Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Running

App Files Files Community

SWE-bench-Costs-Calculator / app.py

IgorSlinko

Bump version to v0.3.46

afac200 about 2 hours ago

raw

history blame contribute delete

133 kB

	import json
	import logging
	import os
	import random
	import re
	import subprocess
	import sys
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import requests
	import tiktoken

	from src.download_swebench_leaderboard import download_leaderboard

	# Tokenizer cache
	_tokenizer_cache = {}

	DATA_DIR = Path("data")
	TRAJS_DIR = DATA_DIR / "swebench_trajs"
	LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
	LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
	S3_BUCKET = "s3://swe-bench-experiments/bash-only"
	LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
	LOG_DIR = Path("logs")

	QUICK_SELECT_MODELS = [
	"openrouter/anthropic/claude-opus-4.5",
	"openrouter/anthropic/claude-sonnet-4.5",
	"openrouter/google/gemini-3-pro-preview",
	"openrouter/openai/gpt-5-codex",
	"openrouter/openai/gpt-oss-120b",
	"deepinfra/Qwen/Qwen3-14B",
	"deepinfra/Qwen/Qwen3-32B",
	"deepinfra/Qwen/Qwen3-73B",
	"deepinfra/Qwen/Qwen3-235B-A22B",
	"deepinfra/Qwen/Qwen3-30B-A3B",
	("deepinfra/Qwen/Qwen3-Coder-480B-A35B-Instruct", "Qwen3-Coder-480B-A35B"),
	]
	LOG_DIR.mkdir(parents=True, exist_ok=True)
	LOG_FILE = LOG_DIR / "app.log"

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	handlers=[
	logging.FileHandler(LOG_FILE, encoding="utf-8"),
	logging.StreamHandler(sys.stdout),
	],
	force=True,
	)


	def _log_unhandled(exc_type, exc_value, exc_traceback):
	if issubclass(exc_type, KeyboardInterrupt):
	sys.__excepthook__(exc_type, exc_value, exc_traceback)
	return
	logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))


	sys.excepthook = _log_unhandled

	_litellm_prices_cache = None
	_litellm_chat_prices_cache = None
	_trajectories_cache = {}
	_calculated_tokens_cache = {}
	_trajectory_steps_cache = {}


	def calculate_routing_tokens(steps: list[dict]) -> dict:
	"""
	Calculate token breakdown per model with proper caching simulation.

	Args:
	steps: list of dicts with keys:
	- model: str (model name)
	- system_user: int (tokens for system/user message, usually only step 0)
	- completion: int (generated tokens)
	- observation: int or None (env response tokens, None for last step)

	Returns:
	dict with per-model totals:
	{model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
	"""
	model_caches = {}
	model_totals = {}

	total_context = 0
	prev_observation = 0

	for i, step in enumerate(steps):
	model = step["model"]
	system_user = step.get("system_user", 0)
	completion = step.get("completion", 0)
	observation = step.get("observation") or 0

	if model not in model_caches:
	model_caches[model] = 0
	if model not in model_totals:
	model_totals[model] = {
	"cache_read": 0,
	"uncached_input": 0,
	"completion": 0,
	"observation": 0,
	"cache_creation": 0,
	}

	cache_read = model_caches[model]

	if i == 0:
	uncached_input = system_user
	else:
	full_context_needed = total_context + prev_observation
	uncached_input = full_context_needed - cache_read

	cache_creation = uncached_input + completion

	model_caches[model] = cache_read + cache_creation

	model_totals[model]["cache_read"] += cache_read
	model_totals[model]["uncached_input"] += uncached_input
	model_totals[model]["completion"] += completion
	model_totals[model]["observation"] += observation
	model_totals[model]["cache_creation"] += cache_creation

	total_context = cache_read + uncached_input + completion
	prev_observation = observation

	return model_totals


	def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
	"""
	Calculate token breakdown per step with proper caching simulation.

	Returns list of per-step data:
	[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
	"""
	result = []
	cache_size = 0
	total_context = 0
	prev_observation = 0

	for i, step in enumerate(steps):
	system_user = step.get("system_user", 0)
	completion = step.get("completion", 0)
	observation = step.get("observation") or 0

	cache_read = cache_size

	if i == 0:
	uncached_input = system_user
	else:
	full_context_needed = total_context + prev_observation
	uncached_input = full_context_needed - cache_read

	cache_creation = uncached_input + completion
	cache_size = cache_read + cache_creation

	result.append({
	"step": i,
	"cache_read": cache_read,
	"uncached_input": uncached_input,
	"completion": completion,
	"cache_creation": cache_creation,
	})

	total_context = cache_read + uncached_input + completion
	prev_observation = observation

	return result


	def _parse_usage_from_log_line(line: str) -> dict \| None:
	"""
	Parse usage info from log line containing ModelResponse or similar format.
	Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc.
	"""
	if "usage=" not in line:
	return None

	result = {}

	for field in ["completion_tokens", "prompt_tokens", "total_tokens"]:
	match = re.search(rf'{field}=(\d+)', line)
	if match:
	result[field] = int(match.group(1))

	cached_match = re.search(r'cached_tokens=(\d+)', line)
	if cached_match:
	result["cached_tokens"] = int(cached_match.group(1))

	return result if result else None


	def _parse_old_format_log(log_path: Path) -> list[dict]:
	"""
	Parse old SWE-agent format .info.log file to extract per-step token usage.
	"""
	result = []
	step = 0

	try:
	with open(log_path, "r", encoding="utf-8") as f:
	for line in f:
	if "usage=Usage(" not in line:
	continue

	usage = _parse_usage_from_log_line(line)
	if not usage:
	continue

	prompt_tokens = usage.get("prompt_tokens", 0)
	completion_tokens = usage.get("completion_tokens", 0)
	cached_tokens = usage.get("cached_tokens", 0)

	uncached_input = max(0, prompt_tokens - cached_tokens)

	result.append({
	"step": step,
	"cache_read": cached_tokens,
	"uncached_input": uncached_input,
	"completion": completion_tokens,
	"cache_creation": 0,
	})
	step += 1
	except Exception as e:
	logging.debug("Error parsing log file %s: %s", log_path, e)

	return result


	def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]:
	"""
	Parse trajectory file and extract per-step metadata from usage fields.
	Supports both new format (.traj.json with messages[].extra.response.usage)
	and old format (.traj with separate .info.log file).

	Returns list of per-step data:
	[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
	"""
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	messages = data.get("messages", [])
	result = []
	step = 0

	for msg in messages:
	if msg.get("role") != "assistant":
	continue

	usage = None
	if "usage" in msg:
	usage = msg["usage"]
	elif "extra" in msg and isinstance(msg["extra"], dict):
	response = msg["extra"].get("response", {})
	if isinstance(response, dict):
	usage = response.get("usage", {})

	if usage:
	prompt_tokens = usage.get("prompt_tokens", 0) or 0
	completion_tokens = usage.get("completion_tokens", 0) or 0
	cache_read = usage.get("cache_read_input_tokens", 0) or 0
	cache_creation = usage.get("cache_creation_input_tokens", 0) or 0

	prompt_tokens_details = usage.get("prompt_tokens_details", {})
	if isinstance(prompt_tokens_details, dict):
	cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
	if cached_from_details > 0 and cache_read == 0:
	cache_read = cached_from_details

	uncached_input = max(0, prompt_tokens - cache_read - cache_creation)

	result.append({
	"step": step,
	"cache_read": cache_read,
	"uncached_input": uncached_input,
	"completion": completion_tokens,
	"cache_creation": cache_creation,
	})
	step += 1

	if not result:
	log_path = traj_path.with_suffix(".info.log")
	if not log_path.exists():
	base_name = traj_path.stem.replace(".traj", "")
	log_path = traj_path.parent / f"{base_name}.info.log"

	if log_path.exists():
	result = _parse_old_format_log(log_path)

	return result


	def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]:
	"""
	Load per-step metadata for all trajectories.

	Returns:
	dict mapping instance_id -> list of per-step metadata
	"""
	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("/.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	result = {}
	for traj_path in traj_files:
	try:
	instance_id = traj_path.stem.replace(".traj", "")
	steps = parse_trajectory_metadata_per_step(traj_path)
	if steps:
	result[instance_id] = steps
	except Exception as e:
	logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True)

	return result


	def create_single_trajectory_meta_chart(steps: list[dict]):
	"""Create stacked bar chart for a single trajectory showing metadata tokens per step."""
	import plotly.graph_objects as go

	if not steps:
	return None

	x_labels = [f"Step {d['step']}" for d in steps]
	uncached = [d["uncached_input"] / 1e3 for d in steps]
	cache_read = [d["cache_read"] / 1e3 for d in steps]
	cache_creation = [d["cache_creation"] / 1e3 for d in steps]
	completion = [d["completion"] / 1e3 for d in steps]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name="Uncached Input",
	x=x_labels,
	y=uncached,
	marker_color="#EF553B",
	hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Read",
	x=x_labels,
	y=cache_read,
	marker_color="#19D3F3",
	hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Creation",
	x=x_labels,
	y=cache_creation,
	marker_color="#FFA15A",
	hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Completion",
	x=x_labels,
	y=completion,
	marker_color="#AB63FA",
	hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
	))

	fig.update_layout(
	barmode="stack",
	xaxis_title="Step",
	yaxis_title="Tokens (K)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig


	def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	"""Create stacked bar chart for a single trajectory showing metadata cost per step."""
	import plotly.graph_objects as go

	if not steps:
	return None

	x_labels = [f"Step {d['step']}" for d in steps]
	uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps]
	cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps]
	cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps]
	completion_cost = [d["completion"] * completion_price / 1e6 for d in steps]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name="Uncached Input",
	x=x_labels,
	y=uncached_cost,
	marker_color="#EF553B",
	hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Read",
	x=x_labels,
	y=cache_read_cost,
	marker_color="#19D3F3",
	hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Creation",
	x=x_labels,
	y=cache_creation_cost,
	marker_color="#FFA15A",
	hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Completion",
	x=x_labels,
	y=completion_cost,
	marker_color="#AB63FA",
	hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
	))

	fig.update_layout(
	barmode="stack",
	xaxis_title="Step",
	yaxis_title="Cost ($)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig


	def create_single_trajectory_chart(steps: list[dict], overhead: float = 1.0, with_cache: bool = True):
	"""Create stacked bar chart for a single trajectory showing tokens per step."""
	import plotly.graph_objects as go

	if not steps:
	return None

	per_step_data = calculate_per_step_tokens(steps)

	x_labels = [f"Step {d['step']}" for d in per_step_data]
	cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
	cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
	completion_raw = [d["completion"] * overhead for d in per_step_data]
	prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]

	if with_cache:
	uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
	cache_read = cache_read_raw
	cache_creation = cache_creation_raw
	else:
	uncached = prompt_tokens_raw
	cache_read = [0] * len(per_step_data)
	cache_creation = [0] * len(per_step_data)

	uncached_k = [u / 1e3 for u in uncached]
	cache_read_k = [cr / 1e3 for cr in cache_read]
	cache_creation_k = [cc / 1e3 for cc in cache_creation]
	completion_k = [c / 1e3 for c in completion_raw]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name="Uncached Input",
	x=x_labels,
	y=uncached_k,
	marker_color="#EF553B",
	hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Read",
	x=x_labels,
	y=cache_read_k,
	marker_color="#19D3F3",
	hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Creation",
	x=x_labels,
	y=cache_creation_k,
	marker_color="#FFA15A",
	hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Completion",
	x=x_labels,
	y=completion_k,
	marker_color="#AB63FA",
	hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
	))

	fig.update_layout(
	barmode="stack",
	xaxis_title="Step",
	yaxis_title="Tokens (K)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig


	def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float, overhead: float = 1.0, with_cache: bool = True):
	"""Create stacked bar chart for a single trajectory showing cost per step."""
	import plotly.graph_objects as go

	if not steps:
	return None

	per_step_data = calculate_per_step_tokens(steps)

	x_labels = [f"Step {d['step']}" for d in per_step_data]
	cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
	cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
	completion_raw = [d["completion"] * overhead for d in per_step_data]
	prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]

	if with_cache:
	uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
	cache_read = cache_read_raw
	cache_creation = cache_creation_raw
	else:
	uncached = prompt_tokens_raw
	cache_read = [0] * len(per_step_data)
	cache_creation = [0] * len(per_step_data)

	uncached_cost = [u * input_price / 1e6 for u in uncached]
	cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read]
	cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation]
	completion_cost = [c * completion_price / 1e6 for c in completion_raw]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name="Uncached Input",
	x=x_labels,
	y=uncached_cost,
	marker_color="#EF553B",
	hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Read",
	x=x_labels,
	y=cache_read_cost,
	marker_color="#19D3F3",
	hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Cache Creation",
	x=x_labels,
	y=cache_creation_cost,
	marker_color="#FFA15A",
	hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name="Completion",
	x=x_labels,
	y=completion_cost,
	marker_color="#AB63FA",
	hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
	))

	fig.update_layout(
	barmode="stack",
	xaxis_title="Step",
	yaxis_title="Cost ($)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig


	def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
	"""
	Parse trajectory file into step format for calculate_routing_tokens.

	Returns list of steps with:
	- model: base model name
	- system_user: tokens for system + user message (step 0 only)
	- completion: assistant response tokens
	- observation: env response tokens (None for last step)
	"""
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	messages = data.get("messages", [])
	trajectory_data = data.get("trajectory", [])

	if not messages and trajectory_data:
	return _parse_trajectory_format_to_steps(trajectory_data, model_name)

	if not messages:
	return []

	count_tokens, _ = get_tokenizer(model_name)

	steps = []
	system_user_tokens = 0
	current_completion = 0
	pending_observation = None

	i = 0
	while i < len(messages):
	msg = messages[i]
	role = msg.get("role", "user")
	content = msg.get("content", "")
	if isinstance(content, list):
	content = json.dumps(content)
	tokens = count_tokens(str(content))

	if role == "system":
	system_user_tokens += tokens
	i += 1
	elif role == "user":
	if not steps:
	system_user_tokens += tokens
	i += 1
	else:
	if steps:
	steps[-1]["observation"] = tokens
	pending_observation = tokens
	i += 1
	elif role == "assistant":
	step = {
	"model": model_name,
	"system_user": system_user_tokens if not steps else 0,
	"completion": tokens,
	"observation": None,
	"content": str(content),
	}
	steps.append(step)
	system_user_tokens = 0
	i += 1

	return steps


	def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]:
	"""
	Parse alternative trajectory format (with "trajectory" array) into steps.
	"""
	count_tokens, _ = get_tokenizer(model_name)

	steps = []
	for i, traj_step in enumerate(trajectory_data):
	query = traj_step.get("query", [])
	response_text = traj_step.get("response", "")
	observation_text = traj_step.get("observation", "")

	system_user_tokens = 0
	if i == 0:
	for q in query:
	content = q.get("content", "")
	if isinstance(content, list):
	content = json.dumps(content)
	system_user_tokens += count_tokens(str(content))

	completion_tokens = count_tokens(str(response_text)) if response_text else 0
	observation_tokens = count_tokens(str(observation_text)) if observation_text else None

	step = {
	"model": model_name,
	"system_user": system_user_tokens,
	"completion": completion_tokens,
	"observation": observation_tokens,
	"content": str(response_text) if response_text else "",
	}
	steps.append(step)

	return steps


	def get_default_overhead(model_name: str) -> float:
	"""Get default tokenizer overhead for model provider"""
	model_lower = model_name.lower() if model_name else ""

	if "claude" in model_lower or "anthropic" in model_lower:
	return 1.24
	elif "gemini" in model_lower or "google" in model_lower:
	return 1.0
	elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
	return 1.0
	else:
	return 1.0


	def get_tokenizer(model_name: str):
	"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
	global _tokenizer_cache

	model_lower = model_name.lower() if model_name else ""

	if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
	tokenizer_name = "o200k_base"
	elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
	tokenizer_name = "cl100k_base"
	elif "gemini" in model_lower or "google" in model_lower:
	return lambda text: int(len(text) / 3.23), "gemini_approx"
	else:
	tokenizer_name = "cl100k_base"

	if tokenizer_name not in _tokenizer_cache:
	_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)

	enc = _tokenizer_cache[tokenizer_name]
	return lambda text: len(enc.encode(text)), tokenizer_name


	def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
	"""Apply tokenizer overhead multiplier to all token counts"""
	if df.empty or overhead == 1.0:
	return df

	df = df.copy()
	df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
	df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
	df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
	df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
	df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
	return df


	def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
	"""Convert all tokens to uncached input + completion (no caching)"""
	if df.empty:
	return df

	df = df.copy()
	df["cache_read_tokens"] = 0
	df["cache_creation_tokens"] = 0
	return df


	def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""Ensure token-related columns exist and are numeric."""
	if df is None or df.empty:
	return df
	df = df.copy()
	required = [
	"prompt_tokens",
	"completion_tokens",
	"cache_read_tokens",
	"cache_creation_tokens",
	]
	for col in required:
	if col not in df.columns:
	df[col] = 0
	df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
	if "total_tokens" in df.columns:
	df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int)
	return df


	def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
	"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
	global _calculated_tokens_cache

	cache_key = f"calculated_{folder}"
	if cache_key in _calculated_tokens_cache:
	return ensure_token_columns(_calculated_tokens_cache[cache_key])

	trajectory_steps = load_all_trajectory_steps(folder)

	rows = []
	for instance_id, steps in trajectory_steps.items():
	if not steps:
	continue

	try:
	model_totals = calculate_routing_tokens(steps)
	step_model = steps[0].get("model", "") if steps else ""
	totals = model_totals.get(step_model, {})

	cache_read = totals.get("cache_read", 0)
	uncached_input = totals.get("uncached_input", 0)
	completion = totals.get("completion", 0)
	cache_creation = totals.get("cache_creation", 0)

	prompt_tokens = cache_read + uncached_input

	rows.append({
	"instance_id": instance_id,
	"model_name": step_model,
	"api_calls": len(steps),
	"instance_cost": 0,
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion,
	"total_tokens": prompt_tokens + completion,
	"cache_read_tokens": cache_read,
	"cache_creation_tokens": cache_creation,
	})
	except Exception as e:
	logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True)

	df = ensure_token_columns(pd.DataFrame(rows))
	_calculated_tokens_cache[cache_key] = df
	return df


	def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
	"""
	Load all trajectories as step sequences for routing calculations.

	Returns:
	dict mapping instance_id -> list of steps for calculate_routing_tokens
	"""
	global _trajectory_steps_cache

	cache_key = f"steps_{folder}"
	if cache_key in _trajectory_steps_cache:
	return _trajectory_steps_cache[cache_key]

	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("/.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	model_name = ""
	if traj_files:
	try:
	with open(traj_files[0], "r") as f:
	first_data = json.load(f)
	config = first_data.get("info", {}).get("config", {}).get("model", {})
	model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
	except Exception:
	pass

	result = {}
	for traj_path in traj_files:
	try:
	instance_id = traj_path.stem.replace(".traj", "")
	steps = parse_trajectory_to_steps(traj_path, model_name)
	if steps:
	result[instance_id] = steps
	except Exception as e:
	logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True)

	_trajectory_steps_cache[cache_key] = result
	return result


	def refresh_litellm_prices() -> bool:
	"""Force refresh litellm prices from remote. Returns True if successful."""
	global _litellm_prices_cache, _litellm_chat_prices_cache
	try:
	response = requests.get(LITELLM_PRICES_URL, timeout=30)
	response.raise_for_status()
	_litellm_prices_cache = response.json()
	_litellm_chat_prices_cache = None

	DATA_DIR.mkdir(exist_ok=True)
	with open(LITELLM_PRICES_CACHE, "w") as f:
	json.dump(_litellm_prices_cache, f)
	logging.info("Successfully refreshed litellm prices")
	return True
	except Exception as e:
	logging.warning(f"Failed to refresh litellm prices: {e}")
	return False


	def get_litellm_prices_raw() -> dict:
	"""Get raw litellm prices (all modes, unfiltered)"""
	global _litellm_prices_cache
	if _litellm_prices_cache is not None:
	return _litellm_prices_cache

	if LITELLM_PRICES_CACHE.exists():
	with open(LITELLM_PRICES_CACHE) as f:
	_litellm_prices_cache = json.load(f)
	return _litellm_prices_cache

	try:
	response = requests.get(LITELLM_PRICES_URL, timeout=30)
	response.raise_for_status()
	_litellm_prices_cache = response.json()

	DATA_DIR.mkdir(exist_ok=True)
	with open(LITELLM_PRICES_CACHE, "w") as f:
	json.dump(_litellm_prices_cache, f)
	except Exception:
	_litellm_prices_cache = {}

	return _litellm_prices_cache


	def get_litellm_prices() -> dict:
	"""Get litellm prices filtered to chat models only"""
	global _litellm_chat_prices_cache
	if _litellm_chat_prices_cache is not None:
	return _litellm_chat_prices_cache

	raw_prices = get_litellm_prices_raw()
	_litellm_chat_prices_cache = {
	k: v for k, v in raw_prices.items()
	if isinstance(v, dict) and v.get("mode") == "chat"
	}
	return _litellm_chat_prices_cache


	def get_litellm_model_list() -> list[str]:
	"""Get list of chat model names from litellm prices"""
	prices = get_litellm_prices()
	return sorted(prices.keys())


	def normalize_model_name(name: str) -> str:
	"""Normalize model name for comparison: lowercase, remove separators"""
	return re.sub(r'[-_./]', '', name.lower())


	def _search_model_in_prices(model_name: str, prices: dict) -> dict \| None:
	"""Search for model in prices dict using various name variations."""
	clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
	name_without_date = re.sub(r'-\d{8}$', '', clean_name)

	candidates = [
	model_name,
	clean_name,
	name_without_date,
	f"anthropic/{clean_name}",
	f"openai/{clean_name}",
	f"anthropic/{name_without_date}",
	f"openai/{name_without_date}",
	]

	for key in candidates:
	if key in prices:
	return prices[key]

	normalized_name = normalize_model_name(clean_name)
	normalized_no_date = normalize_model_name(name_without_date)

	for key, value in prices.items():
	key_normalized = normalize_model_name(key)
	if normalized_name in key_normalized or normalized_no_date in key_normalized:
	return value
	key_last_part = key.split('/')[-1] if '/' in key else key
	key_last_normalized = normalize_model_name(key_last_part)
	if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
	return value

	return None


	def get_model_prices(model_name: str) -> dict \| None:
	if not model_name:
	return None

	prices = get_litellm_prices()
	result = _search_model_in_prices(model_name, prices)

	if result is None and LITELLM_PRICES_CACHE.exists():
	logging.info(f"Model '{model_name}' not found in litellm prices, refreshing cache...")
	if refresh_litellm_prices():
	prices = get_litellm_prices()
	result = _search_model_in_prices(model_name, prices)
	if result is None:
	logging.warning(f"Model '{model_name}' still not found after refresh")

	return result


	def load_or_download_leaderboard(force_refresh: bool = False):
	if not force_refresh and LEADERBOARD_CACHE.exists():
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)

	try:
	filename = download_leaderboard(output_dir=str(DATA_DIR))
	os.rename(filename, LEADERBOARD_CACHE)
	logging.info("Successfully downloaded fresh leaderboard data")
	except Exception as e:
	logging.warning(f"Failed to download leaderboard: {e}")
	if LEADERBOARD_CACHE.exists():
	logging.info("Using cached leaderboard data")
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)
	raise

	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)


	def get_bash_only_df():
	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return pd.DataFrame()

	rows = []
	for r in bash_only["results"]:
	resolved_pct = r.get("resolved", 0)
	if isinstance(resolved_pct, (int, float)):
	resolved_str = f"{resolved_pct:.1f}%"
	else:
	resolved_str = str(resolved_pct)

	rows.append({
	"name": r.get("name", ""),
	"% resolved": resolved_str,
	"date": r.get("date", ""),
	"cost": round(r.get("cost") or 0, 2),
	"instance_cost": round(r.get("instance_cost") or 0, 4),
	"instance_calls": r.get("instance_calls") or 0,
	"folder": r.get("folder", ""),
	"os_model": "✅" if r.get("os_model") else "❌",
	})

	return pd.DataFrame(rows)


	def get_model_details(folder: str):
	if not folder:
	return None, "Select a model from the table"

	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return None, "Leaderboard not found"

	model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
	if not model:
	return None, f"Model with folder '{folder}' not found"

	return model, None


	def check_trajectories_downloaded(folder: str) -> bool:
	if not folder:
	return False
	output_dir = TRAJS_DIR / folder
	return output_dir.exists() and any(output_dir.iterdir())


	def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
	if not folder:
	return "❌ No model selected", gr.update(visible=False)

	model, error = get_model_details(folder)
	if error:
	return f"❌ {error}", gr.update(visible=False)

	output_dir = TRAJS_DIR / folder
	if output_dir.exists() and any(output_dir.iterdir()):
	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("/.traj")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))
	return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

	s3_path = f"{S3_BUCKET}/{folder}/trajs/"
	output_dir.mkdir(parents=True, exist_ok=True)

	progress(0, desc="Starting S3 download...")

	try:
	result = subprocess.run(
	["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
	capture_output=True,
	text=True,
	timeout=600,
	)

	if result.returncode != 0:
	return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("/.traj")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))

	if file_count == 0:
	return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)

	per_instance = model.get("per_instance_details", {})
	resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
	total_count = len(per_instance)

	if total_count > 0:
	resolved_pct = f"{100*resolved_count/total_count:.1f}%"
	else:
	resolved_pct = "N/A"

	status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
	return status, gr.update(visible=True)

	except subprocess.TimeoutExpired:
	return "❌ Download timed out (>10 min)", gr.update(visible=False)
	except FileNotFoundError:
	return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
	except Exception as e:
	return f"❌ Error: {e}", gr.update(visible=False)


	def parse_trajectory(traj_path: Path) -> dict:
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	info = data.get("info", {})
	model_stats = info.get("model_stats", {})
	config = info.get("config", {})
	model_config = config.get("model", {})
	model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

	trajectory_steps = data.get("trajectory", [])
	is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data

	if is_trajectory_format and not model_name:
	for step in trajectory_steps:
	query = step.get("query", [])
	for q in query:
	if q.get("role") == "system":
	content = q.get("content", "")
	if "llama" in content.lower() or "meta" in content.lower():
	model_name = "llama"
	break
	if model_name:
	break

	api_calls = model_stats.get("api_calls", 0)
	if api_calls == 0 and is_trajectory_format:
	api_calls = len(trajectory_steps)

	result = {
	"instance_id": data.get("instance_id", traj_path.stem),
	"model_name": model_name,
	"api_calls": api_calls,
	"instance_cost": model_stats.get("instance_cost", 0),
	"prompt_tokens": 0,
	"completion_tokens": 0,
	"total_tokens": 0,
	"cache_read_tokens": 0,
	"cache_creation_tokens": 0,
	}

	messages = data.get("messages", [])
	for msg in messages:
	usage = None
	if "usage" in msg:
	usage = msg["usage"]
	elif "extra" in msg and isinstance(msg["extra"], dict):
	response = msg["extra"].get("response", {})
	if isinstance(response, dict):
	usage = response.get("usage", {})

	if usage:
	result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
	result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
	result["total_tokens"] += usage.get("total_tokens", 0) or 0

	cache_read = usage.get("cache_read_input_tokens", 0) or 0
	cache_creation = usage.get("cache_creation_input_tokens", 0) or 0

	prompt_tokens_details = usage.get("prompt_tokens_details", {})
	if isinstance(prompt_tokens_details, dict):
	cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
	if cached_from_details > 0 and cache_read == 0:
	cache_read = cached_from_details

	result["cache_read_tokens"] += cache_read
	result["cache_creation_tokens"] += cache_creation

	if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0:
	log_path = traj_path.with_suffix(".info.log")
	if not log_path.exists():
	base_name = traj_path.stem.replace(".traj", "")
	log_path = traj_path.parent / f"{base_name}.info.log"

	if log_path.exists():
	steps = _parse_old_format_log(log_path)
	for step_data in steps:
	result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"]
	result["completion_tokens"] += step_data["completion"]
	result["cache_read_tokens"] += step_data["cache_read"]
	result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"]
	if result["api_calls"] == 0:
	result["api_calls"] = len(steps)

	return result


	def load_all_trajectories(folder: str) -> pd.DataFrame:
	global _trajectories_cache

	if folder in _trajectories_cache:
	return ensure_token_columns(_trajectories_cache[folder])

	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("/.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	rows = []
	for traj_path in traj_files:
	try:
	rows.append(parse_trajectory(traj_path))
	except Exception as e:
	logging.error("Error parsing %s: %s", traj_path, e, exc_info=True)

	df = ensure_token_columns(pd.DataFrame(rows))
	_trajectories_cache[folder] = df
	return df


	def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	"""Create Total Cost by Token Type chart (can be called separately for price updates)"""
	if df.empty:
	return None

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	cost_uncached_input = total_uncached_input * input_price / 1e6
	cost_cache_read = total_cache_read * cache_read_price / 1e6
	cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
	cost_completion = total_completion * completion_price / 1e6

	cost_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
	})

	fig = px.bar(
	cost_data,
	x="Token Type",
	y="Cost ($)",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig.update_layout(
	xaxis_title="",
	yaxis_title="Cost ($)",
	showlegend=False,
	margin=dict(l=60, r=20, t=20, b=40),
	)

	total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
	fig.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	return fig


	def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	"""Create only token-related charts (for source switching)"""
	if df.empty:
	return None, None, None

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	token_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
	})

	fig_tokens = px.bar(
	token_data,
	x="Token Type",
	y="Total Tokens (M)",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens.update_layout(
	xaxis_title="",
	yaxis_title="Tokens (M)",
	showlegend=False,
	margin=dict(l=60, r=20, t=20, b=40),
	)
	total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
	fig_tokens.add_annotation(
	text=f"Total: {total_all/1e6:.2f}M",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

	# Stacked bar chart - sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	fig_stacked = go.Figure()
	fig_stacked.add_trace(go.Bar(
	name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
	marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
	marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
	marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
	marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.update_layout(
	barmode="stack",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Tokens (M)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig_tokens, fig_tokens_cost, fig_stacked


	def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None, None, None, None, None

	fig_steps = px.histogram(
	df,
	x="api_calls",
	nbins=30,
	color_discrete_sequence=["#636EFA"],
	)
	fig_steps.update_layout(
	xaxis_title="API Calls (Steps)",
	yaxis_title="Number of Trajectories",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_steps.add_annotation(
	text=f"Mean: {df['api_calls'].mean():.1f} \| Median: {df['api_calls'].median():.0f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	fig_cost = px.histogram(
	df,
	x="instance_cost",
	nbins=30,
	color_discrete_sequence=["#00CC96"],
	)
	fig_cost.update_layout(
	xaxis_title="Cost ($)",
	yaxis_title="Number of Trajectories",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_cost.add_annotation(
	text=f"Mean: ${df['instance_cost'].mean():.4f} \| Total: ${df['instance_cost'].sum():.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	token_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
	})

	fig_tokens = px.bar(
	token_data,
	x="Token Type",
	y="Tokens (M)",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens.update_layout(
	xaxis_title="",
	yaxis_title="Tokens (M)",
	showlegend=False,
	margin=dict(l=60, r=20, t=20, b=40),
	)

	total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
	fig_tokens.add_annotation(
	text=f"Total: {total_all/1e6:.2f}M",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	# Cost by token type (use separate function)
	fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

	# Sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	fig_stacked = go.Figure()

	fig_stacked.add_trace(go.Bar(
	name="Uncached Input",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["uncached_input_tokens"] / 1e6,
	marker_color="#EF553B",
	hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Read",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cache_read_tokens"] / 1e6,
	marker_color="#19D3F3",
	hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Creation",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cache_creation_tokens"] / 1e6,
	marker_color="#FFA15A",
	hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Completion",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["completion_tokens"] / 1e6,
	marker_color="#AB63FA",
	hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.update_layout(
	barmode="stack",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Tokens (M)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


	def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None

	# Sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
	df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
	df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
	df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name=f"Uncached Input (${input_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_uncached_input"],
	marker_color="#EF553B",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Read (${cache_read_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_cache_read"],
	marker_color="#19D3F3",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_cache_creation"],
	marker_color="#FFA15A",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Completion (${completion_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_completion"],
	marker_color="#AB63FA",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	total_cost = (
	df_sorted["cost_uncached_input"].sum() +
	df_sorted["cost_cache_read"].sum() +
	df_sorted["cost_cache_creation"].sum() +
	df_sorted["cost_completion"].sum()
	)

	fig.update_layout(
	barmode="stack",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Cost ($)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=40, b=40),
	)

	fig.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=14),
	bgcolor="white",
	)

	return fig


	def extract_model_from_folder(folder: str) -> str:
	"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
	if not folder:
	return ""
	parts = folder.split("_")
	if len(parts) >= 3:
	return "_".join(parts[2:])
	return folder


	def get_prices_for_folder(folder: str) -> tuple[dict, str]:
	"""Get prices from litellm based on folder name.
	Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
	model_hint = extract_model_from_folder(folder)

	result = {
	"input": {"value": 0, "found": False},
	"cache_read": {"value": 0, "found": False},
	"cache_creation": {"value": 0, "found": False},
	"completion": {"value": 0, "found": False},
	}

	if not model_hint:
	return result, ""

	prices = get_model_prices(model_hint)
	if prices:
	# Get values from litellm
	input_price = prices.get("input_cost_per_token", 0) * 1e6
	cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
	cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
	completion = prices.get("output_cost_per_token", 0) * 1e6

	result["input"] = {"value": input_price, "found": input_price > 0}
	result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
	result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
	result["completion"] = {"value": completion, "found": completion > 0}

	# Apply fallback estimates based on standard ratios
	# Cache Read = Input * 0.1 (90% discount)
	# Cache Creation = Input * 1.25 (25% premium)
	# Completion = Input * 5 (typical ratio)
	if input_price > 0:
	if not result["cache_read"]["found"]:
	result["cache_read"]["value"] = input_price * 0.1
	if not result["cache_creation"]["found"]:
	result["cache_creation"]["value"] = input_price * 1.25
	if not result["completion"]["found"]:
	result["completion"]["value"] = input_price * 5
	elif completion > 0:
	# If we only have completion, estimate input from it
	estimated_input = completion / 5
	if not result["input"]["found"]:
	result["input"]["value"] = estimated_input
	if not result["cache_read"]["found"]:
	result["cache_read"]["value"] = estimated_input * 0.1
	if not result["cache_creation"]["found"]:
	result["cache_creation"]["value"] = estimated_input * 1.25

	return result, model_hint


	def _build_selection_payload(row_idx: int \| None, df: pd.DataFrame):
	if df is None or df.empty or row_idx is None:
	return (
	"", "",
	gr.update(visible=False),
	gr.update(value=0, label="Input"),
	gr.update(value=0, label="Cache Read"),
	gr.update(value=0, label="Cache Creation"),
	gr.update(value=0, label="Completion"),
	"",
	gr.update(value=1.0),
	)

	row = df.iloc[row_idx]
	folder = row["folder"]
	name = row["name"]

	prices_dict, model_hint = get_prices_for_folder(folder)
	default_overhead = get_default_overhead(model_hint)

	def price_update(price_info, name):
	value = price_info["value"]
	if price_info["found"]:
	return gr.update(value=value, label=f"✅ {name}")
	elif value > 0:
	return gr.update(value=value, label=f"❌ {name} (est.)")
	else:
	return gr.update(value=0, label=f"❌ {name}")

	return (
	folder, name,
	gr.update(visible=True),
	price_update(prices_dict["input"], "Input"),
	price_update(prices_dict["cache_read"], "Cache Read"),
	price_update(prices_dict["cache_creation"], "Cache Creation"),
	price_update(prices_dict["completion"], "Completion"),
	model_hint,
	gr.update(value=default_overhead),
	)


	def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
	row_idx = None
	if evt is not None and evt.index is not None:
	row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
	return _build_selection_payload(row_idx, df)


	def select_first_row(df: pd.DataFrame):
	default_idx = 0 if df is not None and not df.empty else None
	return _build_selection_payload(default_idx, df)


	def create_routed_token_chart(original_tokens: dict, base_tokens: dict, additional_models: list, base_model_name: str = "Base"):
	"""
	Create grouped+stacked bar chart comparing Calculated vs Routed tokens.

	Args:
	original_tokens: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
	base_tokens: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
	additional_models: list of (model_name, tokens_dict) tuples
	base_model_name: name of the base model
	"""
	import plotly.graph_objects as go

	categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
	token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
	base_color_dark = "#636EFA"
	base_color_light = "#A0C4FF"
	model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name=f"{base_model_name} [no routing]",
	x=categories,
	y=[original_tokens.get(k, 0) / 1e6 for k in token_keys],
	marker_color="rgba(99, 110, 250, 0.3)",
	marker_line_color=base_color_dark,
	marker_line_width=1,
	marker_pattern_shape="/",
	marker_pattern_fgcolor=base_color_dark,
	offsetgroup="calculated",
	hovertemplate="%{x}<br>" + base_model_name + " [no routing]: %{y:.3f}M<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"{base_model_name} [with routing]",
	x=categories,
	y=[base_tokens.get(k, 0) / 1e6 for k in token_keys],
	marker_color=base_color_dark,
	offsetgroup="routed",
	hovertemplate="%{x}<br>" + base_model_name + " [with routing]: %{y:.3f}M<extra></extra>",
	))

	for i, (model_name, tokens) in enumerate(additional_models):
	fig.add_trace(go.Bar(
	name=model_name or f"Model {i+1}",
	x=categories,
	y=[tokens.get(k, 0) / 1e6 for k in token_keys],
	marker_color=model_colors[i % len(model_colors)],
	offsetgroup="routed",
	hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": %{y:.3f}M<extra></extra>",
	))

	original_total = sum(original_tokens.get(k, 0) for k in token_keys)
	routed_total = sum(base_tokens.get(k, 0) for k in token_keys) + sum(
	sum(m[1].get(k, 0) for k in token_keys) for m in additional_models
	)

	annotation_lines = [
	f"<b>No routing: {original_total/1e6:.2f}M</b>",
	f"<b>With routing: {routed_total/1e6:.2f}M</b>",
	]

	fig.update_layout(
	yaxis_title="Tokens (M)",
	barmode="stack",
	bargroupgap=0.1,
	margin=dict(l=40, r=40, t=40, b=40),
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
	)
	fig.add_annotation(
	text="<br>".join(annotation_lines),
	xref="paper", yref="paper",
	x=0.02, y=0.98, showarrow=False,
	font=dict(size=11),
	align="left",
	bgcolor="rgba(255,255,255,0.8)",
	bordercolor="gray",
	borderwidth=1,
	)
	return fig


	def create_routed_cost_chart(original_costs: dict, base_costs: dict, additional_models: list, base_model_name: str = "Base"):
	"""
	Create grouped+stacked bar chart comparing Calculated vs Routed costs.

	Args:
	original_costs: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
	base_costs: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
	additional_models: list of (model_name, costs_dict) tuples
	base_model_name: name of the base model
	"""
	import plotly.graph_objects as go

	categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
	cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
	base_color_dark = "#636EFA"
	base_color_light = "#A0C4FF"
	model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name=f"{base_model_name} [no routing]",
	x=categories,
	y=[original_costs.get(k, 0) for k in cost_keys],
	marker_color="rgba(99, 110, 250, 0.3)",
	marker_line_color=base_color_dark,
	marker_line_width=1,
	marker_pattern_shape="/",
	marker_pattern_fgcolor=base_color_dark,
	offsetgroup="calculated",
	hovertemplate="%{x}<br>" + base_model_name + " [no routing]: $%{y:.2f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"{base_model_name} [with routing]",
	x=categories,
	y=[base_costs.get(k, 0) for k in cost_keys],
	marker_color=base_color_dark,
	offsetgroup="routed",
	hovertemplate="%{x}<br>" + base_model_name + " [with routing]: $%{y:.2f}<extra></extra>",
	))

	for i, (model_name, costs) in enumerate(additional_models):
	fig.add_trace(go.Bar(
	name=model_name or f"Model {i+1}",
	x=categories,
	y=[costs.get(k, 0) for k in cost_keys],
	marker_color=model_colors[i % len(model_colors)],
	offsetgroup="routed",
	hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": $%{y:.2f}<extra></extra>",
	))

	original_total = sum(original_costs.get(k, 0) for k in cost_keys)
	routed_total = sum(base_costs.get(k, 0) for k in cost_keys) + sum(
	sum(m[1].get(k, 0) for k in cost_keys) for m in additional_models
	)

	annotation_lines = [
	f"<b>No routing: ${original_total:.2f}</b>",
	f"<b>With routing: ${routed_total:.2f}</b>",
	]

	fig.update_layout(
	yaxis_title="Cost ($)",
	barmode="stack",
	bargroupgap=0.1,
	margin=dict(l=40, r=40, t=40, b=40),
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
	)
	fig.add_annotation(
	text="<br>".join(annotation_lines),
	xref="paper", yref="paper",
	x=0.02, y=0.98, showarrow=False,
	font=dict(size=11),
	align="left",
	bgcolor="rgba(255,255,255,0.8)",
	bordercolor="gray",
	borderwidth=1,
	)
	return fig


	def build_app():
	leaderboard_df = get_bash_only_df()

	with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
	gr.HTML("""
	<style>
	.quick-select-row {
	flex-wrap: wrap !important;
	gap: 6px !important;
	margin-bottom: 8px !important;
	}
	.quick-select-row button {
	background: white !important;
	color: #333 !important;
	border: 1px solid #ccc !important;
	border-radius: 4px !important;
	padding: 4px 10px !important;
	font-size: 12px !important;
	transition: all 0.15s ease !important;
	}
	.quick-select-row button:hover {
	background: #f0f0f0 !important;
	border-color: #999 !important;
	}
	</style>
	""")
	trajectories_state = gr.State(None)

	gr.Markdown("# 🧮 SWE-bench Costs Calculator `v0.3.46`")
	gr.Markdown("### Calculate cost savings with different routing strategies.")
	gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")

	with gr.Row():
	with gr.Column(scale=3):
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	label="Bash-Only Leaderboard",
	interactive=False,
	wrap=True,
	elem_id="leaderboard-table",
	)

	with gr.Column(visible=False) as analysis_section:
	gr.Markdown("## 📊 Trajectory Analysis")

	with gr.Accordion("Leaderboard data", open=True):
	with gr.Row():
	plot_steps = gr.Plot(label="Distribution of API Calls (Steps) per Trajectory")
	plot_cost = gr.Plot(label="Distribution of Cost Reported by Leaderboard ($)")

	with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED ALL]", open=True):
	with gr.Row():
	plot_tokens_meta = gr.Plot(label="Total Tokens by Type")
	plot_tokens_cost_meta = gr.Plot(label="Total Cost by Token Type ($)")

	with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED BY TRAJECTORY]", open=True):
	with gr.Row():
	plot_stacked_meta = gr.Plot(label="Tokens per Trajectory (stacked)")
	with gr.Row():
	plot_cost_breakdown_meta = gr.Plot(label="Cost per Trajectory")

	with gr.Accordion("Token counts REPORTED in the metadata of .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_meta_accordion:
	with gr.Row():
	single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
	with gr.Row():
	single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)")
	with gr.Row():
	single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")

	with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED ALL]", open=True):
	with gr.Row():
	plot_tokens_calc = gr.Plot(label="Total Tokens by Type")
	plot_tokens_cost_calc = gr.Plot(label="Total Cost by Token Type ($)")

	with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED BY TRAJECTORY]", open=True):
	with gr.Row():
	plot_stacked_calc = gr.Plot(label="Tokens per Trajectory (stacked)")
	with gr.Row():
	plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")

	with gr.Accordion("Token counts CALCULATED from .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_accordion:
	with gr.Row():
	single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
	with gr.Row():
	single_traj_plot = gr.Plot(label="Tokens per Step (stacked)")
	with gr.Row():
	single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")

	with gr.Accordion("Token counts CALCULATED from .traj files, with ROUTING [AGGREGATED ALL]", open=True, visible=False) as routing_plots_row:
	with gr.Row():
	routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
	routing_cost_plot = gr.Plot(label="Cost by Type (per Model) ($)")
	gr.Markdown("With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.")

	with gr.Column(scale=1):
	selected_folder = gr.State("")
	gr.Markdown("### Selected Model")
	selected_name = gr.Textbox(label="Model Name", interactive=False)

	analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
	download_status = gr.Textbox(label="Status", interactive=False, lines=3)

	gr.Markdown("---")
	gr.Markdown("### 💰 Token Prices ($/1M) · [litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)")
	detected_model = gr.Textbox(label="Detected Model", interactive=False)
	with gr.Row():
	price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
	price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
	price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
	price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)

	gr.Markdown("---")
	gr.Markdown("### 🔢 Calculated Token Options")
	thinking_overhead = gr.Number(
	label="Tokenizer Overhead",
	value=1.21,
	precision=2,
	info="Multiplier for Calculated tokens (tiktoken → native)",
	)
	use_cache = gr.Checkbox(
	label="Use Cache",
	value=True,
	info="If disabled, all tokens are Uncached Input or Completion",
	)

	gr.Markdown("---")
	add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)
	gr.Markdown("With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.")

	with gr.Column(visible=False) as routing_section:
	gr.Markdown("### 🔀 Routing Models")

	with gr.Column():
	with gr.Group():
	gr.Markdown("#### Route to Model 1")
	with gr.Row(elem_classes=["quick-select-row"]):
	quick_btns_1 = []
	for item in QUICK_SELECT_MODELS:
	if isinstance(item, tuple):
	model, short_name = item
	else:
	model = item
	short_name = model.split("/")[-1]
	btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
	quick_btns_1.append((btn, model))
	routing_model_1 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)

	add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)

	with gr.Column(visible=False) as routing_block_2:
	with gr.Group():
	gr.Markdown("#### Route to Model 2")
	with gr.Row(elem_classes=["quick-select-row"]):
	quick_btns_2 = []
	for item in QUICK_SELECT_MODELS:
	if isinstance(item, tuple):
	model, short_name = item
	else:
	model = item
	short_name = model.split("/")[-1]
	btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
	quick_btns_2.append((btn, model))
	routing_model_2 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)

	add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)

	with gr.Column(visible=False) as routing_block_3:
	with gr.Group():
	gr.Markdown("#### Route to Model 3")
	with gr.Row(elem_classes=["quick-select-row"]):
	quick_btns_3 = []
	for item in QUICK_SELECT_MODELS:
	if isinstance(item, tuple):
	model, short_name = item
	else:
	model = item
	short_name = model.split("/")[-1]
	btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
	quick_btns_3.append((btn, model))
	routing_model_3 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)

	gr.Markdown("---")
	gr.Markdown("### 🎯 Router Strategy")

	selected_strategy = gr.Radio(
	choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
	value="Random router",
	label="",
	interactive=True,
	)
	num_routing_models = gr.State(1)

	with gr.Column(visible=True) as random_block:
	random_hint = gr.Markdown("Weights must sum to 1.0")
	weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
	weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
	weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
	weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)

	with gr.Column(visible=False) as every_k_block:
	every_k_hint = gr.Markdown("First model has priority on overlaps")
	k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True)
	k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False)
	k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False)

	with gr.Column(visible=False) as slice_block:
	slice_hint = gr.Markdown("First model has priority on overlaps")
	slice_model_1 = gr.Textbox(label="M1 slice", value="[0::3]", interactive=True)
	slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
	slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)

	with gr.Column(visible=False) as grep_block:
	grep_hint = gr.Markdown("Use `\|` for OR, `&` for AND (don't mix). First model has priority on overlaps")
	grep_model_1 = gr.Textbox(label="M1 grep", value="ls\|find", interactive=True)
	grep_model_2 = gr.Textbox(label="M2 grep", value="cat\|echo\|printf\|tee", interactive=True, visible=False)
	grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)

	with gr.Column(visible=False) as resolved_block:
	resolved_hint = gr.Markdown("Route all steps based on trajectory resolution status")
	resolved_model = gr.Dropdown(
	label="Model for resolved trajectories",
	choices=["Base", "M1", "M2", "M3"],
	value="Base",
	interactive=True,
	)
	unresolved_model = gr.Dropdown(
	label="Model for unresolved trajectories",
	choices=["Base", "M1", "M2", "M3"],
	value="M1",
	interactive=True,
	)

	with gr.Column(visible=False) as part_block:
	part_hint = gr.Markdown("Ranges must not overlap")
	part_mode = gr.Radio(
	choices=["Indexes", "Percentages"],
	value="Percentages",
	label="Mode",
	interactive=True,
	)
	start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True)
	end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True)
	start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False)
	end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False)
	start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False)
	end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False)

	gr.Markdown("---")
	route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
	routing_result = gr.Markdown(visible=False)


	def toggle_routing_section():
	return gr.update(visible=True)

	add_routing_btn.click(
	fn=toggle_routing_section,
	outputs=[routing_section],
	)

	def on_strategy_change(strategy, num_models):
	show_random = strategy == "Random router"
	show_every_k = strategy == "Every k-th step"
	show_slice = strategy == "Python list slices"
	show_grep = strategy == "Grep"
	show_resolved = strategy == "Resolved/Unresolved"
	show_part = strategy == "Replace part of trajectory"
	has_m2 = num_models >= 2
	has_m3 = num_models >= 3
	return [
	gr.update(visible=show_random), # random_block
	gr.update(visible=show_every_k), # every_k_block
	gr.update(visible=show_slice), # slice_block
	gr.update(visible=show_grep), # grep_block
	gr.update(visible=show_resolved), # resolved_block
	gr.update(visible=show_part), # part_block
	gr.update(visible=show_random), # random_hint
	gr.update(visible=show_random), # weight_base
	gr.update(visible=show_random), # weight_model_1
	gr.update(visible=show_random and has_m2), # weight_model_2
	gr.update(visible=show_random and has_m3), # weight_model_3
	gr.update(visible=show_every_k), # every_k_hint
	gr.update(visible=show_every_k), # k_model_1
	gr.update(visible=show_every_k and has_m2), # k_model_2
	gr.update(visible=show_every_k and has_m3), # k_model_3
	gr.update(visible=show_slice), # slice_hint
	gr.update(visible=show_slice), # slice_model_1
	gr.update(visible=show_slice and has_m2), # slice_model_2
	gr.update(visible=show_slice and has_m3), # slice_model_3
	gr.update(visible=show_grep), # grep_hint
	gr.update(visible=show_grep), # grep_model_1
	gr.update(visible=show_grep and has_m2), # grep_model_2
	gr.update(visible=show_grep and has_m3), # grep_model_3
	gr.update(visible=show_resolved), # resolved_hint
	gr.update(visible=show_resolved), # resolved_model
	gr.update(visible=show_resolved), # unresolved_model
	gr.update(visible=show_part), # part_hint
	gr.update(visible=show_part), # part_mode
	gr.update(visible=show_part), # start_1
	gr.update(visible=show_part), # end_1
	gr.update(visible=show_part and has_m2), # start_2
	gr.update(visible=show_part and has_m2), # end_2
	gr.update(visible=show_part and has_m3), # start_3
	gr.update(visible=show_part and has_m3), # end_3
	]

	selected_strategy.change(
	fn=on_strategy_change,
	inputs=[selected_strategy, num_routing_models],
	outputs=[
	random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
	random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
	every_k_hint, k_model_1, k_model_2, k_model_3,
	slice_hint, slice_model_1, slice_model_2, slice_model_3,
	grep_hint, grep_model_1, grep_model_2, grep_model_3,
	resolved_hint, resolved_model, unresolved_model,
	part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
	],
	)

	def filter_models(query):
	"""Filter models based on search query (starts at 3 chars)"""
	if not query or len(query) < 3:
	return gr.update(choices=[])
	all_models = get_litellm_model_list()
	query_lower = query.lower()
	filtered = [m for m in all_models if query_lower in m.lower()][:50]
	return gr.update(choices=filtered)

	routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
	routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
	routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])

	def make_quick_select_fn_1(full_model_name):
	def fn():
	prices = get_routing_prices_with_labels(full_model_name)
	return (gr.update(value=full_model_name), *prices,
	gr.update(visible=True), gr.update(interactive=True))
	return fn

	def make_quick_select_fn_2(full_model_name):
	def fn():
	prices = get_routing_prices_with_labels(full_model_name)
	return (gr.update(value=full_model_name), *prices,
	gr.update(visible=True))
	return fn

	def make_quick_select_fn_3(full_model_name):
	def fn():
	prices = get_routing_prices_with_labels(full_model_name)
	return (gr.update(value=full_model_name), *prices)
	return fn

	for btn, full_model in quick_btns_1:
	btn.click(
	fn=make_quick_select_fn_1(full_model),
	outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
	)

	for btn, full_model in quick_btns_2:
	btn.click(
	fn=make_quick_select_fn_2(full_model),
	outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
	)

	for btn, full_model in quick_btns_3:
	btn.click(
	fn=make_quick_select_fn_3(full_model),
	outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
	)

	def get_routing_prices_with_labels(model_name):
	"""Get all 4 prices for a routing model with found/estimated labels"""
	if not model_name:
	return (
	gr.update(value=0, label="Input"),
	gr.update(value=0, label="Cache Read"),
	gr.update(value=0, label="Cache Creation"),
	gr.update(value=0, label="Completion"),
	)

	prices = get_litellm_prices()
	model_prices = prices.get(model_name, {})

	input_price = model_prices.get("input_cost_per_token", 0) * 1e6
	cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
	cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
	completion = model_prices.get("output_cost_per_token", 0) * 1e6

	input_found = input_price > 0
	cache_read_found = cache_read > 0
	cache_creation_found = cache_creation > 0
	completion_found = completion > 0

	if not cache_read_found and input_price > 0:
	cache_read = input_price * 0.1
	if not cache_creation_found and input_price > 0:
	cache_creation = input_price * 1.25

	def label(name, found):
	return f"✅ {name}" if found else f"❌ {name}"

	return (
	gr.update(value=input_price, label=label("Input", input_found)),
	gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
	gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
	gr.update(value=completion, label=label("Completion", completion_found)),
	)

	def on_routing_model_1_select(model_name):
	prices = get_routing_prices_with_labels(model_name)
	show_btn = bool(model_name)
	return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))

	def on_routing_model_2_select(model_name):
	prices = get_routing_prices_with_labels(model_name)
	show_btn = bool(model_name)
	return (*prices, gr.update(visible=show_btn))

	def on_routing_model_3_select(model_name):
	return get_routing_prices_with_labels(model_name)

	routing_model_1.change(
	fn=on_routing_model_1_select,
	inputs=[routing_model_1],
	outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
	)

	def show_model_2(strategy):
	is_random = strategy == "Random router"
	is_every_k = strategy == "Every k-th step"
	is_slice = strategy == "Python list slices"
	is_grep = strategy == "Grep"
	is_part = strategy == "Replace part of trajectory"
	return (
	gr.update(visible=True), # show block 2
	gr.update(visible=False), # hide add button
	gr.update(visible=is_random), # weight2
	gr.update(visible=is_every_k), # k2
	gr.update(visible=is_slice), # slice2
	gr.update(visible=is_grep), # grep2
	gr.update(visible=is_part), # start2
	gr.update(visible=is_part), # end2
	2,
	)

	add_model_2_btn.click(
	fn=show_model_2,
	inputs=[selected_strategy],
	outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models],
	)

	routing_model_2.change(
	fn=on_routing_model_2_select,
	inputs=[routing_model_2],
	outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
	)

	def show_model_3(strategy):
	is_random = strategy == "Random router"
	is_every_k = strategy == "Every k-th step"
	is_slice = strategy == "Python list slices"
	is_grep = strategy == "Grep"
	is_part = strategy == "Replace part of trajectory"
	return (
	gr.update(visible=True), # show block 3
	gr.update(visible=False), # hide add button
	gr.update(visible=is_random), # weight3
	gr.update(visible=is_every_k), # k3
	gr.update(visible=is_slice), # slice3
	gr.update(visible=is_grep), # grep3
	gr.update(visible=is_part), # start3
	gr.update(visible=is_part), # end3
	3,
	)

	add_model_3_btn.click(
	fn=show_model_3,
	inputs=[selected_strategy],
	outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models],
	)

	routing_model_3.change(
	fn=on_routing_model_3_select,
	inputs=[routing_model_3],
	outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
	)

	def run_routing(
	state_data,
	base_input, base_cache_read, base_cache_creation, base_completion,
	routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
	routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
	routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
	strategy_val,
	weight_base_val, weight_1_val, weight_2_val, weight_3_val,
	k_1_val, k_2_val, k_3_val,
	slice_1_val, slice_2_val, slice_3_val,
	grep_1_val, grep_2_val, grep_3_val,
	resolved_model_val, unresolved_model_val,
	part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
	overhead, with_cache,
	detected_model_val
	):
	if state_data is None:
	yield (
	gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
	gr.update(visible=False),
	None, None,
	)
	return

	if not routing_model_1_val:
	yield (
	gr.update(visible=True, value="❌ Please select at least one routing model."),
	gr.update(visible=False),
	None, None,
	)
	return

	trajectory_steps = state_data.get("steps", {})
	resolved_instances = state_data.get("resolved", {})
	if not trajectory_steps:
	yield (
	gr.update(visible=True, value="❌ No trajectory steps data available."),
	gr.update(visible=False),
	None, None,
	)
	return


	df_calc = state_data.get("calculated")
	if df_calc is not None and not df_calc.empty:
	df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
	if not with_cache:
	df_for_cost = apply_no_cache(df_for_cost)
	df_temp = df_for_cost.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_original_cost_from_df = (
	df_temp["uncached_input"].sum() * base_input / 1e6 +
	df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
	df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
	df_for_cost["completion_tokens"].sum() * base_completion / 1e6
	)
	else:
	total_original_cost_from_df = None

	base_prices = {
	"input": base_input,
	"cache_read": base_cache_read,
	"cache_creation": base_cache_creation,
	"completion": base_completion,
	}

	routing_models = []
	if routing_model_1_val:
	routing_models.append({
	"name": routing_model_1_val,
	"prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
	})
	if routing_model_2_val:
	routing_models.append({
	"name": routing_model_2_val,
	"prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
	})
	if routing_model_3_val:
	routing_models.append({
	"name": routing_model_3_val,
	"prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
	})

	if strategy_val == "Replace part of trajectory":
	ranges = [(start_1_val, end_1_val)]
	if len(routing_models) > 1:
	ranges.append((start_2_val, end_2_val))
	if len(routing_models) > 2:
	ranges.append((start_3_val, end_3_val))
	for i, (s, e) in enumerate(ranges):
	if s >= e:
	yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None)
	return
	for i in range(len(ranges)):
	for j in range(i+1, len(ranges)):
	s1, e1 = ranges[i]
	s2, e2 = ranges[j]
	if not (e1 <= s2 or e2 <= s1):
	yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None)
	return

	weights = None
	if strategy_val == "Random router":
	weights = [weight_base_val, weight_1_val]
	if len(routing_models) > 1:
	weights.append(weight_2_val)
	if len(routing_models) > 2:
	weights.append(weight_3_val)
	total_weight = sum(weights)
	if abs(total_weight - 1.0) > 0.01:
	yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None)
	return

	k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
	slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
	grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)]
	part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]

	if strategy_val == "Grep":
	for i, gv in enumerate(grep_values):
	if gv and "\|" in gv and "&" in gv:
	yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix \| and & operators"), gr.update(visible=False), None, None)
	return

	def grep_matches(text, pattern):
	"""Check if text matches grep pattern (words with \| or &)"""
	if not pattern or not text:
	return False
	pattern = pattern.strip()
	if "\|" in pattern:
	words = [w.strip() for w in pattern.split("\|") if w.strip()]
	for word in words:
	if re.search(r'\b' + re.escape(word) + r'\b', text):
	return True
	return False
	elif "&" in pattern:
	words = [w.strip() for w in pattern.split("&") if w.strip()]
	for word in words:
	if not re.search(r'\b' + re.escape(word) + r'\b', text):
	return False
	return True
	else:
	return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text))

	def parse_slice(slice_str, length):
	"""Parse Python slice notation like [0::3] and return list of indices"""
	slice_str = slice_str.strip()
	if slice_str.startswith("[") and slice_str.endswith("]"):
	slice_str = slice_str[1:-1]
	parts = slice_str.split(":")
	if len(parts) == 2:
	start = int(parts[0]) if parts[0] else None
	stop = int(parts[1]) if parts[1] else None
	step = None
	elif len(parts) == 3:
	start = int(parts[0]) if parts[0] else None
	stop = int(parts[1]) if parts[1] else None
	step = int(parts[2]) if parts[2] else None
	else:
	return []
	return list(range(length))[slice(start, stop, step)]

	BASE_MODEL = "__base__"
	model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]

	all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
	total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}

	for instance_id, steps in trajectory_steps.items():
	if not steps:
	continue

	total_steps = len(steps)

	step_to_model = {}

	if strategy_val == "Random router":
	model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))]
	for i in range(total_steps):
	step_to_model[i] = random.choices(model_choices, weights=weights)[0]

	elif strategy_val == "Every k-th step":
	for j, k_val in enumerate(k_values):
	if k_val and k_val > 0:
	for i in range(total_steps):
	if (i + 1) % int(k_val) == 0:
	if i not in step_to_model:
	step_to_model[i] = f"__routing_{j}__"

	elif strategy_val == "Python list slices":
	for j, slice_val in enumerate(slice_values):
	if slice_val:
	try:
	indices = parse_slice(slice_val, total_steps)
	for i in indices:
	if i not in step_to_model:
	step_to_model[i] = f"__routing_{j}__"
	except Exception:
	pass

	elif strategy_val == "Grep":
	for i, step in enumerate(steps):
	content = step.get("content", "")
	for j, grep_val in enumerate(grep_values):
	if grep_val and i not in step_to_model:
	if grep_matches(content, grep_val):
	step_to_model[i] = f"__routing_{j}__"

	elif strategy_val == "Resolved/Unresolved":
	is_resolved = resolved_instances.get(instance_id, False)
	target_model = resolved_model_val if is_resolved else unresolved_model_val
	if target_model and target_model != "Base":
	model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
	if model_idx is not None and model_idx < len(routing_models):
	for i in range(total_steps):
	step_to_model[i] = f"__routing_{model_idx}__"

	elif strategy_val == "Replace part of trajectory":
	for j, (start_val, end_val) in enumerate(part_ranges):
	if part_mode_val == "Percentages":
	start_idx = int(total_steps * start_val / 100)
	end_idx = int(total_steps * end_val / 100)
	else:
	start_idx = int(start_val)
	end_idx = min(int(end_val), total_steps)
	for i in range(start_idx, end_idx):
	step_to_model[i] = f"__routing_{j}__"

	modified_steps = []
	for i, step in enumerate(steps):
	model = step_to_model.get(i, BASE_MODEL)
	modified_steps.append({
	"model": model,
	"system_user": step.get("system_user", 0),
	"completion": int(step.get("completion", 0) * overhead),
	"observation": step.get("observation"),
	})

	model_totals = calculate_routing_tokens(modified_steps)

	for key in model_keys:
	totals = model_totals.get(key, {})
	all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
	all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
	all_tokens[key]["completion"] += totals.get("completion", 0)
	all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)

	original_steps = []
	for step in steps:
	original_steps.append({
	"model": BASE_MODEL,
	"system_user": step.get("system_user", 0),
	"completion": int(step.get("completion", 0) * overhead),
	"observation": step.get("observation"),
	})
	original_totals = calculate_routing_tokens(original_steps)
	orig = original_totals.get(BASE_MODEL, {})
	total_original_tokens["cache_read"] += orig.get("cache_read", 0)
	total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
	total_original_tokens["completion"] += orig.get("completion", 0)
	total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)

	def calc_cost(tokens: dict, prices: dict) -> float:
	return (
	tokens["uncached_input"] * prices["input"] / 1e6 +
	tokens["cache_read"] * prices["cache_read"] / 1e6 +
	tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
	tokens["completion"] * prices["completion"] / 1e6
	)

	def tokens_to_costs(tokens: dict, prices: dict) -> dict:
	price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
	return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}

	total_base_tokens = all_tokens[BASE_MODEL]
	base_costs = tokens_to_costs(total_base_tokens, base_prices)
	total_base_cost = calc_cost(total_base_tokens, base_prices)

	routing_costs_list = []
	total_routing_cost = 0
	for i, rm in enumerate(routing_models):
	key = f"__routing_{i}__"
	tokens = all_tokens[key]
	costs = tokens_to_costs(tokens, rm["prices"])
	cost = calc_cost(tokens, rm["prices"])
	routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
	total_routing_cost += cost

	total_original_cost = calc_cost(total_original_tokens, base_prices)

	total_routed_cost = total_base_cost + total_routing_cost
	savings = total_original_cost - total_routed_cost
	savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0

	result_lines = [
	"## 🚀 Routing Results",
	"",
	"\| Metric \| Value \|",
	"\|--------\|-------\|",
	f"\| Original Cost (base model only) \| ${total_original_cost:.2f} \|",
	f"\| Routed Cost \| ${total_routed_cost:.2f} \|",
	f"\| ↳ Base model portion \| ${total_base_cost:.2f} \|",
	]
	for rc in routing_costs_list:
	result_lines.append(f"\| ↳ {rc['name']} \| ${rc['cost']:.2f} \|")
	savings_color = "green" if savings >= 0 else "red"
	result_lines.append(f'\| Savings \| <span style="color: {savings_color}; font-weight: bold;">${savings:.2f} · {savings_pct:.1f}%</span> \|')
	result_text = "\n".join(result_lines)

	def apply_display_formula(tokens: dict) -> dict:
	prompt = tokens["cache_read"] + tokens["uncached_input"]
	if with_cache:
	uncached_display = max(0, prompt - tokens["cache_read"] - tokens["cache_creation"])
	return {
	"uncached_input": uncached_display,
	"cache_read": tokens["cache_read"],
	"cache_creation": tokens["cache_creation"],
	"completion": tokens["completion"],
	}
	else:
	return {
	"uncached_input": prompt,
	"cache_read": 0,
	"cache_creation": 0,
	"completion": tokens["completion"],
	}

	total_base_tokens_display = apply_display_formula(total_base_tokens)
	base_costs = tokens_to_costs(total_base_tokens_display, base_prices)

	additional_token_models = [(rc["name"], apply_display_formula(rc["tokens"])) for rc in routing_costs_list]
	additional_cost_models = []
	for i, rc in enumerate(routing_costs_list):
	model_prices = routing_models[i]["prices"]
	additional_cost_models.append((rc["name"], tokens_to_costs(apply_display_formula(rc["tokens"]), model_prices)))

	if df_calc is not None and not df_calc.empty:
	df_temp = df_for_cost.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	original_tokens_from_df = {
	"uncached_input": df_temp["uncached_input"].sum(),
	"cache_read": df_for_cost["cache_read_tokens"].sum(),
	"cache_creation": df_for_cost["cache_creation_tokens"].sum(),
	"completion": df_for_cost["completion_tokens"].sum(),
	}
	else:
	original_tokens_from_df = apply_display_formula(total_original_tokens)

	original_costs = tokens_to_costs(original_tokens_from_df, base_prices)

	base_model_name = detected_model_val or "Base"
	tokens_chart = create_routed_token_chart(original_tokens_from_df, total_base_tokens_display, additional_token_models, base_model_name)
	cost_chart = create_routed_cost_chart(original_costs, base_costs, additional_cost_models, base_model_name)

	yield (
	gr.update(visible=True, value=result_text),
	gr.update(visible=True),
	tokens_chart,
	cost_chart,
	)

	route_btn.click(
	fn=run_routing,
	inputs=[
	trajectories_state,
	price_input, price_cache_read, price_cache_creation, price_completion,
	routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
	routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
	routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
	selected_strategy,
	weight_base, weight_model_1, weight_model_2, weight_model_3,
	k_model_1, k_model_2, k_model_3,
	slice_model_1, slice_model_2, slice_model_3,
	grep_model_1, grep_model_2, grep_model_3,
	resolved_model, unresolved_model,
	part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
	thinking_overhead, use_cache,
	detected_model,
	],
	outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
	)

	leaderboard_table.select(
	fn=on_row_select,
	inputs=[leaderboard_table],
	outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
	)

	app.load(
	fn=select_first_row,
	inputs=[leaderboard_table],
	outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
	js="""
	(data) => {
	const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr');
	if (row) {
	row.click();
	}
	return data;
	}
	""",
	)

	def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
	progress(0, desc="Ready")
	empty_result = (
	"",
	gr.update(visible=False),
	None, None,
	None, None, None, None,
	None, None, None, None,
	None,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	)

	if not folder:
	progress(1, desc="No folder selected")
	yield empty_result
	return

	if not check_trajectories_downloaded(folder):
	progress(0.1, desc="Preparing download")
	yield (
	"⏳ Downloading trajectories...",
	gr.update(visible=False),
	None, None,
	None, None, None, None,
	None, None, None, None,
	None,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	)
	progress(0.3, desc="Downloading")
	status, _ = download_trajectories_from_s3(folder)
	if "❌" in status:
	progress(1, desc="Download failed")
	yield (
	status,
	gr.update(visible=False),
	None, None,
	None, None, None, None,
	None, None, None, None,
	None,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	)
	return
	progress(0.45, desc="Loading trajectories")

	yield (
	"⏳ Loading trajectories...",
	gr.update(visible=True),
	None, None,
	None, None, None, None,
	None, None, None, None,
	None,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	)

	progress(0.6, desc="Reading metadata")
	df_meta = ensure_token_columns(load_all_trajectories(folder))
	progress(0.7, desc="Reading calculated")
	df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
	df_calc["api_calls"] = df_meta["api_calls"].values
	df_calc["instance_cost"] = df_meta["instance_cost"].values
	progress(0.75, desc="Reading steps")
	trajectory_steps = load_all_trajectory_steps(folder)
	progress(0.8, desc="Reading metadata steps")
	metadata_steps = load_all_trajectory_metadata_steps(folder)

	model_details, _ = get_model_details(folder)
	resolved_instances = {}
	if model_details:
	per_instance = model_details.get("per_instance_details", {})
	for inst_id, details in per_instance.items():
	resolved_instances[inst_id] = details.get("resolved", False)

	state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances}

	if df_meta.empty:
	progress(1, desc="No trajectories found")
	yield (
	"❌ No trajectories found",
	gr.update(visible=False),
	None, None,
	None, None, None, None,
	None, None, None, None,
	None,
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	gr.update(visible=False),
	gr.update(),
	gr.update(),
	gr.update(),
	)
	return

	progress(0.9, desc="Building charts")
	fig_steps, fig_cost, _, _, _ = create_basic_histograms(
	df_meta, input_price, cache_read_price, cache_creation_price, completion_price
	)

	fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta = create_token_charts(
	df_meta, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown_meta = create_cost_breakdown(
	df_meta, input_price, cache_read_price, cache_creation_price, completion_price
	)

	df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
	if not with_cache:
	df_calc_processed = apply_no_cache(df_calc_processed)

	fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
	df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown_calc = create_cost_breakdown(
	df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
	)

	issue_ids = sorted(trajectory_steps.keys())
	first_issue = issue_ids[0] if issue_ids else None

	meta_issue_ids = sorted(metadata_steps.keys())
	first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None
	has_meta_steps = len(meta_issue_ids) > 0

	fig_single_traj = None
	fig_single_traj_cost = None
	if first_issue and first_issue in trajectory_steps:
	calc_steps = trajectory_steps[first_issue]
	fig_single_traj = create_single_trajectory_chart(calc_steps, overhead, with_cache)
	fig_single_traj_cost = create_single_trajectory_cost_chart(calc_steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)

	fig_single_traj_meta = None
	fig_single_traj_meta_cost = None
	if first_meta_issue and first_meta_issue in metadata_steps:
	meta_steps = metadata_steps[first_meta_issue]
	fig_single_traj_meta = create_single_trajectory_meta_chart(meta_steps)
	fig_single_traj_meta_cost = create_single_trajectory_meta_cost_chart(meta_steps, input_price, cache_read_price, cache_creation_price, completion_price)

	progress(1, desc="Done")
	yield (
	f"✅ Loaded {len(df_meta)} trajectories",
	gr.update(visible=True),
	fig_steps, fig_cost,
	fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta, fig_cost_breakdown_meta,
	fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
	state_data,
	gr.update(visible=True),
	gr.update(visible=True),
	gr.update(choices=issue_ids, value=first_issue),
	fig_single_traj,
	fig_single_traj_cost,
	gr.update(visible=has_meta_steps),
	gr.update(choices=meta_issue_ids, value=first_meta_issue),
	fig_single_traj_meta,
	fig_single_traj_meta_cost,
	)

	def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
	if state_data is None or not issue_id:
	return None, None
	trajectory_steps = state_data.get("steps", {})
	if issue_id not in trajectory_steps:
	return None, None
	steps = trajectory_steps[issue_id]
	tokens_chart = create_single_trajectory_chart(steps, overhead, with_cache)
	cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)
	return tokens_chart, cost_chart

	def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
	if state_data is None or not issue_id:
	return None, None
	metadata_steps = state_data.get("metadata_steps", {})
	if issue_id not in metadata_steps:
	return None, None
	steps = metadata_steps[issue_id]
	tokens_chart = create_single_trajectory_meta_chart(steps)
	cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
	return tokens_chart, cost_chart

	analyze_btn.click(
	fn=load_and_analyze,
	inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
	outputs=[
	download_status,
	analysis_section,
	plot_steps, plot_cost,
	plot_tokens_meta, plot_tokens_cost_meta, plot_stacked_meta, plot_cost_breakdown_meta,
	plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
	trajectories_state,
	add_routing_btn,
	single_traj_accordion,
	single_traj_dropdown,
	single_traj_plot,
	single_traj_cost_plot,
	single_traj_meta_accordion,
	single_traj_meta_dropdown,
	single_traj_meta_plot,
	single_traj_meta_cost_plot,
	],
	)

	def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
	if state_data is None:
	return None, None, None, None

	df_meta = state_data["meta"]
	df_calc = state_data["calculated"]

	if df_meta.empty:
	return None, None, None, None

	fig_tokens_cost_meta = create_cost_by_type_chart(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)
	fig_cost_breakdown_meta = create_cost_breakdown(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)

	df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
	if not with_cache:
	df_calc_processed = apply_no_cache(df_calc_processed)

	fig_tokens_cost_calc = create_cost_by_type_chart(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)
	fig_cost_breakdown_calc = create_cost_breakdown(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)

	return fig_tokens_cost_meta, fig_cost_breakdown_meta, fig_tokens_cost_calc, fig_cost_breakdown_calc

	price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
	price_outputs = [plot_tokens_cost_meta, plot_cost_breakdown_meta, plot_tokens_cost_calc, plot_cost_breakdown_calc]

	price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)

	def on_calc_options_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
	"""Recalculate only calculated charts when overhead or cache options change"""
	if state_data is None:
	return None, None, None, None

	df_calc = state_data["calculated"]
	if df_calc.empty:
	return None, None, None, None

	df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
	if not with_cache:
	df_calc_processed = apply_no_cache(df_calc_processed)

	fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
	df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown_calc = create_cost_breakdown(
	df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
	)

	return fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc

	calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
	calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]

	single_traj_dropdown.change(
	fn=on_single_traj_select,
	inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
	outputs=[single_traj_plot, single_traj_cost_plot],
	)

	single_traj_meta_dropdown.change(
	fn=on_single_traj_meta_select,
	inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
	outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
	)

	single_traj_inputs = [trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
	single_traj_outputs = [single_traj_plot, single_traj_cost_plot]

	routing_inputs = [
	trajectories_state,
	price_input, price_cache_read, price_cache_creation, price_completion,
	routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
	routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
	routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
	selected_strategy,
	weight_base, weight_model_1, weight_model_2, weight_model_3,
	k_model_1, k_model_2, k_model_3,
	slice_model_1, slice_model_2, slice_model_3,
	grep_model_1, grep_model_2, grep_model_3,
	resolved_model, unresolved_model,
	part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
	thinking_overhead, use_cache,
	detected_model,
	]
	routing_outputs = [routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot]

	thinking_overhead.change(
	fn=on_calc_options_change,
	inputs=calc_options_inputs,
	outputs=calc_options_outputs,
	).then(
	fn=on_single_traj_select,
	inputs=single_traj_inputs,
	outputs=single_traj_outputs,
	).then(
	fn=run_routing,
	inputs=routing_inputs,
	outputs=routing_outputs,
	)

	use_cache.change(
	fn=on_calc_options_change,
	inputs=calc_options_inputs,
	outputs=calc_options_outputs,
	).then(
	fn=on_single_traj_select,
	inputs=single_traj_inputs,
	outputs=single_traj_outputs,
	).then(
	fn=run_routing,
	inputs=routing_inputs,
	outputs=routing_outputs,
	)

	return app


	if __name__ == "__main__":
	logging.info("Refreshing leaderboard data on startup...")
	load_or_download_leaderboard(force_refresh=True)
	app = build_app()
	app.queue()
	app.launch()