Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

App Files Files Community

SWE-bench-Costs-Calculator / app.py

IgorSlinko

Fix routing bugs and unify chart formatting

a75cc98 9 days ago

raw

history blame

76.6 kB

	import json
	import os
	import re
	import subprocess
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import requests
	import tiktoken

	from src.download_swebench_leaderboard import download_leaderboard

	# Tokenizer cache
	_tokenizer_cache = {}

	DATA_DIR = Path("data")
	TRAJS_DIR = DATA_DIR / "swebench_trajs"
	LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
	LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
	S3_BUCKET = "s3://swe-bench-experiments/bash-only"
	LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"

	_litellm_prices_cache = None
	_trajectories_cache = {}
	_calculated_tokens_cache = {}
	_trajectory_steps_cache = {}


	def parse_step_or_ratio(value: float, total_steps: int) -> int:
	"""
	Parse a value as either step number or ratio.

	If value is integer (e.g., 3.0, 5.0) -> treat as step number
	If value is float with decimal (e.g., 0.5, 0.25) -> treat as ratio of total_steps

	Returns: step index (0-based)
	"""
	if value == int(value) and value >= 1:
	return int(value)
	else:
	return int(value * total_steps)


	def get_routed_steps(total_steps: int, strategy: str, params: dict) -> set:
	"""
	Determine which steps should be routed to alternative model.

	Returns set of step indices (0-based) that should use the routing model.
	"""
	import random

	routed = set()

	if strategy == "Replace on random steps":
	pct = params.get("percentage", 50) / 100.0
	num_to_route = int(total_steps * pct)
	if num_to_route > 0:
	routed = set(random.sample(range(total_steps), min(num_to_route, total_steps)))

	elif strategy == "Replace every step k":
	k = int(params.get("k", 2))
	if k > 0:
	routed = set(range(0, total_steps, k))

	elif strategy == "Replace part of trajectory":
	start = parse_step_or_ratio(params.get("start", 0), total_steps)
	end = parse_step_or_ratio(params.get("end", 0.5), total_steps)
	routed = set(range(start, min(end, total_steps)))

	return routed


	def calculate_routing_tokens(steps: list[dict]) -> dict:
	"""
	Calculate token breakdown per model with proper caching simulation.

	Args:
	steps: list of dicts with keys:
	- model: str (model name)
	- system_user: int (tokens for system/user message, usually only step 0)
	- completion: int (generated tokens)
	- observation: int or None (env response tokens, None for last step)

	Returns:
	dict with per-model totals:
	{model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
	"""
	model_caches = {}
	model_totals = {}

	total_context = 0
	prev_observation = 0

	for i, step in enumerate(steps):
	model = step["model"]
	system_user = step.get("system_user", 0)
	completion = step.get("completion", 0)
	observation = step.get("observation") or 0

	if model not in model_caches:
	model_caches[model] = 0
	if model not in model_totals:
	model_totals[model] = {
	"cache_read": 0,
	"uncached_input": 0,
	"completion": 0,
	"observation": 0,
	"cache_creation": 0,
	}

	cache_read = model_caches[model]

	if i == 0:
	uncached_input = system_user
	else:
	full_context_needed = total_context + prev_observation
	uncached_input = full_context_needed - cache_read

	cache_creation = uncached_input + completion

	model_caches[model] = cache_read + cache_creation

	model_totals[model]["cache_read"] += cache_read
	model_totals[model]["uncached_input"] += uncached_input
	model_totals[model]["completion"] += completion
	model_totals[model]["observation"] += observation
	model_totals[model]["cache_creation"] += cache_creation

	total_context = cache_read + uncached_input + completion
	prev_observation = observation

	return model_totals


	def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
	"""
	Parse trajectory file into step format for calculate_routing_tokens.

	Returns list of steps with:
	- model: base model name
	- system_user: tokens for system + user message (step 0 only)
	- completion: assistant response tokens
	- observation: env response tokens (None for last step)
	"""
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	messages = data.get("messages", [])
	if not messages:
	return []

	count_tokens, _ = get_tokenizer(model_name)

	steps = []
	system_user_tokens = 0
	current_completion = 0
	pending_observation = None

	i = 0
	while i < len(messages):
	msg = messages[i]
	role = msg.get("role", "user")
	content = msg.get("content", "")
	if isinstance(content, list):
	content = json.dumps(content)
	tokens = count_tokens(str(content))

	if role == "system":
	system_user_tokens += tokens
	i += 1
	elif role == "user":
	if not steps:
	system_user_tokens += tokens
	i += 1
	else:
	if steps:
	steps[-1]["observation"] = tokens
	pending_observation = tokens
	i += 1
	elif role == "assistant":
	step = {
	"model": model_name,
	"system_user": system_user_tokens if not steps else 0,
	"completion": tokens,
	"observation": None,
	}
	steps.append(step)
	system_user_tokens = 0
	i += 1

	return steps


	def get_default_overhead(model_name: str) -> float:
	"""Get default tokenizer overhead for model provider"""
	model_lower = model_name.lower() if model_name else ""

	if "claude" in model_lower or "anthropic" in model_lower:
	return 1.24
	elif "gemini" in model_lower or "google" in model_lower:
	return 1.0
	elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
	return 1.0
	else:
	return 1.0


	def get_tokenizer(model_name: str):
	"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
	global _tokenizer_cache

	model_lower = model_name.lower() if model_name else ""

	if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
	tokenizer_name = "o200k_base"
	elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
	tokenizer_name = "cl100k_base"
	elif "gemini" in model_lower or "google" in model_lower:
	return lambda text: int(len(text) / 3.23), "gemini_approx"
	else:
	tokenizer_name = "cl100k_base"

	if tokenizer_name not in _tokenizer_cache:
	_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)

	enc = _tokenizer_cache[tokenizer_name]
	return lambda text: len(enc.encode(text)), tokenizer_name


	def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
	"""Apply tokenizer overhead multiplier to all token counts"""
	if df.empty or overhead == 1.0:
	return df

	df = df.copy()
	df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
	df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
	df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
	df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
	df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
	return df


	def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
	"""Convert all tokens to uncached input + completion (no caching)"""
	if df.empty:
	return df

	df = df.copy()
	df["cache_read_tokens"] = 0
	df["cache_creation_tokens"] = 0
	return df


	def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
	"""Load trajectories with self-calculated token counts using calculate_routing_tokens"""
	global _calculated_tokens_cache

	cache_key = f"calculated_{folder}"
	if cache_key in _calculated_tokens_cache:
	return _calculated_tokens_cache[cache_key]

	trajectory_steps = load_all_trajectory_steps(folder)

	rows = []
	for instance_id, steps in trajectory_steps.items():
	if not steps:
	continue

	try:
	model_totals = calculate_routing_tokens(steps)
	step_model = steps[0].get("model", "") if steps else ""
	totals = model_totals.get(step_model, {})

	cache_read = totals.get("cache_read", 0)
	uncached_input = totals.get("uncached_input", 0)
	completion = totals.get("completion", 0)
	cache_creation = totals.get("cache_creation", 0)

	prompt_tokens = cache_read + uncached_input

	rows.append({
	"instance_id": instance_id,
	"model_name": step_model,
	"api_calls": len(steps),
	"instance_cost": 0,
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion,
	"total_tokens": prompt_tokens + completion,
	"cache_read_tokens": cache_read,
	"cache_creation_tokens": cache_creation,
	})
	except Exception as e:
	print(f"Error calculating tokens for {instance_id}: {e}")

	df = pd.DataFrame(rows)
	_calculated_tokens_cache[cache_key] = df
	return df


	def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
	"""
	Load all trajectories as step sequences for routing calculations.

	Returns:
	dict mapping instance_id -> list of steps for calculate_routing_tokens
	"""
	global _trajectory_steps_cache

	cache_key = f"steps_{folder}"
	if cache_key in _trajectory_steps_cache:
	return _trajectory_steps_cache[cache_key]

	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("/.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	model_name = ""
	if traj_files:
	try:
	with open(traj_files[0], "r") as f:
	first_data = json.load(f)
	config = first_data.get("info", {}).get("config", {}).get("model", {})
	model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
	except Exception:
	pass

	result = {}
	for traj_path in traj_files:
	try:
	instance_id = traj_path.stem.replace(".traj", "")
	steps = parse_trajectory_to_steps(traj_path, model_name)
	if steps:
	result[instance_id] = steps
	except Exception as e:
	print(f"Error parsing steps for {traj_path}: {e}")

	_trajectory_steps_cache[cache_key] = result
	return result


	def get_litellm_model_list() -> list[str]:
	"""Get list of model names from litellm prices"""
	prices = get_litellm_prices()
	return sorted(prices.keys())


	def get_litellm_prices() -> dict:
	global _litellm_prices_cache
	if _litellm_prices_cache is not None:
	return _litellm_prices_cache

	if LITELLM_PRICES_CACHE.exists():
	with open(LITELLM_PRICES_CACHE) as f:
	_litellm_prices_cache = json.load(f)
	return _litellm_prices_cache

	try:
	response = requests.get(LITELLM_PRICES_URL, timeout=30)
	response.raise_for_status()
	_litellm_prices_cache = response.json()

	DATA_DIR.mkdir(exist_ok=True)
	with open(LITELLM_PRICES_CACHE, "w") as f:
	json.dump(_litellm_prices_cache, f)
	except Exception:
	_litellm_prices_cache = {}

	return _litellm_prices_cache


	def normalize_model_name(name: str) -> str:
	"""Normalize model name for comparison: lowercase, remove separators"""
	return re.sub(r'[-_./]', '', name.lower())


	def get_model_prices(model_name: str) -> dict \| None:
	if not model_name:
	return None

	prices = get_litellm_prices()

	clean_name = model_name.replace("anthropic/", "").replace("openai/", "")

	name_without_date = re.sub(r'-\d{8}$', '', clean_name)

	candidates = [
	model_name,
	clean_name,
	name_without_date,
	f"anthropic/{clean_name}",
	f"openai/{clean_name}",
	f"anthropic/{name_without_date}",
	f"openai/{name_without_date}",
	]

	for key in candidates:
	if key in prices:
	return prices[key]

	normalized_name = normalize_model_name(clean_name)
	normalized_no_date = normalize_model_name(name_without_date)

	for key, value in prices.items():
	key_normalized = normalize_model_name(key)
	if normalized_name in key_normalized or normalized_no_date in key_normalized:
	return value
	key_last_part = key.split('/')[-1] if '/' in key else key
	key_last_normalized = normalize_model_name(key_last_part)
	if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
	return value

	return None


	def load_or_download_leaderboard():
	if LEADERBOARD_CACHE.exists():
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)

	filename = download_leaderboard(output_dir=str(DATA_DIR))
	os.rename(filename, LEADERBOARD_CACHE)
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)


	def get_bash_only_df():
	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return pd.DataFrame()

	rows = []
	for r in bash_only["results"]:
	resolved_pct = r.get("resolved", 0)
	if isinstance(resolved_pct, (int, float)):
	resolved_str = f"{resolved_pct:.1f}%"
	else:
	resolved_str = str(resolved_pct)

	rows.append({
	"name": r.get("name", ""),
	"% resolved": resolved_str,
	"date": r.get("date", ""),
	"cost": round(r.get("cost", 0), 2),
	"instance_cost": round(r.get("instance_cost", 0), 4),
	"instance_calls": r.get("instance_calls", 0),
	"folder": r.get("folder", ""),
	"os_model": "✅" if r.get("os_model") else "❌",
	})

	return pd.DataFrame(rows)


	def get_model_details(folder: str):
	if not folder:
	return None, "Select a model from the table"

	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return None, "Leaderboard not found"

	model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
	if not model:
	return None, f"Model with folder '{folder}' not found"

	return model, None


	def check_trajectories_downloaded(folder: str) -> bool:
	if not folder:
	return False
	output_dir = TRAJS_DIR / folder
	return output_dir.exists() and any(output_dir.iterdir())


	def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
	if not folder:
	return "❌ No model selected", gr.update(visible=False)

	model, error = get_model_details(folder)
	if error:
	return f"❌ {error}", gr.update(visible=False)

	output_dir = TRAJS_DIR / folder
	if output_dir.exists() and any(output_dir.iterdir()):
	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("/.traj")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))
	return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

	s3_path = f"{S3_BUCKET}/{folder}/trajs/"
	output_dir.mkdir(parents=True, exist_ok=True)

	progress(0, desc="Starting S3 download...")

	try:
	result = subprocess.run(
	["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
	capture_output=True,
	text=True,
	timeout=600,
	)

	if result.returncode != 0:
	return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("/.traj")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))

	if file_count == 0:
	return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)

	per_instance = model.get("per_instance_details", {})
	resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
	total_count = len(per_instance)

	if total_count > 0:
	resolved_pct = f"{100*resolved_count/total_count:.1f}%"
	else:
	resolved_pct = "N/A"

	status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
	return status, gr.update(visible=True)

	except subprocess.TimeoutExpired:
	return "❌ Download timed out (>10 min)", gr.update(visible=False)
	except FileNotFoundError:
	return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
	except Exception as e:
	return f"❌ Error: {e}", gr.update(visible=False)


	def parse_trajectory(traj_path: Path) -> dict:
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	info = data.get("info", {})
	model_stats = info.get("model_stats", {})
	config = info.get("config", {})
	model_config = config.get("model", {})
	model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

	result = {
	"instance_id": data.get("instance_id", traj_path.stem),
	"model_name": model_name,
	"api_calls": model_stats.get("api_calls", 0),
	"instance_cost": model_stats.get("instance_cost", 0),
	"prompt_tokens": 0,
	"completion_tokens": 0,
	"total_tokens": 0,
	"cache_read_tokens": 0,
	"cache_creation_tokens": 0,
	}

	messages = data.get("messages", [])
	for msg in messages:
	usage = None
	if "usage" in msg:
	usage = msg["usage"]
	elif "extra" in msg and isinstance(msg["extra"], dict):
	response = msg["extra"].get("response", {})
	if isinstance(response, dict):
	usage = response.get("usage", {})

	if usage:
	result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
	result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
	result["total_tokens"] += usage.get("total_tokens", 0) or 0
	result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
	result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0

	return result


	def load_all_trajectories(folder: str) -> pd.DataFrame:
	global _trajectories_cache

	if folder in _trajectories_cache:
	return _trajectories_cache[folder]

	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("/.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	rows = []
	for traj_path in traj_files:
	try:
	rows.append(parse_trajectory(traj_path))
	except Exception as e:
	print(f"Error parsing {traj_path}: {e}")

	df = pd.DataFrame(rows)
	_trajectories_cache[folder] = df
	return df


	def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	"""Create Total Cost by Token Type chart (can be called separately for price updates)"""
	if df.empty:
	return None

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	cost_uncached_input = total_uncached_input * input_price / 1e6
	cost_cache_read = total_cache_read * cache_read_price / 1e6
	cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
	cost_completion = total_completion * completion_price / 1e6

	cost_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
	})

	fig = px.bar(
	cost_data,
	x="Token Type",
	y="Cost ($)",
	title="Total Cost by Token Type ($)",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig.update_layout(
	xaxis_title="Token Type",
	yaxis_title="Cost ($)",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)

	total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
	fig.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	return fig


	def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	"""Create only token-related charts (for source switching)"""
	if df.empty:
	return None, None, None

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	token_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
	})

	fig_tokens = px.bar(
	token_data,
	x="Token Type",
	y="Total Tokens (M)",
	title="Total Tokens by Type",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens.update_layout(
	xaxis_title="Token Type",
	yaxis_title="Tokens (M)",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
	fig_tokens.add_annotation(
	text=f"Total: {total_all/1e6:.2f}M",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

	# Stacked bar chart - sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	fig_stacked = go.Figure()
	fig_stacked.add_trace(go.Bar(
	name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
	marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
	marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
	marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.add_trace(go.Bar(
	name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
	marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>",
	))
	fig_stacked.update_layout(
	barmode="stack",
	title="Tokens per Trajectory (stacked)",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Tokens (M)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=60, b=40),
	)

	return fig_tokens, fig_tokens_cost, fig_stacked


	def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None, None, None, None, None

	fig_steps = px.histogram(
	df,
	x="api_calls",
	nbins=30,
	title="Distribution of API Calls (Steps) per Trajectory",
	color_discrete_sequence=["#636EFA"],
	)
	fig_steps.update_layout(
	xaxis_title="API Calls (Steps)",
	yaxis_title="Number of Trajectories",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_steps.add_annotation(
	text=f"Mean: {df['api_calls'].mean():.1f} \| Median: {df['api_calls'].median():.0f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	fig_cost = px.histogram(
	df,
	x="instance_cost",
	nbins=30,
	title="Distribution of Cost Reported by Leaderboard ($)",
	color_discrete_sequence=["#00CC96"],
	)
	fig_cost.update_layout(
	xaxis_title="Cost ($)",
	yaxis_title="Number of Trajectories",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_cost.add_annotation(
	text=f"Mean: ${df['instance_cost'].mean():.4f} \| Total: ${df['instance_cost'].sum():.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	token_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
	})

	fig_tokens = px.bar(
	token_data,
	x="Token Type",
	y="Tokens (M)",
	title="Total Tokens by Type",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens.update_layout(
	xaxis_title="Token Type",
	yaxis_title="Tokens (M)",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)

	total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
	fig_tokens.add_annotation(
	text=f"Total: {total_all/1e6:.2f}M",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	# Cost by token type (use separate function)
	fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

	# Sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	fig_stacked = go.Figure()

	fig_stacked.add_trace(go.Bar(
	name="Uncached Input",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["uncached_input_tokens"] / 1e6,
	marker_color="#EF553B",
	hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Read",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cache_read_tokens"] / 1e6,
	marker_color="#19D3F3",
	hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Creation",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cache_creation_tokens"] / 1e6,
	marker_color="#FFA15A",
	hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Completion",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["completion_tokens"] / 1e6,
	marker_color="#AB63FA",
	hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>",
	))

	fig_stacked.update_layout(
	barmode="stack",
	title="Tokens per Trajectory (stacked)",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Tokens (M)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=60, b=40),
	)

	return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


	def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None

	# Sort by total tokens (sum of all stacked)
	df_sorted = df.copy()
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
	df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
	df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
	df_sorted["trajectory_idx"] = range(len(df_sorted))

	df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
	df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
	df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
	df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name=f"Uncached Input (${input_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_uncached_input"],
	marker_color="#EF553B",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Read (${cache_read_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_cache_read"],
	marker_color="#19D3F3",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_cache_creation"],
	marker_color="#FFA15A",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Completion (${completion_price:.2f}/1M)",
	x=df_sorted["trajectory_idx"],
	y=df_sorted["cost_completion"],
	marker_color="#AB63FA",
	hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	total_cost = (
	df_sorted["cost_uncached_input"].sum() +
	df_sorted["cost_cache_read"].sum() +
	df_sorted["cost_cache_creation"].sum() +
	df_sorted["cost_completion"].sum()
	)

	fig.update_layout(
	barmode="stack",
	title="Cost per Trajectory",
	xaxis_title="Trajectory (sorted by total tokens)",
	yaxis_title="Cost ($)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=60, b=40),
	)

	fig.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=14),
	bgcolor="white",
	)

	return fig


	def extract_model_from_folder(folder: str) -> str:
	"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
	if not folder:
	return ""
	parts = folder.split("_")
	if len(parts) >= 3:
	return "_".join(parts[2:])
	return folder


	def get_prices_for_folder(folder: str) -> tuple[dict, str]:
	"""Get prices from litellm based on folder name.
	Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
	model_hint = extract_model_from_folder(folder)

	result = {
	"input": {"value": 0, "found": False},
	"cache_read": {"value": 0, "found": False},
	"cache_creation": {"value": 0, "found": False},
	"completion": {"value": 0, "found": False},
	}

	if not model_hint:
	return result, ""

	prices = get_model_prices(model_hint)
	if prices:
	# Get values from litellm
	input_price = prices.get("input_cost_per_token", 0) * 1e6
	cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
	cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
	completion = prices.get("output_cost_per_token", 0) * 1e6

	result["input"] = {"value": input_price, "found": input_price > 0}
	result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
	result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
	result["completion"] = {"value": completion, "found": completion > 0}

	# Apply fallback estimates based on standard ratios
	# Cache Read = Input * 0.1 (90% discount)
	# Cache Creation = Input * 1.25 (25% premium)
	# Completion = Input * 5 (typical ratio)
	if input_price > 0:
	if not result["cache_read"]["found"]:
	result["cache_read"]["value"] = input_price * 0.1
	if not result["cache_creation"]["found"]:
	result["cache_creation"]["value"] = input_price * 1.25
	if not result["completion"]["found"]:
	result["completion"]["value"] = input_price * 5
	elif completion > 0:
	# If we only have completion, estimate input from it
	estimated_input = completion / 5
	if not result["input"]["found"]:
	result["input"]["value"] = estimated_input
	if not result["cache_read"]["found"]:
	result["cache_read"]["value"] = estimated_input * 0.1
	if not result["cache_creation"]["found"]:
	result["cache_creation"]["value"] = estimated_input * 1.25

	return result, model_hint


	def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
	if evt.index is None:
	return (
	"", "",
	gr.update(visible=False),
	gr.update(value=0, label="Input"),
	gr.update(value=0, label="Cache Read"),
	gr.update(value=0, label="Cache Creation"),
	gr.update(value=0, label="Completion"),
	"",
	gr.update(value=1.0),
	)

	row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
	row = df.iloc[row_idx]
	folder = row["folder"]
	name = row["name"]

	prices_dict, model_hint = get_prices_for_folder(folder)
	default_overhead = get_default_overhead(model_hint)

	def price_update(price_info, name):
	value = price_info["value"]
	if price_info["found"]:
	return gr.update(value=value, label=f"✅ {name}")
	elif value > 0:
	return gr.update(value=value, label=f"❌ {name} (est.)")
	else:
	return gr.update(value=0, label=f"❌ {name}")

	return (
	folder, name,
	gr.update(visible=True),
	price_update(prices_dict["input"], "Input"),
	price_update(prices_dict["cache_read"], "Cache Read"),
	price_update(prices_dict["cache_creation"], "Cache Creation"),
	price_update(prices_dict["completion"], "Completion"),
	model_hint,
	gr.update(value=default_overhead),
	)


	def create_routed_token_chart(base_tokens: dict, additional_models: list):
	"""
	Create grouped bar chart for tokens by type, comparing base vs additional models.

	Args:
	base_tokens: dict with uncached_input, cache_read, cache_creation, completion
	additional_models: list of (model_name, tokens_dict) tuples
	"""
	import plotly.graph_objects as go

	categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
	colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

	fig = go.Figure()

	base_total = sum(base_tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
	base_values = [
	base_tokens.get("uncached_input", 0) / 1e6,
	base_tokens.get("cache_read", 0) / 1e6,
	base_tokens.get("cache_creation", 0) / 1e6,
	base_tokens.get("completion", 0) / 1e6,
	]
	fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))

	model_totals = [("Base Model", base_total)]

	for i, (model_name, tokens) in enumerate(additional_models):
	model_total = sum(tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
	model_totals.append((model_name or f"Model {i+1}", model_total))
	values = [
	tokens.get("uncached_input", 0) / 1e6,
	tokens.get("cache_read", 0) / 1e6,
	tokens.get("cache_creation", 0) / 1e6,
	tokens.get("completion", 0) / 1e6,
	]
	color = colors[(i + 1) % len(colors)]
	fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))

	grand_total = sum(t for _, t in model_totals)
	annotation_lines = [f"<b>Total: {grand_total/1e6:.2f}M</b>"]
	for name, total in model_totals:
	annotation_lines.append(f"{name}: {total/1e6:.2f}M")

	fig.update_layout(
	title="Tokens by Type (per Model)",
	yaxis_title="Tokens (M)",
	barmode="group",
	margin=dict(l=40, r=40, t=80, b=40),
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	)
	fig.add_annotation(
	text="<br>".join(annotation_lines),
	xref="paper", yref="paper",
	x=0.02, y=0.98, showarrow=False,
	font=dict(size=11),
	align="left",
	bgcolor="rgba(255,255,255,0.8)",
	bordercolor="gray",
	borderwidth=1,
	)
	return fig


	def create_routed_cost_chart(base_costs: dict, additional_models: list):
	"""
	Create grouped bar chart for costs by type, comparing base vs additional models.

	Args:
	base_costs: dict with uncached_input, cache_read, cache_creation, completion
	additional_models: list of (model_name, costs_dict) tuples
	"""
	import plotly.graph_objects as go

	categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
	colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

	fig = go.Figure()

	base_total = sum(base_costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
	base_values = [
	base_costs.get("uncached_input", 0),
	base_costs.get("cache_read", 0),
	base_costs.get("cache_creation", 0),
	base_costs.get("completion", 0),
	]
	fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0]))

	model_totals = [("Base Model", base_total)]

	for i, (model_name, costs) in enumerate(additional_models):
	model_total = sum(costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"])
	model_totals.append((model_name or f"Model {i+1}", model_total))
	values = [
	costs.get("uncached_input", 0),
	costs.get("cache_read", 0),
	costs.get("cache_creation", 0),
	costs.get("completion", 0),
	]
	color = colors[(i + 1) % len(colors)]
	fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color))

	grand_total = sum(t for _, t in model_totals)
	annotation_lines = [f"<b>Total: ${grand_total:.2f}</b>"]
	for name, total in model_totals:
	annotation_lines.append(f"{name}: ${total:.2f}")

	fig.update_layout(
	title="Cost by Type (per Model) ($)",
	yaxis_title="Cost ($)",
	barmode="group",
	margin=dict(l=40, r=40, t=80, b=40),
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	)
	fig.add_annotation(
	text="<br>".join(annotation_lines),
	xref="paper", yref="paper",
	x=0.02, y=0.98, showarrow=False,
	font=dict(size=11),
	align="left",
	bgcolor="rgba(255,255,255,0.8)",
	bordercolor="gray",
	borderwidth=1,
	)
	return fig


	def build_app():
	leaderboard_df = get_bash_only_df()

	with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
	trajectories_state = gr.State(None)

	gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
	gr.Markdown("Select a model to use as base for cost analysis")

	with gr.Row():
	with gr.Column(scale=3):
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	label="Bash-Only Leaderboard",
	interactive=False,
	wrap=True,
	)

	with gr.Column(visible=False) as analysis_section:
	gr.Markdown("## 📊 Trajectory Analysis")

	with gr.Row():
	plot_steps = gr.Plot(label="API Calls Distribution")
	plot_cost = gr.Plot(label="Cost Distribution")

	with gr.Row():
	plot_tokens = gr.Plot(label="Token Usage by Type")
	plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")

	with gr.Row(visible=False) as routing_plots_row:
	routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
	routing_cost_plot = gr.Plot(label="Cost by Type (per Model)")

	with gr.Row():
	plot_stacked = gr.Plot(label="Tokens per Trajectory")
	plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")

	with gr.Column(scale=1):
	selected_folder = gr.State("")
	gr.Markdown("### Selected Model")
	selected_name = gr.Textbox(label="Model Name", interactive=False)

	analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
	download_status = gr.Textbox(label="Status", interactive=False, lines=3)

	gr.Markdown("---")
	gr.Markdown("### 💰 Token Prices ($/1M) · [litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)")
	detected_model = gr.Textbox(label="Detected Model", interactive=False)
	with gr.Row():
	price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
	price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
	price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
	price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)

	gr.Markdown("---")
	gr.Markdown("### 📊 Token Count Source")
	token_source = gr.Radio(
	choices=["Metadata", "Calculated"],
	value="Metadata",
	)
	thinking_overhead = gr.Number(
	label="🔢 Tokenizer Overhead",
	value=1.21,
	precision=2,
	info="Multiplier for Calculated tokens (tiktoken → native)",
	visible=False,
	)
	use_cache = gr.Checkbox(
	label="Use Cache",
	value=True,
	info="If disabled, all tokens are Uncached Input or Completion",
	visible=False,
	)

	gr.Markdown("---")
	add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)

	with gr.Column(visible=False) as routing_section:
	gr.Markdown("### 🔀 Routing Models")

	STRATEGY_CHOICES = [
	"Replace on random steps",
	"Replace every step k",
	"Replace part of trajectory",
	]

	with gr.Column():
	with gr.Group():
	gr.Markdown("#### Route to Model 1")
	routing_model_1 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)
	strategy_1 = gr.Dropdown(
	label="Strategy",
	choices=STRATEGY_CHOICES,
	value="Replace on random steps",
	interactive=True,
	)
	with gr.Row(visible=True) as random_params_1:
	random_pct_1 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
	with gr.Row(visible=False) as every_k_params_1:
	step_k_1 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
	with gr.Row(visible=False) as part_params_1:
	start_step_1 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True)
	end_step_1 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True)

	add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)

	with gr.Column(visible=False) as routing_block_2:
	with gr.Group():
	gr.Markdown("#### Route to Model 2")
	routing_model_2 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)
	strategy_2 = gr.Dropdown(
	label="Strategy",
	choices=STRATEGY_CHOICES,
	value="Replace on random steps",
	interactive=True,
	)
	with gr.Row(visible=True) as random_params_2:
	random_pct_2 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
	with gr.Row(visible=False) as every_k_params_2:
	step_k_2 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
	with gr.Row(visible=False) as part_params_2:
	start_step_2 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True)
	end_step_2 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True)

	add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)

	with gr.Column(visible=False) as routing_block_3:
	with gr.Group():
	gr.Markdown("#### Route to Model 3")
	routing_model_3 = gr.Dropdown(
	label="Model (type 3+ chars to search)",
	choices=[],
	allow_custom_value=True,
	interactive=True,
	)
	with gr.Row():
	routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
	routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
	routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
	routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)
	strategy_3 = gr.Dropdown(
	label="Strategy",
	choices=STRATEGY_CHOICES,
	value="Replace on random steps",
	interactive=True,
	)
	with gr.Row(visible=True) as random_params_3:
	random_pct_3 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True)
	with gr.Row(visible=False) as every_k_params_3:
	step_k_3 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True)
	with gr.Row(visible=False) as part_params_3:
	start_step_3 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True)
	end_step_3 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True)

	gr.Markdown("---")
	route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
	routing_result = gr.Markdown(visible=False)


	def on_strategy_change(strategy):
	return (
	gr.update(visible=strategy == "Replace on random steps"),
	gr.update(visible=strategy == "Replace every step k"),
	gr.update(visible=strategy == "Replace part of trajectory"),
	)

	def toggle_routing_section():
	return gr.update(visible=True)

	add_routing_btn.click(
	fn=toggle_routing_section,
	outputs=[routing_section],
	)

	strategy_1.change(
	fn=on_strategy_change,
	inputs=[strategy_1],
	outputs=[random_params_1, every_k_params_1, part_params_1],
	)

	strategy_2.change(
	fn=on_strategy_change,
	inputs=[strategy_2],
	outputs=[random_params_2, every_k_params_2, part_params_2],
	)

	strategy_3.change(
	fn=on_strategy_change,
	inputs=[strategy_3],
	outputs=[random_params_3, every_k_params_3, part_params_3],
	)

	def filter_models(query):
	"""Filter models based on search query (starts at 3 chars)"""
	if not query or len(query) < 3:
	return gr.update(choices=[])
	all_models = get_litellm_model_list()
	query_lower = query.lower()
	filtered = [m for m in all_models if query_lower in m.lower()][:50]
	return gr.update(choices=filtered)

	routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
	routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
	routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])

	def get_routing_prices_with_labels(model_name):
	"""Get all 4 prices for a routing model with found/estimated labels"""
	if not model_name:
	return (
	gr.update(value=0, label="Input"),
	gr.update(value=0, label="Cache Read"),
	gr.update(value=0, label="Cache Creation"),
	gr.update(value=0, label="Completion"),
	)

	prices = get_litellm_prices()
	model_prices = prices.get(model_name, {})

	input_price = model_prices.get("input_cost_per_token", 0) * 1e6
	cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
	cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
	completion = model_prices.get("output_cost_per_token", 0) * 1e6

	input_found = input_price > 0
	cache_read_found = cache_read > 0
	cache_creation_found = cache_creation > 0
	completion_found = completion > 0

	if not cache_read_found and input_price > 0:
	cache_read = input_price * 0.1
	if not cache_creation_found and input_price > 0:
	cache_creation = input_price * 1.25

	def label(name, found):
	return f"✅ {name}" if found else f"❌ {name}"

	return (
	gr.update(value=input_price, label=label("Input", input_found)),
	gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
	gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
	gr.update(value=completion, label=label("Completion", completion_found)),
	)

	def on_routing_model_1_select(model_name):
	prices = get_routing_prices_with_labels(model_name)
	show_btn = bool(model_name)
	return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)

	def on_routing_model_2_select(model_name):
	prices = get_routing_prices_with_labels(model_name)
	show_btn = bool(model_name)
	return *prices, gr.update(visible=show_btn)

	def on_routing_model_3_select(model_name):
	return get_routing_prices_with_labels(model_name)

	routing_model_1.change(
	fn=on_routing_model_1_select,
	inputs=[routing_model_1],
	outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
	)

	add_model_2_btn.click(
	fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
	outputs=[routing_block_2, add_model_2_btn],
	)

	routing_model_2.change(
	fn=on_routing_model_2_select,
	inputs=[routing_model_2],
	outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
	)

	add_model_3_btn.click(
	fn=lambda: (gr.update(visible=True), gr.update(visible=False)),
	outputs=[routing_block_3, add_model_3_btn],
	)

	routing_model_3.change(
	fn=on_routing_model_3_select,
	inputs=[routing_model_3],
	outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
	)

	def run_routing(
	state_data,
	base_input, base_cache_read, base_cache_creation, base_completion,
	routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
	strategy_1_val, random_pct_1_val, step_k_1_val, start_1_val, end_1_val,
	source, overhead, with_cache
	):
	if state_data is None:
	yield (
	gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
	gr.update(visible=False),
	None, None,
	)
	return

	if not routing_model_1_val:
	yield (
	gr.update(visible=True, value="❌ Please select at least one routing model."),
	gr.update(visible=False),
	None, None,
	)
	return

	trajectory_steps = state_data.get("steps", {})
	if not trajectory_steps:
	yield (
	gr.update(visible=True, value="❌ No trajectory steps data available."),
	gr.update(visible=False),
	None, None,
	)
	return

	df_key = "meta" if source == "Metadata" else "calculated"
	df = state_data.get(df_key)
	if df is not None and not df.empty:
	if source == "Calculated":
	df = apply_thinking_overhead(df.copy(), overhead)
	if not with_cache:
	df = apply_no_cache(df)
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_original_cost_from_df = (
	df_temp["uncached_input"].sum() * base_input / 1e6 +
	df["cache_read_tokens"].sum() * base_cache_read / 1e6 +
	df["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
	df["completion_tokens"].sum() * base_completion / 1e6
	)
	else:
	total_original_cost_from_df = None

	base_prices = {
	"input": base_input,
	"cache_read": base_cache_read,
	"cache_creation": base_cache_creation,
	"completion": base_completion,
	}
	routing_prices = {
	"input": r1_input,
	"cache_read": r1_cache_read,
	"cache_creation": r1_cache_creation,
	"completion": r1_completion,
	}

	strategy_params = {}
	if strategy_1_val == "Replace on random steps":
	strategy_params["percentage"] = random_pct_1_val
	elif strategy_1_val == "Replace every step k":
	strategy_params["k"] = step_k_1_val
	elif strategy_1_val == "Replace part of trajectory":
	strategy_params["start"] = start_1_val
	strategy_params["end"] = end_1_val

	total_base_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
	total_routing_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
	total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}

	BASE_MODEL = "__base__"
	ROUTING_MODEL = "__routing__"

	for instance_id, steps in trajectory_steps.items():
	if not steps:
	continue

	total_steps = len(steps)
	routed_step_indices = get_routed_steps(total_steps, strategy_1_val, strategy_params)

	modified_steps = []
	for i, step in enumerate(steps):
	model = ROUTING_MODEL if i in routed_step_indices else BASE_MODEL
	modified_steps.append({
	"model": model,
	"system_user": step.get("system_user", 0),
	"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
	"observation": step.get("observation"),
	})

	model_totals = calculate_routing_tokens(modified_steps)

	base_totals = model_totals.get(BASE_MODEL, {
	"cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0
	})
	routing_totals = model_totals.get(ROUTING_MODEL, {
	"cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0
	})

	total_base_tokens["cache_read"] += base_totals.get("cache_read", 0)
	total_base_tokens["uncached_input"] += base_totals.get("uncached_input", 0)
	total_base_tokens["completion"] += base_totals.get("completion", 0)
	total_base_tokens["cache_creation"] += base_totals.get("cache_creation", 0)

	total_routing_tokens["cache_read"] += routing_totals.get("cache_read", 0)
	total_routing_tokens["uncached_input"] += routing_totals.get("uncached_input", 0)
	total_routing_tokens["completion"] += routing_totals.get("completion", 0)
	total_routing_tokens["cache_creation"] += routing_totals.get("cache_creation", 0)

	original_steps = []
	for step in steps:
	original_steps.append({
	"model": BASE_MODEL,
	"system_user": step.get("system_user", 0),
	"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
	"observation": step.get("observation"),
	})
	original_totals = calculate_routing_tokens(original_steps)
	orig = original_totals.get(BASE_MODEL, {})
	total_original_tokens["cache_read"] += orig.get("cache_read", 0)
	total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
	total_original_tokens["completion"] += orig.get("completion", 0)
	total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)

	def calc_cost(tokens: dict, prices: dict) -> float:
	return (
	tokens["uncached_input"] * prices["input"] / 1e6 +
	tokens["cache_read"] * prices["cache_read"] / 1e6 +
	tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
	tokens["completion"] * prices["completion"] / 1e6
	)

	base_costs = {k: total_base_tokens[k] * base_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_base_tokens}
	routing_costs = {k: total_routing_tokens[k] * routing_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_routing_tokens}

	total_base_cost = calc_cost(total_base_tokens, base_prices)
	total_routing_cost = calc_cost(total_routing_tokens, routing_prices)

	if total_original_cost_from_df is not None:
	total_original_cost = total_original_cost_from_df
	else:
	total_original_cost = calc_cost(total_original_tokens, base_prices)

	total_routed_cost = total_base_cost + total_routing_cost
	savings = total_original_cost - total_routed_cost
	savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0

	result_text = f"""
	## 🚀 Routing Results

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Original Cost (base model only) \| ${total_original_cost:.2f} \|
	\| Routed Cost \| ${total_routed_cost:.2f} \|
	\| ↳ Base model portion \| ${total_base_cost:.2f} \|
	\| ↳ Routing model portion \| ${total_routing_cost:.2f} \|
	\| Savings \| ${savings:.2f} ({savings_pct:+.1f}%) \|

	Strategy: {strategy_1_val}
	Routing model: {routing_model_1_val}
	"""

	additional_token_models = [(routing_model_1_val, total_routing_tokens)]
	additional_cost_models = [(routing_model_1_val, routing_costs)]

	yield (
	gr.update(visible=True, value="⏳ Creating charts..."),
	gr.update(visible=True),
	None,
	None,
	)

	tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
	cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)

	yield (
	gr.update(visible=True, value=result_text),
	gr.update(visible=True),
	tokens_chart,
	cost_chart,
	)

	route_btn.click(
	fn=run_routing,
	inputs=[
	trajectories_state,
	price_input, price_cache_read, price_cache_creation, price_completion,
	routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
	strategy_1, random_pct_1, step_k_1, start_step_1, end_step_1,
	token_source, thinking_overhead, use_cache,
	],
	outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
	)

	def update_calculated_options_visibility(source):
	is_calc = source == "Calculated"
	return gr.update(visible=is_calc), gr.update(visible=is_calc)

	token_source.change(
	fn=update_calculated_options_visibility,
	inputs=[token_source],
	outputs=[thinking_overhead, use_cache],
	)

	leaderboard_table.select(
	fn=on_row_select,
	inputs=[leaderboard_table],
	outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
	)

	def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache, progress=gr.Progress()):
	empty_result = (
	"",
	gr.update(visible=False),
	None, None, None, None, None, None,
	None,
	gr.update(visible=False),
	)

	if not folder:
	yield empty_result
	return

	if not check_trajectories_downloaded(folder):
	yield (
	"⏳ Downloading trajectories...",
	gr.update(visible=False),
	None, None, None, None, None, None,
	None,
	gr.update(visible=False),
	)
	status, _ = download_trajectories_from_s3(folder)
	if "❌" in status:
	yield (
	status,
	gr.update(visible=False),
	None, None, None, None, None, None,
	None,
	gr.update(visible=False),
	)
	return

	yield (
	"⏳ Loading trajectories...",
	gr.update(visible=True),
	None, None, None, None, None, None,
	None,
	gr.update(visible=False),
	)

	df_meta = load_all_trajectories(folder)
	df_calc = load_all_trajectories_calculated(folder)
	df_calc["api_calls"] = df_meta["api_calls"].values
	df_calc["instance_cost"] = df_meta["instance_cost"].values
	trajectory_steps = load_all_trajectory_steps(folder)

	state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}

	if source == "Metadata":
	df = df_meta
	else:
	df = apply_thinking_overhead(df_calc.copy(), overhead)
	if not with_cache:
	df = apply_no_cache(df)

	if df.empty:
	yield (
	"❌ No trajectories found",
	gr.update(visible=False),
	None, None, None, None, None, None,
	None,
	gr.update(visible=False),
	)
	return

	fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
	df, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

	yield (
	f"✅ Loaded {len(df)} trajectories",
	gr.update(visible=True),
	fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
	state_data,
	gr.update(visible=True),
	)

	analyze_btn.click(
	fn=load_and_analyze,
	inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
	outputs=[
	download_status,
	analysis_section,
	plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
	trajectories_state,
	add_routing_btn,
	],
	)

	def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
	if state_data is None:
	return None, None

	if source == "Metadata":
	df = state_data["meta"]
	else:
	df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
	if not with_cache:
	df = apply_no_cache(df)

	if df.empty:
	return None, None

	fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
	fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
	return fig_tokens_cost, fig_cost_breakdown

	price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
	price_outputs = [plot_tokens_cost, plot_cost_breakdown]

	price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
	price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)

	def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
	"""Recalculate only token-dependent charts when source changes"""
	if state_data is None:
	return None, None, None, None

	if source == "Metadata":
	df = state_data["meta"]
	else:
	df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
	if not with_cache:
	df = apply_no_cache(df)

	if df.empty:
	return None, None, None, None

	fig_tokens, fig_tokens_cost, fig_stacked = create_token_charts(
	df, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

	return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown

	source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
	source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]

	token_source.change(
	fn=on_source_change,
	inputs=source_change_inputs,
	outputs=source_change_outputs,
	)

	thinking_overhead.change(
	fn=on_source_change,
	inputs=source_change_inputs,
	outputs=source_change_outputs,
	)

	use_cache.change(
	fn=on_source_change,
	inputs=source_change_inputs,
	outputs=source_change_outputs,
	)

	return app


	if __name__ == "__main__":
	app = build_app()
	app.queue()
	app.launch()