Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Running

File size: 132,789 Bytes

import json
import logging
import os
import random
import re
import subprocess
import sys
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
import tiktoken

from src.download_swebench_leaderboard import download_leaderboard

# Tokenizer cache
_tokenizer_cache = {}

DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
LOG_DIR = Path("logs")

QUICK_SELECT_MODELS = [
    "openrouter/anthropic/claude-opus-4.5",
    "openrouter/anthropic/claude-sonnet-4.5",
    "openrouter/google/gemini-3-pro-preview",
    "openrouter/openai/gpt-5-codex",
    "openrouter/openai/gpt-oss-120b",
    "deepinfra/Qwen/Qwen3-14B",
    "deepinfra/Qwen/Qwen3-32B",
    "deepinfra/Qwen/Qwen3-73B",
    "deepinfra/Qwen/Qwen3-235B-A22B",
    "deepinfra/Qwen/Qwen3-30B-A3B",
    ("deepinfra/Qwen/Qwen3-Coder-480B-A35B-Instruct", "Qwen3-Coder-480B-A35B"),
]
LOG_DIR.mkdir(parents=True, exist_ok=True)
LOG_FILE = LOG_DIR / "app.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE, encoding="utf-8"),
        logging.StreamHandler(sys.stdout),
    ],
    force=True,
)


def _log_unhandled(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return
    logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))


sys.excepthook = _log_unhandled

_litellm_prices_cache = None
_litellm_chat_prices_cache = None
_trajectories_cache = {}
_calculated_tokens_cache = {}
_trajectory_steps_cache = {}


def calculate_routing_tokens(steps: list[dict]) -> dict:
    """
    Calculate token breakdown per model with proper caching simulation.

    Args:
        steps: list of dicts with keys:
            - model: str (model name)
            - system_user: int (tokens for system/user message, usually only step 0)
            - completion: int (generated tokens)
            - observation: int or None (env response tokens, None for last step)

    Returns:
        dict with per-model totals:
            {model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
    """
    model_caches = {}
    model_totals = {}

    total_context = 0
    prev_observation = 0

    for i, step in enumerate(steps):
        model = step["model"]
        system_user = step.get("system_user", 0)
        completion = step.get("completion", 0)
        observation = step.get("observation") or 0

        if model not in model_caches:
            model_caches[model] = 0
        if model not in model_totals:
            model_totals[model] = {
                "cache_read": 0,
                "uncached_input": 0,
                "completion": 0,
                "observation": 0,
                "cache_creation": 0,
            }

        cache_read = model_caches[model]

        if i == 0:
            uncached_input = system_user
        else:
            full_context_needed = total_context + prev_observation
            uncached_input = full_context_needed - cache_read

        cache_creation = uncached_input + completion

        model_caches[model] = cache_read + cache_creation

        model_totals[model]["cache_read"] += cache_read
        model_totals[model]["uncached_input"] += uncached_input
        model_totals[model]["completion"] += completion
        model_totals[model]["observation"] += observation
        model_totals[model]["cache_creation"] += cache_creation

        total_context = cache_read + uncached_input + completion
        prev_observation = observation

    return model_totals


def calculate_per_step_tokens(steps: list[dict]) -> list[dict]:
    """
    Calculate token breakdown per step with proper caching simulation.

    Returns list of per-step data:
        [{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
    """
    result = []
    cache_size = 0
    total_context = 0
    prev_observation = 0

    for i, step in enumerate(steps):
        system_user = step.get("system_user", 0)
        completion = step.get("completion", 0)
        observation = step.get("observation") or 0

        cache_read = cache_size

        if i == 0:
            uncached_input = system_user
        else:
            full_context_needed = total_context + prev_observation
            uncached_input = full_context_needed - cache_read

        cache_creation = uncached_input + completion
        cache_size = cache_read + cache_creation

        result.append({
            "step": i,
            "cache_read": cache_read,
            "uncached_input": uncached_input,
            "completion": completion,
            "cache_creation": cache_creation,
        })

        total_context = cache_read + uncached_input + completion
        prev_observation = observation

    return result


def _parse_usage_from_log_line(line: str) -> dict | None:
    """
    Parse usage info from log line containing ModelResponse or similar format.
    Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc.
    """
    if "usage=" not in line:
        return None

    result = {}

    for field in ["completion_tokens", "prompt_tokens", "total_tokens"]:
        match = re.search(rf'{field}=(\d+)', line)
        if match:
            result[field] = int(match.group(1))

    cached_match = re.search(r'cached_tokens=(\d+)', line)
    if cached_match:
        result["cached_tokens"] = int(cached_match.group(1))

    return result if result else None


def _parse_old_format_log(log_path: Path) -> list[dict]:
    """
    Parse old SWE-agent format .info.log file to extract per-step token usage.
    """
    result = []
    step = 0

    try:
        with open(log_path, "r", encoding="utf-8") as f:
            for line in f:
                if "usage=Usage(" not in line:
                    continue

                usage = _parse_usage_from_log_line(line)
                if not usage:
                    continue

                prompt_tokens = usage.get("prompt_tokens", 0)
                completion_tokens = usage.get("completion_tokens", 0)
                cached_tokens = usage.get("cached_tokens", 0)

                uncached_input = max(0, prompt_tokens - cached_tokens)

                result.append({
                    "step": step,
                    "cache_read": cached_tokens,
                    "uncached_input": uncached_input,
                    "completion": completion_tokens,
                    "cache_creation": 0,
                })
                step += 1
    except Exception as e:
        logging.debug("Error parsing log file %s: %s", log_path, e)

    return result


def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]:
    """
    Parse trajectory file and extract per-step metadata from usage fields.
    Supports both new format (.traj.json with messages[].extra.response.usage)
    and old format (.traj with separate .info.log file).

    Returns list of per-step data:
        [{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...]
    """
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    messages = data.get("messages", [])
    result = []
    step = 0

    for msg in messages:
        if msg.get("role") != "assistant":
            continue

        usage = None
        if "usage" in msg:
            usage = msg["usage"]
        elif "extra" in msg and isinstance(msg["extra"], dict):
            response = msg["extra"].get("response", {})
            if isinstance(response, dict):
                usage = response.get("usage", {})

        if usage:
            prompt_tokens = usage.get("prompt_tokens", 0) or 0
            completion_tokens = usage.get("completion_tokens", 0) or 0
            cache_read = usage.get("cache_read_input_tokens", 0) or 0
            cache_creation = usage.get("cache_creation_input_tokens", 0) or 0

            prompt_tokens_details = usage.get("prompt_tokens_details", {})
            if isinstance(prompt_tokens_details, dict):
                cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
                if cached_from_details > 0 and cache_read == 0:
                    cache_read = cached_from_details

            uncached_input = max(0, prompt_tokens - cache_read - cache_creation)

            result.append({
                "step": step,
                "cache_read": cache_read,
                "uncached_input": uncached_input,
                "completion": completion_tokens,
                "cache_creation": cache_creation,
            })
            step += 1

    if not result:
        log_path = traj_path.with_suffix(".info.log")
        if not log_path.exists():
            base_name = traj_path.stem.replace(".traj", "")
            log_path = traj_path.parent / f"{base_name}.info.log"

        if log_path.exists():
            result = _parse_old_format_log(log_path)

    return result


def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]:
    """
    Load per-step metadata for all trajectories.

    Returns:
        dict mapping instance_id -> list of per-step metadata
    """
    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*/*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    result = {}
    for traj_path in traj_files:
        try:
            instance_id = traj_path.stem.replace(".traj", "")
            steps = parse_trajectory_metadata_per_step(traj_path)
            if steps:
                result[instance_id] = steps
        except Exception as e:
            logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True)

    return result


def create_single_trajectory_meta_chart(steps: list[dict]):
    """Create stacked bar chart for a single trajectory showing metadata tokens per step."""
    import plotly.graph_objects as go

    if not steps:
        return None

    x_labels = [f"Step {d['step']}" for d in steps]
    uncached = [d["uncached_input"] / 1e3 for d in steps]
    cache_read = [d["cache_read"] / 1e3 for d in steps]
    cache_creation = [d["cache_creation"] / 1e3 for d in steps]
    completion = [d["completion"] / 1e3 for d in steps]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name="Uncached Input",
        x=x_labels,
        y=uncached,
        marker_color="#EF553B",
        hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Read",
        x=x_labels,
        y=cache_read,
        marker_color="#19D3F3",
        hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Creation",
        x=x_labels,
        y=cache_creation,
        marker_color="#FFA15A",
        hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Completion",
        x=x_labels,
        y=completion,
        marker_color="#AB63FA",
        hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
    ))

    fig.update_layout(
        barmode="stack",
        xaxis_title="Step",
        yaxis_title="Tokens (K)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig


def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    """Create stacked bar chart for a single trajectory showing metadata cost per step."""
    import plotly.graph_objects as go

    if not steps:
        return None

    x_labels = [f"Step {d['step']}" for d in steps]
    uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps]
    cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps]
    cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps]
    completion_cost = [d["completion"] * completion_price / 1e6 for d in steps]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name="Uncached Input",
        x=x_labels,
        y=uncached_cost,
        marker_color="#EF553B",
        hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Read",
        x=x_labels,
        y=cache_read_cost,
        marker_color="#19D3F3",
        hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Creation",
        x=x_labels,
        y=cache_creation_cost,
        marker_color="#FFA15A",
        hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Completion",
        x=x_labels,
        y=completion_cost,
        marker_color="#AB63FA",
        hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
    ))

    fig.update_layout(
        barmode="stack",
        xaxis_title="Step",
        yaxis_title="Cost ($)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig


def create_single_trajectory_chart(steps: list[dict], overhead: float = 1.0, with_cache: bool = True):
    """Create stacked bar chart for a single trajectory showing tokens per step."""
    import plotly.graph_objects as go

    if not steps:
        return None

    per_step_data = calculate_per_step_tokens(steps)

    x_labels = [f"Step {d['step']}" for d in per_step_data]
    cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
    cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
    completion_raw = [d["completion"] * overhead for d in per_step_data]
    prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]

    if with_cache:
        uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
        cache_read = cache_read_raw
        cache_creation = cache_creation_raw
    else:
        uncached = prompt_tokens_raw
        cache_read = [0] * len(per_step_data)
        cache_creation = [0] * len(per_step_data)

    uncached_k = [u / 1e3 for u in uncached]
    cache_read_k = [cr / 1e3 for cr in cache_read]
    cache_creation_k = [cc / 1e3 for cc in cache_creation]
    completion_k = [c / 1e3 for c in completion_raw]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name="Uncached Input",
        x=x_labels,
        y=uncached_k,
        marker_color="#EF553B",
        hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Read",
        x=x_labels,
        y=cache_read_k,
        marker_color="#19D3F3",
        hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Creation",
        x=x_labels,
        y=cache_creation_k,
        marker_color="#FFA15A",
        hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Completion",
        x=x_labels,
        y=completion_k,
        marker_color="#AB63FA",
        hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>",
    ))

    fig.update_layout(
        barmode="stack",
        xaxis_title="Step",
        yaxis_title="Tokens (K)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig


def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float, overhead: float = 1.0, with_cache: bool = True):
    """Create stacked bar chart for a single trajectory showing cost per step."""
    import plotly.graph_objects as go

    if not steps:
        return None

    per_step_data = calculate_per_step_tokens(steps)

    x_labels = [f"Step {d['step']}" for d in per_step_data]
    cache_read_raw = [d["cache_read"] * overhead for d in per_step_data]
    cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data]
    completion_raw = [d["completion"] * overhead for d in per_step_data]
    prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data]

    if with_cache:
        uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)]
        cache_read = cache_read_raw
        cache_creation = cache_creation_raw
    else:
        uncached = prompt_tokens_raw
        cache_read = [0] * len(per_step_data)
        cache_creation = [0] * len(per_step_data)

    uncached_cost = [u * input_price / 1e6 for u in uncached]
    cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read]
    cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation]
    completion_cost = [c * completion_price / 1e6 for c in completion_raw]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name="Uncached Input",
        x=x_labels,
        y=uncached_cost,
        marker_color="#EF553B",
        hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Read",
        x=x_labels,
        y=cache_read_cost,
        marker_color="#19D3F3",
        hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Cache Creation",
        x=x_labels,
        y=cache_creation_cost,
        marker_color="#FFA15A",
        hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name="Completion",
        x=x_labels,
        y=completion_cost,
        marker_color="#AB63FA",
        hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>",
    ))

    fig.update_layout(
        barmode="stack",
        xaxis_title="Step",
        yaxis_title="Cost ($)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig


def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
    """
    Parse trajectory file into step format for calculate_routing_tokens.

    Returns list of steps with:
        - model: base model name
        - system_user: tokens for system + user message (step 0 only)
        - completion: assistant response tokens
        - observation: env response tokens (None for last step)
    """
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    messages = data.get("messages", [])
    trajectory_data = data.get("trajectory", [])

    if not messages and trajectory_data:
        return _parse_trajectory_format_to_steps(trajectory_data, model_name)

    if not messages:
        return []

    count_tokens, _ = get_tokenizer(model_name)

    steps = []
    system_user_tokens = 0
    current_completion = 0
    pending_observation = None

    i = 0
    while i < len(messages):
        msg = messages[i]
        role = msg.get("role", "user")
        content = msg.get("content", "")
        if isinstance(content, list):
            content = json.dumps(content)
        tokens = count_tokens(str(content))

        if role == "system":
            system_user_tokens += tokens
            i += 1
        elif role == "user":
            if not steps:
                system_user_tokens += tokens
                i += 1
            else:
                if steps:
                    steps[-1]["observation"] = tokens
                pending_observation = tokens
                i += 1
        elif role == "assistant":
            step = {
                "model": model_name,
                "system_user": system_user_tokens if not steps else 0,
                "completion": tokens,
                "observation": None,
                "content": str(content),
            }
            steps.append(step)
            system_user_tokens = 0
            i += 1

    return steps


def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]:
    """
    Parse alternative trajectory format (with "trajectory" array) into steps.
    """
    count_tokens, _ = get_tokenizer(model_name)

    steps = []
    for i, traj_step in enumerate(trajectory_data):
        query = traj_step.get("query", [])
        response_text = traj_step.get("response", "")
        observation_text = traj_step.get("observation", "")

        system_user_tokens = 0
        if i == 0:
            for q in query:
                content = q.get("content", "")
                if isinstance(content, list):
                    content = json.dumps(content)
                system_user_tokens += count_tokens(str(content))

        completion_tokens = count_tokens(str(response_text)) if response_text else 0
        observation_tokens = count_tokens(str(observation_text)) if observation_text else None

        step = {
            "model": model_name,
            "system_user": system_user_tokens,
            "completion": completion_tokens,
            "observation": observation_tokens,
            "content": str(response_text) if response_text else "",
        }
        steps.append(step)

    return steps


def get_default_overhead(model_name: str) -> float:
    """Get default tokenizer overhead for model provider"""
    model_lower = model_name.lower() if model_name else ""

    if "claude" in model_lower or "anthropic" in model_lower:
        return 1.24
    elif "gemini" in model_lower or "google" in model_lower:
        return 1.0
    elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
        return 1.0
    else:
        return 1.0


def get_tokenizer(model_name: str):
    """Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
    global _tokenizer_cache

    model_lower = model_name.lower() if model_name else ""

    if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
        tokenizer_name = "o200k_base"
    elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
        tokenizer_name = "cl100k_base"
    elif "gemini" in model_lower or "google" in model_lower:
        return lambda text: int(len(text) / 3.23), "gemini_approx"
    else:
        tokenizer_name = "cl100k_base"
    
    if tokenizer_name not in _tokenizer_cache:
        _tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name)
    
    enc = _tokenizer_cache[tokenizer_name]
    return lambda text: len(enc.encode(text)), tokenizer_name


def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
    """Apply tokenizer overhead multiplier to all token counts"""
    if df.empty or overhead == 1.0:
        return df

    df = df.copy()
    df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int)
    df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int)
    df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int)
    df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int)
    df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"]
    return df


def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
    """Convert all tokens to uncached input + completion (no caching)"""
    if df.empty:
        return df

    df = df.copy()
    df["cache_read_tokens"] = 0
    df["cache_creation_tokens"] = 0
    return df


def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure token-related columns exist and are numeric."""
    if df is None or df.empty:
        return df
    df = df.copy()
    required = [
        "prompt_tokens",
        "completion_tokens",
        "cache_read_tokens",
        "cache_creation_tokens",
    ]
    for col in required:
        if col not in df.columns:
            df[col] = 0
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
    if "total_tokens" in df.columns:
        df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int)
    return df


def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
    """Load trajectories with self-calculated token counts using calculate_routing_tokens"""
    global _calculated_tokens_cache

    cache_key = f"calculated_{folder}"
    if cache_key in _calculated_tokens_cache:
        return ensure_token_columns(_calculated_tokens_cache[cache_key])

    trajectory_steps = load_all_trajectory_steps(folder)

    rows = []
    for instance_id, steps in trajectory_steps.items():
        if not steps:
            continue

        try:
            model_totals = calculate_routing_tokens(steps)
            step_model = steps[0].get("model", "") if steps else ""
            totals = model_totals.get(step_model, {})

            cache_read = totals.get("cache_read", 0)
            uncached_input = totals.get("uncached_input", 0)
            completion = totals.get("completion", 0)
            cache_creation = totals.get("cache_creation", 0)

            prompt_tokens = cache_read + uncached_input

            rows.append({
                "instance_id": instance_id,
                "model_name": step_model,
                "api_calls": len(steps),
                "instance_cost": 0,
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion,
                "total_tokens": prompt_tokens + completion,
                "cache_read_tokens": cache_read,
                "cache_creation_tokens": cache_creation,
            })
        except Exception as e:
            logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True)

    df = ensure_token_columns(pd.DataFrame(rows))
    _calculated_tokens_cache[cache_key] = df
    return df


def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
    """
    Load all trajectories as step sequences for routing calculations.

    Returns:
        dict mapping instance_id -> list of steps for calculate_routing_tokens
    """
    global _trajectory_steps_cache

    cache_key = f"steps_{folder}"
    if cache_key in _trajectory_steps_cache:
        return _trajectory_steps_cache[cache_key]

    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*/*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    model_name = ""
    if traj_files:
        try:
            with open(traj_files[0], "r") as f:
                first_data = json.load(f)
                config = first_data.get("info", {}).get("config", {}).get("model", {})
                model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
        except Exception:
            pass

    result = {}
    for traj_path in traj_files:
        try:
            instance_id = traj_path.stem.replace(".traj", "")
            steps = parse_trajectory_to_steps(traj_path, model_name)
            if steps:
                result[instance_id] = steps
        except Exception as e:
            logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True)

    _trajectory_steps_cache[cache_key] = result
    return result


def refresh_litellm_prices() -> bool:
    """Force refresh litellm prices from remote. Returns True if successful."""
    global _litellm_prices_cache, _litellm_chat_prices_cache
    try:
        response = requests.get(LITELLM_PRICES_URL, timeout=30)
        response.raise_for_status()
        _litellm_prices_cache = response.json()
        _litellm_chat_prices_cache = None

        DATA_DIR.mkdir(exist_ok=True)
        with open(LITELLM_PRICES_CACHE, "w") as f:
            json.dump(_litellm_prices_cache, f)
        logging.info("Successfully refreshed litellm prices")
        return True
    except Exception as e:
        logging.warning(f"Failed to refresh litellm prices: {e}")
        return False


def get_litellm_prices_raw() -> dict:
    """Get raw litellm prices (all modes, unfiltered)"""
    global _litellm_prices_cache
    if _litellm_prices_cache is not None:
        return _litellm_prices_cache

    if LITELLM_PRICES_CACHE.exists():
        with open(LITELLM_PRICES_CACHE) as f:
            _litellm_prices_cache = json.load(f)
            return _litellm_prices_cache

    try:
        response = requests.get(LITELLM_PRICES_URL, timeout=30)
        response.raise_for_status()
        _litellm_prices_cache = response.json()

        DATA_DIR.mkdir(exist_ok=True)
        with open(LITELLM_PRICES_CACHE, "w") as f:
            json.dump(_litellm_prices_cache, f)
    except Exception:
        _litellm_prices_cache = {}

    return _litellm_prices_cache


def get_litellm_prices() -> dict:
    """Get litellm prices filtered to chat models only"""
    global _litellm_chat_prices_cache
    if _litellm_chat_prices_cache is not None:
        return _litellm_chat_prices_cache

    raw_prices = get_litellm_prices_raw()
    _litellm_chat_prices_cache = {
        k: v for k, v in raw_prices.items()
        if isinstance(v, dict) and v.get("mode") == "chat"
    }
    return _litellm_chat_prices_cache


def get_litellm_model_list() -> list[str]:
    """Get list of chat model names from litellm prices"""
    prices = get_litellm_prices()
    return sorted(prices.keys())


def normalize_model_name(name: str) -> str:
    """Normalize model name for comparison: lowercase, remove separators"""
    return re.sub(r'[-_./]', '', name.lower())


def _search_model_in_prices(model_name: str, prices: dict) -> dict | None:
    """Search for model in prices dict using various name variations."""
    clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
    name_without_date = re.sub(r'-\d{8}$', '', clean_name)

    candidates = [
        model_name,
        clean_name,
        name_without_date,
        f"anthropic/{clean_name}",
        f"openai/{clean_name}",
        f"anthropic/{name_without_date}",
        f"openai/{name_without_date}",
    ]

    for key in candidates:
        if key in prices:
            return prices[key]

    normalized_name = normalize_model_name(clean_name)
    normalized_no_date = normalize_model_name(name_without_date)

    for key, value in prices.items():
        key_normalized = normalize_model_name(key)
        if normalized_name in key_normalized or normalized_no_date in key_normalized:
            return value
        key_last_part = key.split('/')[-1] if '/' in key else key
        key_last_normalized = normalize_model_name(key_last_part)
        if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
            return value

    return None


def get_model_prices(model_name: str) -> dict | None:
    if not model_name:
        return None

    prices = get_litellm_prices()
    result = _search_model_in_prices(model_name, prices)

    if result is None and LITELLM_PRICES_CACHE.exists():
        logging.info(f"Model '{model_name}' not found in litellm prices, refreshing cache...")
        if refresh_litellm_prices():
            prices = get_litellm_prices()
            result = _search_model_in_prices(model_name, prices)
            if result is None:
                logging.warning(f"Model '{model_name}' still not found after refresh")

    return result


def load_or_download_leaderboard(force_refresh: bool = False):
    if not force_refresh and LEADERBOARD_CACHE.exists():
        with open(LEADERBOARD_CACHE) as f:
            return json.load(f)

    try:
        filename = download_leaderboard(output_dir=str(DATA_DIR))
        os.rename(filename, LEADERBOARD_CACHE)
        logging.info("Successfully downloaded fresh leaderboard data")
    except Exception as e:
        logging.warning(f"Failed to download leaderboard: {e}")
        if LEADERBOARD_CACHE.exists():
            logging.info("Using cached leaderboard data")
            with open(LEADERBOARD_CACHE) as f:
                return json.load(f)
        raise

    with open(LEADERBOARD_CACHE) as f:
        return json.load(f)


def get_bash_only_df():
    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return pd.DataFrame()

    rows = []
    for r in bash_only["results"]:
        resolved_pct = r.get("resolved", 0)
        if isinstance(resolved_pct, (int, float)):
            resolved_str = f"{resolved_pct:.1f}%"
        else:
            resolved_str = str(resolved_pct)

        rows.append({
            "name": r.get("name", ""),
            "% resolved": resolved_str,
            "date": r.get("date", ""),
            "cost": round(r.get("cost") or 0, 2),
            "instance_cost": round(r.get("instance_cost") or 0, 4),
            "instance_calls": r.get("instance_calls") or 0,
            "folder": r.get("folder", ""),
            "os_model": "✅" if r.get("os_model") else "❌",
        })

    return pd.DataFrame(rows)


def get_model_details(folder: str):
    if not folder:
        return None, "Select a model from the table"

    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return None, "Leaderboard not found"

    model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
    if not model:
        return None, f"Model with folder '{folder}' not found"

    return model, None


def check_trajectories_downloaded(folder: str) -> bool:
    if not folder:
        return False
    output_dir = TRAJS_DIR / folder
    return output_dir.exists() and any(output_dir.iterdir())


def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
    if not folder:
        return "❌ No model selected", gr.update(visible=False)

    model, error = get_model_details(folder)
    if error:
        return f"❌ {error}", gr.update(visible=False)

    output_dir = TRAJS_DIR / folder
    if output_dir.exists() and any(output_dir.iterdir()):
        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*/*.traj")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))
        return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

    s3_path = f"{S3_BUCKET}/{folder}/trajs/"
    output_dir.mkdir(parents=True, exist_ok=True)

    progress(0, desc="Starting S3 download...")

    try:
        result = subprocess.run(
            ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
            capture_output=True,
            text=True,
            timeout=600,
        )

        if result.returncode != 0:
            return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*/*.traj")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))

        if file_count == 0:
            return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False)

        per_instance = model.get("per_instance_details", {})
        resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
        total_count = len(per_instance)

        if total_count > 0:
            resolved_pct = f"{100*resolved_count/total_count:.1f}%"
        else:
            resolved_pct = "N/A"

        status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
        return status, gr.update(visible=True)

    except subprocess.TimeoutExpired:
        return "❌ Download timed out (>10 min)", gr.update(visible=False)
    except FileNotFoundError:
        return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
    except Exception as e:
        return f"❌ Error: {e}", gr.update(visible=False)


def parse_trajectory(traj_path: Path) -> dict:
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    info = data.get("info", {})
    model_stats = info.get("model_stats", {})
    config = info.get("config", {})
    model_config = config.get("model", {})
    model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

    trajectory_steps = data.get("trajectory", [])
    is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data

    if is_trajectory_format and not model_name:
        for step in trajectory_steps:
            query = step.get("query", [])
            for q in query:
                if q.get("role") == "system":
                    content = q.get("content", "")
                    if "llama" in content.lower() or "meta" in content.lower():
                        model_name = "llama"
                        break
            if model_name:
                break

    api_calls = model_stats.get("api_calls", 0)
    if api_calls == 0 and is_trajectory_format:
        api_calls = len(trajectory_steps)

    result = {
        "instance_id": data.get("instance_id", traj_path.stem),
        "model_name": model_name,
        "api_calls": api_calls,
        "instance_cost": model_stats.get("instance_cost", 0),
        "prompt_tokens": 0,
        "completion_tokens": 0,
        "total_tokens": 0,
        "cache_read_tokens": 0,
        "cache_creation_tokens": 0,
    }

    messages = data.get("messages", [])
    for msg in messages:
        usage = None
        if "usage" in msg:
            usage = msg["usage"]
        elif "extra" in msg and isinstance(msg["extra"], dict):
            response = msg["extra"].get("response", {})
            if isinstance(response, dict):
                usage = response.get("usage", {})

        if usage:
            result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
            result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
            result["total_tokens"] += usage.get("total_tokens", 0) or 0

            cache_read = usage.get("cache_read_input_tokens", 0) or 0
            cache_creation = usage.get("cache_creation_input_tokens", 0) or 0

            prompt_tokens_details = usage.get("prompt_tokens_details", {})
            if isinstance(prompt_tokens_details, dict):
                cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0
                if cached_from_details > 0 and cache_read == 0:
                    cache_read = cached_from_details

            result["cache_read_tokens"] += cache_read
            result["cache_creation_tokens"] += cache_creation

    if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0:
        log_path = traj_path.with_suffix(".info.log")
        if not log_path.exists():
            base_name = traj_path.stem.replace(".traj", "")
            log_path = traj_path.parent / f"{base_name}.info.log"

        if log_path.exists():
            steps = _parse_old_format_log(log_path)
            for step_data in steps:
                result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"]
                result["completion_tokens"] += step_data["completion"]
                result["cache_read_tokens"] += step_data["cache_read"]
            result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"]
            if result["api_calls"] == 0:
                result["api_calls"] = len(steps)

    return result


def load_all_trajectories(folder: str) -> pd.DataFrame:
    global _trajectories_cache

    if folder in _trajectories_cache:
        return ensure_token_columns(_trajectories_cache[folder])

    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*/*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    rows = []
    for traj_path in traj_files:
        try:
            rows.append(parse_trajectory(traj_path))
        except Exception as e:
            logging.error("Error parsing %s: %s", traj_path, e, exc_info=True)

    df = ensure_token_columns(pd.DataFrame(rows))
    _trajectories_cache[folder] = df
    return df


def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    """Create Total Cost by Token Type chart (can be called separately for price updates)"""
    if df.empty:
        return None

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    cost_uncached_input = total_uncached_input * input_price / 1e6
    cost_cache_read = total_cache_read * cache_read_price / 1e6
    cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
    cost_completion = total_completion * completion_price / 1e6

    cost_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
    })

    fig = px.bar(
        cost_data,
        x="Token Type",
        y="Cost ($)",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig.update_layout(
        xaxis_title="",
        yaxis_title="Cost ($)",
        showlegend=False,
        margin=dict(l=60, r=20, t=20, b=40),
    )

    total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
    fig.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    return fig


def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    """Create only token-related charts (for source switching)"""
    if df.empty:
        return None, None, None

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    token_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
    })

    fig_tokens = px.bar(
        token_data,
        x="Token Type",
        y="Total Tokens (M)",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens.update_layout(
        xaxis_title="",
        yaxis_title="Tokens (M)",
        showlegend=False,
        margin=dict(l=60, r=20, t=20, b=40),
    )
    total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
    fig_tokens.add_annotation(
        text=f"Total: {total_all/1e6:.2f}M",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

    # Stacked bar chart - sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    fig_stacked = go.Figure()
    fig_stacked.add_trace(go.Bar(
        name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6,
        marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6,
        marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6,
        marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.add_trace(go.Bar(
        name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6,
        marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>",
    ))
    fig_stacked.update_layout(
        barmode="stack",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Tokens (M)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig_tokens, fig_tokens_cost, fig_stacked


def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None, None, None, None, None

    fig_steps = px.histogram(
        df,
        x="api_calls",
        nbins=30,
        color_discrete_sequence=["#636EFA"],
    )
    fig_steps.update_layout(
        xaxis_title="API Calls (Steps)",
        yaxis_title="Number of Trajectories",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_steps.add_annotation(
        text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    fig_cost = px.histogram(
        df,
        x="instance_cost",
        nbins=30,
        color_discrete_sequence=["#00CC96"],
    )
    fig_cost.update_layout(
        xaxis_title="Cost ($)",
        yaxis_title="Number of Trajectories",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_cost.add_annotation(
        text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    # Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    token_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6],
    })

    fig_tokens = px.bar(
        token_data,
        x="Token Type",
        y="Tokens (M)",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens.update_layout(
        xaxis_title="",
        yaxis_title="Tokens (M)",
        showlegend=False,
        margin=dict(l=60, r=20, t=20, b=40),
    )

    total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion
    fig_tokens.add_annotation(
        text=f"Total: {total_all/1e6:.2f}M",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    # Cost by token type (use separate function)
    fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)

    # Sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    fig_stacked = go.Figure()

    fig_stacked.add_trace(go.Bar(
        name="Uncached Input",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["uncached_input_tokens"] / 1e6,
        marker_color="#EF553B",
        hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Read",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cache_read_tokens"] / 1e6,
        marker_color="#19D3F3",
        hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Creation",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cache_creation_tokens"] / 1e6,
        marker_color="#FFA15A",
        hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Completion",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["completion_tokens"] / 1e6,
        marker_color="#AB63FA",
        hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>",
    ))

    fig_stacked.update_layout(
        barmode="stack",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Tokens (M)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None

    # Sort by total tokens (sum of all stacked)
    df_sorted = df.copy()
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
    df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
    df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
    df_sorted["trajectory_idx"] = range(len(df_sorted))

    df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
    df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
    df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
    df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name=f"Uncached Input (${input_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_uncached_input"],
        marker_color="#EF553B",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Read (${cache_read_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_cache_read"],
        marker_color="#19D3F3",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_cache_creation"],
        marker_color="#FFA15A",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Completion (${completion_price:.2f}/1M)",
        x=df_sorted["trajectory_idx"],
        y=df_sorted["cost_completion"],
        marker_color="#AB63FA",
        hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    total_cost = (
        df_sorted["cost_uncached_input"].sum() +
        df_sorted["cost_cache_read"].sum() +
        df_sorted["cost_cache_creation"].sum() +
        df_sorted["cost_completion"].sum()
    )

    fig.update_layout(
        barmode="stack",
        xaxis_title="Trajectory (sorted by total tokens)",
        yaxis_title="Cost ($)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=40, b=40),
    )

    fig.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=14),
        bgcolor="white",
    )

    return fig


def extract_model_from_folder(folder: str) -> str:
    """Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
    if not folder:
        return ""
    parts = folder.split("_")
    if len(parts) >= 3:
        return "_".join(parts[2:])
    return folder


def get_prices_for_folder(folder: str) -> tuple[dict, str]:
    """Get prices from litellm based on folder name. 
    Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type."""
    model_hint = extract_model_from_folder(folder)
    
    result = {
        "input": {"value": 0, "found": False},
        "cache_read": {"value": 0, "found": False},
        "cache_creation": {"value": 0, "found": False},
        "completion": {"value": 0, "found": False},
    }
    
    if not model_hint:
        return result, ""

    prices = get_model_prices(model_hint)
    if prices:
        # Get values from litellm
        input_price = prices.get("input_cost_per_token", 0) * 1e6
        cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
        cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
        completion = prices.get("output_cost_per_token", 0) * 1e6
        
        result["input"] = {"value": input_price, "found": input_price > 0}
        result["cache_read"] = {"value": cache_read, "found": cache_read > 0}
        result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0}
        result["completion"] = {"value": completion, "found": completion > 0}
        
        # Apply fallback estimates based on standard ratios
        # Cache Read = Input * 0.1 (90% discount)
        # Cache Creation = Input * 1.25 (25% premium)
        # Completion = Input * 5 (typical ratio)
        if input_price > 0:
            if not result["cache_read"]["found"]:
                result["cache_read"]["value"] = input_price * 0.1
            if not result["cache_creation"]["found"]:
                result["cache_creation"]["value"] = input_price * 1.25
            if not result["completion"]["found"]:
                result["completion"]["value"] = input_price * 5
        elif completion > 0:
            # If we only have completion, estimate input from it
            estimated_input = completion / 5
            if not result["input"]["found"]:
                result["input"]["value"] = estimated_input
            if not result["cache_read"]["found"]:
                result["cache_read"]["value"] = estimated_input * 0.1
            if not result["cache_creation"]["found"]:
                result["cache_creation"]["value"] = estimated_input * 1.25

    return result, model_hint


def _build_selection_payload(row_idx: int | None, df: pd.DataFrame):
    if df is None or df.empty or row_idx is None:
        return (
            "", "",
            gr.update(visible=False),
            gr.update(value=0, label="Input"),
            gr.update(value=0, label="Cache Read"),
            gr.update(value=0, label="Cache Creation"),
            gr.update(value=0, label="Completion"),
            "",
            gr.update(value=1.0),
        )

    row = df.iloc[row_idx]
    folder = row["folder"]
    name = row["name"]

    prices_dict, model_hint = get_prices_for_folder(folder)
    default_overhead = get_default_overhead(model_hint)

    def price_update(price_info, name):
        value = price_info["value"]
        if price_info["found"]:
            return gr.update(value=value, label=f"✅ {name}")
        elif value > 0:
            return gr.update(value=value, label=f"❌ {name} (est.)")
        else:
            return gr.update(value=0, label=f"❌ {name}")

    return (
        folder, name,
        gr.update(visible=True),
        price_update(prices_dict["input"], "Input"),
        price_update(prices_dict["cache_read"], "Cache Read"),
        price_update(prices_dict["cache_creation"], "Cache Creation"),
        price_update(prices_dict["completion"], "Completion"),
        model_hint,
        gr.update(value=default_overhead),
    )


def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
    row_idx = None
    if evt is not None and evt.index is not None:
        row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
    return _build_selection_payload(row_idx, df)


def select_first_row(df: pd.DataFrame):
    default_idx = 0 if df is not None and not df.empty else None
    return _build_selection_payload(default_idx, df)


def create_routed_token_chart(original_tokens: dict, base_tokens: dict, additional_models: list, base_model_name: str = "Base"):
    """
    Create grouped+stacked bar chart comparing Calculated vs Routed tokens.

    Args:
        original_tokens: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
        base_tokens: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
        additional_models: list of (model_name, tokens_dict) tuples
        base_model_name: name of the base model
    """
    import plotly.graph_objects as go

    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
    token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
    base_color_dark = "#636EFA"
    base_color_light = "#A0C4FF"
    model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name=f"{base_model_name} [no routing]",
        x=categories,
        y=[original_tokens.get(k, 0) / 1e6 for k in token_keys],
        marker_color="rgba(99, 110, 250, 0.3)",
        marker_line_color=base_color_dark,
        marker_line_width=1,
        marker_pattern_shape="/",
        marker_pattern_fgcolor=base_color_dark,
        offsetgroup="calculated",
        hovertemplate="%{x}<br>" + base_model_name + " [no routing]: %{y:.3f}M<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"{base_model_name} [with routing]",
        x=categories,
        y=[base_tokens.get(k, 0) / 1e6 for k in token_keys],
        marker_color=base_color_dark,
        offsetgroup="routed",
        hovertemplate="%{x}<br>" + base_model_name + " [with routing]: %{y:.3f}M<extra></extra>",
    ))

    for i, (model_name, tokens) in enumerate(additional_models):
        fig.add_trace(go.Bar(
            name=model_name or f"Model {i+1}",
            x=categories,
            y=[tokens.get(k, 0) / 1e6 for k in token_keys],
            marker_color=model_colors[i % len(model_colors)],
            offsetgroup="routed",
            hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": %{y:.3f}M<extra></extra>",
        ))

    original_total = sum(original_tokens.get(k, 0) for k in token_keys)
    routed_total = sum(base_tokens.get(k, 0) for k in token_keys) + sum(
        sum(m[1].get(k, 0) for k in token_keys) for m in additional_models
    )

    annotation_lines = [
        f"<b>No routing: {original_total/1e6:.2f}M</b>",
        f"<b>With routing: {routed_total/1e6:.2f}M</b>",
    ]

    fig.update_layout(
        yaxis_title="Tokens (M)",
        barmode="stack",
        bargroupgap=0.1,
        margin=dict(l=40, r=40, t=40, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
    )
    fig.add_annotation(
        text="<br>".join(annotation_lines),
        xref="paper", yref="paper",
        x=0.02, y=0.98, showarrow=False,
        font=dict(size=11),
        align="left",
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=1,
    )
    return fig


def create_routed_cost_chart(original_costs: dict, base_costs: dict, additional_models: list, base_model_name: str = "Base"):
    """
    Create grouped+stacked bar chart comparing Calculated vs Routed costs.

    Args:
        original_costs: dict with uncached_input, cache_read, cache_creation, completion (from Calculated)
        base_costs: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing)
        additional_models: list of (model_name, costs_dict) tuples
        base_model_name: name of the base model
    """
    import plotly.graph_objects as go

    categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"]
    cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"]
    base_color_dark = "#636EFA"
    base_color_light = "#A0C4FF"
    model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name=f"{base_model_name} [no routing]",
        x=categories,
        y=[original_costs.get(k, 0) for k in cost_keys],
        marker_color="rgba(99, 110, 250, 0.3)",
        marker_line_color=base_color_dark,
        marker_line_width=1,
        marker_pattern_shape="/",
        marker_pattern_fgcolor=base_color_dark,
        offsetgroup="calculated",
        hovertemplate="%{x}<br>" + base_model_name + " [no routing]: $%{y:.2f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"{base_model_name} [with routing]",
        x=categories,
        y=[base_costs.get(k, 0) for k in cost_keys],
        marker_color=base_color_dark,
        offsetgroup="routed",
        hovertemplate="%{x}<br>" + base_model_name + " [with routing]: $%{y:.2f}<extra></extra>",
    ))

    for i, (model_name, costs) in enumerate(additional_models):
        fig.add_trace(go.Bar(
            name=model_name or f"Model {i+1}",
            x=categories,
            y=[costs.get(k, 0) for k in cost_keys],
            marker_color=model_colors[i % len(model_colors)],
            offsetgroup="routed",
            hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": $%{y:.2f}<extra></extra>",
        ))

    original_total = sum(original_costs.get(k, 0) for k in cost_keys)
    routed_total = sum(base_costs.get(k, 0) for k in cost_keys) + sum(
        sum(m[1].get(k, 0) for k in cost_keys) for m in additional_models
    )

    annotation_lines = [
        f"<b>No routing: ${original_total:.2f}</b>",
        f"<b>With routing: ${routed_total:.2f}</b>",
    ]

    fig.update_layout(
        yaxis_title="Cost ($)",
        barmode="stack",
        bargroupgap=0.1,
        margin=dict(l=40, r=40, t=40, b=40),
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"),
    )
    fig.add_annotation(
        text="<br>".join(annotation_lines),
        xref="paper", yref="paper",
        x=0.02, y=0.98, showarrow=False,
        font=dict(size=11),
        align="left",
        bgcolor="rgba(255,255,255,0.8)",
        bordercolor="gray",
        borderwidth=1,
    )
    return fig


def build_app():
    leaderboard_df = get_bash_only_df()

    with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
        gr.HTML("""
        <style>
        .quick-select-row {
            flex-wrap: wrap !important;
            gap: 6px !important;
            margin-bottom: 8px !important;
        }
        .quick-select-row button {
            background: white !important;
            color: #333 !important;
            border: 1px solid #ccc !important;
            border-radius: 4px !important;
            padding: 4px 10px !important;
            font-size: 12px !important;
            transition: all 0.15s ease !important;
        }
        .quick-select-row button:hover {
            background: #f0f0f0 !important;
            border-color: #999 !important;
        }
        </style>
        """)
        trajectories_state = gr.State(None)

        gr.Markdown("# 🧮 SWE-bench Costs Calculator `v0.3.46`")
        gr.Markdown("### *Calculate cost savings with different routing strategies.*")
        gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")

        with gr.Row():
            with gr.Column(scale=3):
                leaderboard_table = gr.Dataframe(
                    value=leaderboard_df,
                    label="Bash-Only Leaderboard",
                    interactive=False,
                    wrap=True,
                    elem_id="leaderboard-table",
                )

                with gr.Column(visible=False) as analysis_section:
                    gr.Markdown("## 📊 Trajectory Analysis")

                    with gr.Accordion("Leaderboard data", open=True):
                        with gr.Row():
                            plot_steps = gr.Plot(label="Distribution of API Calls (Steps) per Trajectory")
                            plot_cost = gr.Plot(label="Distribution of Cost Reported by Leaderboard ($)")

                    with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED ALL]", open=True):
                        with gr.Row():
                            plot_tokens_meta = gr.Plot(label="Total Tokens by Type")
                            plot_tokens_cost_meta = gr.Plot(label="Total Cost by Token Type ($)")

                    with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED BY TRAJECTORY]", open=True):
                        with gr.Row():
                            plot_stacked_meta = gr.Plot(label="Tokens per Trajectory (stacked)")
                        with gr.Row():
                            plot_cost_breakdown_meta = gr.Plot(label="Cost per Trajectory")

                    with gr.Accordion("Token counts REPORTED in the metadata of .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_meta_accordion:
                        with gr.Row():
                            single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
                        with gr.Row():
                            single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)")
                        with gr.Row():
                            single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")

                    with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED ALL]", open=True):
                        with gr.Row():
                            plot_tokens_calc = gr.Plot(label="Total Tokens by Type")
                            plot_tokens_cost_calc = gr.Plot(label="Total Cost by Token Type ($)")

                    with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED BY TRAJECTORY]", open=True):
                        with gr.Row():
                            plot_stacked_calc = gr.Plot(label="Tokens per Trajectory (stacked)")
                        with gr.Row():
                            plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory")

                    with gr.Accordion("Token counts CALCULATED from .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_accordion:
                        with gr.Row():
                            single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True)
                        with gr.Row():
                            single_traj_plot = gr.Plot(label="Tokens per Step (stacked)")
                        with gr.Row():
                            single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)")

                    with gr.Accordion("Token counts CALCULATED from .traj files, with ROUTING [AGGREGATED ALL]", open=True, visible=False) as routing_plots_row:
                        with gr.Row():
                            routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)")
                            routing_cost_plot = gr.Plot(label="Cost by Type (per Model) ($)")
                        gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*")

            with gr.Column(scale=1):
                selected_folder = gr.State("")
                gr.Markdown("### Selected Model")
                selected_name = gr.Textbox(label="Model Name", interactive=False)

                analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")
                download_status = gr.Textbox(label="Status", interactive=False, lines=3)

                gr.Markdown("---")
                gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
                detected_model = gr.Textbox(label="Detected Model", interactive=False)
                with gr.Row():
                    price_input = gr.Number(label="Input", value=0, precision=2, scale=1)
                    price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1)
                    price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1)
                    price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1)

                gr.Markdown("---")
                gr.Markdown("### 🔢 Calculated Token Options")
                thinking_overhead = gr.Number(
                    label="Tokenizer Overhead",
                    value=1.21,
                    precision=2,
                    info="Multiplier for Calculated tokens (tiktoken → native)",
                )
                use_cache = gr.Checkbox(
                    label="Use Cache",
                    value=True,
                    info="If disabled, all tokens are Uncached Input or Completion",
                )

                gr.Markdown("---")
                add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False)
                gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*")

                with gr.Column(visible=False) as routing_section:
                    gr.Markdown("### 🔀 Routing Models")

                    with gr.Column():
                        with gr.Group():
                            gr.Markdown("#### Route to Model 1")
                            with gr.Row(elem_classes=["quick-select-row"]):
                                quick_btns_1 = []
                                for item in QUICK_SELECT_MODELS:
                                    if isinstance(item, tuple):
                                        model, short_name = item
                                    else:
                                        model = item
                                        short_name = model.split("/")[-1]
                                    btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
                                    quick_btns_1.append((btn, model))
                            routing_model_1 = gr.Dropdown(
                                label="Model (type 3+ chars to search)",
                                choices=[],
                                allow_custom_value=True,
                                interactive=True,
                            )
                            with gr.Row():
                                routing_price_1_input = gr.Number(label="Input", precision=3, scale=1)
                                routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1)

                        add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False)

                        with gr.Column(visible=False) as routing_block_2:
                            with gr.Group():
                                gr.Markdown("#### Route to Model 2")
                                with gr.Row(elem_classes=["quick-select-row"]):
                                    quick_btns_2 = []
                                    for item in QUICK_SELECT_MODELS:
                                        if isinstance(item, tuple):
                                            model, short_name = item
                                        else:
                                            model = item
                                            short_name = model.split("/")[-1]
                                        btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
                                        quick_btns_2.append((btn, model))
                                routing_model_2 = gr.Dropdown(
                                    label="Model (type 3+ chars to search)",
                                    choices=[],
                                    allow_custom_value=True,
                                    interactive=True,
                                )
                                with gr.Row():
                                    routing_price_2_input = gr.Number(label="Input", precision=3, scale=1)
                                    routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                    routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                    routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1)

                            add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False)

                        with gr.Column(visible=False) as routing_block_3:
                            with gr.Group():
                                gr.Markdown("#### Route to Model 3")
                                with gr.Row(elem_classes=["quick-select-row"]):
                                    quick_btns_3 = []
                                    for item in QUICK_SELECT_MODELS:
                                        if isinstance(item, tuple):
                                            model, short_name = item
                                        else:
                                            model = item
                                            short_name = model.split("/")[-1]
                                        btn = gr.Button(short_name, size="sm", scale=0, min_width=80)
                                        quick_btns_3.append((btn, model))
                                routing_model_3 = gr.Dropdown(
                                    label="Model (type 3+ chars to search)",
                                    choices=[],
                                    allow_custom_value=True,
                                    interactive=True,
                                )
                                with gr.Row():
                                    routing_price_3_input = gr.Number(label="Input", precision=3, scale=1)
                                    routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1)
                                    routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1)
                                    routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1)

                    gr.Markdown("---")
                    gr.Markdown("### 🎯 Router Strategy")

                    selected_strategy = gr.Radio(
                        choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
                        value="Random router",
                        label="",
                        interactive=True,
                    )
                    num_routing_models = gr.State(1)

                    with gr.Column(visible=True) as random_block:
                        random_hint = gr.Markdown("*Weights must sum to 1.0*")
                        weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
                        weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True)
                        weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)
                        weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False)

                    with gr.Column(visible=False) as every_k_block:
                        every_k_hint = gr.Markdown("*First model has priority on overlaps*")
                        k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True)
                        k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False)
                        k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False)

                    with gr.Column(visible=False) as slice_block:
                        slice_hint = gr.Markdown("*First model has priority on overlaps*")
                        slice_model_1 = gr.Textbox(label="M1 slice", value="[0::3]", interactive=True)
                        slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False)
                        slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False)

                    with gr.Column(visible=False) as grep_block:
                        grep_hint = gr.Markdown("*Use `|` for OR, `&` for AND (don't mix). First model has priority on overlaps*")
                        grep_model_1 = gr.Textbox(label="M1 grep", value="ls|find", interactive=True)
                        grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
                        grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)

                    with gr.Column(visible=False) as resolved_block:
                        resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*")
                        resolved_model = gr.Dropdown(
                            label="Model for resolved trajectories",
                            choices=["Base", "M1", "M2", "M3"],
                            value="Base",
                            interactive=True,
                        )
                        unresolved_model = gr.Dropdown(
                            label="Model for unresolved trajectories",
                            choices=["Base", "M1", "M2", "M3"],
                            value="M1",
                            interactive=True,
                        )

                    with gr.Column(visible=False) as part_block:
                        part_hint = gr.Markdown("*Ranges must not overlap*")
                        part_mode = gr.Radio(
                            choices=["Indexes", "Percentages"],
                            value="Percentages",
                            label="Mode",
                            interactive=True,
                        )
                        start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True)
                        end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True)
                        start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False)
                        end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False)
                        start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False)
                        end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False)

                    gr.Markdown("---")
                    route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False)
                    routing_result = gr.Markdown(visible=False)


        def toggle_routing_section():
            return gr.update(visible=True)

        add_routing_btn.click(
            fn=toggle_routing_section,
            outputs=[routing_section],
        )

        def on_strategy_change(strategy, num_models):
            show_random = strategy == "Random router"
            show_every_k = strategy == "Every k-th step"
            show_slice = strategy == "Python list slices"
            show_grep = strategy == "Grep"
            show_resolved = strategy == "Resolved/Unresolved"
            show_part = strategy == "Replace part of trajectory"
            has_m2 = num_models >= 2
            has_m3 = num_models >= 3
            return [
                gr.update(visible=show_random),       # random_block
                gr.update(visible=show_every_k),      # every_k_block
                gr.update(visible=show_slice),        # slice_block
                gr.update(visible=show_grep),         # grep_block
                gr.update(visible=show_resolved),     # resolved_block
                gr.update(visible=show_part),         # part_block
                gr.update(visible=show_random),       # random_hint
                gr.update(visible=show_random),       # weight_base
                gr.update(visible=show_random),       # weight_model_1
                gr.update(visible=show_random and has_m2),  # weight_model_2
                gr.update(visible=show_random and has_m3),  # weight_model_3
                gr.update(visible=show_every_k),      # every_k_hint
                gr.update(visible=show_every_k),      # k_model_1
                gr.update(visible=show_every_k and has_m2), # k_model_2
                gr.update(visible=show_every_k and has_m3), # k_model_3
                gr.update(visible=show_slice),        # slice_hint
                gr.update(visible=show_slice),        # slice_model_1
                gr.update(visible=show_slice and has_m2), # slice_model_2
                gr.update(visible=show_slice and has_m3), # slice_model_3
                gr.update(visible=show_grep),         # grep_hint
                gr.update(visible=show_grep),         # grep_model_1
                gr.update(visible=show_grep and has_m2), # grep_model_2
                gr.update(visible=show_grep and has_m3), # grep_model_3
                gr.update(visible=show_resolved),     # resolved_hint
                gr.update(visible=show_resolved),     # resolved_model
                gr.update(visible=show_resolved),     # unresolved_model
                gr.update(visible=show_part),         # part_hint
                gr.update(visible=show_part),         # part_mode
                gr.update(visible=show_part),         # start_1
                gr.update(visible=show_part),         # end_1
                gr.update(visible=show_part and has_m2), # start_2
                gr.update(visible=show_part and has_m2), # end_2
                gr.update(visible=show_part and has_m3), # start_3
                gr.update(visible=show_part and has_m3), # end_3
            ]

        selected_strategy.change(
            fn=on_strategy_change,
            inputs=[selected_strategy, num_routing_models],
            outputs=[
                random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
                random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
                every_k_hint, k_model_1, k_model_2, k_model_3,
                slice_hint, slice_model_1, slice_model_2, slice_model_3,
                grep_hint, grep_model_1, grep_model_2, grep_model_3,
                resolved_hint, resolved_model, unresolved_model,
                part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
            ],
        )

        def filter_models(query):
            """Filter models based on search query (starts at 3 chars)"""
            if not query or len(query) < 3:
                return gr.update(choices=[])
            all_models = get_litellm_model_list()
            query_lower = query.lower()
            filtered = [m for m in all_models if query_lower in m.lower()][:50]
            return gr.update(choices=filtered)

        routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1])
        routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
        routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])

        def make_quick_select_fn_1(full_model_name):
            def fn():
                prices = get_routing_prices_with_labels(full_model_name)
                return (gr.update(value=full_model_name), *prices,
                        gr.update(visible=True), gr.update(interactive=True))
            return fn

        def make_quick_select_fn_2(full_model_name):
            def fn():
                prices = get_routing_prices_with_labels(full_model_name)
                return (gr.update(value=full_model_name), *prices,
                        gr.update(visible=True))
            return fn

        def make_quick_select_fn_3(full_model_name):
            def fn():
                prices = get_routing_prices_with_labels(full_model_name)
                return (gr.update(value=full_model_name), *prices)
            return fn

        for btn, full_model in quick_btns_1:
            btn.click(
                fn=make_quick_select_fn_1(full_model),
                outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
            )

        for btn, full_model in quick_btns_2:
            btn.click(
                fn=make_quick_select_fn_2(full_model),
                outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
            )

        for btn, full_model in quick_btns_3:
            btn.click(
                fn=make_quick_select_fn_3(full_model),
                outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
            )

        def get_routing_prices_with_labels(model_name):
            """Get all 4 prices for a routing model with found/estimated labels"""
            if not model_name:
                return (
                    gr.update(value=0, label="Input"),
                    gr.update(value=0, label="Cache Read"),
                    gr.update(value=0, label="Cache Creation"),
                    gr.update(value=0, label="Completion"),
                )

            prices = get_litellm_prices()
            model_prices = prices.get(model_name, {})

            input_price = model_prices.get("input_cost_per_token", 0) * 1e6
            cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6
            cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6
            completion = model_prices.get("output_cost_per_token", 0) * 1e6

            input_found = input_price > 0
            cache_read_found = cache_read > 0
            cache_creation_found = cache_creation > 0
            completion_found = completion > 0

            if not cache_read_found and input_price > 0:
                cache_read = input_price * 0.1
            if not cache_creation_found and input_price > 0:
                cache_creation = input_price * 1.25

            def label(name, found):
                return f"✅ {name}" if found else f"❌ {name}"

            return (
                gr.update(value=input_price, label=label("Input", input_found)),
                gr.update(value=cache_read, label=label("Cache Read", cache_read_found)),
                gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)),
                gr.update(value=completion, label=label("Completion", completion_found)),
            )

        def on_routing_model_1_select(model_name):
            prices = get_routing_prices_with_labels(model_name)
            show_btn = bool(model_name)
            return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))

        def on_routing_model_2_select(model_name):
            prices = get_routing_prices_with_labels(model_name)
            show_btn = bool(model_name)
            return (*prices, gr.update(visible=show_btn))

        def on_routing_model_3_select(model_name):
            return get_routing_prices_with_labels(model_name)

        routing_model_1.change(
            fn=on_routing_model_1_select,
            inputs=[routing_model_1],
            outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn],
        )

        def show_model_2(strategy):
            is_random = strategy == "Random router"
            is_every_k = strategy == "Every k-th step"
            is_slice = strategy == "Python list slices"
            is_grep = strategy == "Grep"
            is_part = strategy == "Replace part of trajectory"
            return (
                gr.update(visible=True),   # show block 2
                gr.update(visible=False),  # hide add button
                gr.update(visible=is_random),  # weight2
                gr.update(visible=is_every_k), # k2
                gr.update(visible=is_slice),   # slice2
                gr.update(visible=is_grep),    # grep2
                gr.update(visible=is_part),    # start2
                gr.update(visible=is_part),    # end2
                2,
            )

        add_model_2_btn.click(
            fn=show_model_2,
            inputs=[selected_strategy],
            outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models],
        )

        routing_model_2.change(
            fn=on_routing_model_2_select,
            inputs=[routing_model_2],
            outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn],
        )

        def show_model_3(strategy):
            is_random = strategy == "Random router"
            is_every_k = strategy == "Every k-th step"
            is_slice = strategy == "Python list slices"
            is_grep = strategy == "Grep"
            is_part = strategy == "Replace part of trajectory"
            return (
                gr.update(visible=True),   # show block 3
                gr.update(visible=False),  # hide add button
                gr.update(visible=is_random),  # weight3
                gr.update(visible=is_every_k), # k3
                gr.update(visible=is_slice),   # slice3
                gr.update(visible=is_grep),    # grep3
                gr.update(visible=is_part),    # start3
                gr.update(visible=is_part),    # end3
                3,
            )

        add_model_3_btn.click(
            fn=show_model_3,
            inputs=[selected_strategy],
            outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models],
        )

        routing_model_3.change(
            fn=on_routing_model_3_select,
            inputs=[routing_model_3],
            outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion],
        )

        def run_routing(
            state_data,
            base_input, base_cache_read, base_cache_creation, base_completion,
            routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion,
            routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion,
            routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion,
            strategy_val,
            weight_base_val, weight_1_val, weight_2_val, weight_3_val,
            k_1_val, k_2_val, k_3_val,
            slice_1_val, slice_2_val, slice_3_val,
            grep_1_val, grep_2_val, grep_3_val,
            resolved_model_val, unresolved_model_val,
            part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
            overhead, with_cache,
            detected_model_val
        ):
            if state_data is None:
                yield (
                    gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."),
                    gr.update(visible=False),
                    None, None,
                )
                return

            if not routing_model_1_val:
                yield (
                    gr.update(visible=True, value="❌ Please select at least one routing model."),
                    gr.update(visible=False),
                    None, None,
                )
                return

            trajectory_steps = state_data.get("steps", {})
            resolved_instances = state_data.get("resolved", {})
            if not trajectory_steps:
                yield (
                    gr.update(visible=True, value="❌ No trajectory steps data available."),
                    gr.update(visible=False),
                    None, None,
                )
                return


            df_calc = state_data.get("calculated")
            if df_calc is not None and not df_calc.empty:
                df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead)
                if not with_cache:
                    df_for_cost = apply_no_cache(df_for_cost)
                df_temp = df_for_cost.copy()
                df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
                total_original_cost_from_df = (
                    df_temp["uncached_input"].sum() * base_input / 1e6 +
                    df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 +
                    df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 +
                    df_for_cost["completion_tokens"].sum() * base_completion / 1e6
                )
            else:
                total_original_cost_from_df = None

            base_prices = {
                "input": base_input,
                "cache_read": base_cache_read,
                "cache_creation": base_cache_creation,
                "completion": base_completion,
            }

            routing_models = []
            if routing_model_1_val:
                routing_models.append({
                    "name": routing_model_1_val,
                    "prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion},
                })
            if routing_model_2_val:
                routing_models.append({
                    "name": routing_model_2_val,
                    "prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion},
                })
            if routing_model_3_val:
                routing_models.append({
                    "name": routing_model_3_val,
                    "prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion},
                })

            if strategy_val == "Replace part of trajectory":
                ranges = [(start_1_val, end_1_val)]
                if len(routing_models) > 1:
                    ranges.append((start_2_val, end_2_val))
                if len(routing_models) > 2:
                    ranges.append((start_3_val, end_3_val))
                for i, (s, e) in enumerate(ranges):
                    if s >= e:
                        yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None)
                        return
                for i in range(len(ranges)):
                    for j in range(i+1, len(ranges)):
                        s1, e1 = ranges[i]
                        s2, e2 = ranges[j]
                        if not (e1 <= s2 or e2 <= s1):
                            yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None)
                            return

            weights = None
            if strategy_val == "Random router":
                weights = [weight_base_val, weight_1_val]
                if len(routing_models) > 1:
                    weights.append(weight_2_val)
                if len(routing_models) > 2:
                    weights.append(weight_3_val)
                total_weight = sum(weights)
                if abs(total_weight - 1.0) > 0.01:
                    yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None)
                    return

            k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)]
            slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)]
            grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)]
            part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)]

            if strategy_val == "Grep":
                for i, gv in enumerate(grep_values):
                    if gv and "|" in gv and "&" in gv:
                        yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix | and & operators"), gr.update(visible=False), None, None)
                        return

            def grep_matches(text, pattern):
                """Check if text matches grep pattern (words with | or &)"""
                if not pattern or not text:
                    return False
                pattern = pattern.strip()
                if "|" in pattern:
                    words = [w.strip() for w in pattern.split("|") if w.strip()]
                    for word in words:
                        if re.search(r'\b' + re.escape(word) + r'\b', text):
                            return True
                    return False
                elif "&" in pattern:
                    words = [w.strip() for w in pattern.split("&") if w.strip()]
                    for word in words:
                        if not re.search(r'\b' + re.escape(word) + r'\b', text):
                            return False
                    return True
                else:
                    return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text))

            def parse_slice(slice_str, length):
                """Parse Python slice notation like [0::3] and return list of indices"""
                slice_str = slice_str.strip()
                if slice_str.startswith("[") and slice_str.endswith("]"):
                    slice_str = slice_str[1:-1]
                parts = slice_str.split(":")
                if len(parts) == 2:
                    start = int(parts[0]) if parts[0] else None
                    stop = int(parts[1]) if parts[1] else None
                    step = None
                elif len(parts) == 3:
                    start = int(parts[0]) if parts[0] else None
                    stop = int(parts[1]) if parts[1] else None
                    step = int(parts[2]) if parts[2] else None
                else:
                    return []
                return list(range(length))[slice(start, stop, step)]

            BASE_MODEL = "__base__"
            model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))]

            all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys}
            total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}

            for instance_id, steps in trajectory_steps.items():
                if not steps:
                    continue

                total_steps = len(steps)

                step_to_model = {}

                if strategy_val == "Random router":
                    model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))]
                    for i in range(total_steps):
                        step_to_model[i] = random.choices(model_choices, weights=weights)[0]

                elif strategy_val == "Every k-th step":
                    for j, k_val in enumerate(k_values):
                        if k_val and k_val > 0:
                            for i in range(total_steps):
                                if (i + 1) % int(k_val) == 0:
                                    if i not in step_to_model:
                                        step_to_model[i] = f"__routing_{j}__"

                elif strategy_val == "Python list slices":
                    for j, slice_val in enumerate(slice_values):
                        if slice_val:
                            try:
                                indices = parse_slice(slice_val, total_steps)
                                for i in indices:
                                    if i not in step_to_model:
                                        step_to_model[i] = f"__routing_{j}__"
                            except Exception:
                                pass

                elif strategy_val == "Grep":
                    for i, step in enumerate(steps):
                        content = step.get("content", "")
                        for j, grep_val in enumerate(grep_values):
                            if grep_val and i not in step_to_model:
                                if grep_matches(content, grep_val):
                                    step_to_model[i] = f"__routing_{j}__"

                elif strategy_val == "Resolved/Unresolved":
                    is_resolved = resolved_instances.get(instance_id, False)
                    target_model = resolved_model_val if is_resolved else unresolved_model_val
                    if target_model and target_model != "Base":
                        model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
                        if model_idx is not None and model_idx < len(routing_models):
                            for i in range(total_steps):
                                step_to_model[i] = f"__routing_{model_idx}__"

                elif strategy_val == "Replace part of trajectory":
                    for j, (start_val, end_val) in enumerate(part_ranges):
                        if part_mode_val == "Percentages":
                            start_idx = int(total_steps * start_val / 100)
                            end_idx = int(total_steps * end_val / 100)
                        else:
                            start_idx = int(start_val)
                            end_idx = min(int(end_val), total_steps)
                        for i in range(start_idx, end_idx):
                            step_to_model[i] = f"__routing_{j}__"

                modified_steps = []
                for i, step in enumerate(steps):
                    model = step_to_model.get(i, BASE_MODEL)
                    modified_steps.append({
                        "model": model,
                        "system_user": step.get("system_user", 0),
                        "completion": int(step.get("completion", 0) * overhead),
                        "observation": step.get("observation"),
                    })

                model_totals = calculate_routing_tokens(modified_steps)

                for key in model_keys:
                    totals = model_totals.get(key, {})
                    all_tokens[key]["cache_read"] += totals.get("cache_read", 0)
                    all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0)
                    all_tokens[key]["completion"] += totals.get("completion", 0)
                    all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0)

                original_steps = []
                for step in steps:
                    original_steps.append({
                        "model": BASE_MODEL,
                        "system_user": step.get("system_user", 0),
                        "completion": int(step.get("completion", 0) * overhead),
                        "observation": step.get("observation"),
                    })
                original_totals = calculate_routing_tokens(original_steps)
                orig = original_totals.get(BASE_MODEL, {})
                total_original_tokens["cache_read"] += orig.get("cache_read", 0)
                total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
                total_original_tokens["completion"] += orig.get("completion", 0)
                total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)

            def calc_cost(tokens: dict, prices: dict) -> float:
                return (
                    tokens["uncached_input"] * prices["input"] / 1e6 +
                    tokens["cache_read"] * prices["cache_read"] / 1e6 +
                    tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
                    tokens["completion"] * prices["completion"] / 1e6
                )

            def tokens_to_costs(tokens: dict, prices: dict) -> dict:
                price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}
                return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens}

            total_base_tokens = all_tokens[BASE_MODEL]
            base_costs = tokens_to_costs(total_base_tokens, base_prices)
            total_base_cost = calc_cost(total_base_tokens, base_prices)

            routing_costs_list = []
            total_routing_cost = 0
            for i, rm in enumerate(routing_models):
                key = f"__routing_{i}__"
                tokens = all_tokens[key]
                costs = tokens_to_costs(tokens, rm["prices"])
                cost = calc_cost(tokens, rm["prices"])
                routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost})
                total_routing_cost += cost

            total_original_cost = calc_cost(total_original_tokens, base_prices)

            total_routed_cost = total_base_cost + total_routing_cost
            savings = total_original_cost - total_routed_cost
            savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0

            result_lines = [
                "## 🚀 Routing Results",
                "",
                "| Metric | Value |",
                "|--------|-------|",
                f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |",
                f"| **Routed Cost** | ${total_routed_cost:.2f} |",
                f"| ↳ Base model portion | ${total_base_cost:.2f} |",
            ]
            for rc in routing_costs_list:
                result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |")
            savings_color = "green" if savings >= 0 else "red"
            result_lines.append(f'| **Savings** | <span style="color: {savings_color}; font-weight: bold;">${savings:.2f} · {savings_pct:.1f}%</span> |')
            result_text = "\n".join(result_lines)

            def apply_display_formula(tokens: dict) -> dict:
                prompt = tokens["cache_read"] + tokens["uncached_input"]
                if with_cache:
                    uncached_display = max(0, prompt - tokens["cache_read"] - tokens["cache_creation"])
                    return {
                        "uncached_input": uncached_display,
                        "cache_read": tokens["cache_read"],
                        "cache_creation": tokens["cache_creation"],
                        "completion": tokens["completion"],
                    }
                else:
                    return {
                        "uncached_input": prompt,
                        "cache_read": 0,
                        "cache_creation": 0,
                        "completion": tokens["completion"],
                    }

            total_base_tokens_display = apply_display_formula(total_base_tokens)
            base_costs = tokens_to_costs(total_base_tokens_display, base_prices)

            additional_token_models = [(rc["name"], apply_display_formula(rc["tokens"])) for rc in routing_costs_list]
            additional_cost_models = []
            for i, rc in enumerate(routing_costs_list):
                model_prices = routing_models[i]["prices"]
                additional_cost_models.append((rc["name"], tokens_to_costs(apply_display_formula(rc["tokens"]), model_prices)))

            if df_calc is not None and not df_calc.empty:
                df_temp = df_for_cost.copy()
                df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
                original_tokens_from_df = {
                    "uncached_input": df_temp["uncached_input"].sum(),
                    "cache_read": df_for_cost["cache_read_tokens"].sum(),
                    "cache_creation": df_for_cost["cache_creation_tokens"].sum(),
                    "completion": df_for_cost["completion_tokens"].sum(),
                }
            else:
                original_tokens_from_df = apply_display_formula(total_original_tokens)

            original_costs = tokens_to_costs(original_tokens_from_df, base_prices)

            base_model_name = detected_model_val or "Base"
            tokens_chart = create_routed_token_chart(original_tokens_from_df, total_base_tokens_display, additional_token_models, base_model_name)
            cost_chart = create_routed_cost_chart(original_costs, base_costs, additional_cost_models, base_model_name)

            yield (
                gr.update(visible=True, value=result_text),
                gr.update(visible=True),
                tokens_chart,
                cost_chart,
            )

        route_btn.click(
            fn=run_routing,
            inputs=[
                trajectories_state,
                price_input, price_cache_read, price_cache_creation, price_completion,
                routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
                routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
                routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
                selected_strategy,
                weight_base, weight_model_1, weight_model_2, weight_model_3,
                k_model_1, k_model_2, k_model_3,
                slice_model_1, slice_model_2, slice_model_3,
                grep_model_1, grep_model_2, grep_model_3,
                resolved_model, unresolved_model,
                part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                thinking_overhead, use_cache,
                detected_model,
            ],
            outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot],
        )

        leaderboard_table.select(
            fn=on_row_select,
            inputs=[leaderboard_table],
            outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
        )

        app.load(
            fn=select_first_row,
            inputs=[leaderboard_table],
            outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
            js="""
(data) => {
    const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr');
    if (row) {
        row.click();
    }
    return data;
}
""",
        )

        def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
            progress(0, desc="Ready")
            empty_result = (
                "",
                gr.update(visible=False),
                None, None,
                None, None, None, None,
                None, None, None, None,
                None,
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(),
                gr.update(),
                gr.update(),
                gr.update(visible=False),
                gr.update(),
                gr.update(),
                gr.update(),
            )

            if not folder:
                progress(1, desc="No folder selected")
                yield empty_result
                return

            if not check_trajectories_downloaded(folder):
                progress(0.1, desc="Preparing download")
                yield (
                    "⏳ Downloading trajectories...",
                    gr.update(visible=False),
                    None, None,
                    None, None, None, None,
                    None, None, None, None,
                    None,
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(),
                    gr.update(),
                    gr.update(),
                    gr.update(visible=False),
                    gr.update(),
                    gr.update(),
                    gr.update(),
                )
                progress(0.3, desc="Downloading")
                status, _ = download_trajectories_from_s3(folder)
                if "❌" in status:
                    progress(1, desc="Download failed")
                    yield (
                        status,
                        gr.update(visible=False),
                        None, None,
                        None, None, None, None,
                        None, None, None, None,
                        None,
                        gr.update(visible=False),
                        gr.update(visible=False),
                        gr.update(),
                        gr.update(),
                        gr.update(),
                        gr.update(visible=False),
                        gr.update(),
                        gr.update(),
                        gr.update(),
                    )
                    return
            progress(0.45, desc="Loading trajectories")

            yield (
                "⏳ Loading trajectories...",
                gr.update(visible=True),
                None, None,
                None, None, None, None,
                None, None, None, None,
                None,
                gr.update(visible=False),
                gr.update(visible=False),
                gr.update(),
                gr.update(),
                gr.update(),
                gr.update(visible=False),
                gr.update(),
                gr.update(),
                gr.update(),
            )

            progress(0.6, desc="Reading metadata")
            df_meta = ensure_token_columns(load_all_trajectories(folder))
            progress(0.7, desc="Reading calculated")
            df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
            df_calc["api_calls"] = df_meta["api_calls"].values
            df_calc["instance_cost"] = df_meta["instance_cost"].values
            progress(0.75, desc="Reading steps")
            trajectory_steps = load_all_trajectory_steps(folder)
            progress(0.8, desc="Reading metadata steps")
            metadata_steps = load_all_trajectory_metadata_steps(folder)

            model_details, _ = get_model_details(folder)
            resolved_instances = {}
            if model_details:
                per_instance = model_details.get("per_instance_details", {})
                for inst_id, details in per_instance.items():
                    resolved_instances[inst_id] = details.get("resolved", False)

            state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances}

            if df_meta.empty:
                progress(1, desc="No trajectories found")
                yield (
                    "❌ No trajectories found",
                    gr.update(visible=False),
                    None, None,
                    None, None, None, None,
                    None, None, None, None,
                    None,
                    gr.update(visible=False),
                    gr.update(visible=False),
                    gr.update(),
                    gr.update(),
                    gr.update(),
                    gr.update(visible=False),
                    gr.update(),
                    gr.update(),
                    gr.update(),
                )
                return

            progress(0.9, desc="Building charts")
            fig_steps, fig_cost, _, _, _ = create_basic_histograms(
                df_meta, input_price, cache_read_price, cache_creation_price, completion_price
            )

            fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta = create_token_charts(
                df_meta, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown_meta = create_cost_breakdown(
                df_meta, input_price, cache_read_price, cache_creation_price, completion_price
            )

            df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
            if not with_cache:
                df_calc_processed = apply_no_cache(df_calc_processed)

            fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
                df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown_calc = create_cost_breakdown(
                df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
            )

            issue_ids = sorted(trajectory_steps.keys())
            first_issue = issue_ids[0] if issue_ids else None

            meta_issue_ids = sorted(metadata_steps.keys())
            first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None
            has_meta_steps = len(meta_issue_ids) > 0

            fig_single_traj = None
            fig_single_traj_cost = None
            if first_issue and first_issue in trajectory_steps:
                calc_steps = trajectory_steps[first_issue]
                fig_single_traj = create_single_trajectory_chart(calc_steps, overhead, with_cache)
                fig_single_traj_cost = create_single_trajectory_cost_chart(calc_steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)

            fig_single_traj_meta = None
            fig_single_traj_meta_cost = None
            if first_meta_issue and first_meta_issue in metadata_steps:
                meta_steps = metadata_steps[first_meta_issue]
                fig_single_traj_meta = create_single_trajectory_meta_chart(meta_steps)
                fig_single_traj_meta_cost = create_single_trajectory_meta_cost_chart(meta_steps, input_price, cache_read_price, cache_creation_price, completion_price)

            progress(1, desc="Done")
            yield (
                f"✅ Loaded {len(df_meta)} trajectories",
                gr.update(visible=True),
                fig_steps, fig_cost,
                fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta, fig_cost_breakdown_meta,
                fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc,
                state_data,
                gr.update(visible=True),
                gr.update(visible=True),
                gr.update(choices=issue_ids, value=first_issue),
                fig_single_traj,
                fig_single_traj_cost,
                gr.update(visible=has_meta_steps),
                gr.update(choices=meta_issue_ids, value=first_meta_issue),
                fig_single_traj_meta,
                fig_single_traj_meta_cost,
            )

        def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
            if state_data is None or not issue_id:
                return None, None
            trajectory_steps = state_data.get("steps", {})
            if issue_id not in trajectory_steps:
                return None, None
            steps = trajectory_steps[issue_id]
            tokens_chart = create_single_trajectory_chart(steps, overhead, with_cache)
            cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache)
            return tokens_chart, cost_chart

        def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price):
            if state_data is None or not issue_id:
                return None, None
            metadata_steps = state_data.get("metadata_steps", {})
            if issue_id not in metadata_steps:
                return None, None
            steps = metadata_steps[issue_id]
            tokens_chart = create_single_trajectory_meta_chart(steps)
            cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price)
            return tokens_chart, cost_chart

        analyze_btn.click(
            fn=load_and_analyze,
            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
            outputs=[
                download_status,
                analysis_section,
                plot_steps, plot_cost,
                plot_tokens_meta, plot_tokens_cost_meta, plot_stacked_meta, plot_cost_breakdown_meta,
                plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc,
                trajectories_state,
                add_routing_btn,
                single_traj_accordion,
                single_traj_dropdown,
                single_traj_plot,
                single_traj_cost_plot,
                single_traj_meta_accordion,
                single_traj_meta_dropdown,
                single_traj_meta_plot,
                single_traj_meta_cost_plot,
            ],
        )

        def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
            if state_data is None:
                return None, None, None, None

            df_meta = state_data["meta"]
            df_calc = state_data["calculated"]

            if df_meta.empty:
                return None, None, None, None

            fig_tokens_cost_meta = create_cost_by_type_chart(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)
            fig_cost_breakdown_meta = create_cost_breakdown(df_meta, input_price, cache_read_price, cache_creation_price, completion_price)

            df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
            if not with_cache:
                df_calc_processed = apply_no_cache(df_calc_processed)

            fig_tokens_cost_calc = create_cost_by_type_chart(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)
            fig_cost_breakdown_calc = create_cost_breakdown(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price)

            return fig_tokens_cost_meta, fig_cost_breakdown_meta, fig_tokens_cost_calc, fig_cost_breakdown_calc

        price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
        price_outputs = [plot_tokens_cost_meta, plot_cost_breakdown_meta, plot_tokens_cost_calc, plot_cost_breakdown_calc]

        price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
        price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)

        def on_calc_options_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache):
            """Recalculate only calculated charts when overhead or cache options change"""
            if state_data is None:
                return None, None, None, None

            df_calc = state_data["calculated"]
            if df_calc.empty:
                return None, None, None, None

            df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead)
            if not with_cache:
                df_calc_processed = apply_no_cache(df_calc_processed)

            fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts(
                df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown_calc = create_cost_breakdown(
                df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
            )

            return fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc

        calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
        calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc]

        single_traj_dropdown.change(
            fn=on_single_traj_select,
            inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache],
            outputs=[single_traj_plot, single_traj_cost_plot],
        )

        single_traj_meta_dropdown.change(
            fn=on_single_traj_meta_select,
            inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion],
            outputs=[single_traj_meta_plot, single_traj_meta_cost_plot],
        )

        single_traj_inputs = [trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache]
        single_traj_outputs = [single_traj_plot, single_traj_cost_plot]

        routing_inputs = [
            trajectories_state,
            price_input, price_cache_read, price_cache_creation, price_completion,
            routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion,
            routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion,
            routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion,
            selected_strategy,
            weight_base, weight_model_1, weight_model_2, weight_model_3,
            k_model_1, k_model_2, k_model_3,
            slice_model_1, slice_model_2, slice_model_3,
            grep_model_1, grep_model_2, grep_model_3,
            resolved_model, unresolved_model,
            part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
            thinking_overhead, use_cache,
            detected_model,
        ]
        routing_outputs = [routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot]

        thinking_overhead.change(
            fn=on_calc_options_change,
            inputs=calc_options_inputs,
            outputs=calc_options_outputs,
        ).then(
            fn=on_single_traj_select,
            inputs=single_traj_inputs,
            outputs=single_traj_outputs,
        ).then(
            fn=run_routing,
            inputs=routing_inputs,
            outputs=routing_outputs,
        )

        use_cache.change(
            fn=on_calc_options_change,
            inputs=calc_options_inputs,
            outputs=calc_options_outputs,
        ).then(
            fn=on_single_traj_select,
            inputs=single_traj_inputs,
            outputs=single_traj_outputs,
        ).then(
            fn=run_routing,
            inputs=routing_inputs,
            outputs=routing_outputs,
        )

    return app


if __name__ == "__main__":
    logging.info("Refreshing leaderboard data on startup...")
    load_or_download_leaderboard(force_refresh=True)
    app = build_app()
    app.queue()
    app.launch()