Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

File size: 23,707 Bytes

import json
import os
import subprocess
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests

from src.download_swebench_leaderboard import download_leaderboard

DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"

_litellm_prices_cache = None
_trajectories_cache = {}


def get_litellm_prices() -> dict:
    global _litellm_prices_cache
    if _litellm_prices_cache is not None:
        return _litellm_prices_cache

    if LITELLM_PRICES_CACHE.exists():
        with open(LITELLM_PRICES_CACHE) as f:
            _litellm_prices_cache = json.load(f)
            return _litellm_prices_cache

    try:
        response = requests.get(LITELLM_PRICES_URL, timeout=30)
        response.raise_for_status()
        _litellm_prices_cache = response.json()

        DATA_DIR.mkdir(exist_ok=True)
        with open(LITELLM_PRICES_CACHE, "w") as f:
            json.dump(_litellm_prices_cache, f)
    except Exception:
        _litellm_prices_cache = {}

    return _litellm_prices_cache


def get_model_prices(model_name: str) -> dict | None:
    if not model_name:
        return None

    prices = get_litellm_prices()

    clean_name = model_name.replace("anthropic/", "").replace("openai/", "")

    candidates = [
        model_name,
        clean_name,
        f"anthropic/{clean_name}",
        f"openai/{clean_name}",
    ]

    for key in candidates:
        if key in prices:
            return prices[key]

    for key, value in prices.items():
        if clean_name in key or model_name in key:
            return value

    return None


def load_or_download_leaderboard():
    if LEADERBOARD_CACHE.exists():
        with open(LEADERBOARD_CACHE) as f:
            return json.load(f)

    filename = download_leaderboard(output_dir=str(DATA_DIR))
    os.rename(filename, LEADERBOARD_CACHE)
    with open(LEADERBOARD_CACHE) as f:
        return json.load(f)


def get_bash_only_df():
    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return pd.DataFrame()

    rows = []
    for r in bash_only["results"]:
        rows.append({
            "name": r.get("name", ""),
            "date": r.get("date", ""),
            "cost": round(r.get("cost", 0), 2),
            "instance_cost": round(r.get("instance_cost", 0), 4),
            "instance_calls": r.get("instance_calls", 0),
            "folder": r.get("folder", ""),
            "os_model": "✅" if r.get("os_model") else "❌",
            "os_system": "✅" if r.get("os_system") else "❌",
        })

    return pd.DataFrame(rows)


def get_model_details(folder: str):
    if not folder:
        return None, "Select a model from the table"

    data = load_or_download_leaderboard()
    leaderboards = data.get("leaderboards", [])
    bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

    if not bash_only:
        return None, "Leaderboard not found"

    model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
    if not model:
        return None, f"Model with folder '{folder}' not found"

    return model, None


def check_trajectories_downloaded(folder: str) -> bool:
    if not folder:
        return False
    output_dir = TRAJS_DIR / folder
    return output_dir.exists() and any(output_dir.iterdir())


def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
    if not folder:
        return "❌ No model selected", gr.update(visible=False)

    model, error = get_model_details(folder)
    if error:
        return f"❌ {error}", gr.update(visible=False)

    output_dir = TRAJS_DIR / folder
    if output_dir.exists() and any(output_dir.iterdir()):
        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))
        return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

    s3_path = f"{S3_BUCKET}/{folder}/trajs/"
    output_dir.mkdir(parents=True, exist_ok=True)

    progress(0, desc="Starting S3 download...")

    try:
        result = subprocess.run(
            ["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
            capture_output=True,
            text=True,
            timeout=600,
        )

        if result.returncode != 0:
            return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

        file_count = len(list(output_dir.glob("*/*.traj.json")))
        if file_count == 0:
            file_count = len(list(output_dir.glob("*.json")))

        per_instance = model.get("per_instance_details", {})
        resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
        total_count = len(per_instance)

        status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
        return status, gr.update(visible=True)

    except subprocess.TimeoutExpired:
        return "❌ Download timed out (>10 min)", gr.update(visible=False)
    except FileNotFoundError:
        return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
    except Exception as e:
        return f"❌ Error: {e}", gr.update(visible=False)


def parse_trajectory(traj_path: Path) -> dict:
    with open(traj_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    info = data.get("info", {})
    model_stats = info.get("model_stats", {})
    config = info.get("config", {})
    model_config = config.get("model", {})
    model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

    result = {
        "instance_id": data.get("instance_id", traj_path.stem),
        "model_name": model_name,
        "api_calls": model_stats.get("api_calls", 0),
        "instance_cost": model_stats.get("instance_cost", 0),
        "prompt_tokens": 0,
        "completion_tokens": 0,
        "total_tokens": 0,
        "cache_read_tokens": 0,
        "cache_creation_tokens": 0,
    }

    messages = data.get("messages", [])
    for msg in messages:
        usage = None
        if "usage" in msg:
            usage = msg["usage"]
        elif "extra" in msg and isinstance(msg["extra"], dict):
            response = msg["extra"].get("response", {})
            if isinstance(response, dict):
                usage = response.get("usage", {})

        if usage:
            result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
            result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
            result["total_tokens"] += usage.get("total_tokens", 0) or 0
            result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
            result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0

    return result


def load_all_trajectories(folder: str) -> pd.DataFrame:
    global _trajectories_cache

    if folder in _trajectories_cache:
        return _trajectories_cache[folder]

    output_dir = TRAJS_DIR / folder

    traj_files = list(output_dir.glob("*/*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.traj.json"))
    if not traj_files:
        traj_files = list(output_dir.glob("*.json"))

    rows = []
    for traj_path in traj_files:
        try:
            rows.append(parse_trajectory(traj_path))
        except Exception as e:
            print(f"Error parsing {traj_path}: {e}")

    df = pd.DataFrame(rows)
    _trajectories_cache[folder] = df
    return df


def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None, None, None, None, None

    fig_steps = px.histogram(
        df,
        x="api_calls",
        nbins=30,
        title="Distribution of API Calls (Steps) per Instance",
        color_discrete_sequence=["#636EFA"],
    )
    fig_steps.update_layout(
        xaxis_title="API Calls (Steps)",
        yaxis_title="Number of Instances",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_steps.add_annotation(
        text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    fig_cost = px.histogram(
        df,
        x="instance_cost",
        nbins=30,
        title="Distribution of Cost per Instance ($)",
        color_discrete_sequence=["#00CC96"],
    )
    fig_cost.update_layout(
        xaxis_title="Cost ($)",
        yaxis_title="Number of Instances",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )
    fig_cost.add_annotation(
        text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    total_completion = df["completion_tokens"].sum()
    total_cache_read = df["cache_read_tokens"].sum()
    total_cache_creation = df["cache_creation_tokens"].sum()
    # Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
    df_temp = df.copy()
    df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
    total_uncached_input = df_temp["uncached_input"].sum()

    token_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion],
    })

    fig_tokens = px.bar(
        token_data,
        x="Token Type",
        y="Total Tokens",
        title="Total Tokens by Type",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens.update_layout(
        xaxis_title="Token Type",
        yaxis_title="Total Tokens",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )

    total_all = token_data["Total Tokens"].sum()
    fig_tokens.add_annotation(
        text=f"Total: {total_all:,.0f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    # Cost by token type
    cost_uncached_input = total_uncached_input * input_price / 1e6
    cost_cache_read = total_cache_read * cache_read_price / 1e6
    cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
    cost_completion = total_completion * completion_price / 1e6

    cost_data = pd.DataFrame({
        "Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
        "Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
    })

    fig_tokens_cost = px.bar(
        cost_data,
        x="Token Type",
        y="Cost ($)",
        title="Total Cost by Token Type ($)",
        color="Token Type",
        color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
    )
    fig_tokens_cost.update_layout(
        xaxis_title="Token Type",
        yaxis_title="Cost ($)",
        showlegend=False,
        margin=dict(l=40, r=20, t=40, b=40),
    )

    total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
    fig_tokens_cost.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=12),
    )

    df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
    df_sorted["instance_idx"] = range(len(df_sorted))
    # Uncached input = prompt - cache_read - cache_creation
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)

    fig_stacked = go.Figure()

    fig_stacked.add_trace(go.Bar(
        name="Uncached Input",
        x=df_sorted["instance_idx"],
        y=df_sorted["uncached_input_tokens"],
        marker_color="#EF553B",
        hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Read",
        x=df_sorted["instance_idx"],
        y=df_sorted["cache_read_tokens"],
        marker_color="#19D3F3",
        hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Cache Creation",
        x=df_sorted["instance_idx"],
        y=df_sorted["cache_creation_tokens"],
        marker_color="#FFA15A",
        hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
    ))

    fig_stacked.add_trace(go.Bar(
        name="Completion",
        x=df_sorted["instance_idx"],
        y=df_sorted["completion_tokens"],
        marker_color="#AB63FA",
        hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
    ))

    fig_stacked.update_layout(
        barmode="stack",
        title="Billable Tokens per Instance (stacked)",
        xaxis_title="Instance (sorted by cache read)",
        yaxis_title="Tokens",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=60, b=40),
    )

    return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
    if df.empty:
        return None

    df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
    df_sorted["instance_idx"] = range(len(df_sorted))

    # Uncached input = prompt - cache_read - cache_creation
    df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)

    df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
    df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
    df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
    df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

    fig = go.Figure()

    fig.add_trace(go.Bar(
        name=f"Uncached Input (${input_price:.2f}/1M)",
        x=df_sorted["instance_idx"],
        y=df_sorted["cost_uncached_input"],
        marker_color="#EF553B",
        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Read (${cache_read_price:.2f}/1M)",
        x=df_sorted["instance_idx"],
        y=df_sorted["cost_cache_read"],
        marker_color="#19D3F3",
        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
        x=df_sorted["instance_idx"],
        y=df_sorted["cost_cache_creation"],
        marker_color="#FFA15A",
        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    fig.add_trace(go.Bar(
        name=f"Completion (${completion_price:.2f}/1M)",
        x=df_sorted["instance_idx"],
        y=df_sorted["cost_completion"],
        marker_color="#AB63FA",
        hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
    ))

    total_cost = (
        df_sorted["cost_uncached_input"].sum() +
        df_sorted["cost_cache_read"].sum() +
        df_sorted["cost_cache_creation"].sum() +
        df_sorted["cost_completion"].sum()
    )

    fig.update_layout(
        barmode="stack",
        title="Cost Breakdown per Instance",
        xaxis_title="Instance (sorted by cache read)",
        yaxis_title="Cost ($)",
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        margin=dict(l=50, r=20, t=60, b=40),
    )

    fig.add_annotation(
        text=f"Total: ${total_cost:.2f}",
        xref="paper", yref="paper",
        x=0.95, y=0.95, showarrow=False,
        font=dict(size=14),
        bgcolor="white",
    )

    return fig


def extract_model_from_folder(folder: str) -> str:
    """Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
    if not folder:
        return ""
    parts = folder.split("_")
    if len(parts) >= 3:
        return "_".join(parts[2:])
    return folder


def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]:
    """Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)"""
    model_hint = extract_model_from_folder(folder)
    if not model_hint:
        return 0, 0, 0, 0, ""

    prices = get_model_prices(model_hint)
    if prices:
        input_price = prices.get("input_cost_per_token", 0) * 1e6
        cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
        cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
        completion = prices.get("output_cost_per_token", 0) * 1e6
        return input_price, cache_read, cache_creation, completion, model_hint

    return 0, 0, 0, 0, model_hint


def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
    if evt.index is None:
        return (
            "", "", 
            gr.update(interactive=False), 
            gr.update(visible=False), 
            gr.update(value=0, label="💲 Input"),
            gr.update(value=0, label="💲 Cache Read"),
            gr.update(value=0, label="💲 Cache Creation"),
            gr.update(value=0, label="💲 Completion"),
            ""
        )

    row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
    row = df.iloc[row_idx]
    folder = row["folder"]
    name = row["name"]

    show_analyze = check_trajectories_downloaded(folder)

    input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder)

    def price_update(value, name):
        if value > 0:
            return gr.update(value=value, label=f"✅ {name}")
        else:
            return gr.update(value=value, label=f"❌ {name}")

    return (
        folder, name, 
        gr.update(interactive=True), 
        gr.update(visible=show_analyze), 
        price_update(input_price, "Input"),
        price_update(cache_read, "Cache Read"),
        price_update(cache_creation, "Cache Creation"),
        price_update(completion, "Completion"),
        model_hint
    )


def build_app():
    leaderboard_df = get_bash_only_df()

    with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
        trajectories_state = gr.State(None)

        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
        gr.Markdown("Select a model to use as base for cost analysis")

        with gr.Row():
            with gr.Column(scale=3):
                leaderboard_table = gr.Dataframe(
                    value=leaderboard_df,
                    label="Bash-Only Leaderboard",
                    interactive=False,
                    wrap=True,
                )

                with gr.Column(visible=False) as analysis_section:
                    gr.Markdown("## 📊 Trajectory Analysis")

                    with gr.Row():
                        plot_steps = gr.Plot(label="API Calls Distribution")
                        plot_cost = gr.Plot(label="Cost Distribution")

                    with gr.Row():
                        plot_tokens = gr.Plot(label="Token Usage by Type")
                        plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")

                    with gr.Row():
                        plot_stacked = gr.Plot(label="Billable Tokens per Instance")

                    with gr.Row():
                        plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")

            with gr.Column(scale=1):
                selected_folder = gr.State("")
                gr.Markdown("### Selected Model")
                selected_name = gr.Textbox(label="Model Name", interactive=False)

                download_btn = gr.Button("📥 Download Trajectories", interactive=False)
                download_status = gr.Textbox(label="Status", interactive=False, lines=3)

                analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")

                gr.Markdown("---")
                gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
                detected_model = gr.Textbox(label="Detected Model", interactive=False)
                price_input = gr.Number(label="💲 Input", value=0, precision=2)
                price_cache_read = gr.Number(label="💲 Cache Read", value=0, precision=2)
                price_cache_creation = gr.Number(label="💲 Cache Creation", value=0, precision=2)
                price_completion = gr.Number(label="💲 Completion", value=0, precision=2)

        leaderboard_table.select(
            fn=on_row_select,
            inputs=[leaderboard_table],
            outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
        )

        download_btn.click(
            fn=download_trajectories_from_s3,
            inputs=[selected_folder],
            outputs=[download_status, analyze_btn],
        )

        def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price):
            empty_result = (
                gr.update(visible=False),
                None, None, None, None, None, None,
            )

            if not folder:
                yield empty_result
                return

            yield (
                gr.update(visible=True),
                None, None, None, None, None, None,
            )

            df = load_all_trajectories(folder)
            if df.empty:
                yield empty_result
                return

            fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
                df, input_price, cache_read_price, cache_creation_price, completion_price
            )
            fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

            yield (
                gr.update(visible=True),
                fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
            )

        analyze_btn.click(
            fn=load_and_analyze,
            inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion],
            outputs=[
                analysis_section,
                plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
            ],
        )

    return app


if __name__ == "__main__":
    app = build_app()
    app.queue()
    app.launch()