IgorSlinko's picture
Add input price and fix uncached input calculation
10ece01
raw
history blame
23.7 kB
import json
import os
import subprocess
from pathlib import Path
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import requests
from src.download_swebench_leaderboard import download_leaderboard
DATA_DIR = Path("data")
TRAJS_DIR = DATA_DIR / "swebench_trajs"
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
S3_BUCKET = "s3://swe-bench-experiments/bash-only"
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
_litellm_prices_cache = None
_trajectories_cache = {}
def get_litellm_prices() -> dict:
global _litellm_prices_cache
if _litellm_prices_cache is not None:
return _litellm_prices_cache
if LITELLM_PRICES_CACHE.exists():
with open(LITELLM_PRICES_CACHE) as f:
_litellm_prices_cache = json.load(f)
return _litellm_prices_cache
try:
response = requests.get(LITELLM_PRICES_URL, timeout=30)
response.raise_for_status()
_litellm_prices_cache = response.json()
DATA_DIR.mkdir(exist_ok=True)
with open(LITELLM_PRICES_CACHE, "w") as f:
json.dump(_litellm_prices_cache, f)
except Exception:
_litellm_prices_cache = {}
return _litellm_prices_cache
def get_model_prices(model_name: str) -> dict | None:
if not model_name:
return None
prices = get_litellm_prices()
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
candidates = [
model_name,
clean_name,
f"anthropic/{clean_name}",
f"openai/{clean_name}",
]
for key in candidates:
if key in prices:
return prices[key]
for key, value in prices.items():
if clean_name in key or model_name in key:
return value
return None
def load_or_download_leaderboard():
if LEADERBOARD_CACHE.exists():
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
filename = download_leaderboard(output_dir=str(DATA_DIR))
os.rename(filename, LEADERBOARD_CACHE)
with open(LEADERBOARD_CACHE) as f:
return json.load(f)
def get_bash_only_df():
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return pd.DataFrame()
rows = []
for r in bash_only["results"]:
rows.append({
"name": r.get("name", ""),
"date": r.get("date", ""),
"cost": round(r.get("cost", 0), 2),
"instance_cost": round(r.get("instance_cost", 0), 4),
"instance_calls": r.get("instance_calls", 0),
"folder": r.get("folder", ""),
"os_model": "โœ…" if r.get("os_model") else "โŒ",
"os_system": "โœ…" if r.get("os_system") else "โŒ",
})
return pd.DataFrame(rows)
def get_model_details(folder: str):
if not folder:
return None, "Select a model from the table"
data = load_or_download_leaderboard()
leaderboards = data.get("leaderboards", [])
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)
if not bash_only:
return None, "Leaderboard not found"
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
if not model:
return None, f"Model with folder '{folder}' not found"
return model, None
def check_trajectories_downloaded(folder: str) -> bool:
if not folder:
return False
output_dir = TRAJS_DIR / folder
return output_dir.exists() and any(output_dir.iterdir())
def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
if not folder:
return "โŒ No model selected", gr.update(visible=False)
model, error = get_model_details(folder)
if error:
return f"โŒ {error}", gr.update(visible=False)
output_dir = TRAJS_DIR / folder
if output_dir.exists() and any(output_dir.iterdir()):
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
return f"โœ… Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)
s3_path = f"{S3_BUCKET}/{folder}/trajs/"
output_dir.mkdir(parents=True, exist_ok=True)
progress(0, desc="Starting S3 download...")
try:
result = subprocess.run(
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
return f"โŒ S3 download failed:\n{result.stderr}", gr.update(visible=False)
file_count = len(list(output_dir.glob("*/*.traj.json")))
if file_count == 0:
file_count = len(list(output_dir.glob("*.json")))
per_instance = model.get("per_instance_details", {})
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
total_count = len(per_instance)
status = f"โœ… Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
return status, gr.update(visible=True)
except subprocess.TimeoutExpired:
return "โŒ Download timed out (>10 min)", gr.update(visible=False)
except FileNotFoundError:
return "โŒ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
except Exception as e:
return f"โŒ Error: {e}", gr.update(visible=False)
def parse_trajectory(traj_path: Path) -> dict:
with open(traj_path, "r", encoding="utf-8") as f:
data = json.load(f)
info = data.get("info", {})
model_stats = info.get("model_stats", {})
config = info.get("config", {})
model_config = config.get("model", {})
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
result = {
"instance_id": data.get("instance_id", traj_path.stem),
"model_name": model_name,
"api_calls": model_stats.get("api_calls", 0),
"instance_cost": model_stats.get("instance_cost", 0),
"prompt_tokens": 0,
"completion_tokens": 0,
"total_tokens": 0,
"cache_read_tokens": 0,
"cache_creation_tokens": 0,
}
messages = data.get("messages", [])
for msg in messages:
usage = None
if "usage" in msg:
usage = msg["usage"]
elif "extra" in msg and isinstance(msg["extra"], dict):
response = msg["extra"].get("response", {})
if isinstance(response, dict):
usage = response.get("usage", {})
if usage:
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
result["total_tokens"] += usage.get("total_tokens", 0) or 0
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0
return result
def load_all_trajectories(folder: str) -> pd.DataFrame:
global _trajectories_cache
if folder in _trajectories_cache:
return _trajectories_cache[folder]
output_dir = TRAJS_DIR / folder
traj_files = list(output_dir.glob("*/*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.traj.json"))
if not traj_files:
traj_files = list(output_dir.glob("*.json"))
rows = []
for traj_path in traj_files:
try:
rows.append(parse_trajectory(traj_path))
except Exception as e:
print(f"Error parsing {traj_path}: {e}")
df = pd.DataFrame(rows)
_trajectories_cache[folder] = df
return df
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None, None, None, None, None
fig_steps = px.histogram(
df,
x="api_calls",
nbins=30,
title="Distribution of API Calls (Steps) per Instance",
color_discrete_sequence=["#636EFA"],
)
fig_steps.update_layout(
xaxis_title="API Calls (Steps)",
yaxis_title="Number of Instances",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_steps.add_annotation(
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
fig_cost = px.histogram(
df,
x="instance_cost",
nbins=30,
title="Distribution of Cost per Instance ($)",
color_discrete_sequence=["#00CC96"],
)
fig_cost.update_layout(
xaxis_title="Cost ($)",
yaxis_title="Number of Instances",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
fig_cost.add_annotation(
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
total_completion = df["completion_tokens"].sum()
total_cache_read = df["cache_read_tokens"].sum()
total_cache_creation = df["cache_creation_tokens"].sum()
# Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
df_temp = df.copy()
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
total_uncached_input = df_temp["uncached_input"].sum()
token_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion],
})
fig_tokens = px.bar(
token_data,
x="Token Type",
y="Total Tokens",
title="Total Tokens by Type",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens.update_layout(
xaxis_title="Token Type",
yaxis_title="Total Tokens",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_all = token_data["Total Tokens"].sum()
fig_tokens.add_annotation(
text=f"Total: {total_all:,.0f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
# Cost by token type
cost_uncached_input = total_uncached_input * input_price / 1e6
cost_cache_read = total_cache_read * cache_read_price / 1e6
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
cost_completion = total_completion * completion_price / 1e6
cost_data = pd.DataFrame({
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
})
fig_tokens_cost = px.bar(
cost_data,
x="Token Type",
y="Cost ($)",
title="Total Cost by Token Type ($)",
color="Token Type",
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
)
fig_tokens_cost.update_layout(
xaxis_title="Token Type",
yaxis_title="Cost ($)",
showlegend=False,
margin=dict(l=40, r=20, t=40, b=40),
)
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
fig_tokens_cost.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=12),
)
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
df_sorted["instance_idx"] = range(len(df_sorted))
# Uncached input = prompt - cache_read - cache_creation
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
fig_stacked = go.Figure()
fig_stacked.add_trace(go.Bar(
name="Uncached Input",
x=df_sorted["instance_idx"],
y=df_sorted["uncached_input_tokens"],
marker_color="#EF553B",
hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Read",
x=df_sorted["instance_idx"],
y=df_sorted["cache_read_tokens"],
marker_color="#19D3F3",
hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Cache Creation",
x=df_sorted["instance_idx"],
y=df_sorted["cache_creation_tokens"],
marker_color="#FFA15A",
hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
))
fig_stacked.add_trace(go.Bar(
name="Completion",
x=df_sorted["instance_idx"],
y=df_sorted["completion_tokens"],
marker_color="#AB63FA",
hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
))
fig_stacked.update_layout(
barmode="stack",
title="Billable Tokens per Instance (stacked)",
xaxis_title="Instance (sorted by cache read)",
yaxis_title="Tokens",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
if df.empty:
return None
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
df_sorted["instance_idx"] = range(len(df_sorted))
# Uncached input = prompt - cache_read - cache_creation
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6
fig = go.Figure()
fig.add_trace(go.Bar(
name=f"Uncached Input (${input_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_uncached_input"],
marker_color="#EF553B",
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Cache Read (${cache_read_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_cache_read"],
marker_color="#19D3F3",
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_cache_creation"],
marker_color="#FFA15A",
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
fig.add_trace(go.Bar(
name=f"Completion (${completion_price:.2f}/1M)",
x=df_sorted["instance_idx"],
y=df_sorted["cost_completion"],
marker_color="#AB63FA",
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
))
total_cost = (
df_sorted["cost_uncached_input"].sum() +
df_sorted["cost_cache_read"].sum() +
df_sorted["cost_cache_creation"].sum() +
df_sorted["cost_completion"].sum()
)
fig.update_layout(
barmode="stack",
title="Cost Breakdown per Instance",
xaxis_title="Instance (sorted by cache read)",
yaxis_title="Cost ($)",
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
margin=dict(l=50, r=20, t=60, b=40),
)
fig.add_annotation(
text=f"Total: ${total_cost:.2f}",
xref="paper", yref="paper",
x=0.95, y=0.95, showarrow=False,
font=dict(size=14),
bgcolor="white",
)
return fig
def extract_model_from_folder(folder: str) -> str:
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
if not folder:
return ""
parts = folder.split("_")
if len(parts) >= 3:
return "_".join(parts[2:])
return folder
def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]:
"""Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)"""
model_hint = extract_model_from_folder(folder)
if not model_hint:
return 0, 0, 0, 0, ""
prices = get_model_prices(model_hint)
if prices:
input_price = prices.get("input_cost_per_token", 0) * 1e6
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
completion = prices.get("output_cost_per_token", 0) * 1e6
return input_price, cache_read, cache_creation, completion, model_hint
return 0, 0, 0, 0, model_hint
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
if evt.index is None:
return (
"", "",
gr.update(interactive=False),
gr.update(visible=False),
gr.update(value=0, label="๐Ÿ’ฒ Input"),
gr.update(value=0, label="๐Ÿ’ฒ Cache Read"),
gr.update(value=0, label="๐Ÿ’ฒ Cache Creation"),
gr.update(value=0, label="๐Ÿ’ฒ Completion"),
""
)
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
row = df.iloc[row_idx]
folder = row["folder"]
name = row["name"]
show_analyze = check_trajectories_downloaded(folder)
input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder)
def price_update(value, name):
if value > 0:
return gr.update(value=value, label=f"โœ… {name}")
else:
return gr.update(value=value, label=f"โŒ {name}")
return (
folder, name,
gr.update(interactive=True),
gr.update(visible=show_analyze),
price_update(input_price, "Input"),
price_update(cache_read, "Cache Read"),
price_update(cache_creation, "Cache Creation"),
price_update(completion, "Completion"),
model_hint
)
def build_app():
leaderboard_df = get_bash_only_df()
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
trajectories_state = gr.State(None)
gr.Markdown("# ๐Ÿงฎ SWE-bench Bash-Only Leaderboard")
gr.Markdown("Select a model to use as base for cost analysis")
with gr.Row():
with gr.Column(scale=3):
leaderboard_table = gr.Dataframe(
value=leaderboard_df,
label="Bash-Only Leaderboard",
interactive=False,
wrap=True,
)
with gr.Column(visible=False) as analysis_section:
gr.Markdown("## ๐Ÿ“Š Trajectory Analysis")
with gr.Row():
plot_steps = gr.Plot(label="API Calls Distribution")
plot_cost = gr.Plot(label="Cost Distribution")
with gr.Row():
plot_tokens = gr.Plot(label="Token Usage by Type")
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
with gr.Row():
plot_stacked = gr.Plot(label="Billable Tokens per Instance")
with gr.Row():
plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
with gr.Column(scale=1):
selected_folder = gr.State("")
gr.Markdown("### Selected Model")
selected_name = gr.Textbox(label="Model Name", interactive=False)
download_btn = gr.Button("๐Ÿ“ฅ Download Trajectories", interactive=False)
download_status = gr.Textbox(label="Status", interactive=False, lines=3)
analyze_btn = gr.Button("๐Ÿ“Š Load & Analyze", visible=False, variant="primary")
gr.Markdown("---")
gr.Markdown("### ๐Ÿ’ฐ Token Prices ($/1M) ยท *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*")
detected_model = gr.Textbox(label="Detected Model", interactive=False)
price_input = gr.Number(label="๐Ÿ’ฒ Input", value=0, precision=2)
price_cache_read = gr.Number(label="๐Ÿ’ฒ Cache Read", value=0, precision=2)
price_cache_creation = gr.Number(label="๐Ÿ’ฒ Cache Creation", value=0, precision=2)
price_completion = gr.Number(label="๐Ÿ’ฒ Completion", value=0, precision=2)
leaderboard_table.select(
fn=on_row_select,
inputs=[leaderboard_table],
outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
)
download_btn.click(
fn=download_trajectories_from_s3,
inputs=[selected_folder],
outputs=[download_status, analyze_btn],
)
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price):
empty_result = (
gr.update(visible=False),
None, None, None, None, None, None,
)
if not folder:
yield empty_result
return
yield (
gr.update(visible=True),
None, None, None, None, None, None,
)
df = load_all_trajectories(folder)
if df.empty:
yield empty_result
return
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
df, input_price, cache_read_price, cache_creation_price, completion_price
)
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
yield (
gr.update(visible=True),
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
)
analyze_btn.click(
fn=load_and_analyze,
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion],
outputs=[
analysis_section,
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
],
)
return app
if __name__ == "__main__":
app = build_app()
app.queue()
app.launch()