Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

App Files Files Community

SWE-bench-Costs-Calculator / app.py

IgorSlinko

Add input price and fix uncached input calculation

10ece01 10 days ago

raw

history blame

23.7 kB

	import json
	import os
	import subprocess
	from pathlib import Path

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import requests

	from src.download_swebench_leaderboard import download_leaderboard

	DATA_DIR = Path("data")
	TRAJS_DIR = DATA_DIR / "swebench_trajs"
	LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
	LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
	S3_BUCKET = "s3://swe-bench-experiments/bash-only"
	LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"

	_litellm_prices_cache = None
	_trajectories_cache = {}


	def get_litellm_prices() -> dict:
	global _litellm_prices_cache
	if _litellm_prices_cache is not None:
	return _litellm_prices_cache

	if LITELLM_PRICES_CACHE.exists():
	with open(LITELLM_PRICES_CACHE) as f:
	_litellm_prices_cache = json.load(f)
	return _litellm_prices_cache

	try:
	response = requests.get(LITELLM_PRICES_URL, timeout=30)
	response.raise_for_status()
	_litellm_prices_cache = response.json()

	DATA_DIR.mkdir(exist_ok=True)
	with open(LITELLM_PRICES_CACHE, "w") as f:
	json.dump(_litellm_prices_cache, f)
	except Exception:
	_litellm_prices_cache = {}

	return _litellm_prices_cache


	def get_model_prices(model_name: str) -> dict \| None:
	if not model_name:
	return None

	prices = get_litellm_prices()

	clean_name = model_name.replace("anthropic/", "").replace("openai/", "")

	candidates = [
	model_name,
	clean_name,
	f"anthropic/{clean_name}",
	f"openai/{clean_name}",
	]

	for key in candidates:
	if key in prices:
	return prices[key]

	for key, value in prices.items():
	if clean_name in key or model_name in key:
	return value

	return None


	def load_or_download_leaderboard():
	if LEADERBOARD_CACHE.exists():
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)

	filename = download_leaderboard(output_dir=str(DATA_DIR))
	os.rename(filename, LEADERBOARD_CACHE)
	with open(LEADERBOARD_CACHE) as f:
	return json.load(f)


	def get_bash_only_df():
	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return pd.DataFrame()

	rows = []
	for r in bash_only["results"]:
	rows.append({
	"name": r.get("name", ""),
	"date": r.get("date", ""),
	"cost": round(r.get("cost", 0), 2),
	"instance_cost": round(r.get("instance_cost", 0), 4),
	"instance_calls": r.get("instance_calls", 0),
	"folder": r.get("folder", ""),
	"os_model": "✅" if r.get("os_model") else "❌",
	"os_system": "✅" if r.get("os_system") else "❌",
	})

	return pd.DataFrame(rows)


	def get_model_details(folder: str):
	if not folder:
	return None, "Select a model from the table"

	data = load_or_download_leaderboard()
	leaderboards = data.get("leaderboards", [])
	bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None)

	if not bash_only:
	return None, "Leaderboard not found"

	model = next((r for r in bash_only["results"] if r.get("folder") == folder), None)
	if not model:
	return None, f"Model with folder '{folder}' not found"

	return model, None


	def check_trajectories_downloaded(folder: str) -> bool:
	if not folder:
	return False
	output_dir = TRAJS_DIR / folder
	return output_dir.exists() and any(output_dir.iterdir())


	def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
	if not folder:
	return "❌ No model selected", gr.update(visible=False)

	model, error = get_model_details(folder)
	if error:
	return f"❌ {error}", gr.update(visible=False)

	output_dir = TRAJS_DIR / folder
	if output_dir.exists() and any(output_dir.iterdir()):
	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))
	return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True)

	s3_path = f"{S3_BUCKET}/{folder}/trajs/"
	output_dir.mkdir(parents=True, exist_ok=True)

	progress(0, desc="Starting S3 download...")

	try:
	result = subprocess.run(
	["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"],
	capture_output=True,
	text=True,
	timeout=600,
	)

	if result.returncode != 0:
	return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False)

	file_count = len(list(output_dir.glob("/.traj.json")))
	if file_count == 0:
	file_count = len(list(output_dir.glob("*.json")))

	per_instance = model.get("per_instance_details", {})
	resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
	total_count = len(per_instance)

	status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)"
	return status, gr.update(visible=True)

	except subprocess.TimeoutExpired:
	return "❌ Download timed out (>10 min)", gr.update(visible=False)
	except FileNotFoundError:
	return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False)
	except Exception as e:
	return f"❌ Error: {e}", gr.update(visible=False)


	def parse_trajectory(traj_path: Path) -> dict:
	with open(traj_path, "r", encoding="utf-8") as f:
	data = json.load(f)

	info = data.get("info", {})
	model_stats = info.get("model_stats", {})
	config = info.get("config", {})
	model_config = config.get("model", {})
	model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))

	result = {
	"instance_id": data.get("instance_id", traj_path.stem),
	"model_name": model_name,
	"api_calls": model_stats.get("api_calls", 0),
	"instance_cost": model_stats.get("instance_cost", 0),
	"prompt_tokens": 0,
	"completion_tokens": 0,
	"total_tokens": 0,
	"cache_read_tokens": 0,
	"cache_creation_tokens": 0,
	}

	messages = data.get("messages", [])
	for msg in messages:
	usage = None
	if "usage" in msg:
	usage = msg["usage"]
	elif "extra" in msg and isinstance(msg["extra"], dict):
	response = msg["extra"].get("response", {})
	if isinstance(response, dict):
	usage = response.get("usage", {})

	if usage:
	result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0
	result["completion_tokens"] += usage.get("completion_tokens", 0) or 0
	result["total_tokens"] += usage.get("total_tokens", 0) or 0
	result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0
	result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0

	return result


	def load_all_trajectories(folder: str) -> pd.DataFrame:
	global _trajectories_cache

	if folder in _trajectories_cache:
	return _trajectories_cache[folder]

	output_dir = TRAJS_DIR / folder

	traj_files = list(output_dir.glob("/.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.traj.json"))
	if not traj_files:
	traj_files = list(output_dir.glob("*.json"))

	rows = []
	for traj_path in traj_files:
	try:
	rows.append(parse_trajectory(traj_path))
	except Exception as e:
	print(f"Error parsing {traj_path}: {e}")

	df = pd.DataFrame(rows)
	_trajectories_cache[folder] = df
	return df


	def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None, None, None, None, None

	fig_steps = px.histogram(
	df,
	x="api_calls",
	nbins=30,
	title="Distribution of API Calls (Steps) per Instance",
	color_discrete_sequence=["#636EFA"],
	)
	fig_steps.update_layout(
	xaxis_title="API Calls (Steps)",
	yaxis_title="Number of Instances",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_steps.add_annotation(
	text=f"Mean: {df['api_calls'].mean():.1f} \| Median: {df['api_calls'].median():.0f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	fig_cost = px.histogram(
	df,
	x="instance_cost",
	nbins=30,
	title="Distribution of Cost per Instance ($)",
	color_discrete_sequence=["#00CC96"],
	)
	fig_cost.update_layout(
	xaxis_title="Cost ($)",
	yaxis_title="Number of Instances",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)
	fig_cost.add_annotation(
	text=f"Mean: ${df['instance_cost'].mean():.4f} \| Total: ${df['instance_cost'].sum():.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	total_completion = df["completion_tokens"].sum()
	total_cache_read = df["cache_read_tokens"].sum()
	total_cache_creation = df["cache_creation_tokens"].sum()
	# Uncached input = prompt - cache_read - cache_creation (per instance, then sum)
	df_temp = df.copy()
	df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
	total_uncached_input = df_temp["uncached_input"].sum()

	token_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion],
	})

	fig_tokens = px.bar(
	token_data,
	x="Token Type",
	y="Total Tokens",
	title="Total Tokens by Type",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens.update_layout(
	xaxis_title="Token Type",
	yaxis_title="Total Tokens",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)

	total_all = token_data["Total Tokens"].sum()
	fig_tokens.add_annotation(
	text=f"Total: {total_all:,.0f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	# Cost by token type
	cost_uncached_input = total_uncached_input * input_price / 1e6
	cost_cache_read = total_cache_read * cache_read_price / 1e6
	cost_cache_creation = total_cache_creation * cache_creation_price / 1e6
	cost_completion = total_completion * completion_price / 1e6

	cost_data = pd.DataFrame({
	"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"],
	"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion],
	})

	fig_tokens_cost = px.bar(
	cost_data,
	x="Token Type",
	y="Cost ($)",
	title="Total Cost by Token Type ($)",
	color="Token Type",
	color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"],
	)
	fig_tokens_cost.update_layout(
	xaxis_title="Token Type",
	yaxis_title="Cost ($)",
	showlegend=False,
	margin=dict(l=40, r=20, t=40, b=40),
	)

	total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion
	fig_tokens_cost.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=12),
	)

	df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
	df_sorted["instance_idx"] = range(len(df_sorted))
	# Uncached input = prompt - cache_read - cache_creation
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)

	fig_stacked = go.Figure()

	fig_stacked.add_trace(go.Bar(
	name="Uncached Input",
	x=df_sorted["instance_idx"],
	y=df_sorted["uncached_input_tokens"],
	marker_color="#EF553B",
	hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Read",
	x=df_sorted["instance_idx"],
	y=df_sorted["cache_read_tokens"],
	marker_color="#19D3F3",
	hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Cache Creation",
	x=df_sorted["instance_idx"],
	y=df_sorted["cache_creation_tokens"],
	marker_color="#FFA15A",
	hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
	))

	fig_stacked.add_trace(go.Bar(
	name="Completion",
	x=df_sorted["instance_idx"],
	y=df_sorted["completion_tokens"],
	marker_color="#AB63FA",
	hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
	))

	fig_stacked.update_layout(
	barmode="stack",
	title="Billable Tokens per Instance (stacked)",
	xaxis_title="Instance (sorted by cache read)",
	yaxis_title="Tokens",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=60, b=40),
	)

	return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked


	def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float):
	if df.empty:
	return None

	df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True)
	df_sorted["instance_idx"] = range(len(df_sorted))

	# Uncached input = prompt - cache_read - cache_creation
	df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)

	df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
	df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
	df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6
	df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6

	fig = go.Figure()

	fig.add_trace(go.Bar(
	name=f"Uncached Input (${input_price:.2f}/1M)",
	x=df_sorted["instance_idx"],
	y=df_sorted["cost_uncached_input"],
	marker_color="#EF553B",
	hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Read (${cache_read_price:.2f}/1M)",
	x=df_sorted["instance_idx"],
	y=df_sorted["cost_cache_read"],
	marker_color="#19D3F3",
	hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
	x=df_sorted["instance_idx"],
	y=df_sorted["cost_cache_creation"],
	marker_color="#FFA15A",
	hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	fig.add_trace(go.Bar(
	name=f"Completion (${completion_price:.2f}/1M)",
	x=df_sorted["instance_idx"],
	y=df_sorted["cost_completion"],
	marker_color="#AB63FA",
	hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
	))

	total_cost = (
	df_sorted["cost_uncached_input"].sum() +
	df_sorted["cost_cache_read"].sum() +
	df_sorted["cost_cache_creation"].sum() +
	df_sorted["cost_completion"].sum()
	)

	fig.update_layout(
	barmode="stack",
	title="Cost Breakdown per Instance",
	xaxis_title="Instance (sorted by cache read)",
	yaxis_title="Cost ($)",
	legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
	margin=dict(l=50, r=20, t=60, b=40),
	)

	fig.add_annotation(
	text=f"Total: ${total_cost:.2f}",
	xref="paper", yref="paper",
	x=0.95, y=0.95, showarrow=False,
	font=dict(size=14),
	bgcolor="white",
	)

	return fig


	def extract_model_from_folder(folder: str) -> str:
	"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'"""
	if not folder:
	return ""
	parts = folder.split("_")
	if len(parts) >= 3:
	return "_".join(parts[2:])
	return folder


	def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]:
	"""Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)"""
	model_hint = extract_model_from_folder(folder)
	if not model_hint:
	return 0, 0, 0, 0, ""

	prices = get_model_prices(model_hint)
	if prices:
	input_price = prices.get("input_cost_per_token", 0) * 1e6
	cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6
	cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6
	completion = prices.get("output_cost_per_token", 0) * 1e6
	return input_price, cache_read, cache_creation, completion, model_hint

	return 0, 0, 0, 0, model_hint


	def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
	if evt.index is None:
	return (
	"", "",
	gr.update(interactive=False),
	gr.update(visible=False),
	gr.update(value=0, label="💲 Input"),
	gr.update(value=0, label="💲 Cache Read"),
	gr.update(value=0, label="💲 Cache Creation"),
	gr.update(value=0, label="💲 Completion"),
	""
	)

	row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
	row = df.iloc[row_idx]
	folder = row["folder"]
	name = row["name"]

	show_analyze = check_trajectories_downloaded(folder)

	input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder)

	def price_update(value, name):
	if value > 0:
	return gr.update(value=value, label=f"✅ {name}")
	else:
	return gr.update(value=value, label=f"❌ {name}")

	return (
	folder, name,
	gr.update(interactive=True),
	gr.update(visible=show_analyze),
	price_update(input_price, "Input"),
	price_update(cache_read, "Cache Read"),
	price_update(cache_creation, "Cache Creation"),
	price_update(completion, "Completion"),
	model_hint
	)


	def build_app():
	leaderboard_df = get_bash_only_df()

	with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
	trajectories_state = gr.State(None)

	gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard")
	gr.Markdown("Select a model to use as base for cost analysis")

	with gr.Row():
	with gr.Column(scale=3):
	leaderboard_table = gr.Dataframe(
	value=leaderboard_df,
	label="Bash-Only Leaderboard",
	interactive=False,
	wrap=True,
	)

	with gr.Column(visible=False) as analysis_section:
	gr.Markdown("## 📊 Trajectory Analysis")

	with gr.Row():
	plot_steps = gr.Plot(label="API Calls Distribution")
	plot_cost = gr.Plot(label="Cost Distribution")

	with gr.Row():
	plot_tokens = gr.Plot(label="Token Usage by Type")
	plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")

	with gr.Row():
	plot_stacked = gr.Plot(label="Billable Tokens per Instance")

	with gr.Row():
	plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")

	with gr.Column(scale=1):
	selected_folder = gr.State("")
	gr.Markdown("### Selected Model")
	selected_name = gr.Textbox(label="Model Name", interactive=False)

	download_btn = gr.Button("📥 Download Trajectories", interactive=False)
	download_status = gr.Textbox(label="Status", interactive=False, lines=3)

	analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary")

	gr.Markdown("---")
	gr.Markdown("### 💰 Token Prices ($/1M) · [litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)")
	detected_model = gr.Textbox(label="Detected Model", interactive=False)
	price_input = gr.Number(label="💲 Input", value=0, precision=2)
	price_cache_read = gr.Number(label="💲 Cache Read", value=0, precision=2)
	price_cache_creation = gr.Number(label="💲 Cache Creation", value=0, precision=2)
	price_completion = gr.Number(label="💲 Completion", value=0, precision=2)

	leaderboard_table.select(
	fn=on_row_select,
	inputs=[leaderboard_table],
	outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
	)

	download_btn.click(
	fn=download_trajectories_from_s3,
	inputs=[selected_folder],
	outputs=[download_status, analyze_btn],
	)

	def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price):
	empty_result = (
	gr.update(visible=False),
	None, None, None, None, None, None,
	)

	if not folder:
	yield empty_result
	return

	yield (
	gr.update(visible=True),
	None, None, None, None, None, None,
	)

	df = load_all_trajectories(folder)
	if df.empty:
	yield empty_result
	return

	fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms(
	df, input_price, cache_read_price, cache_creation_price, completion_price
	)
	fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)

	yield (
	gr.update(visible=True),
	fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
	)

	analyze_btn.click(
	fn=load_and_analyze,
	inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion],
	outputs=[
	analysis_section,
	plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
	],
	)

	return app


	if __name__ == "__main__":
	app = build_app()
	app.queue()
	app.launch()