|
|
import json |
|
|
import os |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import requests |
|
|
|
|
|
from src.download_swebench_leaderboard import download_leaderboard |
|
|
|
|
|
DATA_DIR = Path("data") |
|
|
TRAJS_DIR = DATA_DIR / "swebench_trajs" |
|
|
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json" |
|
|
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json" |
|
|
S3_BUCKET = "s3://swe-bench-experiments/bash-only" |
|
|
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" |
|
|
|
|
|
_litellm_prices_cache = None |
|
|
_trajectories_cache = {} |
|
|
|
|
|
|
|
|
def get_litellm_prices() -> dict: |
|
|
global _litellm_prices_cache |
|
|
if _litellm_prices_cache is not None: |
|
|
return _litellm_prices_cache |
|
|
|
|
|
if LITELLM_PRICES_CACHE.exists(): |
|
|
with open(LITELLM_PRICES_CACHE) as f: |
|
|
_litellm_prices_cache = json.load(f) |
|
|
return _litellm_prices_cache |
|
|
|
|
|
try: |
|
|
response = requests.get(LITELLM_PRICES_URL, timeout=30) |
|
|
response.raise_for_status() |
|
|
_litellm_prices_cache = response.json() |
|
|
|
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
with open(LITELLM_PRICES_CACHE, "w") as f: |
|
|
json.dump(_litellm_prices_cache, f) |
|
|
except Exception: |
|
|
_litellm_prices_cache = {} |
|
|
|
|
|
return _litellm_prices_cache |
|
|
|
|
|
|
|
|
def get_model_prices(model_name: str) -> dict | None: |
|
|
if not model_name: |
|
|
return None |
|
|
|
|
|
prices = get_litellm_prices() |
|
|
|
|
|
clean_name = model_name.replace("anthropic/", "").replace("openai/", "") |
|
|
|
|
|
candidates = [ |
|
|
model_name, |
|
|
clean_name, |
|
|
f"anthropic/{clean_name}", |
|
|
f"openai/{clean_name}", |
|
|
] |
|
|
|
|
|
for key in candidates: |
|
|
if key in prices: |
|
|
return prices[key] |
|
|
|
|
|
for key, value in prices.items(): |
|
|
if clean_name in key or model_name in key: |
|
|
return value |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def load_or_download_leaderboard(): |
|
|
if LEADERBOARD_CACHE.exists(): |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
filename = download_leaderboard(output_dir=str(DATA_DIR)) |
|
|
os.rename(filename, LEADERBOARD_CACHE) |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
def get_bash_only_df(): |
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return pd.DataFrame() |
|
|
|
|
|
rows = [] |
|
|
for r in bash_only["results"]: |
|
|
rows.append({ |
|
|
"name": r.get("name", ""), |
|
|
"date": r.get("date", ""), |
|
|
"cost": round(r.get("cost", 0), 2), |
|
|
"instance_cost": round(r.get("instance_cost", 0), 4), |
|
|
"instance_calls": r.get("instance_calls", 0), |
|
|
"folder": r.get("folder", ""), |
|
|
"os_model": "โ
" if r.get("os_model") else "โ", |
|
|
"os_system": "โ
" if r.get("os_system") else "โ", |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
|
|
|
def get_model_details(folder: str): |
|
|
if not folder: |
|
|
return None, "Select a model from the table" |
|
|
|
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return None, "Leaderboard not found" |
|
|
|
|
|
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None) |
|
|
if not model: |
|
|
return None, f"Model with folder '{folder}' not found" |
|
|
|
|
|
return model, None |
|
|
|
|
|
|
|
|
def check_trajectories_downloaded(folder: str) -> bool: |
|
|
if not folder: |
|
|
return False |
|
|
output_dir = TRAJS_DIR / folder |
|
|
return output_dir.exists() and any(output_dir.iterdir()) |
|
|
|
|
|
|
|
|
def download_trajectories_from_s3(folder: str, progress=gr.Progress()): |
|
|
if not folder: |
|
|
return "โ No model selected", gr.update(visible=False) |
|
|
|
|
|
model, error = get_model_details(folder) |
|
|
if error: |
|
|
return f"โ {error}", gr.update(visible=False) |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
if output_dir.exists() and any(output_dir.iterdir()): |
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
return f"โ
Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True) |
|
|
|
|
|
s3_path = f"{S3_BUCKET}/{folder}/trajs/" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
progress(0, desc="Starting S3 download...") |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=600, |
|
|
) |
|
|
|
|
|
if result.returncode != 0: |
|
|
return f"โ S3 download failed:\n{result.stderr}", gr.update(visible=False) |
|
|
|
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
|
|
|
per_instance = model.get("per_instance_details", {}) |
|
|
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved")) |
|
|
total_count = len(per_instance) |
|
|
|
|
|
status = f"โ
Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({100*resolved_count/total_count:.1f}%)" |
|
|
return status, gr.update(visible=True) |
|
|
|
|
|
except subprocess.TimeoutExpired: |
|
|
return "โ Download timed out (>10 min)", gr.update(visible=False) |
|
|
except FileNotFoundError: |
|
|
return "โ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False) |
|
|
except Exception as e: |
|
|
return f"โ Error: {e}", gr.update(visible=False) |
|
|
|
|
|
|
|
|
def parse_trajectory(traj_path: Path) -> dict: |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
info = data.get("info", {}) |
|
|
model_stats = info.get("model_stats", {}) |
|
|
config = info.get("config", {}) |
|
|
model_config = config.get("model", {}) |
|
|
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", "")) |
|
|
|
|
|
result = { |
|
|
"instance_id": data.get("instance_id", traj_path.stem), |
|
|
"model_name": model_name, |
|
|
"api_calls": model_stats.get("api_calls", 0), |
|
|
"instance_cost": model_stats.get("instance_cost", 0), |
|
|
"prompt_tokens": 0, |
|
|
"completion_tokens": 0, |
|
|
"total_tokens": 0, |
|
|
"cache_read_tokens": 0, |
|
|
"cache_creation_tokens": 0, |
|
|
} |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
for msg in messages: |
|
|
usage = None |
|
|
if "usage" in msg: |
|
|
usage = msg["usage"] |
|
|
elif "extra" in msg and isinstance(msg["extra"], dict): |
|
|
response = msg["extra"].get("response", {}) |
|
|
if isinstance(response, dict): |
|
|
usage = response.get("usage", {}) |
|
|
|
|
|
if usage: |
|
|
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0 |
|
|
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0 |
|
|
result["total_tokens"] += usage.get("total_tokens", 0) or 0 |
|
|
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0 |
|
|
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0 |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_all_trajectories(folder: str) -> pd.DataFrame: |
|
|
global _trajectories_cache |
|
|
|
|
|
if folder in _trajectories_cache: |
|
|
return _trajectories_cache[folder] |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
rows = [] |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
rows.append(parse_trajectory(traj_path)) |
|
|
except Exception as e: |
|
|
print(f"Error parsing {traj_path}: {e}") |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
_trajectories_cache[folder] = df |
|
|
return df |
|
|
|
|
|
|
|
|
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None, None, None, None, None |
|
|
|
|
|
fig_steps = px.histogram( |
|
|
df, |
|
|
x="api_calls", |
|
|
nbins=30, |
|
|
title="Distribution of API Calls (Steps) per Instance", |
|
|
color_discrete_sequence=["#636EFA"], |
|
|
) |
|
|
fig_steps.update_layout( |
|
|
xaxis_title="API Calls (Steps)", |
|
|
yaxis_title="Number of Instances", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_steps.add_annotation( |
|
|
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
fig_cost = px.histogram( |
|
|
df, |
|
|
x="instance_cost", |
|
|
nbins=30, |
|
|
title="Distribution of Cost per Instance ($)", |
|
|
color_discrete_sequence=["#00CC96"], |
|
|
) |
|
|
fig_cost.update_layout( |
|
|
xaxis_title="Cost ($)", |
|
|
yaxis_title="Number of Instances", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_cost.add_annotation( |
|
|
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
|
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
token_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Total Tokens": [total_uncached_input, total_cache_read, total_cache_creation, total_completion], |
|
|
}) |
|
|
|
|
|
fig_tokens = px.bar( |
|
|
token_data, |
|
|
x="Token Type", |
|
|
y="Total Tokens", |
|
|
title="Total Tokens by Type", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens.update_layout( |
|
|
xaxis_title="Token Type", |
|
|
yaxis_title="Total Tokens", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
total_all = token_data["Total Tokens"].sum() |
|
|
fig_tokens.add_annotation( |
|
|
text=f"Total: {total_all:,.0f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
|
|
|
cost_uncached_input = total_uncached_input * input_price / 1e6 |
|
|
cost_cache_read = total_cache_read * cache_read_price / 1e6 |
|
|
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6 |
|
|
cost_completion = total_completion * completion_price / 1e6 |
|
|
|
|
|
cost_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion], |
|
|
}) |
|
|
|
|
|
fig_tokens_cost = px.bar( |
|
|
cost_data, |
|
|
x="Token Type", |
|
|
y="Cost ($)", |
|
|
title="Total Cost by Token Type ($)", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens_cost.update_layout( |
|
|
xaxis_title="Token Type", |
|
|
yaxis_title="Cost ($)", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion |
|
|
fig_tokens_cost.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True) |
|
|
df_sorted["instance_idx"] = range(len(df_sorted)) |
|
|
|
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
|
|
|
fig_stacked = go.Figure() |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["uncached_input_tokens"], |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Instance: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cache_read_tokens"], |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Instance: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cache_creation_tokens"], |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Instance: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["completion_tokens"], |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Instance: %{x}<br>Completion: %{y:,.0f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.update_layout( |
|
|
barmode="stack", |
|
|
title="Billable Tokens per Instance (stacked)", |
|
|
xaxis_title="Instance (sorted by cache read)", |
|
|
yaxis_title="Tokens", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=60, b=40), |
|
|
) |
|
|
|
|
|
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked |
|
|
|
|
|
|
|
|
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None |
|
|
|
|
|
df_sorted = df.sort_values("cache_read_tokens", ascending=False).reset_index(drop=True) |
|
|
df_sorted["instance_idx"] = range(len(df_sorted)) |
|
|
|
|
|
|
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
|
|
|
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6 |
|
|
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6 |
|
|
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6 |
|
|
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6 |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Uncached Input (${input_price:.2f}/1M)", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cost_uncached_input"], |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Read (${cache_read_price:.2f}/1M)", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cost_cache_read"], |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Creation (${cache_creation_price:.2f}/1M)", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cost_cache_creation"], |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Completion (${completion_price:.2f}/1M)", |
|
|
x=df_sorted["instance_idx"], |
|
|
y=df_sorted["cost_completion"], |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Instance: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
total_cost = ( |
|
|
df_sorted["cost_uncached_input"].sum() + |
|
|
df_sorted["cost_cache_read"].sum() + |
|
|
df_sorted["cost_cache_creation"].sum() + |
|
|
df_sorted["cost_completion"].sum() |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
title="Cost Breakdown per Instance", |
|
|
xaxis_title="Instance (sorted by cache read)", |
|
|
yaxis_title="Cost ($)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=60, b=40), |
|
|
) |
|
|
|
|
|
fig.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=14), |
|
|
bgcolor="white", |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def extract_model_from_folder(folder: str) -> str: |
|
|
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'""" |
|
|
if not folder: |
|
|
return "" |
|
|
parts = folder.split("_") |
|
|
if len(parts) >= 3: |
|
|
return "_".join(parts[2:]) |
|
|
return folder |
|
|
|
|
|
|
|
|
def get_prices_for_folder(folder: str) -> tuple[float, float, float, float, str]: |
|
|
"""Get prices from litellm based on folder name. Returns (input, cache_read, cache_creation, completion, model_name)""" |
|
|
model_hint = extract_model_from_folder(folder) |
|
|
if not model_hint: |
|
|
return 0, 0, 0, 0, "" |
|
|
|
|
|
prices = get_model_prices(model_hint) |
|
|
if prices: |
|
|
input_price = prices.get("input_cost_per_token", 0) * 1e6 |
|
|
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6 |
|
|
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6 |
|
|
completion = prices.get("output_cost_per_token", 0) * 1e6 |
|
|
return input_price, cache_read, cache_creation, completion, model_hint |
|
|
|
|
|
return 0, 0, 0, 0, model_hint |
|
|
|
|
|
|
|
|
def on_row_select(evt: gr.SelectData, df: pd.DataFrame): |
|
|
if evt.index is None: |
|
|
return ( |
|
|
"", "", |
|
|
gr.update(interactive=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(value=0, label="๐ฒ Input"), |
|
|
gr.update(value=0, label="๐ฒ Cache Read"), |
|
|
gr.update(value=0, label="๐ฒ Cache Creation"), |
|
|
gr.update(value=0, label="๐ฒ Completion"), |
|
|
"" |
|
|
) |
|
|
|
|
|
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index |
|
|
row = df.iloc[row_idx] |
|
|
folder = row["folder"] |
|
|
name = row["name"] |
|
|
|
|
|
show_analyze = check_trajectories_downloaded(folder) |
|
|
|
|
|
input_price, cache_read, cache_creation, completion, model_hint = get_prices_for_folder(folder) |
|
|
|
|
|
def price_update(value, name): |
|
|
if value > 0: |
|
|
return gr.update(value=value, label=f"โ
{name}") |
|
|
else: |
|
|
return gr.update(value=value, label=f"โ {name}") |
|
|
|
|
|
return ( |
|
|
folder, name, |
|
|
gr.update(interactive=True), |
|
|
gr.update(visible=show_analyze), |
|
|
price_update(input_price, "Input"), |
|
|
price_update(cache_read, "Cache Read"), |
|
|
price_update(cache_creation, "Cache Creation"), |
|
|
price_update(completion, "Completion"), |
|
|
model_hint |
|
|
) |
|
|
|
|
|
|
|
|
def build_app(): |
|
|
leaderboard_df = get_bash_only_df() |
|
|
|
|
|
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app: |
|
|
trajectories_state = gr.State(None) |
|
|
|
|
|
gr.Markdown("# ๐งฎ SWE-bench Bash-Only Leaderboard") |
|
|
gr.Markdown("Select a model to use as base for cost analysis") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
leaderboard_table = gr.Dataframe( |
|
|
value=leaderboard_df, |
|
|
label="Bash-Only Leaderboard", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
with gr.Column(visible=False) as analysis_section: |
|
|
gr.Markdown("## ๐ Trajectory Analysis") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_steps = gr.Plot(label="API Calls Distribution") |
|
|
plot_cost = gr.Plot(label="Cost Distribution") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_tokens = gr.Plot(label="Token Usage by Type") |
|
|
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_stacked = gr.Plot(label="Billable Tokens per Instance") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
selected_folder = gr.State("") |
|
|
gr.Markdown("### Selected Model") |
|
|
selected_name = gr.Textbox(label="Model Name", interactive=False) |
|
|
|
|
|
download_btn = gr.Button("๐ฅ Download Trajectories", interactive=False) |
|
|
download_status = gr.Textbox(label="Status", interactive=False, lines=3) |
|
|
|
|
|
analyze_btn = gr.Button("๐ Load & Analyze", visible=False, variant="primary") |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### ๐ฐ Token Prices ($/1M) ยท *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*") |
|
|
detected_model = gr.Textbox(label="Detected Model", interactive=False) |
|
|
price_input = gr.Number(label="๐ฒ Input", value=0, precision=2) |
|
|
price_cache_read = gr.Number(label="๐ฒ Cache Read", value=0, precision=2) |
|
|
price_cache_creation = gr.Number(label="๐ฒ Cache Creation", value=0, precision=2) |
|
|
price_completion = gr.Number(label="๐ฒ Completion", value=0, precision=2) |
|
|
|
|
|
leaderboard_table.select( |
|
|
fn=on_row_select, |
|
|
inputs=[leaderboard_table], |
|
|
outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model], |
|
|
) |
|
|
|
|
|
download_btn.click( |
|
|
fn=download_trajectories_from_s3, |
|
|
inputs=[selected_folder], |
|
|
outputs=[download_status, analyze_btn], |
|
|
) |
|
|
|
|
|
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price): |
|
|
empty_result = ( |
|
|
gr.update(visible=False), |
|
|
None, None, None, None, None, None, |
|
|
) |
|
|
|
|
|
if not folder: |
|
|
yield empty_result |
|
|
return |
|
|
|
|
|
yield ( |
|
|
gr.update(visible=True), |
|
|
None, None, None, None, None, None, |
|
|
) |
|
|
|
|
|
df = load_all_trajectories(folder) |
|
|
if df.empty: |
|
|
yield empty_result |
|
|
return |
|
|
|
|
|
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms( |
|
|
df, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
yield ( |
|
|
gr.update(visible=True), |
|
|
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown, |
|
|
) |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=load_and_analyze, |
|
|
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion], |
|
|
outputs=[ |
|
|
analysis_section, |
|
|
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown, |
|
|
], |
|
|
) |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app = build_app() |
|
|
app.queue() |
|
|
app.launch() |
|
|
|