|
|
import json |
|
|
import os |
|
|
import re |
|
|
import subprocess |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import requests |
|
|
import tiktoken |
|
|
|
|
|
from src.download_swebench_leaderboard import download_leaderboard |
|
|
|
|
|
|
|
|
_tokenizer_cache = {} |
|
|
|
|
|
DATA_DIR = Path("data") |
|
|
TRAJS_DIR = DATA_DIR / "swebench_trajs" |
|
|
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json" |
|
|
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json" |
|
|
S3_BUCKET = "s3://swe-bench-experiments/bash-only" |
|
|
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" |
|
|
|
|
|
_litellm_prices_cache = None |
|
|
_trajectories_cache = {} |
|
|
_calculated_tokens_cache = {} |
|
|
_trajectory_steps_cache = {} |
|
|
|
|
|
|
|
|
def parse_step_or_ratio(value: float, total_steps: int) -> int: |
|
|
""" |
|
|
Parse a value as either step number or ratio. |
|
|
|
|
|
If value is integer (e.g., 3.0, 5.0) -> treat as step number |
|
|
If value is float with decimal (e.g., 0.5, 0.25) -> treat as ratio of total_steps |
|
|
|
|
|
Returns: step index (0-based) |
|
|
""" |
|
|
if value == int(value) and value >= 1: |
|
|
return int(value) |
|
|
else: |
|
|
return int(value * total_steps) |
|
|
|
|
|
|
|
|
def get_routed_steps(total_steps: int, strategy: str, params: dict) -> set: |
|
|
""" |
|
|
Determine which steps should be routed to alternative model. |
|
|
|
|
|
Returns set of step indices (0-based) that should use the routing model. |
|
|
""" |
|
|
import random |
|
|
|
|
|
routed = set() |
|
|
|
|
|
if strategy == "Replace on random steps": |
|
|
pct = params.get("percentage", 50) / 100.0 |
|
|
num_to_route = int(total_steps * pct) |
|
|
if num_to_route > 0: |
|
|
routed = set(random.sample(range(total_steps), min(num_to_route, total_steps))) |
|
|
|
|
|
elif strategy == "Replace every step k": |
|
|
k = int(params.get("k", 2)) |
|
|
if k > 0: |
|
|
routed = set(range(0, total_steps, k)) |
|
|
|
|
|
elif strategy == "Replace part of trajectory": |
|
|
start = parse_step_or_ratio(params.get("start", 0), total_steps) |
|
|
end = parse_step_or_ratio(params.get("end", 0.5), total_steps) |
|
|
routed = set(range(start, min(end, total_steps))) |
|
|
|
|
|
return routed |
|
|
|
|
|
|
|
|
def calculate_routing_tokens(steps: list[dict]) -> dict: |
|
|
""" |
|
|
Calculate token breakdown per model with proper caching simulation. |
|
|
|
|
|
Args: |
|
|
steps: list of dicts with keys: |
|
|
- model: str (model name) |
|
|
- system_user: int (tokens for system/user message, usually only step 0) |
|
|
- completion: int (generated tokens) |
|
|
- observation: int or None (env response tokens, None for last step) |
|
|
|
|
|
Returns: |
|
|
dict with per-model totals: |
|
|
{model_name: {cache_read, uncached_input, completion, observation, cache_creation}} |
|
|
""" |
|
|
model_caches = {} |
|
|
model_totals = {} |
|
|
|
|
|
total_context = 0 |
|
|
prev_observation = 0 |
|
|
|
|
|
for i, step in enumerate(steps): |
|
|
model = step["model"] |
|
|
system_user = step.get("system_user", 0) |
|
|
completion = step.get("completion", 0) |
|
|
observation = step.get("observation") or 0 |
|
|
|
|
|
if model not in model_caches: |
|
|
model_caches[model] = 0 |
|
|
if model not in model_totals: |
|
|
model_totals[model] = { |
|
|
"cache_read": 0, |
|
|
"uncached_input": 0, |
|
|
"completion": 0, |
|
|
"observation": 0, |
|
|
"cache_creation": 0, |
|
|
} |
|
|
|
|
|
cache_read = model_caches[model] |
|
|
|
|
|
if i == 0: |
|
|
uncached_input = system_user |
|
|
else: |
|
|
full_context_needed = total_context + prev_observation |
|
|
uncached_input = full_context_needed - cache_read |
|
|
|
|
|
cache_creation = uncached_input + completion |
|
|
|
|
|
model_caches[model] = cache_read + cache_creation |
|
|
|
|
|
model_totals[model]["cache_read"] += cache_read |
|
|
model_totals[model]["uncached_input"] += uncached_input |
|
|
model_totals[model]["completion"] += completion |
|
|
model_totals[model]["observation"] += observation |
|
|
model_totals[model]["cache_creation"] += cache_creation |
|
|
|
|
|
total_context = cache_read + uncached_input + completion |
|
|
prev_observation = observation |
|
|
|
|
|
return model_totals |
|
|
|
|
|
|
|
|
def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]: |
|
|
""" |
|
|
Parse trajectory file into step format for calculate_routing_tokens. |
|
|
|
|
|
Returns list of steps with: |
|
|
- model: base model name |
|
|
- system_user: tokens for system + user message (step 0 only) |
|
|
- completion: assistant response tokens |
|
|
- observation: env response tokens (None for last step) |
|
|
""" |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
if not messages: |
|
|
return [] |
|
|
|
|
|
count_tokens, _ = get_tokenizer(model_name) |
|
|
|
|
|
steps = [] |
|
|
system_user_tokens = 0 |
|
|
current_completion = 0 |
|
|
pending_observation = None |
|
|
|
|
|
i = 0 |
|
|
while i < len(messages): |
|
|
msg = messages[i] |
|
|
role = msg.get("role", "user") |
|
|
content = msg.get("content", "") |
|
|
if isinstance(content, list): |
|
|
content = json.dumps(content) |
|
|
tokens = count_tokens(str(content)) |
|
|
|
|
|
if role == "system": |
|
|
system_user_tokens += tokens |
|
|
i += 1 |
|
|
elif role == "user": |
|
|
if not steps: |
|
|
system_user_tokens += tokens |
|
|
i += 1 |
|
|
else: |
|
|
if steps: |
|
|
steps[-1]["observation"] = tokens |
|
|
pending_observation = tokens |
|
|
i += 1 |
|
|
elif role == "assistant": |
|
|
step = { |
|
|
"model": model_name, |
|
|
"system_user": system_user_tokens if not steps else 0, |
|
|
"completion": tokens, |
|
|
"observation": None, |
|
|
} |
|
|
steps.append(step) |
|
|
system_user_tokens = 0 |
|
|
i += 1 |
|
|
|
|
|
return steps |
|
|
|
|
|
|
|
|
def get_default_overhead(model_name: str) -> float: |
|
|
"""Get default tokenizer overhead for model provider""" |
|
|
model_lower = model_name.lower() if model_name else "" |
|
|
|
|
|
if "claude" in model_lower or "anthropic" in model_lower: |
|
|
return 1.24 |
|
|
elif "gemini" in model_lower or "google" in model_lower: |
|
|
return 1.0 |
|
|
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower: |
|
|
return 1.0 |
|
|
else: |
|
|
return 1.0 |
|
|
|
|
|
|
|
|
def get_tokenizer(model_name: str): |
|
|
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)""" |
|
|
global _tokenizer_cache |
|
|
|
|
|
model_lower = model_name.lower() if model_name else "" |
|
|
|
|
|
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower: |
|
|
tokenizer_name = "o200k_base" |
|
|
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower: |
|
|
tokenizer_name = "cl100k_base" |
|
|
elif "gemini" in model_lower or "google" in model_lower: |
|
|
return lambda text: int(len(text) / 3.23), "gemini_approx" |
|
|
else: |
|
|
tokenizer_name = "cl100k_base" |
|
|
|
|
|
if tokenizer_name not in _tokenizer_cache: |
|
|
_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name) |
|
|
|
|
|
enc = _tokenizer_cache[tokenizer_name] |
|
|
return lambda text: len(enc.encode(text)), tokenizer_name |
|
|
|
|
|
|
|
|
def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame: |
|
|
"""Apply tokenizer overhead multiplier to all token counts""" |
|
|
if df.empty or overhead == 1.0: |
|
|
return df |
|
|
|
|
|
df = df.copy() |
|
|
df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int) |
|
|
df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int) |
|
|
df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int) |
|
|
df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int) |
|
|
df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"] |
|
|
return df |
|
|
|
|
|
|
|
|
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame: |
|
|
"""Convert all tokens to uncached input + completion (no caching)""" |
|
|
if df.empty: |
|
|
return df |
|
|
|
|
|
df = df.copy() |
|
|
df["cache_read_tokens"] = 0 |
|
|
df["cache_creation_tokens"] = 0 |
|
|
return df |
|
|
|
|
|
|
|
|
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame: |
|
|
"""Load trajectories with self-calculated token counts using calculate_routing_tokens""" |
|
|
global _calculated_tokens_cache |
|
|
|
|
|
cache_key = f"calculated_{folder}" |
|
|
if cache_key in _calculated_tokens_cache: |
|
|
return _calculated_tokens_cache[cache_key] |
|
|
|
|
|
trajectory_steps = load_all_trajectory_steps(folder) |
|
|
|
|
|
rows = [] |
|
|
for instance_id, steps in trajectory_steps.items(): |
|
|
if not steps: |
|
|
continue |
|
|
|
|
|
try: |
|
|
model_totals = calculate_routing_tokens(steps) |
|
|
step_model = steps[0].get("model", "") if steps else "" |
|
|
totals = model_totals.get(step_model, {}) |
|
|
|
|
|
cache_read = totals.get("cache_read", 0) |
|
|
uncached_input = totals.get("uncached_input", 0) |
|
|
completion = totals.get("completion", 0) |
|
|
cache_creation = totals.get("cache_creation", 0) |
|
|
|
|
|
prompt_tokens = cache_read + uncached_input |
|
|
|
|
|
rows.append({ |
|
|
"instance_id": instance_id, |
|
|
"model_name": step_model, |
|
|
"api_calls": len(steps), |
|
|
"instance_cost": 0, |
|
|
"prompt_tokens": prompt_tokens, |
|
|
"completion_tokens": completion, |
|
|
"total_tokens": prompt_tokens + completion, |
|
|
"cache_read_tokens": cache_read, |
|
|
"cache_creation_tokens": cache_creation, |
|
|
}) |
|
|
except Exception as e: |
|
|
print(f"Error calculating tokens for {instance_id}: {e}") |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
_calculated_tokens_cache[cache_key] = df |
|
|
return df |
|
|
|
|
|
|
|
|
def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]: |
|
|
""" |
|
|
Load all trajectories as step sequences for routing calculations. |
|
|
|
|
|
Returns: |
|
|
dict mapping instance_id -> list of steps for calculate_routing_tokens |
|
|
""" |
|
|
global _trajectory_steps_cache |
|
|
|
|
|
cache_key = f"steps_{folder}" |
|
|
if cache_key in _trajectory_steps_cache: |
|
|
return _trajectory_steps_cache[cache_key] |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*/*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
model_name = "" |
|
|
if traj_files: |
|
|
try: |
|
|
with open(traj_files[0], "r") as f: |
|
|
first_data = json.load(f) |
|
|
config = first_data.get("info", {}).get("config", {}).get("model", {}) |
|
|
model_name = config.get("cost_calc_model_override", config.get("model_name", "")) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
result = {} |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
instance_id = traj_path.stem.replace(".traj", "") |
|
|
steps = parse_trajectory_to_steps(traj_path, model_name) |
|
|
if steps: |
|
|
result[instance_id] = steps |
|
|
except Exception as e: |
|
|
print(f"Error parsing steps for {traj_path}: {e}") |
|
|
|
|
|
_trajectory_steps_cache[cache_key] = result |
|
|
return result |
|
|
|
|
|
|
|
|
def get_litellm_model_list() -> list[str]: |
|
|
"""Get list of model names from litellm prices""" |
|
|
prices = get_litellm_prices() |
|
|
return sorted(prices.keys()) |
|
|
|
|
|
|
|
|
def get_litellm_prices() -> dict: |
|
|
global _litellm_prices_cache |
|
|
if _litellm_prices_cache is not None: |
|
|
return _litellm_prices_cache |
|
|
|
|
|
if LITELLM_PRICES_CACHE.exists(): |
|
|
with open(LITELLM_PRICES_CACHE) as f: |
|
|
_litellm_prices_cache = json.load(f) |
|
|
return _litellm_prices_cache |
|
|
|
|
|
try: |
|
|
response = requests.get(LITELLM_PRICES_URL, timeout=30) |
|
|
response.raise_for_status() |
|
|
_litellm_prices_cache = response.json() |
|
|
|
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
with open(LITELLM_PRICES_CACHE, "w") as f: |
|
|
json.dump(_litellm_prices_cache, f) |
|
|
except Exception: |
|
|
_litellm_prices_cache = {} |
|
|
|
|
|
return _litellm_prices_cache |
|
|
|
|
|
|
|
|
def normalize_model_name(name: str) -> str: |
|
|
"""Normalize model name for comparison: lowercase, remove separators""" |
|
|
return re.sub(r'[-_./]', '', name.lower()) |
|
|
|
|
|
|
|
|
def get_model_prices(model_name: str) -> dict | None: |
|
|
if not model_name: |
|
|
return None |
|
|
|
|
|
prices = get_litellm_prices() |
|
|
|
|
|
clean_name = model_name.replace("anthropic/", "").replace("openai/", "") |
|
|
|
|
|
name_without_date = re.sub(r'-\d{8}$', '', clean_name) |
|
|
|
|
|
candidates = [ |
|
|
model_name, |
|
|
clean_name, |
|
|
name_without_date, |
|
|
f"anthropic/{clean_name}", |
|
|
f"openai/{clean_name}", |
|
|
f"anthropic/{name_without_date}", |
|
|
f"openai/{name_without_date}", |
|
|
] |
|
|
|
|
|
for key in candidates: |
|
|
if key in prices: |
|
|
return prices[key] |
|
|
|
|
|
normalized_name = normalize_model_name(clean_name) |
|
|
normalized_no_date = normalize_model_name(name_without_date) |
|
|
|
|
|
for key, value in prices.items(): |
|
|
key_normalized = normalize_model_name(key) |
|
|
if normalized_name in key_normalized or normalized_no_date in key_normalized: |
|
|
return value |
|
|
key_last_part = key.split('/')[-1] if '/' in key else key |
|
|
key_last_normalized = normalize_model_name(key_last_part) |
|
|
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized: |
|
|
return value |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def load_or_download_leaderboard(): |
|
|
if LEADERBOARD_CACHE.exists(): |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
filename = download_leaderboard(output_dir=str(DATA_DIR)) |
|
|
os.rename(filename, LEADERBOARD_CACHE) |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
def get_bash_only_df(): |
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return pd.DataFrame() |
|
|
|
|
|
rows = [] |
|
|
for r in bash_only["results"]: |
|
|
resolved_pct = r.get("resolved", 0) |
|
|
if isinstance(resolved_pct, (int, float)): |
|
|
resolved_str = f"{resolved_pct:.1f}%" |
|
|
else: |
|
|
resolved_str = str(resolved_pct) |
|
|
|
|
|
rows.append({ |
|
|
"name": r.get("name", ""), |
|
|
"% resolved": resolved_str, |
|
|
"date": r.get("date", ""), |
|
|
"cost": round(r.get("cost", 0), 2), |
|
|
"instance_cost": round(r.get("instance_cost", 0), 4), |
|
|
"instance_calls": r.get("instance_calls", 0), |
|
|
"folder": r.get("folder", ""), |
|
|
"os_model": "✅" if r.get("os_model") else "❌", |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
|
|
|
def get_model_details(folder: str): |
|
|
if not folder: |
|
|
return None, "Select a model from the table" |
|
|
|
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return None, "Leaderboard not found" |
|
|
|
|
|
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None) |
|
|
if not model: |
|
|
return None, f"Model with folder '{folder}' not found" |
|
|
|
|
|
return model, None |
|
|
|
|
|
|
|
|
def check_trajectories_downloaded(folder: str) -> bool: |
|
|
if not folder: |
|
|
return False |
|
|
output_dir = TRAJS_DIR / folder |
|
|
return output_dir.exists() and any(output_dir.iterdir()) |
|
|
|
|
|
|
|
|
def download_trajectories_from_s3(folder: str, progress=gr.Progress()): |
|
|
if not folder: |
|
|
return "❌ No model selected", gr.update(visible=False) |
|
|
|
|
|
model, error = get_model_details(folder) |
|
|
if error: |
|
|
return f"❌ {error}", gr.update(visible=False) |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
if output_dir.exists() and any(output_dir.iterdir()): |
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*/*.traj"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True) |
|
|
|
|
|
s3_path = f"{S3_BUCKET}/{folder}/trajs/" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
progress(0, desc="Starting S3 download...") |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=600, |
|
|
) |
|
|
|
|
|
if result.returncode != 0: |
|
|
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False) |
|
|
|
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*/*.traj"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
|
|
|
if file_count == 0: |
|
|
return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False) |
|
|
|
|
|
per_instance = model.get("per_instance_details", {}) |
|
|
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved")) |
|
|
total_count = len(per_instance) |
|
|
|
|
|
if total_count > 0: |
|
|
resolved_pct = f"{100*resolved_count/total_count:.1f}%" |
|
|
else: |
|
|
resolved_pct = "N/A" |
|
|
|
|
|
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})" |
|
|
return status, gr.update(visible=True) |
|
|
|
|
|
except subprocess.TimeoutExpired: |
|
|
return "❌ Download timed out (>10 min)", gr.update(visible=False) |
|
|
except FileNotFoundError: |
|
|
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False) |
|
|
except Exception as e: |
|
|
return f"❌ Error: {e}", gr.update(visible=False) |
|
|
|
|
|
|
|
|
def parse_trajectory(traj_path: Path) -> dict: |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
info = data.get("info", {}) |
|
|
model_stats = info.get("model_stats", {}) |
|
|
config = info.get("config", {}) |
|
|
model_config = config.get("model", {}) |
|
|
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", "")) |
|
|
|
|
|
result = { |
|
|
"instance_id": data.get("instance_id", traj_path.stem), |
|
|
"model_name": model_name, |
|
|
"api_calls": model_stats.get("api_calls", 0), |
|
|
"instance_cost": model_stats.get("instance_cost", 0), |
|
|
"prompt_tokens": 0, |
|
|
"completion_tokens": 0, |
|
|
"total_tokens": 0, |
|
|
"cache_read_tokens": 0, |
|
|
"cache_creation_tokens": 0, |
|
|
} |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
for msg in messages: |
|
|
usage = None |
|
|
if "usage" in msg: |
|
|
usage = msg["usage"] |
|
|
elif "extra" in msg and isinstance(msg["extra"], dict): |
|
|
response = msg["extra"].get("response", {}) |
|
|
if isinstance(response, dict): |
|
|
usage = response.get("usage", {}) |
|
|
|
|
|
if usage: |
|
|
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0 |
|
|
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0 |
|
|
result["total_tokens"] += usage.get("total_tokens", 0) or 0 |
|
|
result["cache_read_tokens"] += usage.get("cache_read_input_tokens", 0) or 0 |
|
|
result["cache_creation_tokens"] += usage.get("cache_creation_input_tokens", 0) or 0 |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_all_trajectories(folder: str) -> pd.DataFrame: |
|
|
global _trajectories_cache |
|
|
|
|
|
if folder in _trajectories_cache: |
|
|
return _trajectories_cache[folder] |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*/*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
rows = [] |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
rows.append(parse_trajectory(traj_path)) |
|
|
except Exception as e: |
|
|
print(f"Error parsing {traj_path}: {e}") |
|
|
|
|
|
df = pd.DataFrame(rows) |
|
|
_trajectories_cache[folder] = df |
|
|
return df |
|
|
|
|
|
|
|
|
def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
"""Create Total Cost by Token Type chart (can be called separately for price updates)""" |
|
|
if df.empty: |
|
|
return None |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
cost_uncached_input = total_uncached_input * input_price / 1e6 |
|
|
cost_cache_read = total_cache_read * cache_read_price / 1e6 |
|
|
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6 |
|
|
cost_completion = total_completion * completion_price / 1e6 |
|
|
|
|
|
cost_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion], |
|
|
}) |
|
|
|
|
|
fig = px.bar( |
|
|
cost_data, |
|
|
x="Token Type", |
|
|
y="Cost ($)", |
|
|
title="Total Cost by Token Type ($)", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig.update_layout( |
|
|
xaxis_title="Token Type", |
|
|
yaxis_title="Cost ($)", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion |
|
|
fig.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
"""Create only token-related charts (for source switching)""" |
|
|
if df.empty: |
|
|
return None, None, None |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
token_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6], |
|
|
}) |
|
|
|
|
|
fig_tokens = px.bar( |
|
|
token_data, |
|
|
x="Token Type", |
|
|
y="Total Tokens (M)", |
|
|
title="Total Tokens by Type", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens.update_layout( |
|
|
xaxis_title="Token Type", |
|
|
yaxis_title="Tokens (M)", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion |
|
|
fig_tokens.add_annotation( |
|
|
text=f"Total: {total_all/1e6:.2f}M", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
fig_stacked = go.Figure() |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6, |
|
|
marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6, |
|
|
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6, |
|
|
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6, |
|
|
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.update_layout( |
|
|
barmode="stack", |
|
|
title="Tokens per Trajectory (stacked)", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Tokens (M)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=60, b=40), |
|
|
) |
|
|
|
|
|
return fig_tokens, fig_tokens_cost, fig_stacked |
|
|
|
|
|
|
|
|
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None, None, None, None, None |
|
|
|
|
|
fig_steps = px.histogram( |
|
|
df, |
|
|
x="api_calls", |
|
|
nbins=30, |
|
|
title="Distribution of API Calls (Steps) per Trajectory", |
|
|
color_discrete_sequence=["#636EFA"], |
|
|
) |
|
|
fig_steps.update_layout( |
|
|
xaxis_title="API Calls (Steps)", |
|
|
yaxis_title="Number of Trajectories", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_steps.add_annotation( |
|
|
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
fig_cost = px.histogram( |
|
|
df, |
|
|
x="instance_cost", |
|
|
nbins=30, |
|
|
title="Distribution of Cost Reported by Leaderboard ($)", |
|
|
color_discrete_sequence=["#00CC96"], |
|
|
) |
|
|
fig_cost.update_layout( |
|
|
xaxis_title="Cost ($)", |
|
|
yaxis_title="Number of Trajectories", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_cost.add_annotation( |
|
|
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
|
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
token_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6], |
|
|
}) |
|
|
|
|
|
fig_tokens = px.bar( |
|
|
token_data, |
|
|
x="Token Type", |
|
|
y="Tokens (M)", |
|
|
title="Total Tokens by Type", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens.update_layout( |
|
|
xaxis_title="Token Type", |
|
|
yaxis_title="Tokens (M)", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion |
|
|
fig_tokens.add_annotation( |
|
|
text=f"Total: {total_all/1e6:.2f}M", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
|
|
|
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
fig_stacked = go.Figure() |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["uncached_input_tokens"] / 1e6, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cache_read_tokens"] / 1e6, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cache_creation_tokens"] / 1e6, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["completion_tokens"] / 1e6, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.update_layout( |
|
|
barmode="stack", |
|
|
title="Tokens per Trajectory (stacked)", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Tokens (M)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=60, b=40), |
|
|
) |
|
|
|
|
|
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked |
|
|
|
|
|
|
|
|
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6 |
|
|
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6 |
|
|
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6 |
|
|
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6 |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Uncached Input (${input_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_uncached_input"], |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Read (${cache_read_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_cache_read"], |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Creation (${cache_creation_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_cache_creation"], |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Completion (${completion_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_completion"], |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
total_cost = ( |
|
|
df_sorted["cost_uncached_input"].sum() + |
|
|
df_sorted["cost_cache_read"].sum() + |
|
|
df_sorted["cost_cache_creation"].sum() + |
|
|
df_sorted["cost_completion"].sum() |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
title="Cost per Trajectory", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Cost ($)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=60, b=40), |
|
|
) |
|
|
|
|
|
fig.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=14), |
|
|
bgcolor="white", |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def extract_model_from_folder(folder: str) -> str: |
|
|
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'""" |
|
|
if not folder: |
|
|
return "" |
|
|
parts = folder.split("_") |
|
|
if len(parts) >= 3: |
|
|
return "_".join(parts[2:]) |
|
|
return folder |
|
|
|
|
|
|
|
|
def get_prices_for_folder(folder: str) -> tuple[dict, str]: |
|
|
"""Get prices from litellm based on folder name. |
|
|
Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type.""" |
|
|
model_hint = extract_model_from_folder(folder) |
|
|
|
|
|
result = { |
|
|
"input": {"value": 0, "found": False}, |
|
|
"cache_read": {"value": 0, "found": False}, |
|
|
"cache_creation": {"value": 0, "found": False}, |
|
|
"completion": {"value": 0, "found": False}, |
|
|
} |
|
|
|
|
|
if not model_hint: |
|
|
return result, "" |
|
|
|
|
|
prices = get_model_prices(model_hint) |
|
|
if prices: |
|
|
|
|
|
input_price = prices.get("input_cost_per_token", 0) * 1e6 |
|
|
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6 |
|
|
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6 |
|
|
completion = prices.get("output_cost_per_token", 0) * 1e6 |
|
|
|
|
|
result["input"] = {"value": input_price, "found": input_price > 0} |
|
|
result["cache_read"] = {"value": cache_read, "found": cache_read > 0} |
|
|
result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0} |
|
|
result["completion"] = {"value": completion, "found": completion > 0} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_price > 0: |
|
|
if not result["cache_read"]["found"]: |
|
|
result["cache_read"]["value"] = input_price * 0.1 |
|
|
if not result["cache_creation"]["found"]: |
|
|
result["cache_creation"]["value"] = input_price * 1.25 |
|
|
if not result["completion"]["found"]: |
|
|
result["completion"]["value"] = input_price * 5 |
|
|
elif completion > 0: |
|
|
|
|
|
estimated_input = completion / 5 |
|
|
if not result["input"]["found"]: |
|
|
result["input"]["value"] = estimated_input |
|
|
if not result["cache_read"]["found"]: |
|
|
result["cache_read"]["value"] = estimated_input * 0.1 |
|
|
if not result["cache_creation"]["found"]: |
|
|
result["cache_creation"]["value"] = estimated_input * 1.25 |
|
|
|
|
|
return result, model_hint |
|
|
|
|
|
|
|
|
def on_row_select(evt: gr.SelectData, df: pd.DataFrame): |
|
|
if evt.index is None: |
|
|
return ( |
|
|
"", "", |
|
|
gr.update(visible=False), |
|
|
gr.update(value=0, label="Input"), |
|
|
gr.update(value=0, label="Cache Read"), |
|
|
gr.update(value=0, label="Cache Creation"), |
|
|
gr.update(value=0, label="Completion"), |
|
|
"", |
|
|
gr.update(value=1.0), |
|
|
) |
|
|
|
|
|
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index |
|
|
row = df.iloc[row_idx] |
|
|
folder = row["folder"] |
|
|
name = row["name"] |
|
|
|
|
|
prices_dict, model_hint = get_prices_for_folder(folder) |
|
|
default_overhead = get_default_overhead(model_hint) |
|
|
|
|
|
def price_update(price_info, name): |
|
|
value = price_info["value"] |
|
|
if price_info["found"]: |
|
|
return gr.update(value=value, label=f"✅ {name}") |
|
|
elif value > 0: |
|
|
return gr.update(value=value, label=f"❌ {name} (est.)") |
|
|
else: |
|
|
return gr.update(value=0, label=f"❌ {name}") |
|
|
|
|
|
return ( |
|
|
folder, name, |
|
|
gr.update(visible=True), |
|
|
price_update(prices_dict["input"], "Input"), |
|
|
price_update(prices_dict["cache_read"], "Cache Read"), |
|
|
price_update(prices_dict["cache_creation"], "Cache Creation"), |
|
|
price_update(prices_dict["completion"], "Completion"), |
|
|
model_hint, |
|
|
gr.update(value=default_overhead), |
|
|
) |
|
|
|
|
|
|
|
|
def create_routed_token_chart(base_tokens: dict, additional_models: list): |
|
|
""" |
|
|
Create grouped bar chart for tokens by type, comparing base vs additional models. |
|
|
|
|
|
Args: |
|
|
base_tokens: dict with uncached_input, cache_read, cache_creation, completion |
|
|
additional_models: list of (model_name, tokens_dict) tuples |
|
|
""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"] |
|
|
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
base_total = sum(base_tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"]) |
|
|
base_values = [ |
|
|
base_tokens.get("uncached_input", 0) / 1e6, |
|
|
base_tokens.get("cache_read", 0) / 1e6, |
|
|
base_tokens.get("cache_creation", 0) / 1e6, |
|
|
base_tokens.get("completion", 0) / 1e6, |
|
|
] |
|
|
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0])) |
|
|
|
|
|
model_totals = [("Base Model", base_total)] |
|
|
|
|
|
for i, (model_name, tokens) in enumerate(additional_models): |
|
|
model_total = sum(tokens.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"]) |
|
|
model_totals.append((model_name or f"Model {i+1}", model_total)) |
|
|
values = [ |
|
|
tokens.get("uncached_input", 0) / 1e6, |
|
|
tokens.get("cache_read", 0) / 1e6, |
|
|
tokens.get("cache_creation", 0) / 1e6, |
|
|
tokens.get("completion", 0) / 1e6, |
|
|
] |
|
|
color = colors[(i + 1) % len(colors)] |
|
|
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color)) |
|
|
|
|
|
grand_total = sum(t for _, t in model_totals) |
|
|
annotation_lines = [f"<b>Total: {grand_total/1e6:.2f}M</b>"] |
|
|
for name, total in model_totals: |
|
|
annotation_lines.append(f"{name}: {total/1e6:.2f}M") |
|
|
|
|
|
fig.update_layout( |
|
|
title="Tokens by Type (per Model)", |
|
|
yaxis_title="Tokens (M)", |
|
|
barmode="group", |
|
|
margin=dict(l=40, r=40, t=80, b=40), |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
) |
|
|
fig.add_annotation( |
|
|
text="<br>".join(annotation_lines), |
|
|
xref="paper", yref="paper", |
|
|
x=0.02, y=0.98, showarrow=False, |
|
|
font=dict(size=11), |
|
|
align="left", |
|
|
bgcolor="rgba(255,255,255,0.8)", |
|
|
bordercolor="gray", |
|
|
borderwidth=1, |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def create_routed_cost_chart(base_costs: dict, additional_models: list): |
|
|
""" |
|
|
Create grouped bar chart for costs by type, comparing base vs additional models. |
|
|
|
|
|
Args: |
|
|
base_costs: dict with uncached_input, cache_read, cache_creation, completion |
|
|
additional_models: list of (model_name, costs_dict) tuples |
|
|
""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"] |
|
|
colors = ["#636EFA", "#EF553B", "#00CC96", "#AB63FA", "#FFA15A"] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
base_total = sum(base_costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"]) |
|
|
base_values = [ |
|
|
base_costs.get("uncached_input", 0), |
|
|
base_costs.get("cache_read", 0), |
|
|
base_costs.get("cache_creation", 0), |
|
|
base_costs.get("completion", 0), |
|
|
] |
|
|
fig.add_trace(go.Bar(name="Base Model", x=categories, y=base_values, marker_color=colors[0])) |
|
|
|
|
|
model_totals = [("Base Model", base_total)] |
|
|
|
|
|
for i, (model_name, costs) in enumerate(additional_models): |
|
|
model_total = sum(costs.get(k, 0) for k in ["uncached_input", "cache_read", "cache_creation", "completion"]) |
|
|
model_totals.append((model_name or f"Model {i+1}", model_total)) |
|
|
values = [ |
|
|
costs.get("uncached_input", 0), |
|
|
costs.get("cache_read", 0), |
|
|
costs.get("cache_creation", 0), |
|
|
costs.get("completion", 0), |
|
|
] |
|
|
color = colors[(i + 1) % len(colors)] |
|
|
fig.add_trace(go.Bar(name=model_name or f"Model {i+1}", x=categories, y=values, marker_color=color)) |
|
|
|
|
|
grand_total = sum(t for _, t in model_totals) |
|
|
annotation_lines = [f"<b>Total: ${grand_total:.2f}</b>"] |
|
|
for name, total in model_totals: |
|
|
annotation_lines.append(f"{name}: ${total:.2f}") |
|
|
|
|
|
fig.update_layout( |
|
|
title="Cost by Type (per Model) ($)", |
|
|
yaxis_title="Cost ($)", |
|
|
barmode="group", |
|
|
margin=dict(l=40, r=40, t=80, b=40), |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
) |
|
|
fig.add_annotation( |
|
|
text="<br>".join(annotation_lines), |
|
|
xref="paper", yref="paper", |
|
|
x=0.02, y=0.98, showarrow=False, |
|
|
font=dict(size=11), |
|
|
align="left", |
|
|
bgcolor="rgba(255,255,255,0.8)", |
|
|
bordercolor="gray", |
|
|
borderwidth=1, |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def build_app(): |
|
|
leaderboard_df = get_bash_only_df() |
|
|
|
|
|
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app: |
|
|
trajectories_state = gr.State(None) |
|
|
|
|
|
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard") |
|
|
gr.Markdown("Select a model to use as base for cost analysis") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
leaderboard_table = gr.Dataframe( |
|
|
value=leaderboard_df, |
|
|
label="Bash-Only Leaderboard", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
) |
|
|
|
|
|
with gr.Column(visible=False) as analysis_section: |
|
|
gr.Markdown("## 📊 Trajectory Analysis") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_steps = gr.Plot(label="API Calls Distribution") |
|
|
plot_cost = gr.Plot(label="Cost Distribution") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_tokens = gr.Plot(label="Token Usage by Type") |
|
|
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)") |
|
|
|
|
|
with gr.Row(visible=False) as routing_plots_row: |
|
|
routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)") |
|
|
routing_cost_plot = gr.Plot(label="Cost by Type (per Model)") |
|
|
|
|
|
with gr.Row(): |
|
|
plot_stacked = gr.Plot(label="Tokens per Trajectory") |
|
|
plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
selected_folder = gr.State("") |
|
|
gr.Markdown("### Selected Model") |
|
|
selected_name = gr.Textbox(label="Model Name", interactive=False) |
|
|
|
|
|
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary") |
|
|
download_status = gr.Textbox(label="Status", interactive=False, lines=3) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*") |
|
|
detected_model = gr.Textbox(label="Detected Model", interactive=False) |
|
|
with gr.Row(): |
|
|
price_input = gr.Number(label="Input", value=0, precision=2, scale=1) |
|
|
price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1) |
|
|
price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1) |
|
|
price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### 📊 Token Count Source") |
|
|
token_source = gr.Radio( |
|
|
choices=["Metadata", "Calculated"], |
|
|
value="Metadata", |
|
|
) |
|
|
thinking_overhead = gr.Number( |
|
|
label="🔢 Tokenizer Overhead", |
|
|
value=1.21, |
|
|
precision=2, |
|
|
info="Multiplier for Calculated tokens (tiktoken → native)", |
|
|
visible=False, |
|
|
) |
|
|
use_cache = gr.Checkbox( |
|
|
label="Use Cache", |
|
|
value=True, |
|
|
info="If disabled, all tokens are Uncached Input or Completion", |
|
|
visible=False, |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as routing_section: |
|
|
gr.Markdown("### 🔀 Routing Models") |
|
|
|
|
|
STRATEGY_CHOICES = [ |
|
|
"Replace on random steps", |
|
|
"Replace every step k", |
|
|
"Replace part of trajectory", |
|
|
] |
|
|
|
|
|
with gr.Column(): |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 1") |
|
|
routing_model_1 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_1_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
strategy_1 = gr.Dropdown( |
|
|
label="Strategy", |
|
|
choices=STRATEGY_CHOICES, |
|
|
value="Replace on random steps", |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(visible=True) as random_params_1: |
|
|
random_pct_1 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as every_k_params_1: |
|
|
step_k_1 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as part_params_1: |
|
|
start_step_1 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True) |
|
|
end_step_1 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True) |
|
|
|
|
|
add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as routing_block_2: |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 2") |
|
|
routing_model_2 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_2_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
strategy_2 = gr.Dropdown( |
|
|
label="Strategy", |
|
|
choices=STRATEGY_CHOICES, |
|
|
value="Replace on random steps", |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(visible=True) as random_params_2: |
|
|
random_pct_2 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as every_k_params_2: |
|
|
step_k_2 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as part_params_2: |
|
|
start_step_2 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True) |
|
|
end_step_2 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True) |
|
|
|
|
|
add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as routing_block_3: |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 3") |
|
|
routing_model_3 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_3_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
strategy_3 = gr.Dropdown( |
|
|
label="Strategy", |
|
|
choices=STRATEGY_CHOICES, |
|
|
value="Replace on random steps", |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(visible=True) as random_params_3: |
|
|
random_pct_3 = gr.Number(label="Percentage (%)", value=50, minimum=0, maximum=100, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as every_k_params_3: |
|
|
step_k_3 = gr.Number(label="k", value=2, minimum=1, precision=0, interactive=True) |
|
|
with gr.Row(visible=False) as part_params_3: |
|
|
start_step_3 = gr.Number(label="Start (int=step; 0,0-1,0=ratio)", value=0, minimum=0, precision=2, interactive=True) |
|
|
end_step_3 = gr.Number(label="End (int=step; 0,0-1,0=ratio)", value=0.5, minimum=0, precision=2, interactive=True) |
|
|
|
|
|
gr.Markdown("---") |
|
|
route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False) |
|
|
routing_result = gr.Markdown(visible=False) |
|
|
|
|
|
|
|
|
def on_strategy_change(strategy): |
|
|
return ( |
|
|
gr.update(visible=strategy == "Replace on random steps"), |
|
|
gr.update(visible=strategy == "Replace every step k"), |
|
|
gr.update(visible=strategy == "Replace part of trajectory"), |
|
|
) |
|
|
|
|
|
def toggle_routing_section(): |
|
|
return gr.update(visible=True) |
|
|
|
|
|
add_routing_btn.click( |
|
|
fn=toggle_routing_section, |
|
|
outputs=[routing_section], |
|
|
) |
|
|
|
|
|
strategy_1.change( |
|
|
fn=on_strategy_change, |
|
|
inputs=[strategy_1], |
|
|
outputs=[random_params_1, every_k_params_1, part_params_1], |
|
|
) |
|
|
|
|
|
strategy_2.change( |
|
|
fn=on_strategy_change, |
|
|
inputs=[strategy_2], |
|
|
outputs=[random_params_2, every_k_params_2, part_params_2], |
|
|
) |
|
|
|
|
|
strategy_3.change( |
|
|
fn=on_strategy_change, |
|
|
inputs=[strategy_3], |
|
|
outputs=[random_params_3, every_k_params_3, part_params_3], |
|
|
) |
|
|
|
|
|
def filter_models(query): |
|
|
"""Filter models based on search query (starts at 3 chars)""" |
|
|
if not query or len(query) < 3: |
|
|
return gr.update(choices=[]) |
|
|
all_models = get_litellm_model_list() |
|
|
query_lower = query.lower() |
|
|
filtered = [m for m in all_models if query_lower in m.lower()][:50] |
|
|
return gr.update(choices=filtered) |
|
|
|
|
|
routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1]) |
|
|
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2]) |
|
|
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3]) |
|
|
|
|
|
def get_routing_prices_with_labels(model_name): |
|
|
"""Get all 4 prices for a routing model with found/estimated labels""" |
|
|
if not model_name: |
|
|
return ( |
|
|
gr.update(value=0, label="Input"), |
|
|
gr.update(value=0, label="Cache Read"), |
|
|
gr.update(value=0, label="Cache Creation"), |
|
|
gr.update(value=0, label="Completion"), |
|
|
) |
|
|
|
|
|
prices = get_litellm_prices() |
|
|
model_prices = prices.get(model_name, {}) |
|
|
|
|
|
input_price = model_prices.get("input_cost_per_token", 0) * 1e6 |
|
|
cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6 |
|
|
cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6 |
|
|
completion = model_prices.get("output_cost_per_token", 0) * 1e6 |
|
|
|
|
|
input_found = input_price > 0 |
|
|
cache_read_found = cache_read > 0 |
|
|
cache_creation_found = cache_creation > 0 |
|
|
completion_found = completion > 0 |
|
|
|
|
|
if not cache_read_found and input_price > 0: |
|
|
cache_read = input_price * 0.1 |
|
|
if not cache_creation_found and input_price > 0: |
|
|
cache_creation = input_price * 1.25 |
|
|
|
|
|
def label(name, found): |
|
|
return f"✅ {name}" if found else f"❌ {name}" |
|
|
|
|
|
return ( |
|
|
gr.update(value=input_price, label=label("Input", input_found)), |
|
|
gr.update(value=cache_read, label=label("Cache Read", cache_read_found)), |
|
|
gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)), |
|
|
gr.update(value=completion, label=label("Completion", completion_found)), |
|
|
) |
|
|
|
|
|
def on_routing_model_1_select(model_name): |
|
|
prices = get_routing_prices_with_labels(model_name) |
|
|
show_btn = bool(model_name) |
|
|
return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn) |
|
|
|
|
|
def on_routing_model_2_select(model_name): |
|
|
prices = get_routing_prices_with_labels(model_name) |
|
|
show_btn = bool(model_name) |
|
|
return *prices, gr.update(visible=show_btn) |
|
|
|
|
|
def on_routing_model_3_select(model_name): |
|
|
return get_routing_prices_with_labels(model_name) |
|
|
|
|
|
routing_model_1.change( |
|
|
fn=on_routing_model_1_select, |
|
|
inputs=[routing_model_1], |
|
|
outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn], |
|
|
) |
|
|
|
|
|
add_model_2_btn.click( |
|
|
fn=lambda: (gr.update(visible=True), gr.update(visible=False)), |
|
|
outputs=[routing_block_2, add_model_2_btn], |
|
|
) |
|
|
|
|
|
routing_model_2.change( |
|
|
fn=on_routing_model_2_select, |
|
|
inputs=[routing_model_2], |
|
|
outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn], |
|
|
) |
|
|
|
|
|
add_model_3_btn.click( |
|
|
fn=lambda: (gr.update(visible=True), gr.update(visible=False)), |
|
|
outputs=[routing_block_3, add_model_3_btn], |
|
|
) |
|
|
|
|
|
routing_model_3.change( |
|
|
fn=on_routing_model_3_select, |
|
|
inputs=[routing_model_3], |
|
|
outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion], |
|
|
) |
|
|
|
|
|
def run_routing( |
|
|
state_data, |
|
|
base_input, base_cache_read, base_cache_creation, base_completion, |
|
|
routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion, |
|
|
strategy_1_val, random_pct_1_val, step_k_1_val, start_1_val, end_1_val, |
|
|
source, overhead, with_cache |
|
|
): |
|
|
if state_data is None: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
if not routing_model_1_val: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ Please select at least one routing model."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
trajectory_steps = state_data.get("steps", {}) |
|
|
if not trajectory_steps: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ No trajectory steps data available."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
df_key = "meta" if source == "Metadata" else "calculated" |
|
|
df = state_data.get(df_key) |
|
|
if df is not None and not df.empty: |
|
|
if source == "Calculated": |
|
|
df = apply_thinking_overhead(df.copy(), overhead) |
|
|
if not with_cache: |
|
|
df = apply_no_cache(df) |
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_original_cost_from_df = ( |
|
|
df_temp["uncached_input"].sum() * base_input / 1e6 + |
|
|
df["cache_read_tokens"].sum() * base_cache_read / 1e6 + |
|
|
df["cache_creation_tokens"].sum() * base_cache_creation / 1e6 + |
|
|
df["completion_tokens"].sum() * base_completion / 1e6 |
|
|
) |
|
|
else: |
|
|
total_original_cost_from_df = None |
|
|
|
|
|
base_prices = { |
|
|
"input": base_input, |
|
|
"cache_read": base_cache_read, |
|
|
"cache_creation": base_cache_creation, |
|
|
"completion": base_completion, |
|
|
} |
|
|
routing_prices = { |
|
|
"input": r1_input, |
|
|
"cache_read": r1_cache_read, |
|
|
"cache_creation": r1_cache_creation, |
|
|
"completion": r1_completion, |
|
|
} |
|
|
|
|
|
strategy_params = {} |
|
|
if strategy_1_val == "Replace on random steps": |
|
|
strategy_params["percentage"] = random_pct_1_val |
|
|
elif strategy_1_val == "Replace every step k": |
|
|
strategy_params["k"] = step_k_1_val |
|
|
elif strategy_1_val == "Replace part of trajectory": |
|
|
strategy_params["start"] = start_1_val |
|
|
strategy_params["end"] = end_1_val |
|
|
|
|
|
total_base_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} |
|
|
total_routing_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} |
|
|
total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} |
|
|
|
|
|
BASE_MODEL = "__base__" |
|
|
ROUTING_MODEL = "__routing__" |
|
|
|
|
|
for instance_id, steps in trajectory_steps.items(): |
|
|
if not steps: |
|
|
continue |
|
|
|
|
|
total_steps = len(steps) |
|
|
routed_step_indices = get_routed_steps(total_steps, strategy_1_val, strategy_params) |
|
|
|
|
|
modified_steps = [] |
|
|
for i, step in enumerate(steps): |
|
|
model = ROUTING_MODEL if i in routed_step_indices else BASE_MODEL |
|
|
modified_steps.append({ |
|
|
"model": model, |
|
|
"system_user": step.get("system_user", 0), |
|
|
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)), |
|
|
"observation": step.get("observation"), |
|
|
}) |
|
|
|
|
|
model_totals = calculate_routing_tokens(modified_steps) |
|
|
|
|
|
base_totals = model_totals.get(BASE_MODEL, { |
|
|
"cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0 |
|
|
}) |
|
|
routing_totals = model_totals.get(ROUTING_MODEL, { |
|
|
"cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0 |
|
|
}) |
|
|
|
|
|
total_base_tokens["cache_read"] += base_totals.get("cache_read", 0) |
|
|
total_base_tokens["uncached_input"] += base_totals.get("uncached_input", 0) |
|
|
total_base_tokens["completion"] += base_totals.get("completion", 0) |
|
|
total_base_tokens["cache_creation"] += base_totals.get("cache_creation", 0) |
|
|
|
|
|
total_routing_tokens["cache_read"] += routing_totals.get("cache_read", 0) |
|
|
total_routing_tokens["uncached_input"] += routing_totals.get("uncached_input", 0) |
|
|
total_routing_tokens["completion"] += routing_totals.get("completion", 0) |
|
|
total_routing_tokens["cache_creation"] += routing_totals.get("cache_creation", 0) |
|
|
|
|
|
original_steps = [] |
|
|
for step in steps: |
|
|
original_steps.append({ |
|
|
"model": BASE_MODEL, |
|
|
"system_user": step.get("system_user", 0), |
|
|
"completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)), |
|
|
"observation": step.get("observation"), |
|
|
}) |
|
|
original_totals = calculate_routing_tokens(original_steps) |
|
|
orig = original_totals.get(BASE_MODEL, {}) |
|
|
total_original_tokens["cache_read"] += orig.get("cache_read", 0) |
|
|
total_original_tokens["uncached_input"] += orig.get("uncached_input", 0) |
|
|
total_original_tokens["completion"] += orig.get("completion", 0) |
|
|
total_original_tokens["cache_creation"] += orig.get("cache_creation", 0) |
|
|
|
|
|
def calc_cost(tokens: dict, prices: dict) -> float: |
|
|
return ( |
|
|
tokens["uncached_input"] * prices["input"] / 1e6 + |
|
|
tokens["cache_read"] * prices["cache_read"] / 1e6 + |
|
|
tokens["cache_creation"] * prices["cache_creation"] / 1e6 + |
|
|
tokens["completion"] * prices["completion"] / 1e6 |
|
|
) |
|
|
|
|
|
base_costs = {k: total_base_tokens[k] * base_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_base_tokens} |
|
|
routing_costs = {k: total_routing_tokens[k] * routing_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_routing_tokens} |
|
|
|
|
|
total_base_cost = calc_cost(total_base_tokens, base_prices) |
|
|
total_routing_cost = calc_cost(total_routing_tokens, routing_prices) |
|
|
|
|
|
if total_original_cost_from_df is not None: |
|
|
total_original_cost = total_original_cost_from_df |
|
|
else: |
|
|
total_original_cost = calc_cost(total_original_tokens, base_prices) |
|
|
|
|
|
total_routed_cost = total_base_cost + total_routing_cost |
|
|
savings = total_original_cost - total_routed_cost |
|
|
savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0 |
|
|
|
|
|
result_text = f""" |
|
|
## 🚀 Routing Results |
|
|
|
|
|
| Metric | Value | |
|
|
|--------|-------| |
|
|
| **Original Cost (base model only)** | ${total_original_cost:.2f} | |
|
|
| **Routed Cost** | ${total_routed_cost:.2f} | |
|
|
| ↳ Base model portion | ${total_base_cost:.2f} | |
|
|
| ↳ Routing model portion | ${total_routing_cost:.2f} | |
|
|
| **Savings** | ${savings:.2f} ({savings_pct:+.1f}%) | |
|
|
|
|
|
*Strategy: {strategy_1_val}* |
|
|
*Routing model: {routing_model_1_val}* |
|
|
""" |
|
|
|
|
|
additional_token_models = [(routing_model_1_val, total_routing_tokens)] |
|
|
additional_cost_models = [(routing_model_1_val, routing_costs)] |
|
|
|
|
|
yield ( |
|
|
gr.update(visible=True, value="⏳ Creating charts..."), |
|
|
gr.update(visible=True), |
|
|
None, |
|
|
None, |
|
|
) |
|
|
|
|
|
tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models) |
|
|
cost_chart = create_routed_cost_chart(base_costs, additional_cost_models) |
|
|
|
|
|
yield ( |
|
|
gr.update(visible=True, value=result_text), |
|
|
gr.update(visible=True), |
|
|
tokens_chart, |
|
|
cost_chart, |
|
|
) |
|
|
|
|
|
route_btn.click( |
|
|
fn=run_routing, |
|
|
inputs=[ |
|
|
trajectories_state, |
|
|
price_input, price_cache_read, price_cache_creation, price_completion, |
|
|
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, |
|
|
strategy_1, random_pct_1, step_k_1, start_step_1, end_step_1, |
|
|
token_source, thinking_overhead, use_cache, |
|
|
], |
|
|
outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot], |
|
|
) |
|
|
|
|
|
def update_calculated_options_visibility(source): |
|
|
is_calc = source == "Calculated" |
|
|
return gr.update(visible=is_calc), gr.update(visible=is_calc) |
|
|
|
|
|
token_source.change( |
|
|
fn=update_calculated_options_visibility, |
|
|
inputs=[token_source], |
|
|
outputs=[thinking_overhead, use_cache], |
|
|
) |
|
|
|
|
|
leaderboard_table.select( |
|
|
fn=on_row_select, |
|
|
inputs=[leaderboard_table], |
|
|
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead], |
|
|
) |
|
|
|
|
|
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache, progress=gr.Progress()): |
|
|
empty_result = ( |
|
|
"", |
|
|
gr.update(visible=False), |
|
|
None, None, None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
|
|
|
if not folder: |
|
|
yield empty_result |
|
|
return |
|
|
|
|
|
if not check_trajectories_downloaded(folder): |
|
|
yield ( |
|
|
"⏳ Downloading trajectories...", |
|
|
gr.update(visible=False), |
|
|
None, None, None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
status, _ = download_trajectories_from_s3(folder) |
|
|
if "❌" in status: |
|
|
yield ( |
|
|
status, |
|
|
gr.update(visible=False), |
|
|
None, None, None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
return |
|
|
|
|
|
yield ( |
|
|
"⏳ Loading trajectories...", |
|
|
gr.update(visible=True), |
|
|
None, None, None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
|
|
|
df_meta = load_all_trajectories(folder) |
|
|
df_calc = load_all_trajectories_calculated(folder) |
|
|
df_calc["api_calls"] = df_meta["api_calls"].values |
|
|
df_calc["instance_cost"] = df_meta["instance_cost"].values |
|
|
trajectory_steps = load_all_trajectory_steps(folder) |
|
|
|
|
|
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps} |
|
|
|
|
|
if source == "Metadata": |
|
|
df = df_meta |
|
|
else: |
|
|
df = apply_thinking_overhead(df_calc.copy(), overhead) |
|
|
if not with_cache: |
|
|
df = apply_no_cache(df) |
|
|
|
|
|
if df.empty: |
|
|
yield ( |
|
|
"❌ No trajectories found", |
|
|
gr.update(visible=False), |
|
|
None, None, None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
) |
|
|
return |
|
|
|
|
|
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked = create_basic_histograms( |
|
|
df, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
yield ( |
|
|
f"✅ Loaded {len(df)} trajectories", |
|
|
gr.update(visible=True), |
|
|
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown, |
|
|
state_data, |
|
|
gr.update(visible=True), |
|
|
) |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=load_and_analyze, |
|
|
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache], |
|
|
outputs=[ |
|
|
download_status, |
|
|
analysis_section, |
|
|
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown, |
|
|
trajectories_state, |
|
|
add_routing_btn, |
|
|
], |
|
|
) |
|
|
|
|
|
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache): |
|
|
if state_data is None: |
|
|
return None, None |
|
|
|
|
|
if source == "Metadata": |
|
|
df = state_data["meta"] |
|
|
else: |
|
|
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead) |
|
|
if not with_cache: |
|
|
df = apply_no_cache(df) |
|
|
|
|
|
if df.empty: |
|
|
return None, None |
|
|
|
|
|
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
return fig_tokens_cost, fig_cost_breakdown |
|
|
|
|
|
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache] |
|
|
price_outputs = [plot_tokens_cost, plot_cost_breakdown] |
|
|
|
|
|
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
|
|
|
def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache): |
|
|
"""Recalculate only token-dependent charts when source changes""" |
|
|
if state_data is None: |
|
|
return None, None, None, None |
|
|
|
|
|
if source == "Metadata": |
|
|
df = state_data["meta"] |
|
|
else: |
|
|
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead) |
|
|
if not with_cache: |
|
|
df = apply_no_cache(df) |
|
|
|
|
|
if df.empty: |
|
|
return None, None, None, None |
|
|
|
|
|
fig_tokens, fig_tokens_cost, fig_stacked = create_token_charts( |
|
|
df, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown |
|
|
|
|
|
source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache] |
|
|
source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown] |
|
|
|
|
|
token_source.change( |
|
|
fn=on_source_change, |
|
|
inputs=source_change_inputs, |
|
|
outputs=source_change_outputs, |
|
|
) |
|
|
|
|
|
thinking_overhead.change( |
|
|
fn=on_source_change, |
|
|
inputs=source_change_inputs, |
|
|
outputs=source_change_outputs, |
|
|
) |
|
|
|
|
|
use_cache.change( |
|
|
fn=on_source_change, |
|
|
inputs=source_change_inputs, |
|
|
outputs=source_change_outputs, |
|
|
) |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
app = build_app() |
|
|
app.queue() |
|
|
app.launch() |
|
|
|