|
|
import json |
|
|
import logging |
|
|
import os |
|
|
import random |
|
|
import re |
|
|
import subprocess |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import plotly.graph_objects as go |
|
|
import requests |
|
|
import tiktoken |
|
|
|
|
|
from src.download_swebench_leaderboard import download_leaderboard |
|
|
|
|
|
|
|
|
_tokenizer_cache = {} |
|
|
|
|
|
DATA_DIR = Path("data") |
|
|
TRAJS_DIR = DATA_DIR / "swebench_trajs" |
|
|
LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json" |
|
|
LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json" |
|
|
S3_BUCKET = "s3://swe-bench-experiments/bash-only" |
|
|
LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json" |
|
|
LOG_DIR = Path("logs") |
|
|
|
|
|
QUICK_SELECT_MODELS = [ |
|
|
"openrouter/anthropic/claude-opus-4.5", |
|
|
"openrouter/anthropic/claude-sonnet-4.5", |
|
|
"openrouter/google/gemini-3-pro-preview", |
|
|
"openrouter/openai/gpt-5-codex", |
|
|
"openrouter/openai/gpt-oss-120b", |
|
|
"deepinfra/Qwen/Qwen3-14B", |
|
|
"deepinfra/Qwen/Qwen3-32B", |
|
|
"deepinfra/Qwen/Qwen3-73B", |
|
|
"deepinfra/Qwen/Qwen3-235B-A22B", |
|
|
"deepinfra/Qwen/Qwen3-30B-A3B", |
|
|
("deepinfra/Qwen/Qwen3-Coder-480B-A35B-Instruct", "Qwen3-Coder-480B-A35B"), |
|
|
] |
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True) |
|
|
LOG_FILE = LOG_DIR / "app.log" |
|
|
|
|
|
logging.basicConfig( |
|
|
level=logging.INFO, |
|
|
format="%(asctime)s [%(levelname)s] %(message)s", |
|
|
handlers=[ |
|
|
logging.FileHandler(LOG_FILE, encoding="utf-8"), |
|
|
logging.StreamHandler(sys.stdout), |
|
|
], |
|
|
force=True, |
|
|
) |
|
|
|
|
|
|
|
|
def _log_unhandled(exc_type, exc_value, exc_traceback): |
|
|
if issubclass(exc_type, KeyboardInterrupt): |
|
|
sys.__excepthook__(exc_type, exc_value, exc_traceback) |
|
|
return |
|
|
logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback)) |
|
|
|
|
|
|
|
|
sys.excepthook = _log_unhandled |
|
|
|
|
|
_litellm_prices_cache = None |
|
|
_litellm_chat_prices_cache = None |
|
|
_trajectories_cache = {} |
|
|
_calculated_tokens_cache = {} |
|
|
_trajectory_steps_cache = {} |
|
|
|
|
|
|
|
|
def calculate_routing_tokens(steps: list[dict]) -> dict: |
|
|
""" |
|
|
Calculate token breakdown per model with proper caching simulation. |
|
|
|
|
|
Args: |
|
|
steps: list of dicts with keys: |
|
|
- model: str (model name) |
|
|
- system_user: int (tokens for system/user message, usually only step 0) |
|
|
- completion: int (generated tokens) |
|
|
- observation: int or None (env response tokens, None for last step) |
|
|
|
|
|
Returns: |
|
|
dict with per-model totals: |
|
|
{model_name: {cache_read, uncached_input, completion, observation, cache_creation}} |
|
|
""" |
|
|
model_caches = {} |
|
|
model_totals = {} |
|
|
|
|
|
total_context = 0 |
|
|
prev_observation = 0 |
|
|
|
|
|
for i, step in enumerate(steps): |
|
|
model = step["model"] |
|
|
system_user = step.get("system_user", 0) |
|
|
completion = step.get("completion", 0) |
|
|
observation = step.get("observation") or 0 |
|
|
|
|
|
if model not in model_caches: |
|
|
model_caches[model] = 0 |
|
|
if model not in model_totals: |
|
|
model_totals[model] = { |
|
|
"cache_read": 0, |
|
|
"uncached_input": 0, |
|
|
"completion": 0, |
|
|
"observation": 0, |
|
|
"cache_creation": 0, |
|
|
} |
|
|
|
|
|
cache_read = model_caches[model] |
|
|
|
|
|
if i == 0: |
|
|
uncached_input = system_user |
|
|
else: |
|
|
full_context_needed = total_context + prev_observation |
|
|
uncached_input = full_context_needed - cache_read |
|
|
|
|
|
cache_creation = uncached_input + completion |
|
|
|
|
|
model_caches[model] = cache_read + cache_creation |
|
|
|
|
|
model_totals[model]["cache_read"] += cache_read |
|
|
model_totals[model]["uncached_input"] += uncached_input |
|
|
model_totals[model]["completion"] += completion |
|
|
model_totals[model]["observation"] += observation |
|
|
model_totals[model]["cache_creation"] += cache_creation |
|
|
|
|
|
total_context = cache_read + uncached_input + completion |
|
|
prev_observation = observation |
|
|
|
|
|
return model_totals |
|
|
|
|
|
|
|
|
def calculate_per_step_tokens(steps: list[dict]) -> list[dict]: |
|
|
""" |
|
|
Calculate token breakdown per step with proper caching simulation. |
|
|
|
|
|
Returns list of per-step data: |
|
|
[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...] |
|
|
""" |
|
|
result = [] |
|
|
cache_size = 0 |
|
|
total_context = 0 |
|
|
prev_observation = 0 |
|
|
|
|
|
for i, step in enumerate(steps): |
|
|
system_user = step.get("system_user", 0) |
|
|
completion = step.get("completion", 0) |
|
|
observation = step.get("observation") or 0 |
|
|
|
|
|
cache_read = cache_size |
|
|
|
|
|
if i == 0: |
|
|
uncached_input = system_user |
|
|
else: |
|
|
full_context_needed = total_context + prev_observation |
|
|
uncached_input = full_context_needed - cache_read |
|
|
|
|
|
cache_creation = uncached_input + completion |
|
|
cache_size = cache_read + cache_creation |
|
|
|
|
|
result.append({ |
|
|
"step": i, |
|
|
"cache_read": cache_read, |
|
|
"uncached_input": uncached_input, |
|
|
"completion": completion, |
|
|
"cache_creation": cache_creation, |
|
|
}) |
|
|
|
|
|
total_context = cache_read + uncached_input + completion |
|
|
prev_observation = observation |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def _parse_usage_from_log_line(line: str) -> dict | None: |
|
|
""" |
|
|
Parse usage info from log line containing ModelResponse or similar format. |
|
|
Returns dict with prompt_tokens, completion_tokens, cached_tokens, etc. |
|
|
""" |
|
|
if "usage=" not in line: |
|
|
return None |
|
|
|
|
|
result = {} |
|
|
|
|
|
for field in ["completion_tokens", "prompt_tokens", "total_tokens"]: |
|
|
match = re.search(rf'{field}=(\d+)', line) |
|
|
if match: |
|
|
result[field] = int(match.group(1)) |
|
|
|
|
|
cached_match = re.search(r'cached_tokens=(\d+)', line) |
|
|
if cached_match: |
|
|
result["cached_tokens"] = int(cached_match.group(1)) |
|
|
|
|
|
return result if result else None |
|
|
|
|
|
|
|
|
def _parse_old_format_log(log_path: Path) -> list[dict]: |
|
|
""" |
|
|
Parse old SWE-agent format .info.log file to extract per-step token usage. |
|
|
""" |
|
|
result = [] |
|
|
step = 0 |
|
|
|
|
|
try: |
|
|
with open(log_path, "r", encoding="utf-8") as f: |
|
|
for line in f: |
|
|
if "usage=Usage(" not in line: |
|
|
continue |
|
|
|
|
|
usage = _parse_usage_from_log_line(line) |
|
|
if not usage: |
|
|
continue |
|
|
|
|
|
prompt_tokens = usage.get("prompt_tokens", 0) |
|
|
completion_tokens = usage.get("completion_tokens", 0) |
|
|
cached_tokens = usage.get("cached_tokens", 0) |
|
|
|
|
|
uncached_input = max(0, prompt_tokens - cached_tokens) |
|
|
|
|
|
result.append({ |
|
|
"step": step, |
|
|
"cache_read": cached_tokens, |
|
|
"uncached_input": uncached_input, |
|
|
"completion": completion_tokens, |
|
|
"cache_creation": 0, |
|
|
}) |
|
|
step += 1 |
|
|
except Exception as e: |
|
|
logging.debug("Error parsing log file %s: %s", log_path, e) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def parse_trajectory_metadata_per_step(traj_path: Path) -> list[dict]: |
|
|
""" |
|
|
Parse trajectory file and extract per-step metadata from usage fields. |
|
|
Supports both new format (.traj.json with messages[].extra.response.usage) |
|
|
and old format (.traj with separate .info.log file). |
|
|
|
|
|
Returns list of per-step data: |
|
|
[{step: 0, cache_read: X, uncached_input: Y, completion: Z, cache_creation: W}, ...] |
|
|
""" |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
result = [] |
|
|
step = 0 |
|
|
|
|
|
for msg in messages: |
|
|
if msg.get("role") != "assistant": |
|
|
continue |
|
|
|
|
|
usage = None |
|
|
if "usage" in msg: |
|
|
usage = msg["usage"] |
|
|
elif "extra" in msg and isinstance(msg["extra"], dict): |
|
|
response = msg["extra"].get("response", {}) |
|
|
if isinstance(response, dict): |
|
|
usage = response.get("usage", {}) |
|
|
|
|
|
if usage: |
|
|
prompt_tokens = usage.get("prompt_tokens", 0) or 0 |
|
|
completion_tokens = usage.get("completion_tokens", 0) or 0 |
|
|
cache_read = usage.get("cache_read_input_tokens", 0) or 0 |
|
|
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0 |
|
|
|
|
|
prompt_tokens_details = usage.get("prompt_tokens_details", {}) |
|
|
if isinstance(prompt_tokens_details, dict): |
|
|
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0 |
|
|
if cached_from_details > 0 and cache_read == 0: |
|
|
cache_read = cached_from_details |
|
|
|
|
|
uncached_input = max(0, prompt_tokens - cache_read - cache_creation) |
|
|
|
|
|
result.append({ |
|
|
"step": step, |
|
|
"cache_read": cache_read, |
|
|
"uncached_input": uncached_input, |
|
|
"completion": completion_tokens, |
|
|
"cache_creation": cache_creation, |
|
|
}) |
|
|
step += 1 |
|
|
|
|
|
if not result: |
|
|
log_path = traj_path.with_suffix(".info.log") |
|
|
if not log_path.exists(): |
|
|
base_name = traj_path.stem.replace(".traj", "") |
|
|
log_path = traj_path.parent / f"{base_name}.info.log" |
|
|
|
|
|
if log_path.exists(): |
|
|
result = _parse_old_format_log(log_path) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_all_trajectory_metadata_steps(folder: str) -> dict[str, list[dict]]: |
|
|
""" |
|
|
Load per-step metadata for all trajectories. |
|
|
|
|
|
Returns: |
|
|
dict mapping instance_id -> list of per-step metadata |
|
|
""" |
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*/*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
result = {} |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
instance_id = traj_path.stem.replace(".traj", "") |
|
|
steps = parse_trajectory_metadata_per_step(traj_path) |
|
|
if steps: |
|
|
result[instance_id] = steps |
|
|
except Exception as e: |
|
|
logging.error("Error parsing metadata steps for %s: %s", traj_path, e, exc_info=True) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def create_single_trajectory_meta_chart(steps: list[dict]): |
|
|
"""Create stacked bar chart for a single trajectory showing metadata tokens per step.""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
if not steps: |
|
|
return None |
|
|
|
|
|
x_labels = [f"Step {d['step']}" for d in steps] |
|
|
uncached = [d["uncached_input"] / 1e3 for d in steps] |
|
|
cache_read = [d["cache_read"] / 1e3 for d in steps] |
|
|
cache_creation = [d["cache_creation"] / 1e3 for d in steps] |
|
|
completion = [d["completion"] / 1e3 for d in steps] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=x_labels, |
|
|
y=uncached, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=x_labels, |
|
|
y=cache_read, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=x_labels, |
|
|
y=cache_creation, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=x_labels, |
|
|
y=completion, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Step", |
|
|
yaxis_title="Tokens (K)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_single_trajectory_meta_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
"""Create stacked bar chart for a single trajectory showing metadata cost per step.""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
if not steps: |
|
|
return None |
|
|
|
|
|
x_labels = [f"Step {d['step']}" for d in steps] |
|
|
uncached_cost = [d["uncached_input"] * input_price / 1e6 for d in steps] |
|
|
cache_read_cost = [d["cache_read"] * cache_read_price / 1e6 for d in steps] |
|
|
cache_creation_cost = [d["cache_creation"] * cache_creation_price / 1e6 for d in steps] |
|
|
completion_cost = [d["completion"] * completion_price / 1e6 for d in steps] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=x_labels, |
|
|
y=uncached_cost, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=x_labels, |
|
|
y=cache_read_cost, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=x_labels, |
|
|
y=cache_creation_cost, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=x_labels, |
|
|
y=completion_cost, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Step", |
|
|
yaxis_title="Cost ($)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_single_trajectory_chart(steps: list[dict], overhead: float = 1.0, with_cache: bool = True): |
|
|
"""Create stacked bar chart for a single trajectory showing tokens per step.""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
if not steps: |
|
|
return None |
|
|
|
|
|
per_step_data = calculate_per_step_tokens(steps) |
|
|
|
|
|
x_labels = [f"Step {d['step']}" for d in per_step_data] |
|
|
cache_read_raw = [d["cache_read"] * overhead for d in per_step_data] |
|
|
cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data] |
|
|
completion_raw = [d["completion"] * overhead for d in per_step_data] |
|
|
prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data] |
|
|
|
|
|
if with_cache: |
|
|
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)] |
|
|
cache_read = cache_read_raw |
|
|
cache_creation = cache_creation_raw |
|
|
else: |
|
|
uncached = prompt_tokens_raw |
|
|
cache_read = [0] * len(per_step_data) |
|
|
cache_creation = [0] * len(per_step_data) |
|
|
|
|
|
uncached_k = [u / 1e3 for u in uncached] |
|
|
cache_read_k = [cr / 1e3 for cr in cache_read] |
|
|
cache_creation_k = [cc / 1e3 for cc in cache_creation] |
|
|
completion_k = [c / 1e3 for c in completion_raw] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=x_labels, |
|
|
y=uncached_k, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Step %{x}<br>Uncached Input: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=x_labels, |
|
|
y=cache_read_k, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Step %{x}<br>Cache Read: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=x_labels, |
|
|
y=cache_creation_k, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Step %{x}<br>Cache Creation: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=x_labels, |
|
|
y=completion_k, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Step %{x}<br>Completion: %{y:.2f}K<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Step", |
|
|
yaxis_title="Tokens (K)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_single_trajectory_cost_chart(steps: list[dict], input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float, overhead: float = 1.0, with_cache: bool = True): |
|
|
"""Create stacked bar chart for a single trajectory showing cost per step.""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
if not steps: |
|
|
return None |
|
|
|
|
|
per_step_data = calculate_per_step_tokens(steps) |
|
|
|
|
|
x_labels = [f"Step {d['step']}" for d in per_step_data] |
|
|
cache_read_raw = [d["cache_read"] * overhead for d in per_step_data] |
|
|
cache_creation_raw = [d["cache_creation"] * overhead for d in per_step_data] |
|
|
completion_raw = [d["completion"] * overhead for d in per_step_data] |
|
|
prompt_tokens_raw = [(d["cache_read"] + d["uncached_input"]) * overhead for d in per_step_data] |
|
|
|
|
|
if with_cache: |
|
|
uncached = [max(0, p - cr - cc) for p, cr, cc in zip(prompt_tokens_raw, cache_read_raw, cache_creation_raw)] |
|
|
cache_read = cache_read_raw |
|
|
cache_creation = cache_creation_raw |
|
|
else: |
|
|
uncached = prompt_tokens_raw |
|
|
cache_read = [0] * len(per_step_data) |
|
|
cache_creation = [0] * len(per_step_data) |
|
|
|
|
|
uncached_cost = [u * input_price / 1e6 for u in uncached] |
|
|
cache_read_cost = [cr * cache_read_price / 1e6 for cr in cache_read] |
|
|
cache_creation_cost = [cc * cache_creation_price / 1e6 for cc in cache_creation] |
|
|
completion_cost = [c * completion_price / 1e6 for c in completion_raw] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=x_labels, |
|
|
y=uncached_cost, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Step %{x}<br>Uncached Input: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=x_labels, |
|
|
y=cache_read_cost, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Step %{x}<br>Cache Read: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=x_labels, |
|
|
y=cache_creation_cost, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Step %{x}<br>Cache Creation: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=x_labels, |
|
|
y=completion_cost, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Step %{x}<br>Completion: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Step", |
|
|
yaxis_title="Cost ($)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]: |
|
|
""" |
|
|
Parse trajectory file into step format for calculate_routing_tokens. |
|
|
|
|
|
Returns list of steps with: |
|
|
- model: base model name |
|
|
- system_user: tokens for system + user message (step 0 only) |
|
|
- completion: assistant response tokens |
|
|
- observation: env response tokens (None for last step) |
|
|
""" |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
trajectory_data = data.get("trajectory", []) |
|
|
|
|
|
if not messages and trajectory_data: |
|
|
return _parse_trajectory_format_to_steps(trajectory_data, model_name) |
|
|
|
|
|
if not messages: |
|
|
return [] |
|
|
|
|
|
count_tokens, _ = get_tokenizer(model_name) |
|
|
|
|
|
steps = [] |
|
|
system_user_tokens = 0 |
|
|
current_completion = 0 |
|
|
pending_observation = None |
|
|
|
|
|
i = 0 |
|
|
while i < len(messages): |
|
|
msg = messages[i] |
|
|
role = msg.get("role", "user") |
|
|
content = msg.get("content", "") |
|
|
if isinstance(content, list): |
|
|
content = json.dumps(content) |
|
|
tokens = count_tokens(str(content)) |
|
|
|
|
|
if role == "system": |
|
|
system_user_tokens += tokens |
|
|
i += 1 |
|
|
elif role == "user": |
|
|
if not steps: |
|
|
system_user_tokens += tokens |
|
|
i += 1 |
|
|
else: |
|
|
if steps: |
|
|
steps[-1]["observation"] = tokens |
|
|
pending_observation = tokens |
|
|
i += 1 |
|
|
elif role == "assistant": |
|
|
step = { |
|
|
"model": model_name, |
|
|
"system_user": system_user_tokens if not steps else 0, |
|
|
"completion": tokens, |
|
|
"observation": None, |
|
|
"content": str(content), |
|
|
} |
|
|
steps.append(step) |
|
|
system_user_tokens = 0 |
|
|
i += 1 |
|
|
|
|
|
return steps |
|
|
|
|
|
|
|
|
def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]: |
|
|
""" |
|
|
Parse alternative trajectory format (with "trajectory" array) into steps. |
|
|
""" |
|
|
count_tokens, _ = get_tokenizer(model_name) |
|
|
|
|
|
steps = [] |
|
|
for i, traj_step in enumerate(trajectory_data): |
|
|
query = traj_step.get("query", []) |
|
|
response_text = traj_step.get("response", "") |
|
|
observation_text = traj_step.get("observation", "") |
|
|
|
|
|
system_user_tokens = 0 |
|
|
if i == 0: |
|
|
for q in query: |
|
|
content = q.get("content", "") |
|
|
if isinstance(content, list): |
|
|
content = json.dumps(content) |
|
|
system_user_tokens += count_tokens(str(content)) |
|
|
|
|
|
completion_tokens = count_tokens(str(response_text)) if response_text else 0 |
|
|
observation_tokens = count_tokens(str(observation_text)) if observation_text else None |
|
|
|
|
|
step = { |
|
|
"model": model_name, |
|
|
"system_user": system_user_tokens, |
|
|
"completion": completion_tokens, |
|
|
"observation": observation_tokens, |
|
|
"content": str(response_text) if response_text else "", |
|
|
} |
|
|
steps.append(step) |
|
|
|
|
|
return steps |
|
|
|
|
|
|
|
|
def get_default_overhead(model_name: str) -> float: |
|
|
"""Get default tokenizer overhead for model provider""" |
|
|
model_lower = model_name.lower() if model_name else "" |
|
|
|
|
|
if "claude" in model_lower or "anthropic" in model_lower: |
|
|
return 1.24 |
|
|
elif "gemini" in model_lower or "google" in model_lower: |
|
|
return 1.0 |
|
|
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower: |
|
|
return 1.0 |
|
|
else: |
|
|
return 1.0 |
|
|
|
|
|
|
|
|
def get_tokenizer(model_name: str): |
|
|
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)""" |
|
|
global _tokenizer_cache |
|
|
|
|
|
model_lower = model_name.lower() if model_name else "" |
|
|
|
|
|
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower: |
|
|
tokenizer_name = "o200k_base" |
|
|
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower: |
|
|
tokenizer_name = "cl100k_base" |
|
|
elif "gemini" in model_lower or "google" in model_lower: |
|
|
return lambda text: int(len(text) / 3.23), "gemini_approx" |
|
|
else: |
|
|
tokenizer_name = "cl100k_base" |
|
|
|
|
|
if tokenizer_name not in _tokenizer_cache: |
|
|
_tokenizer_cache[tokenizer_name] = tiktoken.get_encoding(tokenizer_name) |
|
|
|
|
|
enc = _tokenizer_cache[tokenizer_name] |
|
|
return lambda text: len(enc.encode(text)), tokenizer_name |
|
|
|
|
|
|
|
|
def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame: |
|
|
"""Apply tokenizer overhead multiplier to all token counts""" |
|
|
if df.empty or overhead == 1.0: |
|
|
return df |
|
|
|
|
|
df = df.copy() |
|
|
df["prompt_tokens"] = (df["prompt_tokens"] * overhead).astype(int) |
|
|
df["completion_tokens"] = (df["completion_tokens"] * overhead).astype(int) |
|
|
df["cache_read_tokens"] = (df["cache_read_tokens"] * overhead).astype(int) |
|
|
df["cache_creation_tokens"] = (df["cache_creation_tokens"] * overhead).astype(int) |
|
|
df["total_tokens"] = df["prompt_tokens"] + df["completion_tokens"] |
|
|
return df |
|
|
|
|
|
|
|
|
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame: |
|
|
"""Convert all tokens to uncached input + completion (no caching)""" |
|
|
if df.empty: |
|
|
return df |
|
|
|
|
|
df = df.copy() |
|
|
df["cache_read_tokens"] = 0 |
|
|
df["cache_creation_tokens"] = 0 |
|
|
return df |
|
|
|
|
|
|
|
|
def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame: |
|
|
"""Ensure token-related columns exist and are numeric.""" |
|
|
if df is None or df.empty: |
|
|
return df |
|
|
df = df.copy() |
|
|
required = [ |
|
|
"prompt_tokens", |
|
|
"completion_tokens", |
|
|
"cache_read_tokens", |
|
|
"cache_creation_tokens", |
|
|
] |
|
|
for col in required: |
|
|
if col not in df.columns: |
|
|
df[col] = 0 |
|
|
df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int) |
|
|
if "total_tokens" in df.columns: |
|
|
df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int) |
|
|
return df |
|
|
|
|
|
|
|
|
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame: |
|
|
"""Load trajectories with self-calculated token counts using calculate_routing_tokens""" |
|
|
global _calculated_tokens_cache |
|
|
|
|
|
cache_key = f"calculated_{folder}" |
|
|
if cache_key in _calculated_tokens_cache: |
|
|
return ensure_token_columns(_calculated_tokens_cache[cache_key]) |
|
|
|
|
|
trajectory_steps = load_all_trajectory_steps(folder) |
|
|
|
|
|
rows = [] |
|
|
for instance_id, steps in trajectory_steps.items(): |
|
|
if not steps: |
|
|
continue |
|
|
|
|
|
try: |
|
|
model_totals = calculate_routing_tokens(steps) |
|
|
step_model = steps[0].get("model", "") if steps else "" |
|
|
totals = model_totals.get(step_model, {}) |
|
|
|
|
|
cache_read = totals.get("cache_read", 0) |
|
|
uncached_input = totals.get("uncached_input", 0) |
|
|
completion = totals.get("completion", 0) |
|
|
cache_creation = totals.get("cache_creation", 0) |
|
|
|
|
|
prompt_tokens = cache_read + uncached_input |
|
|
|
|
|
rows.append({ |
|
|
"instance_id": instance_id, |
|
|
"model_name": step_model, |
|
|
"api_calls": len(steps), |
|
|
"instance_cost": 0, |
|
|
"prompt_tokens": prompt_tokens, |
|
|
"completion_tokens": completion, |
|
|
"total_tokens": prompt_tokens + completion, |
|
|
"cache_read_tokens": cache_read, |
|
|
"cache_creation_tokens": cache_creation, |
|
|
}) |
|
|
except Exception as e: |
|
|
logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True) |
|
|
|
|
|
df = ensure_token_columns(pd.DataFrame(rows)) |
|
|
_calculated_tokens_cache[cache_key] = df |
|
|
return df |
|
|
|
|
|
|
|
|
def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]: |
|
|
""" |
|
|
Load all trajectories as step sequences for routing calculations. |
|
|
|
|
|
Returns: |
|
|
dict mapping instance_id -> list of steps for calculate_routing_tokens |
|
|
""" |
|
|
global _trajectory_steps_cache |
|
|
|
|
|
cache_key = f"steps_{folder}" |
|
|
if cache_key in _trajectory_steps_cache: |
|
|
return _trajectory_steps_cache[cache_key] |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*/*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
model_name = "" |
|
|
if traj_files: |
|
|
try: |
|
|
with open(traj_files[0], "r") as f: |
|
|
first_data = json.load(f) |
|
|
config = first_data.get("info", {}).get("config", {}).get("model", {}) |
|
|
model_name = config.get("cost_calc_model_override", config.get("model_name", "")) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
result = {} |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
instance_id = traj_path.stem.replace(".traj", "") |
|
|
steps = parse_trajectory_to_steps(traj_path, model_name) |
|
|
if steps: |
|
|
result[instance_id] = steps |
|
|
except Exception as e: |
|
|
logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True) |
|
|
|
|
|
_trajectory_steps_cache[cache_key] = result |
|
|
return result |
|
|
|
|
|
|
|
|
def refresh_litellm_prices() -> bool: |
|
|
"""Force refresh litellm prices from remote. Returns True if successful.""" |
|
|
global _litellm_prices_cache, _litellm_chat_prices_cache |
|
|
try: |
|
|
response = requests.get(LITELLM_PRICES_URL, timeout=30) |
|
|
response.raise_for_status() |
|
|
_litellm_prices_cache = response.json() |
|
|
_litellm_chat_prices_cache = None |
|
|
|
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
with open(LITELLM_PRICES_CACHE, "w") as f: |
|
|
json.dump(_litellm_prices_cache, f) |
|
|
logging.info("Successfully refreshed litellm prices") |
|
|
return True |
|
|
except Exception as e: |
|
|
logging.warning(f"Failed to refresh litellm prices: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
def get_litellm_prices_raw() -> dict: |
|
|
"""Get raw litellm prices (all modes, unfiltered)""" |
|
|
global _litellm_prices_cache |
|
|
if _litellm_prices_cache is not None: |
|
|
return _litellm_prices_cache |
|
|
|
|
|
if LITELLM_PRICES_CACHE.exists(): |
|
|
with open(LITELLM_PRICES_CACHE) as f: |
|
|
_litellm_prices_cache = json.load(f) |
|
|
return _litellm_prices_cache |
|
|
|
|
|
try: |
|
|
response = requests.get(LITELLM_PRICES_URL, timeout=30) |
|
|
response.raise_for_status() |
|
|
_litellm_prices_cache = response.json() |
|
|
|
|
|
DATA_DIR.mkdir(exist_ok=True) |
|
|
with open(LITELLM_PRICES_CACHE, "w") as f: |
|
|
json.dump(_litellm_prices_cache, f) |
|
|
except Exception: |
|
|
_litellm_prices_cache = {} |
|
|
|
|
|
return _litellm_prices_cache |
|
|
|
|
|
|
|
|
def get_litellm_prices() -> dict: |
|
|
"""Get litellm prices filtered to chat models only""" |
|
|
global _litellm_chat_prices_cache |
|
|
if _litellm_chat_prices_cache is not None: |
|
|
return _litellm_chat_prices_cache |
|
|
|
|
|
raw_prices = get_litellm_prices_raw() |
|
|
_litellm_chat_prices_cache = { |
|
|
k: v for k, v in raw_prices.items() |
|
|
if isinstance(v, dict) and v.get("mode") == "chat" |
|
|
} |
|
|
return _litellm_chat_prices_cache |
|
|
|
|
|
|
|
|
def get_litellm_model_list() -> list[str]: |
|
|
"""Get list of chat model names from litellm prices""" |
|
|
prices = get_litellm_prices() |
|
|
return sorted(prices.keys()) |
|
|
|
|
|
|
|
|
def normalize_model_name(name: str) -> str: |
|
|
"""Normalize model name for comparison: lowercase, remove separators""" |
|
|
return re.sub(r'[-_./]', '', name.lower()) |
|
|
|
|
|
|
|
|
def _search_model_in_prices(model_name: str, prices: dict) -> dict | None: |
|
|
"""Search for model in prices dict using various name variations.""" |
|
|
clean_name = model_name.replace("anthropic/", "").replace("openai/", "") |
|
|
name_without_date = re.sub(r'-\d{8}$', '', clean_name) |
|
|
|
|
|
candidates = [ |
|
|
model_name, |
|
|
clean_name, |
|
|
name_without_date, |
|
|
f"anthropic/{clean_name}", |
|
|
f"openai/{clean_name}", |
|
|
f"anthropic/{name_without_date}", |
|
|
f"openai/{name_without_date}", |
|
|
] |
|
|
|
|
|
for key in candidates: |
|
|
if key in prices: |
|
|
return prices[key] |
|
|
|
|
|
normalized_name = normalize_model_name(clean_name) |
|
|
normalized_no_date = normalize_model_name(name_without_date) |
|
|
|
|
|
for key, value in prices.items(): |
|
|
key_normalized = normalize_model_name(key) |
|
|
if normalized_name in key_normalized or normalized_no_date in key_normalized: |
|
|
return value |
|
|
key_last_part = key.split('/')[-1] if '/' in key else key |
|
|
key_last_normalized = normalize_model_name(key_last_part) |
|
|
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized: |
|
|
return value |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def get_model_prices(model_name: str) -> dict | None: |
|
|
if not model_name: |
|
|
return None |
|
|
|
|
|
prices = get_litellm_prices() |
|
|
result = _search_model_in_prices(model_name, prices) |
|
|
|
|
|
if result is None and LITELLM_PRICES_CACHE.exists(): |
|
|
logging.info(f"Model '{model_name}' not found in litellm prices, refreshing cache...") |
|
|
if refresh_litellm_prices(): |
|
|
prices = get_litellm_prices() |
|
|
result = _search_model_in_prices(model_name, prices) |
|
|
if result is None: |
|
|
logging.warning(f"Model '{model_name}' still not found after refresh") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_or_download_leaderboard(force_refresh: bool = False): |
|
|
if not force_refresh and LEADERBOARD_CACHE.exists(): |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
try: |
|
|
filename = download_leaderboard(output_dir=str(DATA_DIR)) |
|
|
os.rename(filename, LEADERBOARD_CACHE) |
|
|
logging.info("Successfully downloaded fresh leaderboard data") |
|
|
except Exception as e: |
|
|
logging.warning(f"Failed to download leaderboard: {e}") |
|
|
if LEADERBOARD_CACHE.exists(): |
|
|
logging.info("Using cached leaderboard data") |
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
raise |
|
|
|
|
|
with open(LEADERBOARD_CACHE) as f: |
|
|
return json.load(f) |
|
|
|
|
|
|
|
|
def get_bash_only_df(): |
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return pd.DataFrame() |
|
|
|
|
|
rows = [] |
|
|
for r in bash_only["results"]: |
|
|
resolved_pct = r.get("resolved", 0) |
|
|
if isinstance(resolved_pct, (int, float)): |
|
|
resolved_str = f"{resolved_pct:.1f}%" |
|
|
else: |
|
|
resolved_str = str(resolved_pct) |
|
|
|
|
|
rows.append({ |
|
|
"name": r.get("name", ""), |
|
|
"% resolved": resolved_str, |
|
|
"date": r.get("date", ""), |
|
|
"cost": round(r.get("cost") or 0, 2), |
|
|
"instance_cost": round(r.get("instance_cost") or 0, 4), |
|
|
"instance_calls": r.get("instance_calls") or 0, |
|
|
"folder": r.get("folder", ""), |
|
|
"os_model": "✅" if r.get("os_model") else "❌", |
|
|
}) |
|
|
|
|
|
return pd.DataFrame(rows) |
|
|
|
|
|
|
|
|
def get_model_details(folder: str): |
|
|
if not folder: |
|
|
return None, "Select a model from the table" |
|
|
|
|
|
data = load_or_download_leaderboard() |
|
|
leaderboards = data.get("leaderboards", []) |
|
|
bash_only = next((lb for lb in leaderboards if lb["name"] == "bash-only"), None) |
|
|
|
|
|
if not bash_only: |
|
|
return None, "Leaderboard not found" |
|
|
|
|
|
model = next((r for r in bash_only["results"] if r.get("folder") == folder), None) |
|
|
if not model: |
|
|
return None, f"Model with folder '{folder}' not found" |
|
|
|
|
|
return model, None |
|
|
|
|
|
|
|
|
def check_trajectories_downloaded(folder: str) -> bool: |
|
|
if not folder: |
|
|
return False |
|
|
output_dir = TRAJS_DIR / folder |
|
|
return output_dir.exists() and any(output_dir.iterdir()) |
|
|
|
|
|
|
|
|
def download_trajectories_from_s3(folder: str, progress=gr.Progress()): |
|
|
if not folder: |
|
|
return "❌ No model selected", gr.update(visible=False) |
|
|
|
|
|
model, error = get_model_details(folder) |
|
|
if error: |
|
|
return f"❌ {error}", gr.update(visible=False) |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
if output_dir.exists() and any(output_dir.iterdir()): |
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*/*.traj"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
return f"✅ Already downloaded: {output_dir}\n\n{file_count} trajectory files", gr.update(visible=True) |
|
|
|
|
|
s3_path = f"{S3_BUCKET}/{folder}/trajs/" |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
progress(0, desc="Starting S3 download...") |
|
|
|
|
|
try: |
|
|
result = subprocess.run( |
|
|
["aws", "s3", "cp", "--recursive", s3_path, str(output_dir), "--no-sign-request"], |
|
|
capture_output=True, |
|
|
text=True, |
|
|
timeout=600, |
|
|
) |
|
|
|
|
|
if result.returncode != 0: |
|
|
return f"❌ S3 download failed:\n{result.stderr}", gr.update(visible=False) |
|
|
|
|
|
file_count = len(list(output_dir.glob("*/*.traj.json"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*/*.traj"))) |
|
|
if file_count == 0: |
|
|
file_count = len(list(output_dir.glob("*.json"))) |
|
|
|
|
|
if file_count == 0: |
|
|
return f"❌ No trajectory files found on S3 for {folder}", gr.update(visible=False) |
|
|
|
|
|
per_instance = model.get("per_instance_details", {}) |
|
|
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved")) |
|
|
total_count = len(per_instance) |
|
|
|
|
|
if total_count > 0: |
|
|
resolved_pct = f"{100*resolved_count/total_count:.1f}%" |
|
|
else: |
|
|
resolved_pct = "N/A" |
|
|
|
|
|
status = f"✅ Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})" |
|
|
return status, gr.update(visible=True) |
|
|
|
|
|
except subprocess.TimeoutExpired: |
|
|
return "❌ Download timed out (>10 min)", gr.update(visible=False) |
|
|
except FileNotFoundError: |
|
|
return "❌ AWS CLI not found. Install with: pip install awscli", gr.update(visible=False) |
|
|
except Exception as e: |
|
|
return f"❌ Error: {e}", gr.update(visible=False) |
|
|
|
|
|
|
|
|
def parse_trajectory(traj_path: Path) -> dict: |
|
|
with open(traj_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
|
|
|
info = data.get("info", {}) |
|
|
model_stats = info.get("model_stats", {}) |
|
|
config = info.get("config", {}) |
|
|
model_config = config.get("model", {}) |
|
|
model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", "")) |
|
|
|
|
|
trajectory_steps = data.get("trajectory", []) |
|
|
is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data |
|
|
|
|
|
if is_trajectory_format and not model_name: |
|
|
for step in trajectory_steps: |
|
|
query = step.get("query", []) |
|
|
for q in query: |
|
|
if q.get("role") == "system": |
|
|
content = q.get("content", "") |
|
|
if "llama" in content.lower() or "meta" in content.lower(): |
|
|
model_name = "llama" |
|
|
break |
|
|
if model_name: |
|
|
break |
|
|
|
|
|
api_calls = model_stats.get("api_calls", 0) |
|
|
if api_calls == 0 and is_trajectory_format: |
|
|
api_calls = len(trajectory_steps) |
|
|
|
|
|
result = { |
|
|
"instance_id": data.get("instance_id", traj_path.stem), |
|
|
"model_name": model_name, |
|
|
"api_calls": api_calls, |
|
|
"instance_cost": model_stats.get("instance_cost", 0), |
|
|
"prompt_tokens": 0, |
|
|
"completion_tokens": 0, |
|
|
"total_tokens": 0, |
|
|
"cache_read_tokens": 0, |
|
|
"cache_creation_tokens": 0, |
|
|
} |
|
|
|
|
|
messages = data.get("messages", []) |
|
|
for msg in messages: |
|
|
usage = None |
|
|
if "usage" in msg: |
|
|
usage = msg["usage"] |
|
|
elif "extra" in msg and isinstance(msg["extra"], dict): |
|
|
response = msg["extra"].get("response", {}) |
|
|
if isinstance(response, dict): |
|
|
usage = response.get("usage", {}) |
|
|
|
|
|
if usage: |
|
|
result["prompt_tokens"] += usage.get("prompt_tokens", 0) or 0 |
|
|
result["completion_tokens"] += usage.get("completion_tokens", 0) or 0 |
|
|
result["total_tokens"] += usage.get("total_tokens", 0) or 0 |
|
|
|
|
|
cache_read = usage.get("cache_read_input_tokens", 0) or 0 |
|
|
cache_creation = usage.get("cache_creation_input_tokens", 0) or 0 |
|
|
|
|
|
prompt_tokens_details = usage.get("prompt_tokens_details", {}) |
|
|
if isinstance(prompt_tokens_details, dict): |
|
|
cached_from_details = prompt_tokens_details.get("cached_tokens", 0) or 0 |
|
|
if cached_from_details > 0 and cache_read == 0: |
|
|
cache_read = cached_from_details |
|
|
|
|
|
result["cache_read_tokens"] += cache_read |
|
|
result["cache_creation_tokens"] += cache_creation |
|
|
|
|
|
if result["prompt_tokens"] == 0 and result["completion_tokens"] == 0: |
|
|
log_path = traj_path.with_suffix(".info.log") |
|
|
if not log_path.exists(): |
|
|
base_name = traj_path.stem.replace(".traj", "") |
|
|
log_path = traj_path.parent / f"{base_name}.info.log" |
|
|
|
|
|
if log_path.exists(): |
|
|
steps = _parse_old_format_log(log_path) |
|
|
for step_data in steps: |
|
|
result["prompt_tokens"] += step_data["cache_read"] + step_data["uncached_input"] |
|
|
result["completion_tokens"] += step_data["completion"] |
|
|
result["cache_read_tokens"] += step_data["cache_read"] |
|
|
result["total_tokens"] = result["prompt_tokens"] + result["completion_tokens"] |
|
|
if result["api_calls"] == 0: |
|
|
result["api_calls"] = len(steps) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def load_all_trajectories(folder: str) -> pd.DataFrame: |
|
|
global _trajectories_cache |
|
|
|
|
|
if folder in _trajectories_cache: |
|
|
return ensure_token_columns(_trajectories_cache[folder]) |
|
|
|
|
|
output_dir = TRAJS_DIR / folder |
|
|
|
|
|
traj_files = list(output_dir.glob("*/*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*/*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj.json")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.traj")) |
|
|
if not traj_files: |
|
|
traj_files = list(output_dir.glob("*.json")) |
|
|
|
|
|
rows = [] |
|
|
for traj_path in traj_files: |
|
|
try: |
|
|
rows.append(parse_trajectory(traj_path)) |
|
|
except Exception as e: |
|
|
logging.error("Error parsing %s: %s", traj_path, e, exc_info=True) |
|
|
|
|
|
df = ensure_token_columns(pd.DataFrame(rows)) |
|
|
_trajectories_cache[folder] = df |
|
|
return df |
|
|
|
|
|
|
|
|
def create_cost_by_type_chart(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
"""Create Total Cost by Token Type chart (can be called separately for price updates)""" |
|
|
if df.empty: |
|
|
return None |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
cost_uncached_input = total_uncached_input * input_price / 1e6 |
|
|
cost_cache_read = total_cache_read * cache_read_price / 1e6 |
|
|
cost_cache_creation = total_cache_creation * cache_creation_price / 1e6 |
|
|
cost_completion = total_completion * completion_price / 1e6 |
|
|
|
|
|
cost_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Cost ($)": [cost_uncached_input, cost_cache_read, cost_cache_creation, cost_completion], |
|
|
}) |
|
|
|
|
|
fig = px.bar( |
|
|
cost_data, |
|
|
x="Token Type", |
|
|
y="Cost ($)", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig.update_layout( |
|
|
xaxis_title="", |
|
|
yaxis_title="Cost ($)", |
|
|
showlegend=False, |
|
|
margin=dict(l=60, r=20, t=20, b=40), |
|
|
) |
|
|
|
|
|
total_cost = cost_uncached_input + cost_cache_read + cost_cache_creation + cost_completion |
|
|
fig.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
"""Create only token-related charts (for source switching)""" |
|
|
if df.empty: |
|
|
return None, None, None |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
token_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Total Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6], |
|
|
}) |
|
|
|
|
|
fig_tokens = px.bar( |
|
|
token_data, |
|
|
x="Token Type", |
|
|
y="Total Tokens (M)", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens.update_layout( |
|
|
xaxis_title="", |
|
|
yaxis_title="Tokens (M)", |
|
|
showlegend=False, |
|
|
margin=dict(l=60, r=20, t=20, b=40), |
|
|
) |
|
|
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion |
|
|
fig_tokens.add_annotation( |
|
|
text=f"Total: {total_all/1e6:.2f}M", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
fig_stacked = go.Figure() |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"] / 1e6, |
|
|
marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"] / 1e6, |
|
|
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"] / 1e6, |
|
|
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"] / 1e6, |
|
|
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:.2f}M<extra></extra>", |
|
|
)) |
|
|
fig_stacked.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Tokens (M)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig_tokens, fig_tokens_cost, fig_stacked |
|
|
|
|
|
|
|
|
def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None, None, None, None, None |
|
|
|
|
|
fig_steps = px.histogram( |
|
|
df, |
|
|
x="api_calls", |
|
|
nbins=30, |
|
|
color_discrete_sequence=["#636EFA"], |
|
|
) |
|
|
fig_steps.update_layout( |
|
|
xaxis_title="API Calls (Steps)", |
|
|
yaxis_title="Number of Trajectories", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_steps.add_annotation( |
|
|
text=f"Mean: {df['api_calls'].mean():.1f} | Median: {df['api_calls'].median():.0f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
fig_cost = px.histogram( |
|
|
df, |
|
|
x="instance_cost", |
|
|
nbins=30, |
|
|
color_discrete_sequence=["#00CC96"], |
|
|
) |
|
|
fig_cost.update_layout( |
|
|
xaxis_title="Cost ($)", |
|
|
yaxis_title="Number of Trajectories", |
|
|
showlegend=False, |
|
|
margin=dict(l=40, r=20, t=40, b=40), |
|
|
) |
|
|
fig_cost.add_annotation( |
|
|
text=f"Mean: ${df['instance_cost'].mean():.4f} | Total: ${df['instance_cost'].sum():.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
total_completion = df["completion_tokens"].sum() |
|
|
total_cache_read = df["cache_read_tokens"].sum() |
|
|
total_cache_creation = df["cache_creation_tokens"].sum() |
|
|
|
|
|
df_temp = df.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_uncached_input = df_temp["uncached_input"].sum() |
|
|
|
|
|
token_data = pd.DataFrame({ |
|
|
"Token Type": ["Uncached Input", "Cache Read", "Cache Creation", "Completion"], |
|
|
"Tokens (M)": [total_uncached_input / 1e6, total_cache_read / 1e6, total_cache_creation / 1e6, total_completion / 1e6], |
|
|
}) |
|
|
|
|
|
fig_tokens = px.bar( |
|
|
token_data, |
|
|
x="Token Type", |
|
|
y="Tokens (M)", |
|
|
color="Token Type", |
|
|
color_discrete_sequence=["#EF553B", "#19D3F3", "#FFA15A", "#AB63FA"], |
|
|
) |
|
|
fig_tokens.update_layout( |
|
|
xaxis_title="", |
|
|
yaxis_title="Tokens (M)", |
|
|
showlegend=False, |
|
|
margin=dict(l=60, r=20, t=20, b=40), |
|
|
) |
|
|
|
|
|
total_all = total_uncached_input + total_cache_read + total_cache_creation + total_completion |
|
|
fig_tokens.add_annotation( |
|
|
text=f"Total: {total_all/1e6:.2f}M", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=12), |
|
|
) |
|
|
|
|
|
|
|
|
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
fig_stacked = go.Figure() |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Uncached Input", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["uncached_input_tokens"] / 1e6, |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Read", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cache_read_tokens"] / 1e6, |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Cache Creation", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cache_creation_tokens"] / 1e6, |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.add_trace(go.Bar( |
|
|
name="Completion", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["completion_tokens"] / 1e6, |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Trajectory: %{x}<br>Completion: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig_stacked.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Tokens (M)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
return fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked |
|
|
|
|
|
|
|
|
def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price: float, cache_creation_price: float, completion_price: float): |
|
|
if df.empty: |
|
|
return None |
|
|
|
|
|
|
|
|
df_sorted = df.copy() |
|
|
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0) |
|
|
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"] |
|
|
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True) |
|
|
df_sorted["trajectory_idx"] = range(len(df_sorted)) |
|
|
|
|
|
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6 |
|
|
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6 |
|
|
df_sorted["cost_cache_creation"] = df_sorted["cache_creation_tokens"] * cache_creation_price / 1e6 |
|
|
df_sorted["cost_completion"] = df_sorted["completion_tokens"] * completion_price / 1e6 |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Uncached Input (${input_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_uncached_input"], |
|
|
marker_color="#EF553B", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Read (${cache_read_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_cache_read"], |
|
|
marker_color="#19D3F3", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Cache Creation (${cache_creation_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_cache_creation"], |
|
|
marker_color="#FFA15A", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"Completion (${completion_price:.2f}/1M)", |
|
|
x=df_sorted["trajectory_idx"], |
|
|
y=df_sorted["cost_completion"], |
|
|
marker_color="#AB63FA", |
|
|
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
total_cost = ( |
|
|
df_sorted["cost_uncached_input"].sum() + |
|
|
df_sorted["cost_cache_read"].sum() + |
|
|
df_sorted["cost_cache_creation"].sum() + |
|
|
df_sorted["cost_completion"].sum() |
|
|
) |
|
|
|
|
|
fig.update_layout( |
|
|
barmode="stack", |
|
|
xaxis_title="Trajectory (sorted by total tokens)", |
|
|
yaxis_title="Cost ($)", |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1), |
|
|
margin=dict(l=50, r=20, t=40, b=40), |
|
|
) |
|
|
|
|
|
fig.add_annotation( |
|
|
text=f"Total: ${total_cost:.2f}", |
|
|
xref="paper", yref="paper", |
|
|
x=0.95, y=0.95, showarrow=False, |
|
|
font=dict(size=14), |
|
|
bgcolor="white", |
|
|
) |
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
def extract_model_from_folder(folder: str) -> str: |
|
|
"""Extract model name from folder like '20251124_mini-v1.16.0_claude-opus-4-5-20251101'""" |
|
|
if not folder: |
|
|
return "" |
|
|
parts = folder.split("_") |
|
|
if len(parts) >= 3: |
|
|
return "_".join(parts[2:]) |
|
|
return folder |
|
|
|
|
|
|
|
|
def get_prices_for_folder(folder: str) -> tuple[dict, str]: |
|
|
"""Get prices from litellm based on folder name. |
|
|
Returns (prices_dict, model_name) where prices_dict has 'value' and 'found' for each price type.""" |
|
|
model_hint = extract_model_from_folder(folder) |
|
|
|
|
|
result = { |
|
|
"input": {"value": 0, "found": False}, |
|
|
"cache_read": {"value": 0, "found": False}, |
|
|
"cache_creation": {"value": 0, "found": False}, |
|
|
"completion": {"value": 0, "found": False}, |
|
|
} |
|
|
|
|
|
if not model_hint: |
|
|
return result, "" |
|
|
|
|
|
prices = get_model_prices(model_hint) |
|
|
if prices: |
|
|
|
|
|
input_price = prices.get("input_cost_per_token", 0) * 1e6 |
|
|
cache_read = prices.get("cache_read_input_token_cost", 0) * 1e6 |
|
|
cache_creation = prices.get("cache_creation_input_token_cost", 0) * 1e6 |
|
|
completion = prices.get("output_cost_per_token", 0) * 1e6 |
|
|
|
|
|
result["input"] = {"value": input_price, "found": input_price > 0} |
|
|
result["cache_read"] = {"value": cache_read, "found": cache_read > 0} |
|
|
result["cache_creation"] = {"value": cache_creation, "found": cache_creation > 0} |
|
|
result["completion"] = {"value": completion, "found": completion > 0} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if input_price > 0: |
|
|
if not result["cache_read"]["found"]: |
|
|
result["cache_read"]["value"] = input_price * 0.1 |
|
|
if not result["cache_creation"]["found"]: |
|
|
result["cache_creation"]["value"] = input_price * 1.25 |
|
|
if not result["completion"]["found"]: |
|
|
result["completion"]["value"] = input_price * 5 |
|
|
elif completion > 0: |
|
|
|
|
|
estimated_input = completion / 5 |
|
|
if not result["input"]["found"]: |
|
|
result["input"]["value"] = estimated_input |
|
|
if not result["cache_read"]["found"]: |
|
|
result["cache_read"]["value"] = estimated_input * 0.1 |
|
|
if not result["cache_creation"]["found"]: |
|
|
result["cache_creation"]["value"] = estimated_input * 1.25 |
|
|
|
|
|
return result, model_hint |
|
|
|
|
|
|
|
|
def _build_selection_payload(row_idx: int | None, df: pd.DataFrame): |
|
|
if df is None or df.empty or row_idx is None: |
|
|
return ( |
|
|
"", "", |
|
|
gr.update(visible=False), |
|
|
gr.update(value=0, label="Input"), |
|
|
gr.update(value=0, label="Cache Read"), |
|
|
gr.update(value=0, label="Cache Creation"), |
|
|
gr.update(value=0, label="Completion"), |
|
|
"", |
|
|
gr.update(value=1.0), |
|
|
) |
|
|
|
|
|
row = df.iloc[row_idx] |
|
|
folder = row["folder"] |
|
|
name = row["name"] |
|
|
|
|
|
prices_dict, model_hint = get_prices_for_folder(folder) |
|
|
default_overhead = get_default_overhead(model_hint) |
|
|
|
|
|
def price_update(price_info, name): |
|
|
value = price_info["value"] |
|
|
if price_info["found"]: |
|
|
return gr.update(value=value, label=f"✅ {name}") |
|
|
elif value > 0: |
|
|
return gr.update(value=value, label=f"❌ {name} (est.)") |
|
|
else: |
|
|
return gr.update(value=0, label=f"❌ {name}") |
|
|
|
|
|
return ( |
|
|
folder, name, |
|
|
gr.update(visible=True), |
|
|
price_update(prices_dict["input"], "Input"), |
|
|
price_update(prices_dict["cache_read"], "Cache Read"), |
|
|
price_update(prices_dict["cache_creation"], "Cache Creation"), |
|
|
price_update(prices_dict["completion"], "Completion"), |
|
|
model_hint, |
|
|
gr.update(value=default_overhead), |
|
|
) |
|
|
|
|
|
|
|
|
def on_row_select(evt: gr.SelectData, df: pd.DataFrame): |
|
|
row_idx = None |
|
|
if evt is not None and evt.index is not None: |
|
|
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index |
|
|
return _build_selection_payload(row_idx, df) |
|
|
|
|
|
|
|
|
def select_first_row(df: pd.DataFrame): |
|
|
default_idx = 0 if df is not None and not df.empty else None |
|
|
return _build_selection_payload(default_idx, df) |
|
|
|
|
|
|
|
|
def create_routed_token_chart(original_tokens: dict, base_tokens: dict, additional_models: list, base_model_name: str = "Base"): |
|
|
""" |
|
|
Create grouped+stacked bar chart comparing Calculated vs Routed tokens. |
|
|
|
|
|
Args: |
|
|
original_tokens: dict with uncached_input, cache_read, cache_creation, completion (from Calculated) |
|
|
base_tokens: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing) |
|
|
additional_models: list of (model_name, tokens_dict) tuples |
|
|
base_model_name: name of the base model |
|
|
""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"] |
|
|
token_keys = ["uncached_input", "cache_read", "cache_creation", "completion"] |
|
|
base_color_dark = "#636EFA" |
|
|
base_color_light = "#A0C4FF" |
|
|
model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"{base_model_name} [no routing]", |
|
|
x=categories, |
|
|
y=[original_tokens.get(k, 0) / 1e6 for k in token_keys], |
|
|
marker_color="rgba(99, 110, 250, 0.3)", |
|
|
marker_line_color=base_color_dark, |
|
|
marker_line_width=1, |
|
|
marker_pattern_shape="/", |
|
|
marker_pattern_fgcolor=base_color_dark, |
|
|
offsetgroup="calculated", |
|
|
hovertemplate="%{x}<br>" + base_model_name + " [no routing]: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"{base_model_name} [with routing]", |
|
|
x=categories, |
|
|
y=[base_tokens.get(k, 0) / 1e6 for k in token_keys], |
|
|
marker_color=base_color_dark, |
|
|
offsetgroup="routed", |
|
|
hovertemplate="%{x}<br>" + base_model_name + " [with routing]: %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
for i, (model_name, tokens) in enumerate(additional_models): |
|
|
fig.add_trace(go.Bar( |
|
|
name=model_name or f"Model {i+1}", |
|
|
x=categories, |
|
|
y=[tokens.get(k, 0) / 1e6 for k in token_keys], |
|
|
marker_color=model_colors[i % len(model_colors)], |
|
|
offsetgroup="routed", |
|
|
hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": %{y:.3f}M<extra></extra>", |
|
|
)) |
|
|
|
|
|
original_total = sum(original_tokens.get(k, 0) for k in token_keys) |
|
|
routed_total = sum(base_tokens.get(k, 0) for k in token_keys) + sum( |
|
|
sum(m[1].get(k, 0) for k in token_keys) for m in additional_models |
|
|
) |
|
|
|
|
|
annotation_lines = [ |
|
|
f"<b>No routing: {original_total/1e6:.2f}M</b>", |
|
|
f"<b>With routing: {routed_total/1e6:.2f}M</b>", |
|
|
] |
|
|
|
|
|
fig.update_layout( |
|
|
yaxis_title="Tokens (M)", |
|
|
barmode="stack", |
|
|
bargroupgap=0.1, |
|
|
margin=dict(l=40, r=40, t=40, b=40), |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"), |
|
|
) |
|
|
fig.add_annotation( |
|
|
text="<br>".join(annotation_lines), |
|
|
xref="paper", yref="paper", |
|
|
x=0.02, y=0.98, showarrow=False, |
|
|
font=dict(size=11), |
|
|
align="left", |
|
|
bgcolor="rgba(255,255,255,0.8)", |
|
|
bordercolor="gray", |
|
|
borderwidth=1, |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def create_routed_cost_chart(original_costs: dict, base_costs: dict, additional_models: list, base_model_name: str = "Base"): |
|
|
""" |
|
|
Create grouped+stacked bar chart comparing Calculated vs Routed costs. |
|
|
|
|
|
Args: |
|
|
original_costs: dict with uncached_input, cache_read, cache_creation, completion (from Calculated) |
|
|
base_costs: dict with uncached_input, cache_read, cache_creation, completion (base portion in routing) |
|
|
additional_models: list of (model_name, costs_dict) tuples |
|
|
base_model_name: name of the base model |
|
|
""" |
|
|
import plotly.graph_objects as go |
|
|
|
|
|
categories = ["Uncached Input", "Cache Read", "Cache Creation", "Completion"] |
|
|
cost_keys = ["uncached_input", "cache_read", "cache_creation", "completion"] |
|
|
base_color_dark = "#636EFA" |
|
|
base_color_light = "#A0C4FF" |
|
|
model_colors = ["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"] |
|
|
|
|
|
fig = go.Figure() |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"{base_model_name} [no routing]", |
|
|
x=categories, |
|
|
y=[original_costs.get(k, 0) for k in cost_keys], |
|
|
marker_color="rgba(99, 110, 250, 0.3)", |
|
|
marker_line_color=base_color_dark, |
|
|
marker_line_width=1, |
|
|
marker_pattern_shape="/", |
|
|
marker_pattern_fgcolor=base_color_dark, |
|
|
offsetgroup="calculated", |
|
|
hovertemplate="%{x}<br>" + base_model_name + " [no routing]: $%{y:.2f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
fig.add_trace(go.Bar( |
|
|
name=f"{base_model_name} [with routing]", |
|
|
x=categories, |
|
|
y=[base_costs.get(k, 0) for k in cost_keys], |
|
|
marker_color=base_color_dark, |
|
|
offsetgroup="routed", |
|
|
hovertemplate="%{x}<br>" + base_model_name + " [with routing]: $%{y:.2f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
for i, (model_name, costs) in enumerate(additional_models): |
|
|
fig.add_trace(go.Bar( |
|
|
name=model_name or f"Model {i+1}", |
|
|
x=categories, |
|
|
y=[costs.get(k, 0) for k in cost_keys], |
|
|
marker_color=model_colors[i % len(model_colors)], |
|
|
offsetgroup="routed", |
|
|
hovertemplate="%{x}<br>" + (model_name or f"Model {i+1}") + ": $%{y:.2f}<extra></extra>", |
|
|
)) |
|
|
|
|
|
original_total = sum(original_costs.get(k, 0) for k in cost_keys) |
|
|
routed_total = sum(base_costs.get(k, 0) for k in cost_keys) + sum( |
|
|
sum(m[1].get(k, 0) for k in cost_keys) for m in additional_models |
|
|
) |
|
|
|
|
|
annotation_lines = [ |
|
|
f"<b>No routing: ${original_total:.2f}</b>", |
|
|
f"<b>With routing: ${routed_total:.2f}</b>", |
|
|
] |
|
|
|
|
|
fig.update_layout( |
|
|
yaxis_title="Cost ($)", |
|
|
barmode="stack", |
|
|
bargroupgap=0.1, |
|
|
margin=dict(l=40, r=40, t=40, b=40), |
|
|
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1, traceorder="normal"), |
|
|
) |
|
|
fig.add_annotation( |
|
|
text="<br>".join(annotation_lines), |
|
|
xref="paper", yref="paper", |
|
|
x=0.02, y=0.98, showarrow=False, |
|
|
font=dict(size=11), |
|
|
align="left", |
|
|
bgcolor="rgba(255,255,255,0.8)", |
|
|
bordercolor="gray", |
|
|
borderwidth=1, |
|
|
) |
|
|
return fig |
|
|
|
|
|
|
|
|
def build_app(): |
|
|
leaderboard_df = get_bash_only_df() |
|
|
|
|
|
with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app: |
|
|
gr.HTML(""" |
|
|
<style> |
|
|
.quick-select-row { |
|
|
flex-wrap: wrap !important; |
|
|
gap: 6px !important; |
|
|
margin-bottom: 8px !important; |
|
|
} |
|
|
.quick-select-row button { |
|
|
background: white !important; |
|
|
color: #333 !important; |
|
|
border: 1px solid #ccc !important; |
|
|
border-radius: 4px !important; |
|
|
padding: 4px 10px !important; |
|
|
font-size: 12px !important; |
|
|
transition: all 0.15s ease !important; |
|
|
} |
|
|
.quick-select-row button:hover { |
|
|
background: #f0f0f0 !important; |
|
|
border-color: #999 !important; |
|
|
} |
|
|
</style> |
|
|
""") |
|
|
trajectories_state = gr.State(None) |
|
|
|
|
|
gr.Markdown("# 🧮 SWE-bench Costs Calculator `v0.3.46`") |
|
|
gr.Markdown("### *Calculate cost savings with different routing strategies.*") |
|
|
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
leaderboard_table = gr.Dataframe( |
|
|
value=leaderboard_df, |
|
|
label="Bash-Only Leaderboard", |
|
|
interactive=False, |
|
|
wrap=True, |
|
|
elem_id="leaderboard-table", |
|
|
) |
|
|
|
|
|
with gr.Column(visible=False) as analysis_section: |
|
|
gr.Markdown("## 📊 Trajectory Analysis") |
|
|
|
|
|
with gr.Accordion("Leaderboard data", open=True): |
|
|
with gr.Row(): |
|
|
plot_steps = gr.Plot(label="Distribution of API Calls (Steps) per Trajectory") |
|
|
plot_cost = gr.Plot(label="Distribution of Cost Reported by Leaderboard ($)") |
|
|
|
|
|
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED ALL]", open=True): |
|
|
with gr.Row(): |
|
|
plot_tokens_meta = gr.Plot(label="Total Tokens by Type") |
|
|
plot_tokens_cost_meta = gr.Plot(label="Total Cost by Token Type ($)") |
|
|
|
|
|
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [AGGREGATED BY TRAJECTORY]", open=True): |
|
|
with gr.Row(): |
|
|
plot_stacked_meta = gr.Plot(label="Tokens per Trajectory (stacked)") |
|
|
with gr.Row(): |
|
|
plot_cost_breakdown_meta = gr.Plot(label="Cost per Trajectory") |
|
|
|
|
|
with gr.Accordion("Token counts REPORTED in the metadata of .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_meta_accordion: |
|
|
with gr.Row(): |
|
|
single_traj_meta_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True) |
|
|
with gr.Row(): |
|
|
single_traj_meta_plot = gr.Plot(label="Tokens per Step (stacked)") |
|
|
with gr.Row(): |
|
|
single_traj_meta_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)") |
|
|
|
|
|
with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED ALL]", open=True): |
|
|
with gr.Row(): |
|
|
plot_tokens_calc = gr.Plot(label="Total Tokens by Type") |
|
|
plot_tokens_cost_calc = gr.Plot(label="Total Cost by Token Type ($)") |
|
|
|
|
|
with gr.Accordion("Token counts CALCULATED from .traj files [AGGREGATED BY TRAJECTORY]", open=True): |
|
|
with gr.Row(): |
|
|
plot_stacked_calc = gr.Plot(label="Tokens per Trajectory (stacked)") |
|
|
with gr.Row(): |
|
|
plot_cost_breakdown_calc = gr.Plot(label="Cost per Trajectory") |
|
|
|
|
|
with gr.Accordion("Token counts CALCULATED from .traj files [ONE TRAJECTORY]", open=True, visible=False) as single_traj_accordion: |
|
|
with gr.Row(): |
|
|
single_traj_dropdown = gr.Dropdown(label="Select Issue", choices=[], interactive=True) |
|
|
with gr.Row(): |
|
|
single_traj_plot = gr.Plot(label="Tokens per Step (stacked)") |
|
|
with gr.Row(): |
|
|
single_traj_cost_plot = gr.Plot(label="Cost per Step (stacked) ($)") |
|
|
|
|
|
with gr.Accordion("Token counts CALCULATED from .traj files, with ROUTING [AGGREGATED ALL]", open=True, visible=False) as routing_plots_row: |
|
|
with gr.Row(): |
|
|
routing_tokens_plot = gr.Plot(label="Tokens by Type (per Model)") |
|
|
routing_cost_plot = gr.Plot(label="Cost by Type (per Model) ($)") |
|
|
gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
selected_folder = gr.State("") |
|
|
gr.Markdown("### Selected Model") |
|
|
selected_name = gr.Textbox(label="Model Name", interactive=False) |
|
|
|
|
|
analyze_btn = gr.Button("📊 Load & Analyze", visible=False, variant="primary") |
|
|
download_status = gr.Textbox(label="Status", interactive=False, lines=3) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### 💰 Token Prices ($/1M) · *[litellm](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)*") |
|
|
detected_model = gr.Textbox(label="Detected Model", interactive=False) |
|
|
with gr.Row(): |
|
|
price_input = gr.Number(label="Input", value=0, precision=2, scale=1) |
|
|
price_cache_read = gr.Number(label="Cache Read", value=0, precision=2, scale=1) |
|
|
price_cache_creation = gr.Number(label="Cache Creation", value=0, precision=2, scale=1) |
|
|
price_completion = gr.Number(label="Completion", value=0, precision=2, scale=1) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### 🔢 Calculated Token Options") |
|
|
thinking_overhead = gr.Number( |
|
|
label="Tokenizer Overhead", |
|
|
value=1.21, |
|
|
precision=2, |
|
|
info="Multiplier for Calculated tokens (tiktoken → native)", |
|
|
) |
|
|
use_cache = gr.Checkbox( |
|
|
label="Use Cache", |
|
|
value=True, |
|
|
info="If disabled, all tokens are Uncached Input or Completion", |
|
|
) |
|
|
|
|
|
gr.Markdown("---") |
|
|
add_routing_btn = gr.Button("➕ Add Routing", variant="primary", visible=False) |
|
|
gr.Markdown("*With routing all messages in the trajectory remain as they are, but messages that match the selected filters are assigned to selected models for routing to.*") |
|
|
|
|
|
with gr.Column(visible=False) as routing_section: |
|
|
gr.Markdown("### 🔀 Routing Models") |
|
|
|
|
|
with gr.Column(): |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 1") |
|
|
with gr.Row(elem_classes=["quick-select-row"]): |
|
|
quick_btns_1 = [] |
|
|
for item in QUICK_SELECT_MODELS: |
|
|
if isinstance(item, tuple): |
|
|
model, short_name = item |
|
|
else: |
|
|
model = item |
|
|
short_name = model.split("/")[-1] |
|
|
btn = gr.Button(short_name, size="sm", scale=0, min_width=80) |
|
|
quick_btns_1.append((btn, model)) |
|
|
routing_model_1 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_1_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_1_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_1_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_1_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
|
|
|
add_model_2_btn = gr.Button("+ Add another model", size="sm", visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as routing_block_2: |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 2") |
|
|
with gr.Row(elem_classes=["quick-select-row"]): |
|
|
quick_btns_2 = [] |
|
|
for item in QUICK_SELECT_MODELS: |
|
|
if isinstance(item, tuple): |
|
|
model, short_name = item |
|
|
else: |
|
|
model = item |
|
|
short_name = model.split("/")[-1] |
|
|
btn = gr.Button(short_name, size="sm", scale=0, min_width=80) |
|
|
quick_btns_2.append((btn, model)) |
|
|
routing_model_2 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_2_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_2_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_2_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_2_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
|
|
|
add_model_3_btn = gr.Button("+ Add another model", size="sm", visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as routing_block_3: |
|
|
with gr.Group(): |
|
|
gr.Markdown("#### Route to Model 3") |
|
|
with gr.Row(elem_classes=["quick-select-row"]): |
|
|
quick_btns_3 = [] |
|
|
for item in QUICK_SELECT_MODELS: |
|
|
if isinstance(item, tuple): |
|
|
model, short_name = item |
|
|
else: |
|
|
model = item |
|
|
short_name = model.split("/")[-1] |
|
|
btn = gr.Button(short_name, size="sm", scale=0, min_width=80) |
|
|
quick_btns_3.append((btn, model)) |
|
|
routing_model_3 = gr.Dropdown( |
|
|
label="Model (type 3+ chars to search)", |
|
|
choices=[], |
|
|
allow_custom_value=True, |
|
|
interactive=True, |
|
|
) |
|
|
with gr.Row(): |
|
|
routing_price_3_input = gr.Number(label="Input", precision=3, scale=1) |
|
|
routing_price_3_cache_read = gr.Number(label="Cache Read", precision=3, scale=1) |
|
|
routing_price_3_cache_creation = gr.Number(label="Cache Creation", precision=3, scale=1) |
|
|
routing_price_3_completion = gr.Number(label="Completion", precision=3, scale=1) |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### 🎯 Router Strategy") |
|
|
|
|
|
selected_strategy = gr.Radio( |
|
|
choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"], |
|
|
value="Random router", |
|
|
label="", |
|
|
interactive=True, |
|
|
) |
|
|
num_routing_models = gr.State(1) |
|
|
|
|
|
with gr.Column(visible=True) as random_block: |
|
|
random_hint = gr.Markdown("*Weights must sum to 1.0*") |
|
|
weight_base = gr.Number(label="Base weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True) |
|
|
weight_model_1 = gr.Number(label="Model 1 weight", value=0.5, minimum=0, maximum=1, precision=2, interactive=True) |
|
|
weight_model_2 = gr.Number(label="Model 2 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False) |
|
|
weight_model_3 = gr.Number(label="Model 3 weight", value=0, minimum=0, maximum=1, precision=2, interactive=True, visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as every_k_block: |
|
|
every_k_hint = gr.Markdown("*First model has priority on overlaps*") |
|
|
k_model_1 = gr.Number(label="k₁ (Model 1)", value=2, minimum=1, precision=0, interactive=True) |
|
|
k_model_2 = gr.Number(label="k₂ (Model 2)", value=3, minimum=1, precision=0, interactive=True, visible=False) |
|
|
k_model_3 = gr.Number(label="k₃ (Model 3)", value=5, minimum=1, precision=0, interactive=True, visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as slice_block: |
|
|
slice_hint = gr.Markdown("*First model has priority on overlaps*") |
|
|
slice_model_1 = gr.Textbox(label="M1 slice", value="[0::3]", interactive=True) |
|
|
slice_model_2 = gr.Textbox(label="M2 slice", value="[1::3]", interactive=True, visible=False) |
|
|
slice_model_3 = gr.Textbox(label="M3 slice", value="[2::3]", interactive=True, visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as grep_block: |
|
|
grep_hint = gr.Markdown("*Use `|` for OR, `&` for AND (don't mix). First model has priority on overlaps*") |
|
|
grep_model_1 = gr.Textbox(label="M1 grep", value="ls|find", interactive=True) |
|
|
grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False) |
|
|
grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False) |
|
|
|
|
|
with gr.Column(visible=False) as resolved_block: |
|
|
resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*") |
|
|
resolved_model = gr.Dropdown( |
|
|
label="Model for resolved trajectories", |
|
|
choices=["Base", "M1", "M2", "M3"], |
|
|
value="Base", |
|
|
interactive=True, |
|
|
) |
|
|
unresolved_model = gr.Dropdown( |
|
|
label="Model for unresolved trajectories", |
|
|
choices=["Base", "M1", "M2", "M3"], |
|
|
value="M1", |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
with gr.Column(visible=False) as part_block: |
|
|
part_hint = gr.Markdown("*Ranges must not overlap*") |
|
|
part_mode = gr.Radio( |
|
|
choices=["Indexes", "Percentages"], |
|
|
value="Percentages", |
|
|
label="Mode", |
|
|
interactive=True, |
|
|
) |
|
|
start_1 = gr.Number(label="M1 Start", value=0, minimum=0, precision=0, interactive=True) |
|
|
end_1 = gr.Number(label="M1 End", value=30, minimum=0, precision=0, interactive=True) |
|
|
start_2 = gr.Number(label="M2 Start", value=30, minimum=0, precision=0, interactive=True, visible=False) |
|
|
end_2 = gr.Number(label="M2 End", value=60, minimum=0, precision=0, interactive=True, visible=False) |
|
|
start_3 = gr.Number(label="M3 Start", value=60, minimum=0, precision=0, interactive=True, visible=False) |
|
|
end_3 = gr.Number(label="M3 End", value=100, minimum=0, precision=0, interactive=True, visible=False) |
|
|
|
|
|
gr.Markdown("---") |
|
|
route_btn = gr.Button("🚀 Let's ROUTE!!", variant="primary", size="lg", interactive=False) |
|
|
routing_result = gr.Markdown(visible=False) |
|
|
|
|
|
|
|
|
def toggle_routing_section(): |
|
|
return gr.update(visible=True) |
|
|
|
|
|
add_routing_btn.click( |
|
|
fn=toggle_routing_section, |
|
|
outputs=[routing_section], |
|
|
) |
|
|
|
|
|
def on_strategy_change(strategy, num_models): |
|
|
show_random = strategy == "Random router" |
|
|
show_every_k = strategy == "Every k-th step" |
|
|
show_slice = strategy == "Python list slices" |
|
|
show_grep = strategy == "Grep" |
|
|
show_resolved = strategy == "Resolved/Unresolved" |
|
|
show_part = strategy == "Replace part of trajectory" |
|
|
has_m2 = num_models >= 2 |
|
|
has_m3 = num_models >= 3 |
|
|
return [ |
|
|
gr.update(visible=show_random), |
|
|
gr.update(visible=show_every_k), |
|
|
gr.update(visible=show_slice), |
|
|
gr.update(visible=show_grep), |
|
|
gr.update(visible=show_resolved), |
|
|
gr.update(visible=show_part), |
|
|
gr.update(visible=show_random), |
|
|
gr.update(visible=show_random), |
|
|
gr.update(visible=show_random), |
|
|
gr.update(visible=show_random and has_m2), |
|
|
gr.update(visible=show_random and has_m3), |
|
|
gr.update(visible=show_every_k), |
|
|
gr.update(visible=show_every_k), |
|
|
gr.update(visible=show_every_k and has_m2), |
|
|
gr.update(visible=show_every_k and has_m3), |
|
|
gr.update(visible=show_slice), |
|
|
gr.update(visible=show_slice), |
|
|
gr.update(visible=show_slice and has_m2), |
|
|
gr.update(visible=show_slice and has_m3), |
|
|
gr.update(visible=show_grep), |
|
|
gr.update(visible=show_grep), |
|
|
gr.update(visible=show_grep and has_m2), |
|
|
gr.update(visible=show_grep and has_m3), |
|
|
gr.update(visible=show_resolved), |
|
|
gr.update(visible=show_resolved), |
|
|
gr.update(visible=show_resolved), |
|
|
gr.update(visible=show_part), |
|
|
gr.update(visible=show_part), |
|
|
gr.update(visible=show_part), |
|
|
gr.update(visible=show_part), |
|
|
gr.update(visible=show_part and has_m2), |
|
|
gr.update(visible=show_part and has_m2), |
|
|
gr.update(visible=show_part and has_m3), |
|
|
gr.update(visible=show_part and has_m3), |
|
|
] |
|
|
|
|
|
selected_strategy.change( |
|
|
fn=on_strategy_change, |
|
|
inputs=[selected_strategy, num_routing_models], |
|
|
outputs=[ |
|
|
random_block, every_k_block, slice_block, grep_block, resolved_block, part_block, |
|
|
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3, |
|
|
every_k_hint, k_model_1, k_model_2, k_model_3, |
|
|
slice_hint, slice_model_1, slice_model_2, slice_model_3, |
|
|
grep_hint, grep_model_1, grep_model_2, grep_model_3, |
|
|
resolved_hint, resolved_model, unresolved_model, |
|
|
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3, |
|
|
], |
|
|
) |
|
|
|
|
|
def filter_models(query): |
|
|
"""Filter models based on search query (starts at 3 chars)""" |
|
|
if not query or len(query) < 3: |
|
|
return gr.update(choices=[]) |
|
|
all_models = get_litellm_model_list() |
|
|
query_lower = query.lower() |
|
|
filtered = [m for m in all_models if query_lower in m.lower()][:50] |
|
|
return gr.update(choices=filtered) |
|
|
|
|
|
routing_model_1.input(fn=filter_models, inputs=[routing_model_1], outputs=[routing_model_1]) |
|
|
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2]) |
|
|
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3]) |
|
|
|
|
|
def make_quick_select_fn_1(full_model_name): |
|
|
def fn(): |
|
|
prices = get_routing_prices_with_labels(full_model_name) |
|
|
return (gr.update(value=full_model_name), *prices, |
|
|
gr.update(visible=True), gr.update(interactive=True)) |
|
|
return fn |
|
|
|
|
|
def make_quick_select_fn_2(full_model_name): |
|
|
def fn(): |
|
|
prices = get_routing_prices_with_labels(full_model_name) |
|
|
return (gr.update(value=full_model_name), *prices, |
|
|
gr.update(visible=True)) |
|
|
return fn |
|
|
|
|
|
def make_quick_select_fn_3(full_model_name): |
|
|
def fn(): |
|
|
prices = get_routing_prices_with_labels(full_model_name) |
|
|
return (gr.update(value=full_model_name), *prices) |
|
|
return fn |
|
|
|
|
|
for btn, full_model in quick_btns_1: |
|
|
btn.click( |
|
|
fn=make_quick_select_fn_1(full_model), |
|
|
outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn] |
|
|
) |
|
|
|
|
|
for btn, full_model in quick_btns_2: |
|
|
btn.click( |
|
|
fn=make_quick_select_fn_2(full_model), |
|
|
outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn] |
|
|
) |
|
|
|
|
|
for btn, full_model in quick_btns_3: |
|
|
btn.click( |
|
|
fn=make_quick_select_fn_3(full_model), |
|
|
outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion] |
|
|
) |
|
|
|
|
|
def get_routing_prices_with_labels(model_name): |
|
|
"""Get all 4 prices for a routing model with found/estimated labels""" |
|
|
if not model_name: |
|
|
return ( |
|
|
gr.update(value=0, label="Input"), |
|
|
gr.update(value=0, label="Cache Read"), |
|
|
gr.update(value=0, label="Cache Creation"), |
|
|
gr.update(value=0, label="Completion"), |
|
|
) |
|
|
|
|
|
prices = get_litellm_prices() |
|
|
model_prices = prices.get(model_name, {}) |
|
|
|
|
|
input_price = model_prices.get("input_cost_per_token", 0) * 1e6 |
|
|
cache_read = model_prices.get("cache_read_input_token_cost", 0) * 1e6 |
|
|
cache_creation = model_prices.get("cache_creation_input_token_cost", 0) * 1e6 |
|
|
completion = model_prices.get("output_cost_per_token", 0) * 1e6 |
|
|
|
|
|
input_found = input_price > 0 |
|
|
cache_read_found = cache_read > 0 |
|
|
cache_creation_found = cache_creation > 0 |
|
|
completion_found = completion > 0 |
|
|
|
|
|
if not cache_read_found and input_price > 0: |
|
|
cache_read = input_price * 0.1 |
|
|
if not cache_creation_found and input_price > 0: |
|
|
cache_creation = input_price * 1.25 |
|
|
|
|
|
def label(name, found): |
|
|
return f"✅ {name}" if found else f"❌ {name}" |
|
|
|
|
|
return ( |
|
|
gr.update(value=input_price, label=label("Input", input_found)), |
|
|
gr.update(value=cache_read, label=label("Cache Read", cache_read_found)), |
|
|
gr.update(value=cache_creation, label=label("Cache Creation", cache_creation_found)), |
|
|
gr.update(value=completion, label=label("Completion", completion_found)), |
|
|
) |
|
|
|
|
|
def on_routing_model_1_select(model_name): |
|
|
prices = get_routing_prices_with_labels(model_name) |
|
|
show_btn = bool(model_name) |
|
|
return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)) |
|
|
|
|
|
def on_routing_model_2_select(model_name): |
|
|
prices = get_routing_prices_with_labels(model_name) |
|
|
show_btn = bool(model_name) |
|
|
return (*prices, gr.update(visible=show_btn)) |
|
|
|
|
|
def on_routing_model_3_select(model_name): |
|
|
return get_routing_prices_with_labels(model_name) |
|
|
|
|
|
routing_model_1.change( |
|
|
fn=on_routing_model_1_select, |
|
|
inputs=[routing_model_1], |
|
|
outputs=[routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn], |
|
|
) |
|
|
|
|
|
def show_model_2(strategy): |
|
|
is_random = strategy == "Random router" |
|
|
is_every_k = strategy == "Every k-th step" |
|
|
is_slice = strategy == "Python list slices" |
|
|
is_grep = strategy == "Grep" |
|
|
is_part = strategy == "Replace part of trajectory" |
|
|
return ( |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=is_random), |
|
|
gr.update(visible=is_every_k), |
|
|
gr.update(visible=is_slice), |
|
|
gr.update(visible=is_grep), |
|
|
gr.update(visible=is_part), |
|
|
gr.update(visible=is_part), |
|
|
2, |
|
|
) |
|
|
|
|
|
add_model_2_btn.click( |
|
|
fn=show_model_2, |
|
|
inputs=[selected_strategy], |
|
|
outputs=[routing_block_2, add_model_2_btn, weight_model_2, k_model_2, slice_model_2, grep_model_2, start_2, end_2, num_routing_models], |
|
|
) |
|
|
|
|
|
routing_model_2.change( |
|
|
fn=on_routing_model_2_select, |
|
|
inputs=[routing_model_2], |
|
|
outputs=[routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn], |
|
|
) |
|
|
|
|
|
def show_model_3(strategy): |
|
|
is_random = strategy == "Random router" |
|
|
is_every_k = strategy == "Every k-th step" |
|
|
is_slice = strategy == "Python list slices" |
|
|
is_grep = strategy == "Grep" |
|
|
is_part = strategy == "Replace part of trajectory" |
|
|
return ( |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=is_random), |
|
|
gr.update(visible=is_every_k), |
|
|
gr.update(visible=is_slice), |
|
|
gr.update(visible=is_grep), |
|
|
gr.update(visible=is_part), |
|
|
gr.update(visible=is_part), |
|
|
3, |
|
|
) |
|
|
|
|
|
add_model_3_btn.click( |
|
|
fn=show_model_3, |
|
|
inputs=[selected_strategy], |
|
|
outputs=[routing_block_3, add_model_3_btn, weight_model_3, k_model_3, slice_model_3, grep_model_3, start_3, end_3, num_routing_models], |
|
|
) |
|
|
|
|
|
routing_model_3.change( |
|
|
fn=on_routing_model_3_select, |
|
|
inputs=[routing_model_3], |
|
|
outputs=[routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion], |
|
|
) |
|
|
|
|
|
def run_routing( |
|
|
state_data, |
|
|
base_input, base_cache_read, base_cache_creation, base_completion, |
|
|
routing_model_1_val, r1_input, r1_cache_read, r1_cache_creation, r1_completion, |
|
|
routing_model_2_val, r2_input, r2_cache_read, r2_cache_creation, r2_completion, |
|
|
routing_model_3_val, r3_input, r3_cache_read, r3_cache_creation, r3_completion, |
|
|
strategy_val, |
|
|
weight_base_val, weight_1_val, weight_2_val, weight_3_val, |
|
|
k_1_val, k_2_val, k_3_val, |
|
|
slice_1_val, slice_2_val, slice_3_val, |
|
|
grep_1_val, grep_2_val, grep_3_val, |
|
|
resolved_model_val, unresolved_model_val, |
|
|
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val, |
|
|
overhead, with_cache, |
|
|
detected_model_val |
|
|
): |
|
|
if state_data is None: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ No trajectories loaded. Click 'Load & Analyze' first."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
if not routing_model_1_val: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ Please select at least one routing model."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
trajectory_steps = state_data.get("steps", {}) |
|
|
resolved_instances = state_data.get("resolved", {}) |
|
|
if not trajectory_steps: |
|
|
yield ( |
|
|
gr.update(visible=True, value="❌ No trajectory steps data available."), |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
) |
|
|
return |
|
|
|
|
|
|
|
|
df_calc = state_data.get("calculated") |
|
|
if df_calc is not None and not df_calc.empty: |
|
|
df_for_cost = apply_thinking_overhead(df_calc.copy(), overhead) |
|
|
if not with_cache: |
|
|
df_for_cost = apply_no_cache(df_for_cost) |
|
|
df_temp = df_for_cost.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
total_original_cost_from_df = ( |
|
|
df_temp["uncached_input"].sum() * base_input / 1e6 + |
|
|
df_for_cost["cache_read_tokens"].sum() * base_cache_read / 1e6 + |
|
|
df_for_cost["cache_creation_tokens"].sum() * base_cache_creation / 1e6 + |
|
|
df_for_cost["completion_tokens"].sum() * base_completion / 1e6 |
|
|
) |
|
|
else: |
|
|
total_original_cost_from_df = None |
|
|
|
|
|
base_prices = { |
|
|
"input": base_input, |
|
|
"cache_read": base_cache_read, |
|
|
"cache_creation": base_cache_creation, |
|
|
"completion": base_completion, |
|
|
} |
|
|
|
|
|
routing_models = [] |
|
|
if routing_model_1_val: |
|
|
routing_models.append({ |
|
|
"name": routing_model_1_val, |
|
|
"prices": {"input": r1_input, "cache_read": r1_cache_read, "cache_creation": r1_cache_creation, "completion": r1_completion}, |
|
|
}) |
|
|
if routing_model_2_val: |
|
|
routing_models.append({ |
|
|
"name": routing_model_2_val, |
|
|
"prices": {"input": r2_input, "cache_read": r2_cache_read, "cache_creation": r2_cache_creation, "completion": r2_completion}, |
|
|
}) |
|
|
if routing_model_3_val: |
|
|
routing_models.append({ |
|
|
"name": routing_model_3_val, |
|
|
"prices": {"input": r3_input, "cache_read": r3_cache_read, "cache_creation": r3_cache_creation, "completion": r3_completion}, |
|
|
}) |
|
|
|
|
|
if strategy_val == "Replace part of trajectory": |
|
|
ranges = [(start_1_val, end_1_val)] |
|
|
if len(routing_models) > 1: |
|
|
ranges.append((start_2_val, end_2_val)) |
|
|
if len(routing_models) > 2: |
|
|
ranges.append((start_3_val, end_3_val)) |
|
|
for i, (s, e) in enumerate(ranges): |
|
|
if s >= e: |
|
|
yield (gr.update(visible=True, value=f"❌ Model {i+1}: Start must be less than End"), gr.update(visible=False), None, None) |
|
|
return |
|
|
for i in range(len(ranges)): |
|
|
for j in range(i+1, len(ranges)): |
|
|
s1, e1 = ranges[i] |
|
|
s2, e2 = ranges[j] |
|
|
if not (e1 <= s2 or e2 <= s1): |
|
|
yield (gr.update(visible=True, value=f"❌ Model {i+1} and Model {j+1} ranges overlap"), gr.update(visible=False), None, None) |
|
|
return |
|
|
|
|
|
weights = None |
|
|
if strategy_val == "Random router": |
|
|
weights = [weight_base_val, weight_1_val] |
|
|
if len(routing_models) > 1: |
|
|
weights.append(weight_2_val) |
|
|
if len(routing_models) > 2: |
|
|
weights.append(weight_3_val) |
|
|
total_weight = sum(weights) |
|
|
if abs(total_weight - 1.0) > 0.01: |
|
|
yield (gr.update(visible=True, value=f"❌ Weights must sum to 1.0 (current: {total_weight:.2f})"), gr.update(visible=False), None, None) |
|
|
return |
|
|
|
|
|
k_values = [k_1_val, k_2_val, k_3_val][:len(routing_models)] |
|
|
slice_values = [slice_1_val, slice_2_val, slice_3_val][:len(routing_models)] |
|
|
grep_values = [grep_1_val, grep_2_val, grep_3_val][:len(routing_models)] |
|
|
part_ranges = [(start_1_val, end_1_val), (start_2_val, end_2_val), (start_3_val, end_3_val)][:len(routing_models)] |
|
|
|
|
|
if strategy_val == "Grep": |
|
|
for i, gv in enumerate(grep_values): |
|
|
if gv and "|" in gv and "&" in gv: |
|
|
yield (gr.update(visible=True, value=f"❌ M{i+1} grep: cannot mix | and & operators"), gr.update(visible=False), None, None) |
|
|
return |
|
|
|
|
|
def grep_matches(text, pattern): |
|
|
"""Check if text matches grep pattern (words with | or &)""" |
|
|
if not pattern or not text: |
|
|
return False |
|
|
pattern = pattern.strip() |
|
|
if "|" in pattern: |
|
|
words = [w.strip() for w in pattern.split("|") if w.strip()] |
|
|
for word in words: |
|
|
if re.search(r'\b' + re.escape(word) + r'\b', text): |
|
|
return True |
|
|
return False |
|
|
elif "&" in pattern: |
|
|
words = [w.strip() for w in pattern.split("&") if w.strip()] |
|
|
for word in words: |
|
|
if not re.search(r'\b' + re.escape(word) + r'\b', text): |
|
|
return False |
|
|
return True |
|
|
else: |
|
|
return bool(re.search(r'\b' + re.escape(pattern) + r'\b', text)) |
|
|
|
|
|
def parse_slice(slice_str, length): |
|
|
"""Parse Python slice notation like [0::3] and return list of indices""" |
|
|
slice_str = slice_str.strip() |
|
|
if slice_str.startswith("[") and slice_str.endswith("]"): |
|
|
slice_str = slice_str[1:-1] |
|
|
parts = slice_str.split(":") |
|
|
if len(parts) == 2: |
|
|
start = int(parts[0]) if parts[0] else None |
|
|
stop = int(parts[1]) if parts[1] else None |
|
|
step = None |
|
|
elif len(parts) == 3: |
|
|
start = int(parts[0]) if parts[0] else None |
|
|
stop = int(parts[1]) if parts[1] else None |
|
|
step = int(parts[2]) if parts[2] else None |
|
|
else: |
|
|
return [] |
|
|
return list(range(length))[slice(start, stop, step)] |
|
|
|
|
|
BASE_MODEL = "__base__" |
|
|
model_keys = [BASE_MODEL] + [f"__routing_{i}__" for i in range(len(routing_models))] |
|
|
|
|
|
all_tokens = {key: {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} for key in model_keys} |
|
|
total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0} |
|
|
|
|
|
for instance_id, steps in trajectory_steps.items(): |
|
|
if not steps: |
|
|
continue |
|
|
|
|
|
total_steps = len(steps) |
|
|
|
|
|
step_to_model = {} |
|
|
|
|
|
if strategy_val == "Random router": |
|
|
model_choices = [BASE_MODEL] + [f"__routing_{j}__" for j in range(len(routing_models))] |
|
|
for i in range(total_steps): |
|
|
step_to_model[i] = random.choices(model_choices, weights=weights)[0] |
|
|
|
|
|
elif strategy_val == "Every k-th step": |
|
|
for j, k_val in enumerate(k_values): |
|
|
if k_val and k_val > 0: |
|
|
for i in range(total_steps): |
|
|
if (i + 1) % int(k_val) == 0: |
|
|
if i not in step_to_model: |
|
|
step_to_model[i] = f"__routing_{j}__" |
|
|
|
|
|
elif strategy_val == "Python list slices": |
|
|
for j, slice_val in enumerate(slice_values): |
|
|
if slice_val: |
|
|
try: |
|
|
indices = parse_slice(slice_val, total_steps) |
|
|
for i in indices: |
|
|
if i not in step_to_model: |
|
|
step_to_model[i] = f"__routing_{j}__" |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
elif strategy_val == "Grep": |
|
|
for i, step in enumerate(steps): |
|
|
content = step.get("content", "") |
|
|
for j, grep_val in enumerate(grep_values): |
|
|
if grep_val and i not in step_to_model: |
|
|
if grep_matches(content, grep_val): |
|
|
step_to_model[i] = f"__routing_{j}__" |
|
|
|
|
|
elif strategy_val == "Resolved/Unresolved": |
|
|
is_resolved = resolved_instances.get(instance_id, False) |
|
|
target_model = resolved_model_val if is_resolved else unresolved_model_val |
|
|
if target_model and target_model != "Base": |
|
|
model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model) |
|
|
if model_idx is not None and model_idx < len(routing_models): |
|
|
for i in range(total_steps): |
|
|
step_to_model[i] = f"__routing_{model_idx}__" |
|
|
|
|
|
elif strategy_val == "Replace part of trajectory": |
|
|
for j, (start_val, end_val) in enumerate(part_ranges): |
|
|
if part_mode_val == "Percentages": |
|
|
start_idx = int(total_steps * start_val / 100) |
|
|
end_idx = int(total_steps * end_val / 100) |
|
|
else: |
|
|
start_idx = int(start_val) |
|
|
end_idx = min(int(end_val), total_steps) |
|
|
for i in range(start_idx, end_idx): |
|
|
step_to_model[i] = f"__routing_{j}__" |
|
|
|
|
|
modified_steps = [] |
|
|
for i, step in enumerate(steps): |
|
|
model = step_to_model.get(i, BASE_MODEL) |
|
|
modified_steps.append({ |
|
|
"model": model, |
|
|
"system_user": step.get("system_user", 0), |
|
|
"completion": int(step.get("completion", 0) * overhead), |
|
|
"observation": step.get("observation"), |
|
|
}) |
|
|
|
|
|
model_totals = calculate_routing_tokens(modified_steps) |
|
|
|
|
|
for key in model_keys: |
|
|
totals = model_totals.get(key, {}) |
|
|
all_tokens[key]["cache_read"] += totals.get("cache_read", 0) |
|
|
all_tokens[key]["uncached_input"] += totals.get("uncached_input", 0) |
|
|
all_tokens[key]["completion"] += totals.get("completion", 0) |
|
|
all_tokens[key]["cache_creation"] += totals.get("cache_creation", 0) |
|
|
|
|
|
original_steps = [] |
|
|
for step in steps: |
|
|
original_steps.append({ |
|
|
"model": BASE_MODEL, |
|
|
"system_user": step.get("system_user", 0), |
|
|
"completion": int(step.get("completion", 0) * overhead), |
|
|
"observation": step.get("observation"), |
|
|
}) |
|
|
original_totals = calculate_routing_tokens(original_steps) |
|
|
orig = original_totals.get(BASE_MODEL, {}) |
|
|
total_original_tokens["cache_read"] += orig.get("cache_read", 0) |
|
|
total_original_tokens["uncached_input"] += orig.get("uncached_input", 0) |
|
|
total_original_tokens["completion"] += orig.get("completion", 0) |
|
|
total_original_tokens["cache_creation"] += orig.get("cache_creation", 0) |
|
|
|
|
|
def calc_cost(tokens: dict, prices: dict) -> float: |
|
|
return ( |
|
|
tokens["uncached_input"] * prices["input"] / 1e6 + |
|
|
tokens["cache_read"] * prices["cache_read"] / 1e6 + |
|
|
tokens["cache_creation"] * prices["cache_creation"] / 1e6 + |
|
|
tokens["completion"] * prices["completion"] / 1e6 |
|
|
) |
|
|
|
|
|
def tokens_to_costs(tokens: dict, prices: dict) -> dict: |
|
|
price_map = {"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"} |
|
|
return {k: tokens[k] * prices[price_map[k]] / 1e6 for k in tokens} |
|
|
|
|
|
total_base_tokens = all_tokens[BASE_MODEL] |
|
|
base_costs = tokens_to_costs(total_base_tokens, base_prices) |
|
|
total_base_cost = calc_cost(total_base_tokens, base_prices) |
|
|
|
|
|
routing_costs_list = [] |
|
|
total_routing_cost = 0 |
|
|
for i, rm in enumerate(routing_models): |
|
|
key = f"__routing_{i}__" |
|
|
tokens = all_tokens[key] |
|
|
costs = tokens_to_costs(tokens, rm["prices"]) |
|
|
cost = calc_cost(tokens, rm["prices"]) |
|
|
routing_costs_list.append({"name": rm["name"], "tokens": tokens, "costs": costs, "cost": cost}) |
|
|
total_routing_cost += cost |
|
|
|
|
|
total_original_cost = calc_cost(total_original_tokens, base_prices) |
|
|
|
|
|
total_routed_cost = total_base_cost + total_routing_cost |
|
|
savings = total_original_cost - total_routed_cost |
|
|
savings_pct = (savings / total_original_cost * 100) if total_original_cost > 0 else 0 |
|
|
|
|
|
result_lines = [ |
|
|
"## 🚀 Routing Results", |
|
|
"", |
|
|
"| Metric | Value |", |
|
|
"|--------|-------|", |
|
|
f"| **Original Cost (base model only)** | ${total_original_cost:.2f} |", |
|
|
f"| **Routed Cost** | ${total_routed_cost:.2f} |", |
|
|
f"| ↳ Base model portion | ${total_base_cost:.2f} |", |
|
|
] |
|
|
for rc in routing_costs_list: |
|
|
result_lines.append(f"| ↳ {rc['name']} | ${rc['cost']:.2f} |") |
|
|
savings_color = "green" if savings >= 0 else "red" |
|
|
result_lines.append(f'| **Savings** | <span style="color: {savings_color}; font-weight: bold;">${savings:.2f} · {savings_pct:.1f}%</span> |') |
|
|
result_text = "\n".join(result_lines) |
|
|
|
|
|
def apply_display_formula(tokens: dict) -> dict: |
|
|
prompt = tokens["cache_read"] + tokens["uncached_input"] |
|
|
if with_cache: |
|
|
uncached_display = max(0, prompt - tokens["cache_read"] - tokens["cache_creation"]) |
|
|
return { |
|
|
"uncached_input": uncached_display, |
|
|
"cache_read": tokens["cache_read"], |
|
|
"cache_creation": tokens["cache_creation"], |
|
|
"completion": tokens["completion"], |
|
|
} |
|
|
else: |
|
|
return { |
|
|
"uncached_input": prompt, |
|
|
"cache_read": 0, |
|
|
"cache_creation": 0, |
|
|
"completion": tokens["completion"], |
|
|
} |
|
|
|
|
|
total_base_tokens_display = apply_display_formula(total_base_tokens) |
|
|
base_costs = tokens_to_costs(total_base_tokens_display, base_prices) |
|
|
|
|
|
additional_token_models = [(rc["name"], apply_display_formula(rc["tokens"])) for rc in routing_costs_list] |
|
|
additional_cost_models = [] |
|
|
for i, rc in enumerate(routing_costs_list): |
|
|
model_prices = routing_models[i]["prices"] |
|
|
additional_cost_models.append((rc["name"], tokens_to_costs(apply_display_formula(rc["tokens"]), model_prices))) |
|
|
|
|
|
if df_calc is not None and not df_calc.empty: |
|
|
df_temp = df_for_cost.copy() |
|
|
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0) |
|
|
original_tokens_from_df = { |
|
|
"uncached_input": df_temp["uncached_input"].sum(), |
|
|
"cache_read": df_for_cost["cache_read_tokens"].sum(), |
|
|
"cache_creation": df_for_cost["cache_creation_tokens"].sum(), |
|
|
"completion": df_for_cost["completion_tokens"].sum(), |
|
|
} |
|
|
else: |
|
|
original_tokens_from_df = apply_display_formula(total_original_tokens) |
|
|
|
|
|
original_costs = tokens_to_costs(original_tokens_from_df, base_prices) |
|
|
|
|
|
base_model_name = detected_model_val or "Base" |
|
|
tokens_chart = create_routed_token_chart(original_tokens_from_df, total_base_tokens_display, additional_token_models, base_model_name) |
|
|
cost_chart = create_routed_cost_chart(original_costs, base_costs, additional_cost_models, base_model_name) |
|
|
|
|
|
yield ( |
|
|
gr.update(visible=True, value=result_text), |
|
|
gr.update(visible=True), |
|
|
tokens_chart, |
|
|
cost_chart, |
|
|
) |
|
|
|
|
|
route_btn.click( |
|
|
fn=run_routing, |
|
|
inputs=[ |
|
|
trajectories_state, |
|
|
price_input, price_cache_read, price_cache_creation, price_completion, |
|
|
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, |
|
|
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, |
|
|
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion, |
|
|
selected_strategy, |
|
|
weight_base, weight_model_1, weight_model_2, weight_model_3, |
|
|
k_model_1, k_model_2, k_model_3, |
|
|
slice_model_1, slice_model_2, slice_model_3, |
|
|
grep_model_1, grep_model_2, grep_model_3, |
|
|
resolved_model, unresolved_model, |
|
|
part_mode, start_1, end_1, start_2, end_2, start_3, end_3, |
|
|
thinking_overhead, use_cache, |
|
|
detected_model, |
|
|
], |
|
|
outputs=[routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot], |
|
|
) |
|
|
|
|
|
leaderboard_table.select( |
|
|
fn=on_row_select, |
|
|
inputs=[leaderboard_table], |
|
|
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead], |
|
|
) |
|
|
|
|
|
app.load( |
|
|
fn=select_first_row, |
|
|
inputs=[leaderboard_table], |
|
|
outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead], |
|
|
js=""" |
|
|
(data) => { |
|
|
const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr'); |
|
|
if (row) { |
|
|
row.click(); |
|
|
} |
|
|
return data; |
|
|
} |
|
|
""", |
|
|
) |
|
|
|
|
|
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()): |
|
|
progress(0, desc="Ready") |
|
|
empty_result = ( |
|
|
"", |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
None, None, None, None, |
|
|
None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
) |
|
|
|
|
|
if not folder: |
|
|
progress(1, desc="No folder selected") |
|
|
yield empty_result |
|
|
return |
|
|
|
|
|
if not check_trajectories_downloaded(folder): |
|
|
progress(0.1, desc="Preparing download") |
|
|
yield ( |
|
|
"⏳ Downloading trajectories...", |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
None, None, None, None, |
|
|
None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
) |
|
|
progress(0.3, desc="Downloading") |
|
|
status, _ = download_trajectories_from_s3(folder) |
|
|
if "❌" in status: |
|
|
progress(1, desc="Download failed") |
|
|
yield ( |
|
|
status, |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
None, None, None, None, |
|
|
None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
) |
|
|
return |
|
|
progress(0.45, desc="Loading trajectories") |
|
|
|
|
|
yield ( |
|
|
"⏳ Loading trajectories...", |
|
|
gr.update(visible=True), |
|
|
None, None, |
|
|
None, None, None, None, |
|
|
None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
) |
|
|
|
|
|
progress(0.6, desc="Reading metadata") |
|
|
df_meta = ensure_token_columns(load_all_trajectories(folder)) |
|
|
progress(0.7, desc="Reading calculated") |
|
|
df_calc = ensure_token_columns(load_all_trajectories_calculated(folder)) |
|
|
df_calc["api_calls"] = df_meta["api_calls"].values |
|
|
df_calc["instance_cost"] = df_meta["instance_cost"].values |
|
|
progress(0.75, desc="Reading steps") |
|
|
trajectory_steps = load_all_trajectory_steps(folder) |
|
|
progress(0.8, desc="Reading metadata steps") |
|
|
metadata_steps = load_all_trajectory_metadata_steps(folder) |
|
|
|
|
|
model_details, _ = get_model_details(folder) |
|
|
resolved_instances = {} |
|
|
if model_details: |
|
|
per_instance = model_details.get("per_instance_details", {}) |
|
|
for inst_id, details in per_instance.items(): |
|
|
resolved_instances[inst_id] = details.get("resolved", False) |
|
|
|
|
|
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "metadata_steps": metadata_steps, "resolved": resolved_instances} |
|
|
|
|
|
if df_meta.empty: |
|
|
progress(1, desc="No trajectories found") |
|
|
yield ( |
|
|
"❌ No trajectories found", |
|
|
gr.update(visible=False), |
|
|
None, None, |
|
|
None, None, None, None, |
|
|
None, None, None, None, |
|
|
None, |
|
|
gr.update(visible=False), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(visible=False), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
gr.update(), |
|
|
) |
|
|
return |
|
|
|
|
|
progress(0.9, desc="Building charts") |
|
|
fig_steps, fig_cost, _, _, _ = create_basic_histograms( |
|
|
df_meta, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
|
|
|
fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta = create_token_charts( |
|
|
df_meta, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown_meta = create_cost_breakdown( |
|
|
df_meta, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
|
|
|
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead) |
|
|
if not with_cache: |
|
|
df_calc_processed = apply_no_cache(df_calc_processed) |
|
|
|
|
|
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts( |
|
|
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown_calc = create_cost_breakdown( |
|
|
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
|
|
|
issue_ids = sorted(trajectory_steps.keys()) |
|
|
first_issue = issue_ids[0] if issue_ids else None |
|
|
|
|
|
meta_issue_ids = sorted(metadata_steps.keys()) |
|
|
first_meta_issue = meta_issue_ids[0] if meta_issue_ids else None |
|
|
has_meta_steps = len(meta_issue_ids) > 0 |
|
|
|
|
|
fig_single_traj = None |
|
|
fig_single_traj_cost = None |
|
|
if first_issue and first_issue in trajectory_steps: |
|
|
calc_steps = trajectory_steps[first_issue] |
|
|
fig_single_traj = create_single_trajectory_chart(calc_steps, overhead, with_cache) |
|
|
fig_single_traj_cost = create_single_trajectory_cost_chart(calc_steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache) |
|
|
|
|
|
fig_single_traj_meta = None |
|
|
fig_single_traj_meta_cost = None |
|
|
if first_meta_issue and first_meta_issue in metadata_steps: |
|
|
meta_steps = metadata_steps[first_meta_issue] |
|
|
fig_single_traj_meta = create_single_trajectory_meta_chart(meta_steps) |
|
|
fig_single_traj_meta_cost = create_single_trajectory_meta_cost_chart(meta_steps, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
progress(1, desc="Done") |
|
|
yield ( |
|
|
f"✅ Loaded {len(df_meta)} trajectories", |
|
|
gr.update(visible=True), |
|
|
fig_steps, fig_cost, |
|
|
fig_tokens_meta, fig_tokens_cost_meta, fig_stacked_meta, fig_cost_breakdown_meta, |
|
|
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc, |
|
|
state_data, |
|
|
gr.update(visible=True), |
|
|
gr.update(visible=True), |
|
|
gr.update(choices=issue_ids, value=first_issue), |
|
|
fig_single_traj, |
|
|
fig_single_traj_cost, |
|
|
gr.update(visible=has_meta_steps), |
|
|
gr.update(choices=meta_issue_ids, value=first_meta_issue), |
|
|
fig_single_traj_meta, |
|
|
fig_single_traj_meta_cost, |
|
|
) |
|
|
|
|
|
def on_single_traj_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache): |
|
|
if state_data is None or not issue_id: |
|
|
return None, None |
|
|
trajectory_steps = state_data.get("steps", {}) |
|
|
if issue_id not in trajectory_steps: |
|
|
return None, None |
|
|
steps = trajectory_steps[issue_id] |
|
|
tokens_chart = create_single_trajectory_chart(steps, overhead, with_cache) |
|
|
cost_chart = create_single_trajectory_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache) |
|
|
return tokens_chart, cost_chart |
|
|
|
|
|
def on_single_traj_meta_select(state_data, issue_id, input_price, cache_read_price, cache_creation_price, completion_price): |
|
|
if state_data is None or not issue_id: |
|
|
return None, None |
|
|
metadata_steps = state_data.get("metadata_steps", {}) |
|
|
if issue_id not in metadata_steps: |
|
|
return None, None |
|
|
steps = metadata_steps[issue_id] |
|
|
tokens_chart = create_single_trajectory_meta_chart(steps) |
|
|
cost_chart = create_single_trajectory_meta_cost_chart(steps, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
return tokens_chart, cost_chart |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=load_and_analyze, |
|
|
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache], |
|
|
outputs=[ |
|
|
download_status, |
|
|
analysis_section, |
|
|
plot_steps, plot_cost, |
|
|
plot_tokens_meta, plot_tokens_cost_meta, plot_stacked_meta, plot_cost_breakdown_meta, |
|
|
plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc, |
|
|
trajectories_state, |
|
|
add_routing_btn, |
|
|
single_traj_accordion, |
|
|
single_traj_dropdown, |
|
|
single_traj_plot, |
|
|
single_traj_cost_plot, |
|
|
single_traj_meta_accordion, |
|
|
single_traj_meta_dropdown, |
|
|
single_traj_meta_plot, |
|
|
single_traj_meta_cost_plot, |
|
|
], |
|
|
) |
|
|
|
|
|
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache): |
|
|
if state_data is None: |
|
|
return None, None, None, None |
|
|
|
|
|
df_meta = state_data["meta"] |
|
|
df_calc = state_data["calculated"] |
|
|
|
|
|
if df_meta.empty: |
|
|
return None, None, None, None |
|
|
|
|
|
fig_tokens_cost_meta = create_cost_by_type_chart(df_meta, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
fig_cost_breakdown_meta = create_cost_breakdown(df_meta, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead) |
|
|
if not with_cache: |
|
|
df_calc_processed = apply_no_cache(df_calc_processed) |
|
|
|
|
|
fig_tokens_cost_calc = create_cost_by_type_chart(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
fig_cost_breakdown_calc = create_cost_breakdown(df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price) |
|
|
|
|
|
return fig_tokens_cost_meta, fig_cost_breakdown_meta, fig_tokens_cost_calc, fig_cost_breakdown_calc |
|
|
|
|
|
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache] |
|
|
price_outputs = [plot_tokens_cost_meta, plot_cost_breakdown_meta, plot_tokens_cost_calc, plot_cost_breakdown_calc] |
|
|
|
|
|
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_cache_read.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs) |
|
|
|
|
|
def on_calc_options_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache): |
|
|
"""Recalculate only calculated charts when overhead or cache options change""" |
|
|
if state_data is None: |
|
|
return None, None, None, None |
|
|
|
|
|
df_calc = state_data["calculated"] |
|
|
if df_calc.empty: |
|
|
return None, None, None, None |
|
|
|
|
|
df_calc_processed = apply_thinking_overhead(df_calc.copy(), overhead) |
|
|
if not with_cache: |
|
|
df_calc_processed = apply_no_cache(df_calc_processed) |
|
|
|
|
|
fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc = create_token_charts( |
|
|
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
fig_cost_breakdown_calc = create_cost_breakdown( |
|
|
df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price |
|
|
) |
|
|
|
|
|
return fig_tokens_calc, fig_tokens_cost_calc, fig_stacked_calc, fig_cost_breakdown_calc |
|
|
|
|
|
calc_options_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache] |
|
|
calc_options_outputs = [plot_tokens_calc, plot_tokens_cost_calc, plot_stacked_calc, plot_cost_breakdown_calc] |
|
|
|
|
|
single_traj_dropdown.change( |
|
|
fn=on_single_traj_select, |
|
|
inputs=[trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache], |
|
|
outputs=[single_traj_plot, single_traj_cost_plot], |
|
|
) |
|
|
|
|
|
single_traj_meta_dropdown.change( |
|
|
fn=on_single_traj_meta_select, |
|
|
inputs=[trajectories_state, single_traj_meta_dropdown, price_input, price_cache_read, price_cache_creation, price_completion], |
|
|
outputs=[single_traj_meta_plot, single_traj_meta_cost_plot], |
|
|
) |
|
|
|
|
|
single_traj_inputs = [trajectories_state, single_traj_dropdown, price_input, price_cache_read, price_cache_creation, price_completion, thinking_overhead, use_cache] |
|
|
single_traj_outputs = [single_traj_plot, single_traj_cost_plot] |
|
|
|
|
|
routing_inputs = [ |
|
|
trajectories_state, |
|
|
price_input, price_cache_read, price_cache_creation, price_completion, |
|
|
routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, |
|
|
routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, |
|
|
routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion, |
|
|
selected_strategy, |
|
|
weight_base, weight_model_1, weight_model_2, weight_model_3, |
|
|
k_model_1, k_model_2, k_model_3, |
|
|
slice_model_1, slice_model_2, slice_model_3, |
|
|
grep_model_1, grep_model_2, grep_model_3, |
|
|
resolved_model, unresolved_model, |
|
|
part_mode, start_1, end_1, start_2, end_2, start_3, end_3, |
|
|
thinking_overhead, use_cache, |
|
|
detected_model, |
|
|
] |
|
|
routing_outputs = [routing_result, routing_plots_row, routing_tokens_plot, routing_cost_plot] |
|
|
|
|
|
thinking_overhead.change( |
|
|
fn=on_calc_options_change, |
|
|
inputs=calc_options_inputs, |
|
|
outputs=calc_options_outputs, |
|
|
).then( |
|
|
fn=on_single_traj_select, |
|
|
inputs=single_traj_inputs, |
|
|
outputs=single_traj_outputs, |
|
|
).then( |
|
|
fn=run_routing, |
|
|
inputs=routing_inputs, |
|
|
outputs=routing_outputs, |
|
|
) |
|
|
|
|
|
use_cache.change( |
|
|
fn=on_calc_options_change, |
|
|
inputs=calc_options_inputs, |
|
|
outputs=calc_options_outputs, |
|
|
).then( |
|
|
fn=on_single_traj_select, |
|
|
inputs=single_traj_inputs, |
|
|
outputs=single_traj_outputs, |
|
|
).then( |
|
|
fn=run_routing, |
|
|
inputs=routing_inputs, |
|
|
outputs=routing_outputs, |
|
|
) |
|
|
|
|
|
return app |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
logging.info("Refreshing leaderboard data on startup...") |
|
|
load_or_download_leaderboard(force_refresh=True) |
|
|
app = build_app() |
|
|
app.queue() |
|
|
app.launch() |
|
|
|