Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 9 days ago

Commit

f7c61dd

1 Parent(s): 81a982c

Unify token calculation using calculate_routing_tokens

- Rewrite load_all_trajectories_calculated() to use calculate_routing_tokens
- Remove obsolete calculate_tokens_from_trajectory() function
- Remove obsolete calculate_routed_cost() function
- Single source of truth for token calculation logic
- prompt_tokens = cache_read + uncached_input (mathematically equivalent)

Files changed (1) hide show

app.py +31 -173

app.py CHANGED Viewed

@@ -73,85 +73,6 @@ def get_routed_steps(total_steps: int, strategy: str, params: dict) -> set:
     return routed
-def calculate_routed_cost(
-    trajectory_tokens: dict,
-    routed_steps: set,
-    base_prices: dict,
-    routing_prices: dict,
-) -> dict:
-    """
-    Calculate cost for a trajectory with routing.
-    Each model maintains its own independent cache.
-    When switching back to a model, its cache is still available.
-    Args:
-        trajectory_tokens: dict with per-step token counts
-        routed_steps: set of step indices using routing model
-        base_prices: {input, cache_read, cache_creation, completion} for base model
-        routing_prices: same for routing model
-    Returns:
-        dict with base_cost, routing_cost, total_cost
-    """
-    total_steps = trajectory_tokens.get("api_calls", 0)
-    if total_steps == 0:
-        return {"base_cost": 0, "routing_cost": 0, "total_cost": 0}
-    prompt_tokens = trajectory_tokens.get("prompt_tokens", 0)
-    completion_tokens = trajectory_tokens.get("completion_tokens", 0)
-    cache_read = trajectory_tokens.get("cache_read_tokens", 0)
-    cache_creation = trajectory_tokens.get("cache_creation_tokens", 0)
-    avg_prompt_per_step = prompt_tokens / total_steps if total_steps > 0 else 0
-    avg_completion_per_step = completion_tokens / total_steps if total_steps > 0 else 0
-    avg_cache_read_per_step = cache_read / total_steps if total_steps > 0 else 0
-    avg_cache_creation_per_step = cache_creation / total_steps if total_steps > 0 else 0
-    base_cost = 0
-    routing_cost = 0
-    base_cache_context = 0
-    routing_cache_context = 0
-    for step in range(total_steps):
-        is_routed = step in routed_steps
-        prices = routing_prices if is_routed else base_prices
-        if is_routed:
-            cache_ctx = routing_cache_context
-        else:
-            cache_ctx = base_cache_context
-        uncached_input = avg_prompt_per_step - avg_cache_read_per_step
-        if cache_ctx == 0:
-            step_cache_read = 0
-            step_uncached = avg_prompt_per_step
-        else:
-            step_cache_read = avg_cache_read_per_step
-            step_uncached = uncached_input
-        step_cost = (
-            step_uncached * prices["input"] / 1e6 +
-            step_cache_read * prices["cache_read"] / 1e6 +
-            avg_cache_creation_per_step * prices["cache_creation"] / 1e6 +
-            avg_completion_per_step * prices["completion"] / 1e6
-        )
-        if is_routed:
-            routing_cost += step_cost
-            routing_cache_context += avg_prompt_per_step + avg_completion_per_step
-        else:
-            base_cost += step_cost
-            base_cache_context += avg_prompt_per_step + avg_completion_per_step
-    return {
-        "base_cost": base_cost,
-        "routing_cost": routing_cost,
-        "total_cost": base_cost + routing_cost,
-    }
 def calculate_routing_tokens(steps: list[dict]) -> dict:
     """
     Calculate token breakdown per model with proper caching simulation.
@@ -309,78 +230,6 @@ def get_tokenizer(model_name: str):
     return lambda text: len(enc.encode(text)), tokenizer_name
-def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
-    """
-    Calculate tokens from trajectory messages simulating API behavior.
-    API counts prompt_tokens cumulatively for each call (full context each time).
-    With caching: cache_read = previous context, cache_creation = new content.
-    Returns dict with:
-    - prompt_tokens: total input tokens (cumulative across all API calls)
-    - completion_tokens: total output tokens
-    - cache_read_tokens: tokens read from cache
-    - cache_creation_tokens: tokens written to cache
-    - api_calls: number of assistant responses
-    """
-    with open(traj_path, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    messages = data.get("messages", [])
-    if not messages:
-        return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
-    count_tokens, _ = get_tokenizer(model_name)
-    message_tokens = []
-    for msg in messages:
-        content = msg.get("content", "")
-        if isinstance(content, list):
-            content = json.dumps(content)
-        tokens = count_tokens(str(content))
-        message_tokens.append({
-            "role": msg.get("role", "user"),
-            "tokens": tokens
-        })
-    # Simulate API behavior: each call sends full context
-    # LLM APIs cache full context including assistant responses
-    prompt_tokens = 0  # Cumulative prompt tokens across all API calls
-    completion_tokens = 0
-    cache_read_tokens = 0
-    cache_creation_tokens = 0
-    api_calls = 0
-    context_so_far = 0  # Total tokens in context (including assistant responses)
-    cached_context = 0  # Tokens that are cached from previous API calls
-    for i, mt in enumerate(message_tokens):
-        if mt["role"] == "assistant":
-            completion_tokens += mt["tokens"]
-            api_calls += 1
-            context_so_far += mt["tokens"]
-        else:
-            context_so_far += mt["tokens"]
-            next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
-            if next_is_assistant:
-                prompt_tokens += context_so_far
-                cache_read_tokens += cached_context
-                assistant_tokens = message_tokens[i + 1]["tokens"]
-                cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
-                cached_context = context_so_far + assistant_tokens
-    return {
-        "prompt_tokens": prompt_tokens,
-        "completion_tokens": completion_tokens,
-        "cache_read_tokens": cache_read_tokens,
-        "cache_creation_tokens": cache_creation_tokens,
-        "api_calls": api_calls,
-    }
 def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
     """Apply tokenizer overhead multiplier to all token counts"""
     if df.empty or overhead == 1.0:
@@ -407,15 +256,16 @@ def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
 def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
-    """Load trajectories with self-calculated token counts"""
     global _calculated_tokens_cache
     cache_key = f"calculated_{folder}"
     if cache_key in _calculated_tokens_cache:
         return _calculated_tokens_cache[cache_key]
     output_dir = TRAJS_DIR / folder
     traj_files = list(output_dir.glob("*/*.traj.json"))
     if not traj_files:
         traj_files = list(output_dir.glob("*/*.traj"))
@@ -423,10 +273,7 @@ def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
         traj_files = list(output_dir.glob("*.traj.json"))
     if not traj_files:
         traj_files = list(output_dir.glob("*.traj"))
-    if not traj_files:
-        traj_files = list(output_dir.glob("*.json"))
-    # Get model name from first trajectory
     model_name = ""
     if traj_files:
         try:
@@ -436,26 +283,37 @@ def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
                 model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
         except Exception:
             pass
     rows = []
-    for traj_path in traj_files:
         try:
-            tokens = calculate_tokens_from_trajectory(traj_path, model_name)
             rows.append({
-                "instance_id": traj_path.stem.replace(".traj", ""),
                 "model_name": model_name,
-                "api_calls": tokens["api_calls"],
-                "instance_cost": 0,  # Will be calculated from prices
-                "prompt_tokens": tokens["prompt_tokens"],
-                "completion_tokens": tokens["completion_tokens"],
-                "total_tokens": tokens["prompt_tokens"] + tokens["completion_tokens"],
-                "cache_read_tokens": tokens["cache_read_tokens"],
-                "cache_creation_tokens": tokens["cache_creation_tokens"],
             })
         except Exception as e:
-            print(f"Error calculating tokens for {traj_path}: {e}")
     df = pd.DataFrame(rows)
     _calculated_tokens_cache[cache_key] = df
     return df

     return routed
 def calculate_routing_tokens(steps: list[dict]) -> dict:
     """
     Calculate token breakdown per model with proper caching simulation.
     return lambda text: len(enc.encode(text)), tokenizer_name
 def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
     """Apply tokenizer overhead multiplier to all token counts"""
     if df.empty or overhead == 1.0:
 def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
+    """Load trajectories with self-calculated token counts using calculate_routing_tokens"""
     global _calculated_tokens_cache
     cache_key = f"calculated_{folder}"
     if cache_key in _calculated_tokens_cache:
         return _calculated_tokens_cache[cache_key]
+    trajectory_steps = load_all_trajectory_steps(folder)
     output_dir = TRAJS_DIR / folder
     traj_files = list(output_dir.glob("*/*.traj.json"))
     if not traj_files:
         traj_files = list(output_dir.glob("*/*.traj"))
         traj_files = list(output_dir.glob("*.traj.json"))
     if not traj_files:
         traj_files = list(output_dir.glob("*.traj"))
     model_name = ""
     if traj_files:
         try:
                 model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
         except Exception:
             pass
     rows = []
+    for instance_id, steps in trajectory_steps.items():
+        if not steps:
+            continue
         try:
+            model_totals = calculate_routing_tokens(steps)
+            totals = model_totals.get(model_name, {})
+            cache_read = totals.get("cache_read", 0)
+            uncached_input = totals.get("uncached_input", 0)
+            completion = totals.get("completion", 0)
+            cache_creation = totals.get("cache_creation", 0)
+            prompt_tokens = cache_read + uncached_input
             rows.append({
+                "instance_id": instance_id,
                 "model_name": model_name,
+                "api_calls": len(steps),
+                "instance_cost": 0,
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion,
+                "total_tokens": prompt_tokens + completion,
+                "cache_read_tokens": cache_read,
+                "cache_creation_tokens": cache_creation,
             })
         except Exception as e:
+            print(f"Error calculating tokens for {instance_id}: {e}")
     df = pd.DataFrame(rows)
     _calculated_tokens_cache[cache_key] = df
     return df