Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 8 days ago

Commit

81a982c

1 Parent(s): 5c06e74

Integrate calculate_routing_tokens for accurate multi-model caching

- Add calculate_routing_tokens() function for precise token tracking
- Add parse_trajectory_to_steps() to extract step data from trajectories
- Add load_all_trajectory_steps() with caching for routing calculations
- Rewrite run_routing() to use step-by-step token calculation
- Each model maintains independent cache context
- Proper handling of system/user, completion, and observation tokens
- Accurate uncached_input and cache_creation per step

Files changed (1) hide show

app.py +245 -75

app.py CHANGED Viewed

@@ -26,6 +26,7 @@ LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/mod
 _litellm_prices_cache = None
 _trajectories_cache = {}
 _calculated_tokens_cache = {}
 def parse_step_or_ratio(value: float, total_steps: int) -> int:
@@ -151,6 +152,127 @@ def calculate_routed_cost(
     }
 def get_default_overhead(model_name: str) -> float:
     """Get default tokenizer overhead for model provider"""
     model_lower = model_name.lower() if model_name else ""
@@ -339,6 +461,55 @@ def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
     return df
 def get_litellm_model_list() -> list[str]:
     """Get list of model names from litellm prices"""
     prices = get_litellm_prices()
@@ -1468,21 +1639,15 @@ def build_app():
                 )
                 return
-            df_key = "meta" if source == "Metadata" else "calculated"
-            df = state_data.get(df_key)
-            if df is None or df.empty:
                 yield (
-                    gr.update(visible=True, value="❌ No trajectory data available."),
                     gr.update(visible=False),
                     None, None,
                 )
                 return
-            if source == "Calculated":
-                df = apply_thinking_overhead(df.copy(), overhead)
-                if not with_cache:
-                    df = apply_no_cache(df)
             base_prices = {
                 "input": base_input,
                 "cache_read": base_cache_read,
@@ -1505,74 +1670,78 @@ def build_app():
                 strategy_params["start"] = start_1_val
                 strategy_params["end"] = end_1_val
-            total_base_cost = 0
-            total_routing_cost = 0
-            total_original_cost = 0
-            base_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
-            routing_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
-            base_costs = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
-            routing_costs = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
-            for _, row in df.iterrows():
-                total_steps = int(row.get("api_calls", 0))
-                if total_steps == 0:
                     continue
-                routed_steps = get_routed_steps(total_steps, strategy_1_val, strategy_params)
-                num_base_steps = total_steps - len(routed_steps)
-                num_routing_steps = len(routed_steps)
-                prompt_tokens = row.get("prompt_tokens", 0)
-                completion_tokens = row.get("completion_tokens", 0)
-                cache_read_tokens = row.get("cache_read_tokens", 0)
-                cache_creation_tokens = row.get("cache_creation_tokens", 0)
-                uncached_input_tokens = prompt_tokens - cache_read_tokens - cache_creation_tokens
-                if uncached_input_tokens < 0:
-                    uncached_input_tokens = 0
-                base_ratio = num_base_steps / total_steps if total_steps > 0 else 0
-                routing_ratio = num_routing_steps / total_steps if total_steps > 0 else 0
-                base_tokens["uncached_input"] += uncached_input_tokens * base_ratio
-                base_tokens["cache_read"] += cache_read_tokens * base_ratio
-                base_tokens["cache_creation"] += cache_creation_tokens * base_ratio
-                base_tokens["completion"] += completion_tokens * base_ratio
-                routing_tokens["uncached_input"] += uncached_input_tokens * routing_ratio
-                routing_tokens["cache_read"] += cache_read_tokens * routing_ratio
-                routing_tokens["cache_creation"] += cache_creation_tokens * routing_ratio
-                routing_tokens["completion"] += completion_tokens * routing_ratio
-                base_costs["uncached_input"] += uncached_input_tokens * base_ratio * base_prices["input"] / 1e6
-                base_costs["cache_read"] += cache_read_tokens * base_ratio * base_prices["cache_read"] / 1e6
-                base_costs["cache_creation"] += cache_creation_tokens * base_ratio * base_prices["cache_creation"] / 1e6
-                base_costs["completion"] += completion_tokens * base_ratio * base_prices["completion"] / 1e6
-                routing_costs["uncached_input"] += uncached_input_tokens * routing_ratio * routing_prices["input"] / 1e6
-                routing_costs["cache_read"] += cache_read_tokens * routing_ratio * routing_prices["cache_read"] / 1e6
-                routing_costs["cache_creation"] += cache_creation_tokens * routing_ratio * routing_prices["cache_creation"] / 1e6
-                routing_costs["completion"] += completion_tokens * routing_ratio * routing_prices["completion"] / 1e6
-                traj_tokens = {
-                    "api_calls": total_steps,
-                    "prompt_tokens": prompt_tokens,
-                    "completion_tokens": completion_tokens,
-                    "cache_read_tokens": cache_read_tokens,
-                    "cache_creation_tokens": cache_creation_tokens,
-                }
-                result = calculate_routed_cost(traj_tokens, routed_steps, base_prices, routing_prices)
-                total_base_cost += result["base_cost"]
-                total_routing_cost += result["routing_cost"]
-                original_cost = (
-                    uncached_input_tokens * base_prices["input"] / 1e6 +
-                    cache_read_tokens * base_prices["cache_read"] / 1e6 +
-                    cache_creation_tokens * base_prices["cache_creation"] / 1e6 +
-                    completion_tokens * base_prices["completion"] / 1e6
                 )
-                total_original_cost += original_cost
             total_routed_cost = total_base_cost + total_routing_cost
             savings = total_original_cost - total_routed_cost
@@ -1593,7 +1762,7 @@ def build_app():
 *Routing model: {routing_model_1_val}*
 """
-            additional_token_models = [(routing_model_1_val, routing_tokens)]
             additional_cost_models = [(routing_model_1_val, routing_costs)]
             yield (
@@ -1603,7 +1772,7 @@ def build_app():
                 None,
             )
-            tokens_chart = create_routed_token_chart(base_tokens, additional_token_models)
             cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
             yield (
@@ -1685,8 +1854,9 @@ def build_app():
             df_calc = load_all_trajectories_calculated(folder)
             df_calc["api_calls"] = df_meta["api_calls"].values
             df_calc["instance_cost"] = df_meta["instance_cost"].values
-            state_data = {"meta": df_meta, "calculated": df_calc}
             if source == "Metadata":
                 df = df_meta

 _litellm_prices_cache = None
 _trajectories_cache = {}
 _calculated_tokens_cache = {}
+_trajectory_steps_cache = {}
 def parse_step_or_ratio(value: float, total_steps: int) -> int:
     }
+def calculate_routing_tokens(steps: list[dict]) -> dict:
+    """
+    Calculate token breakdown per model with proper caching simulation.
+    Args:
+        steps: list of dicts with keys:
+            - model: str (model name)
+            - system_user: int (tokens for system/user message, usually only step 0)
+            - completion: int (generated tokens)
+            - observation: int or None (env response tokens, None for last step)
+    Returns:
+        dict with per-model totals:
+            {model_name: {cache_read, uncached_input, completion, observation, cache_creation}}
+    """
+    model_caches = {}
+    model_totals = {}
+    total_context = 0
+    prev_observation = 0
+    for i, step in enumerate(steps):
+        model = step["model"]
+        system_user = step.get("system_user", 0)
+        completion = step.get("completion", 0)
+        observation = step.get("observation") or 0
+        if model not in model_caches:
+            model_caches[model] = 0
+        if model not in model_totals:
+            model_totals[model] = {
+                "cache_read": 0,
+                "uncached_input": 0,
+                "completion": 0,
+                "observation": 0,
+                "cache_creation": 0,
+            }
+        cache_read = model_caches[model]
+        if i == 0:
+            uncached_input = system_user
+        else:
+            full_context_needed = total_context + prev_observation
+            uncached_input = full_context_needed - cache_read
+        cache_creation = uncached_input + completion
+        model_caches[model] = cache_read + cache_creation
+        model_totals[model]["cache_read"] += cache_read
+        model_totals[model]["uncached_input"] += uncached_input
+        model_totals[model]["completion"] += completion
+        model_totals[model]["observation"] += observation
+        model_totals[model]["cache_creation"] += cache_creation
+        total_context = cache_read + uncached_input + completion
+        prev_observation = observation
+    return model_totals
+def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
+    """
+    Parse trajectory file into step format for calculate_routing_tokens.
+    Returns list of steps with:
+        - model: base model name
+        - system_user: tokens for system + user message (step 0 only)
+        - completion: assistant response tokens
+        - observation: env response tokens (None for last step)
+    """
+    with open(traj_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    messages = data.get("messages", [])
+    if not messages:
+        return []
+    count_tokens, _ = get_tokenizer(model_name)
+    steps = []
+    system_user_tokens = 0
+    current_completion = 0
+    pending_observation = None
+    i = 0
+    while i < len(messages):
+        msg = messages[i]
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        if isinstance(content, list):
+            content = json.dumps(content)
+        tokens = count_tokens(str(content))
+        if role == "system":
+            system_user_tokens += tokens
+            i += 1
+        elif role == "user":
+            if not steps:
+                system_user_tokens += tokens
+                i += 1
+            else:
+                if steps:
+                    steps[-1]["observation"] = tokens
+                pending_observation = tokens
+                i += 1
+        elif role == "assistant":
+            step = {
+                "model": model_name,
+                "system_user": system_user_tokens if not steps else 0,
+                "completion": tokens,
+                "observation": None,
+            }
+            steps.append(step)
+            system_user_tokens = 0
+            i += 1
+    return steps
 def get_default_overhead(model_name: str) -> float:
     """Get default tokenizer overhead for model provider"""
     model_lower = model_name.lower() if model_name else ""
     return df
+def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
+    """
+    Load all trajectories as step sequences for routing calculations.
+    Returns:
+        dict mapping instance_id -> list of steps for calculate_routing_tokens
+    """
+    global _trajectory_steps_cache
+    cache_key = f"steps_{folder}"
+    if cache_key in _trajectory_steps_cache:
+        return _trajectory_steps_cache[cache_key]
+    output_dir = TRAJS_DIR / folder
+    traj_files = list(output_dir.glob("*/*.traj.json"))
+    if not traj_files:
+        traj_files = list(output_dir.glob("*/*.traj"))
+    if not traj_files:
+        traj_files = list(output_dir.glob("*.traj.json"))
+    if not traj_files:
+        traj_files = list(output_dir.glob("*.traj"))
+    if not traj_files:
+        traj_files = list(output_dir.glob("*.json"))
+    model_name = ""
+    if traj_files:
+        try:
+            with open(traj_files[0], "r") as f:
+                first_data = json.load(f)
+                config = first_data.get("info", {}).get("config", {}).get("model", {})
+                model_name = config.get("cost_calc_model_override", config.get("model_name", ""))
+        except Exception:
+            pass
+    result = {}
+    for traj_path in traj_files:
+        try:
+            instance_id = traj_path.stem.replace(".traj", "")
+            steps = parse_trajectory_to_steps(traj_path, model_name)
+            if steps:
+                result[instance_id] = steps
+        except Exception as e:
+            print(f"Error parsing steps for {traj_path}: {e}")
+    _trajectory_steps_cache[cache_key] = result
+    return result
 def get_litellm_model_list() -> list[str]:
     """Get list of model names from litellm prices"""
     prices = get_litellm_prices()
                 )
                 return
+            trajectory_steps = state_data.get("steps", {})
+            if not trajectory_steps:
                 yield (
+                    gr.update(visible=True, value="❌ No trajectory steps data available."),
                     gr.update(visible=False),
                     None, None,
                 )
                 return
             base_prices = {
                 "input": base_input,
                 "cache_read": base_cache_read,
                 strategy_params["start"] = start_1_val
                 strategy_params["end"] = end_1_val
+            total_base_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            total_routing_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            total_original_tokens = {"uncached_input": 0, "cache_read": 0, "cache_creation": 0, "completion": 0}
+            BASE_MODEL = "__base__"
+            ROUTING_MODEL = "__routing__"
+            for instance_id, steps in trajectory_steps.items():
+                if not steps:
                     continue
+                total_steps = len(steps)
+                routed_step_indices = get_routed_steps(total_steps, strategy_1_val, strategy_params)
+                modified_steps = []
+                for i, step in enumerate(steps):
+                    model = ROUTING_MODEL if i in routed_step_indices else BASE_MODEL
+                    modified_steps.append({
+                        "model": model,
+                        "system_user": step.get("system_user", 0),
+                        "completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
+                        "observation": step.get("observation"),
+                    })
+                model_totals = calculate_routing_tokens(modified_steps)
+                base_totals = model_totals.get(BASE_MODEL, {
+                    "cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0
+                })
+                routing_totals = model_totals.get(ROUTING_MODEL, {
+                    "cache_read": 0, "uncached_input": 0, "completion": 0, "cache_creation": 0
+                })
+                total_base_tokens["cache_read"] += base_totals.get("cache_read", 0)
+                total_base_tokens["uncached_input"] += base_totals.get("uncached_input", 0)
+                total_base_tokens["completion"] += base_totals.get("completion", 0)
+                total_base_tokens["cache_creation"] += base_totals.get("cache_creation", 0)
+                total_routing_tokens["cache_read"] += routing_totals.get("cache_read", 0)
+                total_routing_tokens["uncached_input"] += routing_totals.get("uncached_input", 0)
+                total_routing_tokens["completion"] += routing_totals.get("completion", 0)
+                total_routing_tokens["cache_creation"] += routing_totals.get("cache_creation", 0)
+                original_steps = []
+                for step in steps:
+                    original_steps.append({
+                        "model": BASE_MODEL,
+                        "system_user": step.get("system_user", 0),
+                        "completion": int(step.get("completion", 0) * (overhead if source == "Calculated" else 1)),
+                        "observation": step.get("observation"),
+                    })
+                original_totals = calculate_routing_tokens(original_steps)
+                orig = original_totals.get(BASE_MODEL, {})
+                total_original_tokens["cache_read"] += orig.get("cache_read", 0)
+                total_original_tokens["uncached_input"] += orig.get("uncached_input", 0)
+                total_original_tokens["completion"] += orig.get("completion", 0)
+                total_original_tokens["cache_creation"] += orig.get("cache_creation", 0)
+            def calc_cost(tokens: dict, prices: dict) -> float:
+                return (
+                    tokens["uncached_input"] * prices["input"] / 1e6 +
+                    tokens["cache_read"] * prices["cache_read"] / 1e6 +
+                    tokens["cache_creation"] * prices["cache_creation"] / 1e6 +
+                    tokens["completion"] * prices["completion"] / 1e6
                 )
+            base_costs = {k: total_base_tokens[k] * base_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_base_tokens}
+            routing_costs = {k: total_routing_tokens[k] * routing_prices[{"uncached_input": "input", "cache_read": "cache_read", "cache_creation": "cache_creation", "completion": "completion"}[k]] / 1e6 for k in total_routing_tokens}
+            total_base_cost = calc_cost(total_base_tokens, base_prices)
+            total_routing_cost = calc_cost(total_routing_tokens, routing_prices)
+            total_original_cost = calc_cost(total_original_tokens, base_prices)
             total_routed_cost = total_base_cost + total_routing_cost
             savings = total_original_cost - total_routed_cost
 *Routing model: {routing_model_1_val}*
 """
+            additional_token_models = [(routing_model_1_val, total_routing_tokens)]
             additional_cost_models = [(routing_model_1_val, routing_costs)]
             yield (
                 None,
             )
+            tokens_chart = create_routed_token_chart(total_base_tokens, additional_token_models)
             cost_chart = create_routed_cost_chart(base_costs, additional_cost_models)
             yield (
             df_calc = load_all_trajectories_calculated(folder)
             df_calc["api_calls"] = df_meta["api_calls"].values
             df_calc["instance_cost"] = df_meta["instance_cost"].values
+            trajectory_steps = load_all_trajectory_steps(folder)
+            state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
             if source == "Metadata":
                 df = df_meta