Commit
Β·
9399ab7
1
Parent(s):
723fdc8
Major improvements to token calculation and UX
Browse files- Pre-calculate both Metadata and Calculated tokens on Load & Analyze
- Instant switching between token sources (no reload needed)
- Fix token calculation: only count user messages that have assistant response
- Provider-specific tokenizer overhead: Claude=1.24, Gemini/OpenAI=1.0
- Case-insensitive model name matching for litellm prices
- Add '% resolved' column, remove 'os_system' from leaderboard table
- Rename 'Instance' to 'Trajectory' throughout UI
- Sort bottom charts by total tokens (not cache_read)
- Make last two charts side-by-side
- Add 'Use Cache' checkbox for Calculated mode
- Fix division by zero when total_count=0
- Copy api_calls and instance_cost from metadata to calculated data
app.py
CHANGED
|
@@ -28,22 +28,33 @@ _trajectories_cache = {}
|
|
| 28 |
_calculated_tokens_cache = {}
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def get_tokenizer(model_name: str):
|
| 32 |
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
|
| 33 |
global _tokenizer_cache
|
| 34 |
-
|
| 35 |
model_lower = model_name.lower() if model_name else ""
|
| 36 |
-
|
| 37 |
-
# Determine tokenizer type
|
| 38 |
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
|
| 39 |
tokenizer_name = "o200k_base"
|
| 40 |
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
|
| 41 |
tokenizer_name = "cl100k_base"
|
| 42 |
elif "gemini" in model_lower or "google" in model_lower:
|
| 43 |
-
# Gemini uses ~3.23 chars per token (calculated from actual API responses)
|
| 44 |
return lambda text: int(len(text) / 3.23), "gemini_approx"
|
| 45 |
else:
|
| 46 |
-
# Default to cl100k_base for unknown models
|
| 47 |
tokenizer_name = "cl100k_base"
|
| 48 |
|
| 49 |
if tokenizer_name not in _tokenizer_cache:
|
|
@@ -75,8 +86,7 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
|
|
| 75 |
return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
|
| 76 |
|
| 77 |
count_tokens, _ = get_tokenizer(model_name)
|
| 78 |
-
|
| 79 |
-
# Calculate tokens for each message
|
| 80 |
message_tokens = []
|
| 81 |
for msg in messages:
|
| 82 |
content = msg.get("content", "")
|
|
@@ -108,7 +118,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
|
|
| 108 |
context_so_far += mt["tokens"]
|
| 109 |
|
| 110 |
next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
|
| 111 |
-
is_last = (i == len(message_tokens) - 1)
|
| 112 |
|
| 113 |
if next_is_assistant:
|
| 114 |
prompt_tokens += context_so_far
|
|
@@ -117,10 +126,6 @@ def calculate_tokens_from_trajectory(traj_path: Path, model_name: str) -> dict:
|
|
| 117 |
assistant_tokens = message_tokens[i + 1]["tokens"]
|
| 118 |
cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
|
| 119 |
cached_context = context_so_far + assistant_tokens
|
| 120 |
-
elif is_last:
|
| 121 |
-
prompt_tokens += context_so_far
|
| 122 |
-
cache_read_tokens += cached_context
|
| 123 |
-
cache_creation_tokens += context_so_far - cached_context
|
| 124 |
|
| 125 |
return {
|
| 126 |
"prompt_tokens": prompt_tokens,
|
|
@@ -145,6 +150,17 @@ def apply_thinking_overhead(df: pd.DataFrame, overhead: float) -> pd.DataFrame:
|
|
| 145 |
return df
|
| 146 |
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
|
| 149 |
"""Load trajectories with self-calculated token counts"""
|
| 150 |
global _calculated_tokens_cache
|
|
@@ -220,6 +236,11 @@ def get_litellm_prices() -> dict:
|
|
| 220 |
return _litellm_prices_cache
|
| 221 |
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
def get_model_prices(model_name: str) -> dict | None:
|
| 224 |
if not model_name:
|
| 225 |
return None
|
|
@@ -227,8 +248,7 @@ def get_model_prices(model_name: str) -> dict | None:
|
|
| 227 |
prices = get_litellm_prices()
|
| 228 |
|
| 229 |
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
|
| 230 |
-
|
| 231 |
-
# Try without date suffix (e.g., gemini-3-pro-preview-20251118 -> gemini-3-pro-preview)
|
| 232 |
name_without_date = re.sub(r'-\d{8}$', '', clean_name)
|
| 233 |
|
| 234 |
candidates = [
|
|
@@ -245,9 +265,16 @@ def get_model_prices(model_name: str) -> dict | None:
|
|
| 245 |
if key in prices:
|
| 246 |
return prices[key]
|
| 247 |
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
for key, value in prices.items():
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
return value
|
| 252 |
|
| 253 |
return None
|
|
@@ -274,15 +301,21 @@ def get_bash_only_df():
|
|
| 274 |
|
| 275 |
rows = []
|
| 276 |
for r in bash_only["results"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
rows.append({
|
| 278 |
"name": r.get("name", ""),
|
|
|
|
| 279 |
"date": r.get("date", ""),
|
| 280 |
"cost": round(r.get("cost", 0), 2),
|
| 281 |
"instance_cost": round(r.get("instance_cost", 0), 4),
|
| 282 |
"instance_calls": r.get("instance_calls", 0),
|
| 283 |
"folder": r.get("folder", ""),
|
| 284 |
"os_model": "β
" if r.get("os_model") else "β",
|
| 285 |
-
"os_system": "β
" if r.get("os_system") else "β",
|
| 286 |
})
|
| 287 |
|
| 288 |
return pd.DataFrame(rows)
|
|
@@ -352,7 +385,12 @@ def download_trajectories_from_s3(folder: str, progress=gr.Progress()):
|
|
| 352 |
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
|
| 353 |
total_count = len(per_instance)
|
| 354 |
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
return status, gr.update(visible=True)
|
| 357 |
|
| 358 |
except subprocess.TimeoutExpired:
|
|
@@ -520,32 +558,34 @@ def create_token_charts(df: pd.DataFrame, input_price: float, cache_read_price:
|
|
| 520 |
|
| 521 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 522 |
|
| 523 |
-
# Stacked bar chart
|
| 524 |
-
df_sorted = df.
|
| 525 |
-
df_sorted["instance_idx"] = range(len(df_sorted))
|
| 526 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
|
|
|
|
|
|
|
|
|
| 527 |
|
| 528 |
fig_stacked = go.Figure()
|
| 529 |
fig_stacked.add_trace(go.Bar(
|
| 530 |
-
name="Uncached Input", x=df_sorted["
|
| 531 |
-
marker_color="#EF553B", hovertemplate="
|
| 532 |
))
|
| 533 |
fig_stacked.add_trace(go.Bar(
|
| 534 |
-
name="Cache Read", x=df_sorted["
|
| 535 |
-
marker_color="#19D3F3", hovertemplate="
|
| 536 |
))
|
| 537 |
fig_stacked.add_trace(go.Bar(
|
| 538 |
-
name="Cache Creation", x=df_sorted["
|
| 539 |
-
marker_color="#FFA15A", hovertemplate="
|
| 540 |
))
|
| 541 |
fig_stacked.add_trace(go.Bar(
|
| 542 |
-
name="Completion", x=df_sorted["
|
| 543 |
-
marker_color="#AB63FA", hovertemplate="
|
| 544 |
))
|
| 545 |
fig_stacked.update_layout(
|
| 546 |
barmode="stack",
|
| 547 |
-
title="
|
| 548 |
-
xaxis_title="
|
| 549 |
yaxis_title="Tokens",
|
| 550 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 551 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
@@ -562,12 +602,12 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
|
|
| 562 |
df,
|
| 563 |
x="api_calls",
|
| 564 |
nbins=30,
|
| 565 |
-
title="Distribution of API Calls (Steps) per
|
| 566 |
color_discrete_sequence=["#636EFA"],
|
| 567 |
)
|
| 568 |
fig_steps.update_layout(
|
| 569 |
xaxis_title="API Calls (Steps)",
|
| 570 |
-
yaxis_title="Number of
|
| 571 |
showlegend=False,
|
| 572 |
margin=dict(l=40, r=20, t=40, b=40),
|
| 573 |
)
|
|
@@ -587,7 +627,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
|
|
| 587 |
)
|
| 588 |
fig_cost.update_layout(
|
| 589 |
xaxis_title="Cost ($)",
|
| 590 |
-
yaxis_title="Number of
|
| 591 |
showlegend=False,
|
| 592 |
margin=dict(l=40, r=20, t=40, b=40),
|
| 593 |
)
|
|
@@ -601,7 +641,7 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
|
|
| 601 |
total_completion = df["completion_tokens"].sum()
|
| 602 |
total_cache_read = df["cache_read_tokens"].sum()
|
| 603 |
total_cache_creation = df["cache_creation_tokens"].sum()
|
| 604 |
-
# Uncached input = prompt - cache_read - cache_creation (per
|
| 605 |
df_temp = df.copy()
|
| 606 |
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
|
| 607 |
total_uncached_input = df_temp["uncached_input"].sum()
|
|
@@ -637,49 +677,51 @@ def create_basic_histograms(df: pd.DataFrame, input_price: float, cache_read_pri
|
|
| 637 |
# Cost by token type (use separate function)
|
| 638 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 639 |
|
| 640 |
-
|
| 641 |
-
df_sorted
|
| 642 |
-
# Uncached input = prompt - cache_read - cache_creation
|
| 643 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
|
|
|
|
|
|
|
|
|
| 644 |
|
| 645 |
fig_stacked = go.Figure()
|
| 646 |
|
| 647 |
fig_stacked.add_trace(go.Bar(
|
| 648 |
name="Uncached Input",
|
| 649 |
-
x=df_sorted["
|
| 650 |
y=df_sorted["uncached_input_tokens"],
|
| 651 |
marker_color="#EF553B",
|
| 652 |
-
hovertemplate="
|
| 653 |
))
|
| 654 |
|
| 655 |
fig_stacked.add_trace(go.Bar(
|
| 656 |
name="Cache Read",
|
| 657 |
-
x=df_sorted["
|
| 658 |
y=df_sorted["cache_read_tokens"],
|
| 659 |
marker_color="#19D3F3",
|
| 660 |
-
hovertemplate="
|
| 661 |
))
|
| 662 |
|
| 663 |
fig_stacked.add_trace(go.Bar(
|
| 664 |
name="Cache Creation",
|
| 665 |
-
x=df_sorted["
|
| 666 |
y=df_sorted["cache_creation_tokens"],
|
| 667 |
marker_color="#FFA15A",
|
| 668 |
-
hovertemplate="
|
| 669 |
))
|
| 670 |
|
| 671 |
fig_stacked.add_trace(go.Bar(
|
| 672 |
name="Completion",
|
| 673 |
-
x=df_sorted["
|
| 674 |
y=df_sorted["completion_tokens"],
|
| 675 |
marker_color="#AB63FA",
|
| 676 |
-
hovertemplate="
|
| 677 |
))
|
| 678 |
|
| 679 |
fig_stacked.update_layout(
|
| 680 |
barmode="stack",
|
| 681 |
-
title="
|
| 682 |
-
xaxis_title="
|
| 683 |
yaxis_title="Tokens",
|
| 684 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 685 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
@@ -692,11 +734,12 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
|
|
| 692 |
if df.empty:
|
| 693 |
return None
|
| 694 |
|
| 695 |
-
|
| 696 |
-
df_sorted
|
| 697 |
-
|
| 698 |
-
# Uncached input = prompt - cache_read - cache_creation
|
| 699 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
|
| 702 |
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
|
|
@@ -707,34 +750,34 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
|
|
| 707 |
|
| 708 |
fig.add_trace(go.Bar(
|
| 709 |
name=f"Uncached Input (${input_price:.2f}/1M)",
|
| 710 |
-
x=df_sorted["
|
| 711 |
y=df_sorted["cost_uncached_input"],
|
| 712 |
marker_color="#EF553B",
|
| 713 |
-
hovertemplate="
|
| 714 |
))
|
| 715 |
|
| 716 |
fig.add_trace(go.Bar(
|
| 717 |
name=f"Cache Read (${cache_read_price:.2f}/1M)",
|
| 718 |
-
x=df_sorted["
|
| 719 |
y=df_sorted["cost_cache_read"],
|
| 720 |
marker_color="#19D3F3",
|
| 721 |
-
hovertemplate="
|
| 722 |
))
|
| 723 |
|
| 724 |
fig.add_trace(go.Bar(
|
| 725 |
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
|
| 726 |
-
x=df_sorted["
|
| 727 |
y=df_sorted["cost_cache_creation"],
|
| 728 |
marker_color="#FFA15A",
|
| 729 |
-
hovertemplate="
|
| 730 |
))
|
| 731 |
|
| 732 |
fig.add_trace(go.Bar(
|
| 733 |
name=f"Completion (${completion_price:.2f}/1M)",
|
| 734 |
-
x=df_sorted["
|
| 735 |
y=df_sorted["cost_completion"],
|
| 736 |
marker_color="#AB63FA",
|
| 737 |
-
hovertemplate="
|
| 738 |
))
|
| 739 |
|
| 740 |
total_cost = (
|
|
@@ -746,8 +789,8 @@ def create_cost_breakdown(df: pd.DataFrame, input_price: float, cache_read_price
|
|
| 746 |
|
| 747 |
fig.update_layout(
|
| 748 |
barmode="stack",
|
| 749 |
-
title="Cost
|
| 750 |
-
xaxis_title="
|
| 751 |
yaxis_title="Cost ($)",
|
| 752 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 753 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
@@ -829,14 +872,15 @@ def get_prices_for_folder(folder: str) -> tuple[dict, str]:
|
|
| 829 |
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
| 830 |
if evt.index is None:
|
| 831 |
return (
|
| 832 |
-
"", "",
|
| 833 |
-
gr.update(interactive=False),
|
| 834 |
-
gr.update(visible=False),
|
| 835 |
gr.update(value=0, label="π² Input"),
|
| 836 |
gr.update(value=0, label="π² Cache Read"),
|
| 837 |
gr.update(value=0, label="π² Cache Creation"),
|
| 838 |
gr.update(value=0, label="π² Completion"),
|
| 839 |
-
""
|
|
|
|
| 840 |
)
|
| 841 |
|
| 842 |
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
|
@@ -847,6 +891,7 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
|
| 847 |
show_analyze = check_trajectories_downloaded(folder)
|
| 848 |
|
| 849 |
prices_dict, model_hint = get_prices_for_folder(folder)
|
|
|
|
| 850 |
|
| 851 |
def price_update(price_info, name):
|
| 852 |
value = price_info["value"]
|
|
@@ -858,14 +903,15 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
|
| 858 |
return gr.update(value=0, label=f"β {name}")
|
| 859 |
|
| 860 |
return (
|
| 861 |
-
folder, name,
|
| 862 |
-
gr.update(interactive=True),
|
| 863 |
-
gr.update(visible=show_analyze),
|
| 864 |
price_update(prices_dict["input"], "Input"),
|
| 865 |
price_update(prices_dict["cache_read"], "Cache Read"),
|
| 866 |
price_update(prices_dict["cache_creation"], "Cache Creation"),
|
| 867 |
price_update(prices_dict["completion"], "Completion"),
|
| 868 |
-
model_hint
|
|
|
|
| 869 |
)
|
| 870 |
|
| 871 |
|
|
@@ -899,10 +945,8 @@ def build_app():
|
|
| 899 |
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
|
| 900 |
|
| 901 |
with gr.Row():
|
| 902 |
-
plot_stacked = gr.Plot(label="
|
| 903 |
-
|
| 904 |
-
with gr.Row():
|
| 905 |
-
plot_cost_breakdown = gr.Plot(label="Cost Breakdown per Instance ($)")
|
| 906 |
|
| 907 |
with gr.Column(scale=1):
|
| 908 |
selected_folder = gr.State("")
|
|
@@ -935,20 +979,27 @@ def build_app():
|
|
| 935 |
info="Multiplier for Calculated tokens (tiktoken β native)",
|
| 936 |
visible=False,
|
| 937 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
|
| 939 |
-
def
|
| 940 |
-
|
|
|
|
| 941 |
|
| 942 |
token_source.change(
|
| 943 |
-
fn=
|
| 944 |
inputs=[token_source],
|
| 945 |
-
outputs=[thinking_overhead],
|
| 946 |
)
|
| 947 |
|
| 948 |
leaderboard_table.select(
|
| 949 |
fn=on_row_select,
|
| 950 |
inputs=[leaderboard_table],
|
| 951 |
-
outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model],
|
| 952 |
)
|
| 953 |
|
| 954 |
download_btn.click(
|
|
@@ -957,7 +1008,7 @@ def build_app():
|
|
| 957 |
outputs=[download_status, analyze_btn],
|
| 958 |
)
|
| 959 |
|
| 960 |
-
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead):
|
| 961 |
empty_result = (
|
| 962 |
gr.update(visible=False),
|
| 963 |
None, None, None, None, None, None,
|
|
@@ -974,11 +1025,19 @@ def build_app():
|
|
| 974 |
None,
|
| 975 |
)
|
| 976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
if source == "Metadata":
|
| 978 |
-
df =
|
| 979 |
else:
|
| 980 |
-
df =
|
| 981 |
-
|
|
|
|
| 982 |
|
| 983 |
if df.empty:
|
| 984 |
yield empty_result
|
|
@@ -992,12 +1051,12 @@ def build_app():
|
|
| 992 |
yield (
|
| 993 |
gr.update(visible=True),
|
| 994 |
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
|
| 995 |
-
|
| 996 |
)
|
| 997 |
|
| 998 |
analyze_btn.click(
|
| 999 |
fn=load_and_analyze,
|
| 1000 |
-
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead],
|
| 1001 |
outputs=[
|
| 1002 |
analysis_section,
|
| 1003 |
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
|
|
@@ -1005,14 +1064,25 @@ def build_app():
|
|
| 1005 |
],
|
| 1006 |
)
|
| 1007 |
|
| 1008 |
-
def recalculate_costs(
|
| 1009 |
-
if
|
| 1010 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1011 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 1012 |
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 1013 |
return fig_tokens_cost, fig_cost_breakdown
|
| 1014 |
|
| 1015 |
-
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion]
|
| 1016 |
price_outputs = [plot_tokens_cost, plot_cost_breakdown]
|
| 1017 |
|
| 1018 |
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
|
@@ -1020,16 +1090,17 @@ def build_app():
|
|
| 1020 |
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
| 1021 |
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
| 1022 |
|
| 1023 |
-
def on_source_change(
|
| 1024 |
"""Recalculate only token-dependent charts when source changes"""
|
| 1025 |
-
if
|
| 1026 |
return None, None, None, None
|
| 1027 |
|
| 1028 |
if source == "Metadata":
|
| 1029 |
-
df =
|
| 1030 |
else:
|
| 1031 |
-
df =
|
| 1032 |
-
|
|
|
|
| 1033 |
|
| 1034 |
if df.empty:
|
| 1035 |
return None, None, None, None
|
|
@@ -1041,16 +1112,25 @@ def build_app():
|
|
| 1041 |
|
| 1042 |
return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
|
| 1043 |
|
|
|
|
|
|
|
|
|
|
| 1044 |
token_source.change(
|
| 1045 |
fn=on_source_change,
|
| 1046 |
-
inputs=
|
| 1047 |
-
outputs=
|
| 1048 |
)
|
| 1049 |
|
| 1050 |
thinking_overhead.change(
|
| 1051 |
fn=on_source_change,
|
| 1052 |
-
inputs=
|
| 1053 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1054 |
)
|
| 1055 |
|
| 1056 |
return app
|
|
|
|
| 28 |
_calculated_tokens_cache = {}
|
| 29 |
|
| 30 |
|
| 31 |
+
def get_default_overhead(model_name: str) -> float:
|
| 32 |
+
"""Get default tokenizer overhead for model provider"""
|
| 33 |
+
model_lower = model_name.lower() if model_name else ""
|
| 34 |
+
|
| 35 |
+
if "claude" in model_lower or "anthropic" in model_lower:
|
| 36 |
+
return 1.24
|
| 37 |
+
elif "gemini" in model_lower or "google" in model_lower:
|
| 38 |
+
return 1.0
|
| 39 |
+
elif "gpt" in model_lower or "openai" in model_lower or "o1" in model_lower or "o3" in model_lower:
|
| 40 |
+
return 1.0
|
| 41 |
+
else:
|
| 42 |
+
return 1.0
|
| 43 |
+
|
| 44 |
+
|
| 45 |
def get_tokenizer(model_name: str):
|
| 46 |
"""Get appropriate tokenizer for model. Returns (tokenizer_func, name)"""
|
| 47 |
global _tokenizer_cache
|
| 48 |
+
|
| 49 |
model_lower = model_name.lower() if model_name else ""
|
| 50 |
+
|
|
|
|
| 51 |
if "gpt-4o" in model_lower or "o1" in model_lower or "o3" in model_lower:
|
| 52 |
tokenizer_name = "o200k_base"
|
| 53 |
elif "gpt" in model_lower or "claude" in model_lower or "anthropic" in model_lower:
|
| 54 |
tokenizer_name = "cl100k_base"
|
| 55 |
elif "gemini" in model_lower or "google" in model_lower:
|
|
|
|
| 56 |
return lambda text: int(len(text) / 3.23), "gemini_approx"
|
| 57 |
else:
|
|
|
|
| 58 |
tokenizer_name = "cl100k_base"
|
| 59 |
|
| 60 |
if tokenizer_name not in _tokenizer_cache:
|
|
|
|
| 86 |
return {"prompt_tokens": 0, "completion_tokens": 0, "cache_read_tokens": 0, "cache_creation_tokens": 0, "api_calls": 0}
|
| 87 |
|
| 88 |
count_tokens, _ = get_tokenizer(model_name)
|
| 89 |
+
|
|
|
|
| 90 |
message_tokens = []
|
| 91 |
for msg in messages:
|
| 92 |
content = msg.get("content", "")
|
|
|
|
| 118 |
context_so_far += mt["tokens"]
|
| 119 |
|
| 120 |
next_is_assistant = (i + 1 < len(message_tokens) and message_tokens[i + 1]["role"] == "assistant")
|
|
|
|
| 121 |
|
| 122 |
if next_is_assistant:
|
| 123 |
prompt_tokens += context_so_far
|
|
|
|
| 126 |
assistant_tokens = message_tokens[i + 1]["tokens"]
|
| 127 |
cache_creation_tokens += (context_so_far - cached_context) + assistant_tokens
|
| 128 |
cached_context = context_so_far + assistant_tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
return {
|
| 131 |
"prompt_tokens": prompt_tokens,
|
|
|
|
| 150 |
return df
|
| 151 |
|
| 152 |
|
| 153 |
+
def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
|
| 154 |
+
"""Convert all tokens to uncached input + completion (no caching)"""
|
| 155 |
+
if df.empty:
|
| 156 |
+
return df
|
| 157 |
+
|
| 158 |
+
df = df.copy()
|
| 159 |
+
df["cache_read_tokens"] = 0
|
| 160 |
+
df["cache_creation_tokens"] = 0
|
| 161 |
+
return df
|
| 162 |
+
|
| 163 |
+
|
| 164 |
def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
|
| 165 |
"""Load trajectories with self-calculated token counts"""
|
| 166 |
global _calculated_tokens_cache
|
|
|
|
| 236 |
return _litellm_prices_cache
|
| 237 |
|
| 238 |
|
| 239 |
+
def normalize_model_name(name: str) -> str:
|
| 240 |
+
"""Normalize model name for comparison: lowercase, remove separators"""
|
| 241 |
+
return re.sub(r'[-_./]', '', name.lower())
|
| 242 |
+
|
| 243 |
+
|
| 244 |
def get_model_prices(model_name: str) -> dict | None:
|
| 245 |
if not model_name:
|
| 246 |
return None
|
|
|
|
| 248 |
prices = get_litellm_prices()
|
| 249 |
|
| 250 |
clean_name = model_name.replace("anthropic/", "").replace("openai/", "")
|
| 251 |
+
|
|
|
|
| 252 |
name_without_date = re.sub(r'-\d{8}$', '', clean_name)
|
| 253 |
|
| 254 |
candidates = [
|
|
|
|
| 265 |
if key in prices:
|
| 266 |
return prices[key]
|
| 267 |
|
| 268 |
+
normalized_name = normalize_model_name(clean_name)
|
| 269 |
+
normalized_no_date = normalize_model_name(name_without_date)
|
| 270 |
+
|
| 271 |
for key, value in prices.items():
|
| 272 |
+
key_normalized = normalize_model_name(key)
|
| 273 |
+
if normalized_name in key_normalized or normalized_no_date in key_normalized:
|
| 274 |
+
return value
|
| 275 |
+
key_last_part = key.split('/')[-1] if '/' in key else key
|
| 276 |
+
key_last_normalized = normalize_model_name(key_last_part)
|
| 277 |
+
if normalized_name == key_last_normalized or normalized_no_date == key_last_normalized:
|
| 278 |
return value
|
| 279 |
|
| 280 |
return None
|
|
|
|
| 301 |
|
| 302 |
rows = []
|
| 303 |
for r in bash_only["results"]:
|
| 304 |
+
resolved_pct = r.get("resolved", 0)
|
| 305 |
+
if isinstance(resolved_pct, (int, float)):
|
| 306 |
+
resolved_str = f"{resolved_pct:.1f}%"
|
| 307 |
+
else:
|
| 308 |
+
resolved_str = str(resolved_pct)
|
| 309 |
+
|
| 310 |
rows.append({
|
| 311 |
"name": r.get("name", ""),
|
| 312 |
+
"% resolved": resolved_str,
|
| 313 |
"date": r.get("date", ""),
|
| 314 |
"cost": round(r.get("cost", 0), 2),
|
| 315 |
"instance_cost": round(r.get("instance_cost", 0), 4),
|
| 316 |
"instance_calls": r.get("instance_calls", 0),
|
| 317 |
"folder": r.get("folder", ""),
|
| 318 |
"os_model": "β
" if r.get("os_model") else "β",
|
|
|
|
| 319 |
})
|
| 320 |
|
| 321 |
return pd.DataFrame(rows)
|
|
|
|
| 385 |
resolved_count = sum(1 for v in per_instance.values() if v.get("resolved"))
|
| 386 |
total_count = len(per_instance)
|
| 387 |
|
| 388 |
+
if total_count > 0:
|
| 389 |
+
resolved_pct = f"{100*resolved_count/total_count:.1f}%"
|
| 390 |
+
else:
|
| 391 |
+
resolved_pct = "N/A"
|
| 392 |
+
|
| 393 |
+
status = f"β
Downloaded to {output_dir}\n\n{file_count} trajectory files\nResolved: {resolved_count}/{total_count} ({resolved_pct})"
|
| 394 |
return status, gr.update(visible=True)
|
| 395 |
|
| 396 |
except subprocess.TimeoutExpired:
|
|
|
|
| 558 |
|
| 559 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 560 |
|
| 561 |
+
# Stacked bar chart - sort by total tokens (sum of all stacked)
|
| 562 |
+
df_sorted = df.copy()
|
|
|
|
| 563 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
| 564 |
+
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
|
| 565 |
+
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
|
| 566 |
+
df_sorted["trajectory_idx"] = range(len(df_sorted))
|
| 567 |
|
| 568 |
fig_stacked = go.Figure()
|
| 569 |
fig_stacked.add_trace(go.Bar(
|
| 570 |
+
name="Uncached Input", x=df_sorted["trajectory_idx"], y=df_sorted["uncached_input_tokens"],
|
| 571 |
+
marker_color="#EF553B", hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
|
| 572 |
))
|
| 573 |
fig_stacked.add_trace(go.Bar(
|
| 574 |
+
name="Cache Read", x=df_sorted["trajectory_idx"], y=df_sorted["cache_read_tokens"],
|
| 575 |
+
marker_color="#19D3F3", hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
|
| 576 |
))
|
| 577 |
fig_stacked.add_trace(go.Bar(
|
| 578 |
+
name="Cache Creation", x=df_sorted["trajectory_idx"], y=df_sorted["cache_creation_tokens"],
|
| 579 |
+
marker_color="#FFA15A", hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
|
| 580 |
))
|
| 581 |
fig_stacked.add_trace(go.Bar(
|
| 582 |
+
name="Completion", x=df_sorted["trajectory_idx"], y=df_sorted["completion_tokens"],
|
| 583 |
+
marker_color="#AB63FA", hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
|
| 584 |
))
|
| 585 |
fig_stacked.update_layout(
|
| 586 |
barmode="stack",
|
| 587 |
+
title="Tokens per Trajectory (stacked)",
|
| 588 |
+
xaxis_title="Trajectory (sorted by total tokens)",
|
| 589 |
yaxis_title="Tokens",
|
| 590 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 591 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
|
|
| 602 |
df,
|
| 603 |
x="api_calls",
|
| 604 |
nbins=30,
|
| 605 |
+
title="Distribution of API Calls (Steps) per Trajectory",
|
| 606 |
color_discrete_sequence=["#636EFA"],
|
| 607 |
)
|
| 608 |
fig_steps.update_layout(
|
| 609 |
xaxis_title="API Calls (Steps)",
|
| 610 |
+
yaxis_title="Number of Trajectories",
|
| 611 |
showlegend=False,
|
| 612 |
margin=dict(l=40, r=20, t=40, b=40),
|
| 613 |
)
|
|
|
|
| 627 |
)
|
| 628 |
fig_cost.update_layout(
|
| 629 |
xaxis_title="Cost ($)",
|
| 630 |
+
yaxis_title="Number of Trajectories",
|
| 631 |
showlegend=False,
|
| 632 |
margin=dict(l=40, r=20, t=40, b=40),
|
| 633 |
)
|
|
|
|
| 641 |
total_completion = df["completion_tokens"].sum()
|
| 642 |
total_cache_read = df["cache_read_tokens"].sum()
|
| 643 |
total_cache_creation = df["cache_creation_tokens"].sum()
|
| 644 |
+
# Uncached input = prompt - cache_read - cache_creation (per trajectory, then sum)
|
| 645 |
df_temp = df.copy()
|
| 646 |
df_temp["uncached_input"] = (df_temp["prompt_tokens"] - df_temp["cache_read_tokens"] - df_temp["cache_creation_tokens"]).clip(lower=0)
|
| 647 |
total_uncached_input = df_temp["uncached_input"].sum()
|
|
|
|
| 677 |
# Cost by token type (use separate function)
|
| 678 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 679 |
|
| 680 |
+
# Sort by total tokens (sum of all stacked)
|
| 681 |
+
df_sorted = df.copy()
|
|
|
|
| 682 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
| 683 |
+
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
|
| 684 |
+
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
|
| 685 |
+
df_sorted["trajectory_idx"] = range(len(df_sorted))
|
| 686 |
|
| 687 |
fig_stacked = go.Figure()
|
| 688 |
|
| 689 |
fig_stacked.add_trace(go.Bar(
|
| 690 |
name="Uncached Input",
|
| 691 |
+
x=df_sorted["trajectory_idx"],
|
| 692 |
y=df_sorted["uncached_input_tokens"],
|
| 693 |
marker_color="#EF553B",
|
| 694 |
+
hovertemplate="Trajectory: %{x}<br>Uncached Input: %{y:,.0f}<extra></extra>",
|
| 695 |
))
|
| 696 |
|
| 697 |
fig_stacked.add_trace(go.Bar(
|
| 698 |
name="Cache Read",
|
| 699 |
+
x=df_sorted["trajectory_idx"],
|
| 700 |
y=df_sorted["cache_read_tokens"],
|
| 701 |
marker_color="#19D3F3",
|
| 702 |
+
hovertemplate="Trajectory: %{x}<br>Cache Read: %{y:,.0f}<extra></extra>",
|
| 703 |
))
|
| 704 |
|
| 705 |
fig_stacked.add_trace(go.Bar(
|
| 706 |
name="Cache Creation",
|
| 707 |
+
x=df_sorted["trajectory_idx"],
|
| 708 |
y=df_sorted["cache_creation_tokens"],
|
| 709 |
marker_color="#FFA15A",
|
| 710 |
+
hovertemplate="Trajectory: %{x}<br>Cache Creation: %{y:,.0f}<extra></extra>",
|
| 711 |
))
|
| 712 |
|
| 713 |
fig_stacked.add_trace(go.Bar(
|
| 714 |
name="Completion",
|
| 715 |
+
x=df_sorted["trajectory_idx"],
|
| 716 |
y=df_sorted["completion_tokens"],
|
| 717 |
marker_color="#AB63FA",
|
| 718 |
+
hovertemplate="Trajectory: %{x}<br>Completion: %{y:,.0f}<extra></extra>",
|
| 719 |
))
|
| 720 |
|
| 721 |
fig_stacked.update_layout(
|
| 722 |
barmode="stack",
|
| 723 |
+
title="Tokens per Trajectory (stacked)",
|
| 724 |
+
xaxis_title="Trajectory (sorted by total tokens)",
|
| 725 |
yaxis_title="Tokens",
|
| 726 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 727 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
|
|
| 734 |
if df.empty:
|
| 735 |
return None
|
| 736 |
|
| 737 |
+
# Sort by total tokens (sum of all stacked)
|
| 738 |
+
df_sorted = df.copy()
|
|
|
|
|
|
|
| 739 |
df_sorted["uncached_input_tokens"] = (df_sorted["prompt_tokens"] - df_sorted["cache_read_tokens"] - df_sorted["cache_creation_tokens"]).clip(lower=0)
|
| 740 |
+
df_sorted["total_stacked"] = df_sorted["uncached_input_tokens"] + df_sorted["cache_read_tokens"] + df_sorted["cache_creation_tokens"] + df_sorted["completion_tokens"]
|
| 741 |
+
df_sorted = df_sorted.sort_values("total_stacked", ascending=False).reset_index(drop=True)
|
| 742 |
+
df_sorted["trajectory_idx"] = range(len(df_sorted))
|
| 743 |
|
| 744 |
df_sorted["cost_uncached_input"] = df_sorted["uncached_input_tokens"] * input_price / 1e6
|
| 745 |
df_sorted["cost_cache_read"] = df_sorted["cache_read_tokens"] * cache_read_price / 1e6
|
|
|
|
| 750 |
|
| 751 |
fig.add_trace(go.Bar(
|
| 752 |
name=f"Uncached Input (${input_price:.2f}/1M)",
|
| 753 |
+
x=df_sorted["trajectory_idx"],
|
| 754 |
y=df_sorted["cost_uncached_input"],
|
| 755 |
marker_color="#EF553B",
|
| 756 |
+
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
|
| 757 |
))
|
| 758 |
|
| 759 |
fig.add_trace(go.Bar(
|
| 760 |
name=f"Cache Read (${cache_read_price:.2f}/1M)",
|
| 761 |
+
x=df_sorted["trajectory_idx"],
|
| 762 |
y=df_sorted["cost_cache_read"],
|
| 763 |
marker_color="#19D3F3",
|
| 764 |
+
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
|
| 765 |
))
|
| 766 |
|
| 767 |
fig.add_trace(go.Bar(
|
| 768 |
name=f"Cache Creation (${cache_creation_price:.2f}/1M)",
|
| 769 |
+
x=df_sorted["trajectory_idx"],
|
| 770 |
y=df_sorted["cost_cache_creation"],
|
| 771 |
marker_color="#FFA15A",
|
| 772 |
+
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
|
| 773 |
))
|
| 774 |
|
| 775 |
fig.add_trace(go.Bar(
|
| 776 |
name=f"Completion (${completion_price:.2f}/1M)",
|
| 777 |
+
x=df_sorted["trajectory_idx"],
|
| 778 |
y=df_sorted["cost_completion"],
|
| 779 |
marker_color="#AB63FA",
|
| 780 |
+
hovertemplate="Trajectory: %{x}<br>Cost: $%{y:.4f}<extra></extra>",
|
| 781 |
))
|
| 782 |
|
| 783 |
total_cost = (
|
|
|
|
| 789 |
|
| 790 |
fig.update_layout(
|
| 791 |
barmode="stack",
|
| 792 |
+
title="Cost per Trajectory",
|
| 793 |
+
xaxis_title="Trajectory (sorted by total tokens)",
|
| 794 |
yaxis_title="Cost ($)",
|
| 795 |
legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
|
| 796 |
margin=dict(l=50, r=20, t=60, b=40),
|
|
|
|
| 872 |
def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
|
| 873 |
if evt.index is None:
|
| 874 |
return (
|
| 875 |
+
"", "",
|
| 876 |
+
gr.update(interactive=False),
|
| 877 |
+
gr.update(visible=False),
|
| 878 |
gr.update(value=0, label="π² Input"),
|
| 879 |
gr.update(value=0, label="π² Cache Read"),
|
| 880 |
gr.update(value=0, label="π² Cache Creation"),
|
| 881 |
gr.update(value=0, label="π² Completion"),
|
| 882 |
+
"",
|
| 883 |
+
gr.update(value=1.0),
|
| 884 |
)
|
| 885 |
|
| 886 |
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
|
|
|
| 891 |
show_analyze = check_trajectories_downloaded(folder)
|
| 892 |
|
| 893 |
prices_dict, model_hint = get_prices_for_folder(folder)
|
| 894 |
+
default_overhead = get_default_overhead(model_hint)
|
| 895 |
|
| 896 |
def price_update(price_info, name):
|
| 897 |
value = price_info["value"]
|
|
|
|
| 903 |
return gr.update(value=0, label=f"β {name}")
|
| 904 |
|
| 905 |
return (
|
| 906 |
+
folder, name,
|
| 907 |
+
gr.update(interactive=True),
|
| 908 |
+
gr.update(visible=show_analyze),
|
| 909 |
price_update(prices_dict["input"], "Input"),
|
| 910 |
price_update(prices_dict["cache_read"], "Cache Read"),
|
| 911 |
price_update(prices_dict["cache_creation"], "Cache Creation"),
|
| 912 |
price_update(prices_dict["completion"], "Completion"),
|
| 913 |
+
model_hint,
|
| 914 |
+
gr.update(value=default_overhead),
|
| 915 |
)
|
| 916 |
|
| 917 |
|
|
|
|
| 945 |
plot_tokens_cost = gr.Plot(label="Cost by Token Type ($)")
|
| 946 |
|
| 947 |
with gr.Row():
|
| 948 |
+
plot_stacked = gr.Plot(label="Tokens per Trajectory")
|
| 949 |
+
plot_cost_breakdown = gr.Plot(label="Cost per Trajectory ($)")
|
|
|
|
|
|
|
| 950 |
|
| 951 |
with gr.Column(scale=1):
|
| 952 |
selected_folder = gr.State("")
|
|
|
|
| 979 |
info="Multiplier for Calculated tokens (tiktoken β native)",
|
| 980 |
visible=False,
|
| 981 |
)
|
| 982 |
+
use_cache = gr.Checkbox(
|
| 983 |
+
label="Use Cache",
|
| 984 |
+
value=True,
|
| 985 |
+
info="If disabled, all tokens are Uncached Input or Completion",
|
| 986 |
+
visible=False,
|
| 987 |
+
)
|
| 988 |
|
| 989 |
+
def update_calculated_options_visibility(source):
|
| 990 |
+
is_calc = source == "Calculated"
|
| 991 |
+
return gr.update(visible=is_calc), gr.update(visible=is_calc)
|
| 992 |
|
| 993 |
token_source.change(
|
| 994 |
+
fn=update_calculated_options_visibility,
|
| 995 |
inputs=[token_source],
|
| 996 |
+
outputs=[thinking_overhead, use_cache],
|
| 997 |
)
|
| 998 |
|
| 999 |
leaderboard_table.select(
|
| 1000 |
fn=on_row_select,
|
| 1001 |
inputs=[leaderboard_table],
|
| 1002 |
+
outputs=[selected_folder, selected_name, download_btn, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
|
| 1003 |
)
|
| 1004 |
|
| 1005 |
download_btn.click(
|
|
|
|
| 1008 |
outputs=[download_status, analyze_btn],
|
| 1009 |
)
|
| 1010 |
|
| 1011 |
+
def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
|
| 1012 |
empty_result = (
|
| 1013 |
gr.update(visible=False),
|
| 1014 |
None, None, None, None, None, None,
|
|
|
|
| 1025 |
None,
|
| 1026 |
)
|
| 1027 |
|
| 1028 |
+
df_meta = load_all_trajectories(folder)
|
| 1029 |
+
df_calc = load_all_trajectories_calculated(folder)
|
| 1030 |
+
df_calc["api_calls"] = df_meta["api_calls"].values
|
| 1031 |
+
df_calc["instance_cost"] = df_meta["instance_cost"].values
|
| 1032 |
+
|
| 1033 |
+
state_data = {"meta": df_meta, "calculated": df_calc}
|
| 1034 |
+
|
| 1035 |
if source == "Metadata":
|
| 1036 |
+
df = df_meta
|
| 1037 |
else:
|
| 1038 |
+
df = apply_thinking_overhead(df_calc.copy(), overhead)
|
| 1039 |
+
if not with_cache:
|
| 1040 |
+
df = apply_no_cache(df)
|
| 1041 |
|
| 1042 |
if df.empty:
|
| 1043 |
yield empty_result
|
|
|
|
| 1051 |
yield (
|
| 1052 |
gr.update(visible=True),
|
| 1053 |
fig_steps, fig_cost, fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown,
|
| 1054 |
+
state_data,
|
| 1055 |
)
|
| 1056 |
|
| 1057 |
analyze_btn.click(
|
| 1058 |
fn=load_and_analyze,
|
| 1059 |
+
inputs=[selected_folder, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache],
|
| 1060 |
outputs=[
|
| 1061 |
analysis_section,
|
| 1062 |
plot_steps, plot_cost, plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown,
|
|
|
|
| 1064 |
],
|
| 1065 |
)
|
| 1066 |
|
| 1067 |
+
def recalculate_costs(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
|
| 1068 |
+
if state_data is None:
|
| 1069 |
return None, None
|
| 1070 |
+
|
| 1071 |
+
if source == "Metadata":
|
| 1072 |
+
df = state_data["meta"]
|
| 1073 |
+
else:
|
| 1074 |
+
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
|
| 1075 |
+
if not with_cache:
|
| 1076 |
+
df = apply_no_cache(df)
|
| 1077 |
+
|
| 1078 |
+
if df.empty:
|
| 1079 |
+
return None, None
|
| 1080 |
+
|
| 1081 |
fig_tokens_cost = create_cost_by_type_chart(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 1082 |
fig_cost_breakdown = create_cost_breakdown(df, input_price, cache_read_price, cache_creation_price, completion_price)
|
| 1083 |
return fig_tokens_cost, fig_cost_breakdown
|
| 1084 |
|
| 1085 |
+
price_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
|
| 1086 |
price_outputs = [plot_tokens_cost, plot_cost_breakdown]
|
| 1087 |
|
| 1088 |
price_input.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
|
|
|
| 1090 |
price_cache_creation.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
| 1091 |
price_completion.change(fn=recalculate_costs, inputs=price_inputs, outputs=price_outputs)
|
| 1092 |
|
| 1093 |
+
def on_source_change(state_data, input_price, cache_read_price, cache_creation_price, completion_price, source, overhead, with_cache):
|
| 1094 |
"""Recalculate only token-dependent charts when source changes"""
|
| 1095 |
+
if state_data is None:
|
| 1096 |
return None, None, None, None
|
| 1097 |
|
| 1098 |
if source == "Metadata":
|
| 1099 |
+
df = state_data["meta"]
|
| 1100 |
else:
|
| 1101 |
+
df = apply_thinking_overhead(state_data["calculated"].copy(), overhead)
|
| 1102 |
+
if not with_cache:
|
| 1103 |
+
df = apply_no_cache(df)
|
| 1104 |
|
| 1105 |
if df.empty:
|
| 1106 |
return None, None, None, None
|
|
|
|
| 1112 |
|
| 1113 |
return fig_tokens, fig_tokens_cost, fig_stacked, fig_cost_breakdown
|
| 1114 |
|
| 1115 |
+
source_change_inputs = [trajectories_state, price_input, price_cache_read, price_cache_creation, price_completion, token_source, thinking_overhead, use_cache]
|
| 1116 |
+
source_change_outputs = [plot_tokens, plot_tokens_cost, plot_stacked, plot_cost_breakdown]
|
| 1117 |
+
|
| 1118 |
token_source.change(
|
| 1119 |
fn=on_source_change,
|
| 1120 |
+
inputs=source_change_inputs,
|
| 1121 |
+
outputs=source_change_outputs,
|
| 1122 |
)
|
| 1123 |
|
| 1124 |
thinking_overhead.change(
|
| 1125 |
fn=on_source_change,
|
| 1126 |
+
inputs=source_change_inputs,
|
| 1127 |
+
outputs=source_change_outputs,
|
| 1128 |
+
)
|
| 1129 |
+
|
| 1130 |
+
use_cache.change(
|
| 1131 |
+
fn=on_source_change,
|
| 1132 |
+
inputs=source_change_inputs,
|
| 1133 |
+
outputs=source_change_outputs,
|
| 1134 |
)
|
| 1135 |
|
| 1136 |
return app
|