IgorSlinko commited on
Commit
bca0945
·
1 Parent(s): 745add3

v0.3.14: Add logging, support alternative trajectory format, fix token columns

Browse files

- Add logging setup with file handler (logs/app.log) and console output
- Log all parsing errors with full traceback (exc_info=True)
- Add unhandled exception hook to capture crashes
- Add logs/ to .gitignore

- Support alternative trajectory format (Llama 4 Scout style)
- Detect 'trajectory' array format vs 'messages' format
- New _parse_trajectory_format_to_steps() for trajectory format
- Extract api_calls from trajectory length when model_stats missing

- Add ensure_token_columns() to guarantee token columns exist
- Apply ensure_token_columns in load_all_trajectories, load_all_trajectories_calculated
- Fix KeyError: 'completion_tokens' for models with missing token data

Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +157 -15
.gitignore CHANGED
@@ -3,6 +3,7 @@
3
  __pycache__/
4
  *.pyc
5
  data/
 
6
  .DS_Store
7
 
8
 
 
3
  __pycache__/
4
  *.pyc
5
  data/
6
+ logs/
7
  .DS_Store
8
 
9
 
app.py CHANGED
@@ -1,8 +1,10 @@
1
  import json
 
2
  import os
3
  import random
4
  import re
5
  import subprocess
 
6
  from pathlib import Path
7
 
8
  import gradio as gr
@@ -23,6 +25,29 @@ LEADERBOARD_CACHE = DATA_DIR / "swebench_leaderboard_latest.json"
23
  LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
24
  S3_BUCKET = "s3://swe-bench-experiments/bash-only"
25
  LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  _litellm_prices_cache = None
28
  _trajectories_cache = {}
@@ -106,6 +131,11 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
106
  data = json.load(f)
107
 
108
  messages = data.get("messages", [])
 
 
 
 
 
109
  if not messages:
110
  return []
111
 
@@ -151,6 +181,40 @@ def parse_trajectory_to_steps(traj_path: Path, model_name: str) -> list[dict]:
151
  return steps
152
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def get_default_overhead(model_name: str) -> float:
155
  """Get default tokenizer overhead for model provider"""
156
  model_lower = model_name.lower() if model_name else ""
@@ -212,13 +276,33 @@ def apply_no_cache(df: pd.DataFrame) -> pd.DataFrame:
212
  return df
213
 
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
216
  """Load trajectories with self-calculated token counts using calculate_routing_tokens"""
217
  global _calculated_tokens_cache
218
 
219
  cache_key = f"calculated_{folder}"
220
  if cache_key in _calculated_tokens_cache:
221
- return _calculated_tokens_cache[cache_key]
222
 
223
  trajectory_steps = load_all_trajectory_steps(folder)
224
 
@@ -251,9 +335,9 @@ def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
251
  "cache_creation_tokens": cache_creation,
252
  })
253
  except Exception as e:
254
- print(f"Error calculating tokens for {instance_id}: {e}")
255
 
256
- df = pd.DataFrame(rows)
257
  _calculated_tokens_cache[cache_key] = df
258
  return df
259
 
@@ -301,7 +385,7 @@ def load_all_trajectory_steps(folder: str) -> dict[str, list[dict]]:
301
  if steps:
302
  result[instance_id] = steps
303
  except Exception as e:
304
- print(f"Error parsing steps for {traj_path}: {e}")
305
 
306
  _trajectory_steps_cache[cache_key] = result
307
  return result
@@ -519,10 +603,29 @@ def parse_trajectory(traj_path: Path) -> dict:
519
  model_config = config.get("model", {})
520
  model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
521
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  result = {
523
  "instance_id": data.get("instance_id", traj_path.stem),
524
  "model_name": model_name,
525
- "api_calls": model_stats.get("api_calls", 0),
526
  "instance_cost": model_stats.get("instance_cost", 0),
527
  "prompt_tokens": 0,
528
  "completion_tokens": 0,
@@ -555,7 +658,7 @@ def load_all_trajectories(folder: str) -> pd.DataFrame:
555
  global _trajectories_cache
556
 
557
  if folder in _trajectories_cache:
558
- return _trajectories_cache[folder]
559
 
560
  output_dir = TRAJS_DIR / folder
561
 
@@ -574,9 +677,9 @@ def load_all_trajectories(folder: str) -> pd.DataFrame:
574
  try:
575
  rows.append(parse_trajectory(traj_path))
576
  except Exception as e:
577
- print(f"Error parsing {traj_path}: {e}")
578
 
579
- df = pd.DataFrame(rows)
580
  _trajectories_cache[folder] = df
581
  return df
582
 
@@ -981,8 +1084,8 @@ def get_prices_for_folder(folder: str) -> tuple[dict, str]:
981
  return result, model_hint
982
 
983
 
984
- def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
985
- if evt.index is None:
986
  return (
987
  "", "",
988
  gr.update(visible=False),
@@ -994,7 +1097,6 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
994
  gr.update(value=1.0),
995
  )
996
 
997
- row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
998
  row = df.iloc[row_idx]
999
  folder = row["folder"]
1000
  name = row["name"]
@@ -1023,6 +1125,18 @@ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
1023
  )
1024
 
1025
 
 
 
 
 
 
 
 
 
 
 
 
 
1026
  def create_routed_token_chart(base_tokens: dict, additional_models: list):
1027
  """
1028
  Create grouped bar chart for tokens by type, comparing base vs additional models.
@@ -1155,8 +1269,8 @@ def build_app():
1155
  with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
1156
  trajectories_state = gr.State(None)
1157
 
1158
- gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.9`")
1159
- gr.Markdown("Select a model to use as base for cost analysis")
1160
 
1161
  with gr.Row():
1162
  with gr.Column(scale=3):
@@ -1165,6 +1279,7 @@ def build_app():
1165
  label="Bash-Only Leaderboard",
1166
  interactive=False,
1167
  wrap=True,
 
1168
  )
1169
 
1170
  with gr.Column(visible=False) as analysis_section:
@@ -1777,7 +1892,23 @@ def build_app():
1777
  outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
1778
  )
1779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1780
  def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
 
1781
  empty_result = (
1782
  "",
1783
  gr.update(visible=False),
@@ -1789,10 +1920,12 @@ def build_app():
1789
  )
1790
 
1791
  if not folder:
 
1792
  yield empty_result
1793
  return
1794
 
1795
  if not check_trajectories_downloaded(folder):
 
1796
  yield (
1797
  "⏳ Downloading trajectories...",
1798
  gr.update(visible=False),
@@ -1802,8 +1935,10 @@ def build_app():
1802
  None,
1803
  gr.update(visible=False),
1804
  )
 
1805
  status, _ = download_trajectories_from_s3(folder)
1806
  if "❌" in status:
 
1807
  yield (
1808
  status,
1809
  gr.update(visible=False),
@@ -1814,6 +1949,7 @@ def build_app():
1814
  gr.update(visible=False),
1815
  )
1816
  return
 
1817
 
1818
  yield (
1819
  "⏳ Loading trajectories...",
@@ -1825,15 +1961,19 @@ def build_app():
1825
  gr.update(visible=False),
1826
  )
1827
 
1828
- df_meta = load_all_trajectories(folder)
1829
- df_calc = load_all_trajectories_calculated(folder)
 
 
1830
  df_calc["api_calls"] = df_meta["api_calls"].values
1831
  df_calc["instance_cost"] = df_meta["instance_cost"].values
 
1832
  trajectory_steps = load_all_trajectory_steps(folder)
1833
 
1834
  state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
1835
 
1836
  if df_meta.empty:
 
1837
  yield (
1838
  "❌ No trajectories found",
1839
  gr.update(visible=False),
@@ -1845,6 +1985,7 @@ def build_app():
1845
  )
1846
  return
1847
 
 
1848
  fig_steps, fig_cost, _, _, _ = create_basic_histograms(
1849
  df_meta, input_price, cache_read_price, cache_creation_price, completion_price
1850
  )
@@ -1867,6 +2008,7 @@ def build_app():
1867
  df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
1868
  )
1869
 
 
1870
  yield (
1871
  f"✅ Loaded {len(df_meta)} trajectories",
1872
  gr.update(visible=True),
 
1
  import json
2
+ import logging
3
  import os
4
  import random
5
  import re
6
  import subprocess
7
+ import sys
8
  from pathlib import Path
9
 
10
  import gradio as gr
 
25
  LITELLM_PRICES_CACHE = DATA_DIR / "litellm_prices.json"
26
  S3_BUCKET = "s3://swe-bench-experiments/bash-only"
27
  LITELLM_PRICES_URL = "https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json"
28
+ LOG_DIR = Path("logs")
29
+ LOG_DIR.mkdir(parents=True, exist_ok=True)
30
+ LOG_FILE = LOG_DIR / "app.log"
31
+
32
+ logging.basicConfig(
33
+ level=logging.INFO,
34
+ format="%(asctime)s [%(levelname)s] %(message)s",
35
+ handlers=[
36
+ logging.FileHandler(LOG_FILE, encoding="utf-8"),
37
+ logging.StreamHandler(sys.stdout),
38
+ ],
39
+ force=True,
40
+ )
41
+
42
+
43
+ def _log_unhandled(exc_type, exc_value, exc_traceback):
44
+ if issubclass(exc_type, KeyboardInterrupt):
45
+ sys.__excepthook__(exc_type, exc_value, exc_traceback)
46
+ return
47
+ logging.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
48
+
49
+
50
+ sys.excepthook = _log_unhandled
51
 
52
  _litellm_prices_cache = None
53
  _trajectories_cache = {}
 
131
  data = json.load(f)
132
 
133
  messages = data.get("messages", [])
134
+ trajectory_data = data.get("trajectory", [])
135
+
136
+ if not messages and trajectory_data:
137
+ return _parse_trajectory_format_to_steps(trajectory_data, model_name)
138
+
139
  if not messages:
140
  return []
141
 
 
181
  return steps
182
 
183
 
184
+ def _parse_trajectory_format_to_steps(trajectory_data: list, model_name: str) -> list[dict]:
185
+ """
186
+ Parse alternative trajectory format (with "trajectory" array) into steps.
187
+ """
188
+ count_tokens, _ = get_tokenizer(model_name)
189
+
190
+ steps = []
191
+ for i, traj_step in enumerate(trajectory_data):
192
+ query = traj_step.get("query", [])
193
+ response_text = traj_step.get("response", "")
194
+ observation_text = traj_step.get("observation", "")
195
+
196
+ system_user_tokens = 0
197
+ if i == 0:
198
+ for q in query:
199
+ content = q.get("content", "")
200
+ if isinstance(content, list):
201
+ content = json.dumps(content)
202
+ system_user_tokens += count_tokens(str(content))
203
+
204
+ completion_tokens = count_tokens(str(response_text)) if response_text else 0
205
+ observation_tokens = count_tokens(str(observation_text)) if observation_text else None
206
+
207
+ step = {
208
+ "model": model_name,
209
+ "system_user": system_user_tokens,
210
+ "completion": completion_tokens,
211
+ "observation": observation_tokens,
212
+ }
213
+ steps.append(step)
214
+
215
+ return steps
216
+
217
+
218
  def get_default_overhead(model_name: str) -> float:
219
  """Get default tokenizer overhead for model provider"""
220
  model_lower = model_name.lower() if model_name else ""
 
276
  return df
277
 
278
 
279
+ def ensure_token_columns(df: pd.DataFrame) -> pd.DataFrame:
280
+ """Ensure token-related columns exist and are numeric."""
281
+ if df is None or df.empty:
282
+ return df
283
+ df = df.copy()
284
+ required = [
285
+ "prompt_tokens",
286
+ "completion_tokens",
287
+ "cache_read_tokens",
288
+ "cache_creation_tokens",
289
+ ]
290
+ for col in required:
291
+ if col not in df.columns:
292
+ df[col] = 0
293
+ df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype(int)
294
+ if "total_tokens" in df.columns:
295
+ df["total_tokens"] = pd.to_numeric(df["total_tokens"], errors="coerce").fillna(0).astype(int)
296
+ return df
297
+
298
+
299
  def load_all_trajectories_calculated(folder: str) -> pd.DataFrame:
300
  """Load trajectories with self-calculated token counts using calculate_routing_tokens"""
301
  global _calculated_tokens_cache
302
 
303
  cache_key = f"calculated_{folder}"
304
  if cache_key in _calculated_tokens_cache:
305
+ return ensure_token_columns(_calculated_tokens_cache[cache_key])
306
 
307
  trajectory_steps = load_all_trajectory_steps(folder)
308
 
 
335
  "cache_creation_tokens": cache_creation,
336
  })
337
  except Exception as e:
338
+ logging.error("Error calculating tokens for %s: %s", instance_id, e, exc_info=True)
339
 
340
+ df = ensure_token_columns(pd.DataFrame(rows))
341
  _calculated_tokens_cache[cache_key] = df
342
  return df
343
 
 
385
  if steps:
386
  result[instance_id] = steps
387
  except Exception as e:
388
+ logging.error("Error parsing steps for %s: %s", traj_path, e, exc_info=True)
389
 
390
  _trajectory_steps_cache[cache_key] = result
391
  return result
 
603
  model_config = config.get("model", {})
604
  model_name = model_config.get("cost_calc_model_override", model_config.get("model_name", ""))
605
 
606
+ trajectory_steps = data.get("trajectory", [])
607
+ is_trajectory_format = len(trajectory_steps) > 0 and "messages" not in data
608
+
609
+ if is_trajectory_format and not model_name:
610
+ for step in trajectory_steps:
611
+ query = step.get("query", [])
612
+ for q in query:
613
+ if q.get("role") == "system":
614
+ content = q.get("content", "")
615
+ if "llama" in content.lower() or "meta" in content.lower():
616
+ model_name = "llama"
617
+ break
618
+ if model_name:
619
+ break
620
+
621
+ api_calls = model_stats.get("api_calls", 0)
622
+ if api_calls == 0 and is_trajectory_format:
623
+ api_calls = len(trajectory_steps)
624
+
625
  result = {
626
  "instance_id": data.get("instance_id", traj_path.stem),
627
  "model_name": model_name,
628
+ "api_calls": api_calls,
629
  "instance_cost": model_stats.get("instance_cost", 0),
630
  "prompt_tokens": 0,
631
  "completion_tokens": 0,
 
658
  global _trajectories_cache
659
 
660
  if folder in _trajectories_cache:
661
+ return ensure_token_columns(_trajectories_cache[folder])
662
 
663
  output_dir = TRAJS_DIR / folder
664
 
 
677
  try:
678
  rows.append(parse_trajectory(traj_path))
679
  except Exception as e:
680
+ logging.error("Error parsing %s: %s", traj_path, e, exc_info=True)
681
 
682
+ df = ensure_token_columns(pd.DataFrame(rows))
683
  _trajectories_cache[folder] = df
684
  return df
685
 
 
1084
  return result, model_hint
1085
 
1086
 
1087
+ def _build_selection_payload(row_idx: int | None, df: pd.DataFrame):
1088
+ if df is None or df.empty or row_idx is None:
1089
  return (
1090
  "", "",
1091
  gr.update(visible=False),
 
1097
  gr.update(value=1.0),
1098
  )
1099
 
 
1100
  row = df.iloc[row_idx]
1101
  folder = row["folder"]
1102
  name = row["name"]
 
1125
  )
1126
 
1127
 
1128
+ def on_row_select(evt: gr.SelectData, df: pd.DataFrame):
1129
+ row_idx = None
1130
+ if evt is not None and evt.index is not None:
1131
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
1132
+ return _build_selection_payload(row_idx, df)
1133
+
1134
+
1135
+ def select_first_row(df: pd.DataFrame):
1136
+ default_idx = 0 if df is not None and not df.empty else None
1137
+ return _build_selection_payload(default_idx, df)
1138
+
1139
+
1140
  def create_routed_token_chart(base_tokens: dict, additional_models: list):
1141
  """
1142
  Create grouped bar chart for tokens by type, comparing base vs additional models.
 
1269
  with gr.Blocks(title="SWE-bench Routing Cost Calculator") as app:
1270
  trajectories_state = gr.State(None)
1271
 
1272
+ gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.14`")
1273
+ gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1274
 
1275
  with gr.Row():
1276
  with gr.Column(scale=3):
 
1279
  label="Bash-Only Leaderboard",
1280
  interactive=False,
1281
  wrap=True,
1282
+ elem_id="leaderboard-table",
1283
  )
1284
 
1285
  with gr.Column(visible=False) as analysis_section:
 
1892
  outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
1893
  )
1894
 
1895
+ app.load(
1896
+ fn=select_first_row,
1897
+ inputs=[leaderboard_table],
1898
+ outputs=[selected_folder, selected_name, analyze_btn, price_input, price_cache_read, price_cache_creation, price_completion, detected_model, thinking_overhead],
1899
+ js="""
1900
+ (data) => {
1901
+ const row = gradioApp()?.querySelector('#leaderboard-table table tbody tr');
1902
+ if (row) {
1903
+ row.click();
1904
+ }
1905
+ return data;
1906
+ }
1907
+ """,
1908
+ )
1909
+
1910
  def load_and_analyze(folder, input_price, cache_read_price, cache_creation_price, completion_price, overhead, with_cache, progress=gr.Progress()):
1911
+ progress(0, desc="Ready")
1912
  empty_result = (
1913
  "",
1914
  gr.update(visible=False),
 
1920
  )
1921
 
1922
  if not folder:
1923
+ progress(1, desc="No folder selected")
1924
  yield empty_result
1925
  return
1926
 
1927
  if not check_trajectories_downloaded(folder):
1928
+ progress(0.1, desc="Preparing download")
1929
  yield (
1930
  "⏳ Downloading trajectories...",
1931
  gr.update(visible=False),
 
1935
  None,
1936
  gr.update(visible=False),
1937
  )
1938
+ progress(0.3, desc="Downloading")
1939
  status, _ = download_trajectories_from_s3(folder)
1940
  if "❌" in status:
1941
+ progress(1, desc="Download failed")
1942
  yield (
1943
  status,
1944
  gr.update(visible=False),
 
1949
  gr.update(visible=False),
1950
  )
1951
  return
1952
+ progress(0.45, desc="Loading trajectories")
1953
 
1954
  yield (
1955
  "⏳ Loading trajectories...",
 
1961
  gr.update(visible=False),
1962
  )
1963
 
1964
+ progress(0.6, desc="Reading metadata")
1965
+ df_meta = ensure_token_columns(load_all_trajectories(folder))
1966
+ progress(0.7, desc="Reading calculated")
1967
+ df_calc = ensure_token_columns(load_all_trajectories_calculated(folder))
1968
  df_calc["api_calls"] = df_meta["api_calls"].values
1969
  df_calc["instance_cost"] = df_meta["instance_cost"].values
1970
+ progress(0.8, desc="Reading steps")
1971
  trajectory_steps = load_all_trajectory_steps(folder)
1972
 
1973
  state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
1974
 
1975
  if df_meta.empty:
1976
+ progress(1, desc="No trajectories found")
1977
  yield (
1978
  "❌ No trajectories found",
1979
  gr.update(visible=False),
 
1985
  )
1986
  return
1987
 
1988
+ progress(0.9, desc="Building charts")
1989
  fig_steps, fig_cost, _, _, _ = create_basic_histograms(
1990
  df_meta, input_price, cache_read_price, cache_creation_price, completion_price
1991
  )
 
2008
  df_calc_processed, input_price, cache_read_price, cache_creation_price, completion_price
2009
  )
2010
 
2011
+ progress(1, desc="Done")
2012
  yield (
2013
  f"✅ Loaded {len(df_meta)} trajectories",
2014
  gr.update(visible=True),