IgorSlinko commited on
Commit
26e685e
·
1 Parent(s): bb3fde6

Add Resolved/Unresolved routing strategy (v0.3.18)

Browse files

- Route all steps based on trajectory resolution status
- Select model for resolved and unresolved trajectories
- Choices: Base, M1, M2, M3 (static dropdown)
- Loads resolved status from leaderboard per_instance_details

Files changed (1) hide show
  1. app.py +73 -11
app.py CHANGED
@@ -1323,7 +1323,7 @@ def build_app():
1323
  """)
1324
  trajectories_state = gr.State(None)
1325
 
1326
- gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.17`")
1327
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1328
 
1329
  with gr.Row():
@@ -1491,7 +1491,7 @@ def build_app():
1491
  gr.Markdown("### 🎯 Router Strategy")
1492
 
1493
  selected_strategy = gr.Radio(
1494
- choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Replace part of trajectory"],
1495
  value="Random router",
1496
  label="",
1497
  interactive=True,
@@ -1523,6 +1523,21 @@ def build_app():
1523
  grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
1524
  grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
1525
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1526
  with gr.Column(visible=False) as part_block:
1527
  part_hint = gr.Markdown("*Ranges must not overlap*")
1528
  part_mode = gr.Radio(
@@ -1556,6 +1571,7 @@ def build_app():
1556
  show_every_k = strategy == "Every k-th step"
1557
  show_slice = strategy == "Python list slices"
1558
  show_grep = strategy == "Grep"
 
1559
  show_part = strategy == "Replace part of trajectory"
1560
  has_m2 = num_models >= 2
1561
  has_m3 = num_models >= 3
@@ -1564,6 +1580,7 @@ def build_app():
1564
  gr.update(visible=show_every_k), # every_k_block
1565
  gr.update(visible=show_slice), # slice_block
1566
  gr.update(visible=show_grep), # grep_block
 
1567
  gr.update(visible=show_part), # part_block
1568
  gr.update(visible=show_random), # random_hint
1569
  gr.update(visible=show_random), # weight_base
@@ -1582,6 +1599,7 @@ def build_app():
1582
  gr.update(visible=show_grep), # grep_model_1
1583
  gr.update(visible=show_grep and has_m2), # grep_model_2
1584
  gr.update(visible=show_grep and has_m3), # grep_model_3
 
1585
  gr.update(visible=show_part), # part_hint
1586
  gr.update(visible=show_part), # part_mode
1587
  gr.update(visible=show_part), # start_1
@@ -1596,11 +1614,12 @@ def build_app():
1596
  fn=on_strategy_change,
1597
  inputs=[selected_strategy, num_routing_models],
1598
  outputs=[
1599
- random_block, every_k_block, slice_block, grep_block, part_block,
1600
  random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
1601
  every_k_hint, k_model_1, k_model_2, k_model_3,
1602
  slice_hint, slice_model_1, slice_model_2, slice_model_3,
1603
  grep_hint, grep_model_1, grep_model_2, grep_model_3,
 
1604
  part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
1605
  ],
1606
  )
@@ -1618,19 +1637,43 @@ def build_app():
1618
  routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
1619
  routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
1620
 
1621
- def make_quick_select_fn(full_model_name):
1622
  def fn():
1623
- return gr.update(value=full_model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1624
  return fn
1625
 
1626
  for btn, full_model in quick_btns_1:
1627
- btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_1])
 
 
 
1628
 
1629
  for btn, full_model in quick_btns_2:
1630
- btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_2])
 
 
 
1631
 
1632
  for btn, full_model in quick_btns_3:
1633
- btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_3])
 
 
 
1634
 
1635
  def get_routing_prices_with_labels(model_name):
1636
  """Get all 4 prices for a routing model with found/estimated labels"""
@@ -1673,12 +1716,12 @@ def build_app():
1673
  def on_routing_model_1_select(model_name):
1674
  prices = get_routing_prices_with_labels(model_name)
1675
  show_btn = bool(model_name)
1676
- return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)
1677
 
1678
  def on_routing_model_2_select(model_name):
1679
  prices = get_routing_prices_with_labels(model_name)
1680
  show_btn = bool(model_name)
1681
- return *prices, gr.update(visible=show_btn)
1682
 
1683
  def on_routing_model_3_select(model_name):
1684
  return get_routing_prices_with_labels(model_name)
@@ -1760,6 +1803,7 @@ def build_app():
1760
  k_1_val, k_2_val, k_3_val,
1761
  slice_1_val, slice_2_val, slice_3_val,
1762
  grep_1_val, grep_2_val, grep_3_val,
 
1763
  part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
1764
  overhead, with_cache
1765
  ):
@@ -1780,6 +1824,7 @@ def build_app():
1780
  return
1781
 
1782
  trajectory_steps = state_data.get("steps", {})
 
1783
  if not trajectory_steps:
1784
  yield (
1785
  gr.update(visible=True, value="❌ No trajectory steps data available."),
@@ -1954,6 +1999,15 @@ def build_app():
1954
  if grep_matches(content, grep_val):
1955
  step_to_model[i] = f"__routing_{j}__"
1956
 
 
 
 
 
 
 
 
 
 
1957
  elif strategy_val == "Replace part of trajectory":
1958
  for j, (start_val, end_val) in enumerate(part_ranges):
1959
  if part_mode_val == "Percentages":
@@ -2081,6 +2135,7 @@ def build_app():
2081
  k_model_1, k_model_2, k_model_3,
2082
  slice_model_1, slice_model_2, slice_model_3,
2083
  grep_model_1, grep_model_2, grep_model_3,
 
2084
  part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
2085
  thinking_overhead, use_cache,
2086
  ],
@@ -2171,7 +2226,14 @@ def build_app():
2171
  progress(0.8, desc="Reading steps")
2172
  trajectory_steps = load_all_trajectory_steps(folder)
2173
 
2174
- state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
 
 
 
 
 
 
 
2175
 
2176
  if df_meta.empty:
2177
  progress(1, desc="No trajectories found")
 
1323
  """)
1324
  trajectories_state = gr.State(None)
1325
 
1326
+ gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.18`")
1327
  gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
1328
 
1329
  with gr.Row():
 
1491
  gr.Markdown("### 🎯 Router Strategy")
1492
 
1493
  selected_strategy = gr.Radio(
1494
+ choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
1495
  value="Random router",
1496
  label="",
1497
  interactive=True,
 
1523
  grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
1524
  grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
1525
 
1526
+ with gr.Column(visible=False) as resolved_block:
1527
+ resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*")
1528
+ resolved_model = gr.Dropdown(
1529
+ label="Model for resolved trajectories",
1530
+ choices=["Base", "M1", "M2", "M3"],
1531
+ value="Base",
1532
+ interactive=True,
1533
+ )
1534
+ unresolved_model = gr.Dropdown(
1535
+ label="Model for unresolved trajectories",
1536
+ choices=["Base", "M1", "M2", "M3"],
1537
+ value="M1",
1538
+ interactive=True,
1539
+ )
1540
+
1541
  with gr.Column(visible=False) as part_block:
1542
  part_hint = gr.Markdown("*Ranges must not overlap*")
1543
  part_mode = gr.Radio(
 
1571
  show_every_k = strategy == "Every k-th step"
1572
  show_slice = strategy == "Python list slices"
1573
  show_grep = strategy == "Grep"
1574
+ show_resolved = strategy == "Resolved/Unresolved"
1575
  show_part = strategy == "Replace part of trajectory"
1576
  has_m2 = num_models >= 2
1577
  has_m3 = num_models >= 3
 
1580
  gr.update(visible=show_every_k), # every_k_block
1581
  gr.update(visible=show_slice), # slice_block
1582
  gr.update(visible=show_grep), # grep_block
1583
+ gr.update(visible=show_resolved), # resolved_block
1584
  gr.update(visible=show_part), # part_block
1585
  gr.update(visible=show_random), # random_hint
1586
  gr.update(visible=show_random), # weight_base
 
1599
  gr.update(visible=show_grep), # grep_model_1
1600
  gr.update(visible=show_grep and has_m2), # grep_model_2
1601
  gr.update(visible=show_grep and has_m3), # grep_model_3
1602
+ gr.update(visible=show_resolved), # resolved_hint
1603
  gr.update(visible=show_part), # part_hint
1604
  gr.update(visible=show_part), # part_mode
1605
  gr.update(visible=show_part), # start_1
 
1614
  fn=on_strategy_change,
1615
  inputs=[selected_strategy, num_routing_models],
1616
  outputs=[
1617
+ random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
1618
  random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
1619
  every_k_hint, k_model_1, k_model_2, k_model_3,
1620
  slice_hint, slice_model_1, slice_model_2, slice_model_3,
1621
  grep_hint, grep_model_1, grep_model_2, grep_model_3,
1622
+ resolved_hint,
1623
  part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
1624
  ],
1625
  )
 
1637
  routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
1638
  routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
1639
 
1640
+ def make_quick_select_fn_1(full_model_name):
1641
  def fn():
1642
+ prices = get_routing_prices_with_labels(full_model_name)
1643
+ return (gr.update(value=full_model_name), *prices,
1644
+ gr.update(visible=True), gr.update(interactive=True))
1645
+ return fn
1646
+
1647
+ def make_quick_select_fn_2(full_model_name):
1648
+ def fn():
1649
+ prices = get_routing_prices_with_labels(full_model_name)
1650
+ return (gr.update(value=full_model_name), *prices,
1651
+ gr.update(visible=True))
1652
+ return fn
1653
+
1654
+ def make_quick_select_fn_3(full_model_name):
1655
+ def fn():
1656
+ prices = get_routing_prices_with_labels(full_model_name)
1657
+ return (gr.update(value=full_model_name), *prices)
1658
  return fn
1659
 
1660
  for btn, full_model in quick_btns_1:
1661
+ btn.click(
1662
+ fn=make_quick_select_fn_1(full_model),
1663
+ outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
1664
+ )
1665
 
1666
  for btn, full_model in quick_btns_2:
1667
+ btn.click(
1668
+ fn=make_quick_select_fn_2(full_model),
1669
+ outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
1670
+ )
1671
 
1672
  for btn, full_model in quick_btns_3:
1673
+ btn.click(
1674
+ fn=make_quick_select_fn_3(full_model),
1675
+ outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
1676
+ )
1677
 
1678
  def get_routing_prices_with_labels(model_name):
1679
  """Get all 4 prices for a routing model with found/estimated labels"""
 
1716
  def on_routing_model_1_select(model_name):
1717
  prices = get_routing_prices_with_labels(model_name)
1718
  show_btn = bool(model_name)
1719
+ return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))
1720
 
1721
  def on_routing_model_2_select(model_name):
1722
  prices = get_routing_prices_with_labels(model_name)
1723
  show_btn = bool(model_name)
1724
+ return (*prices, gr.update(visible=show_btn))
1725
 
1726
  def on_routing_model_3_select(model_name):
1727
  return get_routing_prices_with_labels(model_name)
 
1803
  k_1_val, k_2_val, k_3_val,
1804
  slice_1_val, slice_2_val, slice_3_val,
1805
  grep_1_val, grep_2_val, grep_3_val,
1806
+ resolved_model_val, unresolved_model_val,
1807
  part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
1808
  overhead, with_cache
1809
  ):
 
1824
  return
1825
 
1826
  trajectory_steps = state_data.get("steps", {})
1827
+ resolved_instances = state_data.get("resolved", {})
1828
  if not trajectory_steps:
1829
  yield (
1830
  gr.update(visible=True, value="❌ No trajectory steps data available."),
 
1999
  if grep_matches(content, grep_val):
2000
  step_to_model[i] = f"__routing_{j}__"
2001
 
2002
+ elif strategy_val == "Resolved/Unresolved":
2003
+ is_resolved = resolved_instances.get(instance_id, False)
2004
+ target_model = resolved_model_val if is_resolved else unresolved_model_val
2005
+ if target_model and target_model != "Base":
2006
+ model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
2007
+ if model_idx is not None and model_idx < len(routing_models):
2008
+ for i in range(total_steps):
2009
+ step_to_model[i] = f"__routing_{model_idx}__"
2010
+
2011
  elif strategy_val == "Replace part of trajectory":
2012
  for j, (start_val, end_val) in enumerate(part_ranges):
2013
  if part_mode_val == "Percentages":
 
2135
  k_model_1, k_model_2, k_model_3,
2136
  slice_model_1, slice_model_2, slice_model_3,
2137
  grep_model_1, grep_model_2, grep_model_3,
2138
+ resolved_model, unresolved_model,
2139
  part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
2140
  thinking_overhead, use_cache,
2141
  ],
 
2226
  progress(0.8, desc="Reading steps")
2227
  trajectory_steps = load_all_trajectory_steps(folder)
2228
 
2229
+ model_details, _ = get_model_details(folder)
2230
+ resolved_instances = {}
2231
+ if model_details:
2232
+ per_instance = model_details.get("per_instance_details", {})
2233
+ for inst_id, details in per_instance.items():
2234
+ resolved_instances[inst_id] = details.get("resolved", False)
2235
+
2236
+ state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "resolved": resolved_instances}
2237
 
2238
  if df_meta.empty:
2239
  progress(1, desc="No trajectories found")