Commit
·
26e685e
1
Parent(s):
bb3fde6
Add Resolved/Unresolved routing strategy (v0.3.18)
Browse files- Route all steps based on trajectory resolution status
- Select model for resolved and unresolved trajectories
- Choices: Base, M1, M2, M3 (static dropdown)
- Loads resolved status from leaderboard per_instance_details
app.py
CHANGED
|
@@ -1323,7 +1323,7 @@ def build_app():
|
|
| 1323 |
""")
|
| 1324 |
trajectories_state = gr.State(None)
|
| 1325 |
|
| 1326 |
-
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.
|
| 1327 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1328 |
|
| 1329 |
with gr.Row():
|
|
@@ -1491,7 +1491,7 @@ def build_app():
|
|
| 1491 |
gr.Markdown("### 🎯 Router Strategy")
|
| 1492 |
|
| 1493 |
selected_strategy = gr.Radio(
|
| 1494 |
-
choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Replace part of trajectory"],
|
| 1495 |
value="Random router",
|
| 1496 |
label="",
|
| 1497 |
interactive=True,
|
|
@@ -1523,6 +1523,21 @@ def build_app():
|
|
| 1523 |
grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
|
| 1524 |
grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
|
| 1525 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1526 |
with gr.Column(visible=False) as part_block:
|
| 1527 |
part_hint = gr.Markdown("*Ranges must not overlap*")
|
| 1528 |
part_mode = gr.Radio(
|
|
@@ -1556,6 +1571,7 @@ def build_app():
|
|
| 1556 |
show_every_k = strategy == "Every k-th step"
|
| 1557 |
show_slice = strategy == "Python list slices"
|
| 1558 |
show_grep = strategy == "Grep"
|
|
|
|
| 1559 |
show_part = strategy == "Replace part of trajectory"
|
| 1560 |
has_m2 = num_models >= 2
|
| 1561 |
has_m3 = num_models >= 3
|
|
@@ -1564,6 +1580,7 @@ def build_app():
|
|
| 1564 |
gr.update(visible=show_every_k), # every_k_block
|
| 1565 |
gr.update(visible=show_slice), # slice_block
|
| 1566 |
gr.update(visible=show_grep), # grep_block
|
|
|
|
| 1567 |
gr.update(visible=show_part), # part_block
|
| 1568 |
gr.update(visible=show_random), # random_hint
|
| 1569 |
gr.update(visible=show_random), # weight_base
|
|
@@ -1582,6 +1599,7 @@ def build_app():
|
|
| 1582 |
gr.update(visible=show_grep), # grep_model_1
|
| 1583 |
gr.update(visible=show_grep and has_m2), # grep_model_2
|
| 1584 |
gr.update(visible=show_grep and has_m3), # grep_model_3
|
|
|
|
| 1585 |
gr.update(visible=show_part), # part_hint
|
| 1586 |
gr.update(visible=show_part), # part_mode
|
| 1587 |
gr.update(visible=show_part), # start_1
|
|
@@ -1596,11 +1614,12 @@ def build_app():
|
|
| 1596 |
fn=on_strategy_change,
|
| 1597 |
inputs=[selected_strategy, num_routing_models],
|
| 1598 |
outputs=[
|
| 1599 |
-
random_block, every_k_block, slice_block, grep_block, part_block,
|
| 1600 |
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 1601 |
every_k_hint, k_model_1, k_model_2, k_model_3,
|
| 1602 |
slice_hint, slice_model_1, slice_model_2, slice_model_3,
|
| 1603 |
grep_hint, grep_model_1, grep_model_2, grep_model_3,
|
|
|
|
| 1604 |
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 1605 |
],
|
| 1606 |
)
|
|
@@ -1618,19 +1637,43 @@ def build_app():
|
|
| 1618 |
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
|
| 1619 |
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
|
| 1620 |
|
| 1621 |
-
def
|
| 1622 |
def fn():
|
| 1623 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1624 |
return fn
|
| 1625 |
|
| 1626 |
for btn, full_model in quick_btns_1:
|
| 1627 |
-
btn.click(
|
|
|
|
|
|
|
|
|
|
| 1628 |
|
| 1629 |
for btn, full_model in quick_btns_2:
|
| 1630 |
-
btn.click(
|
|
|
|
|
|
|
|
|
|
| 1631 |
|
| 1632 |
for btn, full_model in quick_btns_3:
|
| 1633 |
-
btn.click(
|
|
|
|
|
|
|
|
|
|
| 1634 |
|
| 1635 |
def get_routing_prices_with_labels(model_name):
|
| 1636 |
"""Get all 4 prices for a routing model with found/estimated labels"""
|
|
@@ -1673,12 +1716,12 @@ def build_app():
|
|
| 1673 |
def on_routing_model_1_select(model_name):
|
| 1674 |
prices = get_routing_prices_with_labels(model_name)
|
| 1675 |
show_btn = bool(model_name)
|
| 1676 |
-
return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)
|
| 1677 |
|
| 1678 |
def on_routing_model_2_select(model_name):
|
| 1679 |
prices = get_routing_prices_with_labels(model_name)
|
| 1680 |
show_btn = bool(model_name)
|
| 1681 |
-
return *prices, gr.update(visible=show_btn)
|
| 1682 |
|
| 1683 |
def on_routing_model_3_select(model_name):
|
| 1684 |
return get_routing_prices_with_labels(model_name)
|
|
@@ -1760,6 +1803,7 @@ def build_app():
|
|
| 1760 |
k_1_val, k_2_val, k_3_val,
|
| 1761 |
slice_1_val, slice_2_val, slice_3_val,
|
| 1762 |
grep_1_val, grep_2_val, grep_3_val,
|
|
|
|
| 1763 |
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
|
| 1764 |
overhead, with_cache
|
| 1765 |
):
|
|
@@ -1780,6 +1824,7 @@ def build_app():
|
|
| 1780 |
return
|
| 1781 |
|
| 1782 |
trajectory_steps = state_data.get("steps", {})
|
|
|
|
| 1783 |
if not trajectory_steps:
|
| 1784 |
yield (
|
| 1785 |
gr.update(visible=True, value="❌ No trajectory steps data available."),
|
|
@@ -1954,6 +1999,15 @@ def build_app():
|
|
| 1954 |
if grep_matches(content, grep_val):
|
| 1955 |
step_to_model[i] = f"__routing_{j}__"
|
| 1956 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1957 |
elif strategy_val == "Replace part of trajectory":
|
| 1958 |
for j, (start_val, end_val) in enumerate(part_ranges):
|
| 1959 |
if part_mode_val == "Percentages":
|
|
@@ -2081,6 +2135,7 @@ def build_app():
|
|
| 2081 |
k_model_1, k_model_2, k_model_3,
|
| 2082 |
slice_model_1, slice_model_2, slice_model_3,
|
| 2083 |
grep_model_1, grep_model_2, grep_model_3,
|
|
|
|
| 2084 |
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 2085 |
thinking_overhead, use_cache,
|
| 2086 |
],
|
|
@@ -2171,7 +2226,14 @@ def build_app():
|
|
| 2171 |
progress(0.8, desc="Reading steps")
|
| 2172 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 2173 |
|
| 2174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2175 |
|
| 2176 |
if df_meta.empty:
|
| 2177 |
progress(1, desc="No trajectories found")
|
|
|
|
| 1323 |
""")
|
| 1324 |
trajectories_state = gr.State(None)
|
| 1325 |
|
| 1326 |
+
gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.18`")
|
| 1327 |
gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
|
| 1328 |
|
| 1329 |
with gr.Row():
|
|
|
|
| 1491 |
gr.Markdown("### 🎯 Router Strategy")
|
| 1492 |
|
| 1493 |
selected_strategy = gr.Radio(
|
| 1494 |
+
choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
|
| 1495 |
value="Random router",
|
| 1496 |
label="",
|
| 1497 |
interactive=True,
|
|
|
|
| 1523 |
grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
|
| 1524 |
grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
|
| 1525 |
|
| 1526 |
+
with gr.Column(visible=False) as resolved_block:
|
| 1527 |
+
resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*")
|
| 1528 |
+
resolved_model = gr.Dropdown(
|
| 1529 |
+
label="Model for resolved trajectories",
|
| 1530 |
+
choices=["Base", "M1", "M2", "M3"],
|
| 1531 |
+
value="Base",
|
| 1532 |
+
interactive=True,
|
| 1533 |
+
)
|
| 1534 |
+
unresolved_model = gr.Dropdown(
|
| 1535 |
+
label="Model for unresolved trajectories",
|
| 1536 |
+
choices=["Base", "M1", "M2", "M3"],
|
| 1537 |
+
value="M1",
|
| 1538 |
+
interactive=True,
|
| 1539 |
+
)
|
| 1540 |
+
|
| 1541 |
with gr.Column(visible=False) as part_block:
|
| 1542 |
part_hint = gr.Markdown("*Ranges must not overlap*")
|
| 1543 |
part_mode = gr.Radio(
|
|
|
|
| 1571 |
show_every_k = strategy == "Every k-th step"
|
| 1572 |
show_slice = strategy == "Python list slices"
|
| 1573 |
show_grep = strategy == "Grep"
|
| 1574 |
+
show_resolved = strategy == "Resolved/Unresolved"
|
| 1575 |
show_part = strategy == "Replace part of trajectory"
|
| 1576 |
has_m2 = num_models >= 2
|
| 1577 |
has_m3 = num_models >= 3
|
|
|
|
| 1580 |
gr.update(visible=show_every_k), # every_k_block
|
| 1581 |
gr.update(visible=show_slice), # slice_block
|
| 1582 |
gr.update(visible=show_grep), # grep_block
|
| 1583 |
+
gr.update(visible=show_resolved), # resolved_block
|
| 1584 |
gr.update(visible=show_part), # part_block
|
| 1585 |
gr.update(visible=show_random), # random_hint
|
| 1586 |
gr.update(visible=show_random), # weight_base
|
|
|
|
| 1599 |
gr.update(visible=show_grep), # grep_model_1
|
| 1600 |
gr.update(visible=show_grep and has_m2), # grep_model_2
|
| 1601 |
gr.update(visible=show_grep and has_m3), # grep_model_3
|
| 1602 |
+
gr.update(visible=show_resolved), # resolved_hint
|
| 1603 |
gr.update(visible=show_part), # part_hint
|
| 1604 |
gr.update(visible=show_part), # part_mode
|
| 1605 |
gr.update(visible=show_part), # start_1
|
|
|
|
| 1614 |
fn=on_strategy_change,
|
| 1615 |
inputs=[selected_strategy, num_routing_models],
|
| 1616 |
outputs=[
|
| 1617 |
+
random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
|
| 1618 |
random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
|
| 1619 |
every_k_hint, k_model_1, k_model_2, k_model_3,
|
| 1620 |
slice_hint, slice_model_1, slice_model_2, slice_model_3,
|
| 1621 |
grep_hint, grep_model_1, grep_model_2, grep_model_3,
|
| 1622 |
+
resolved_hint,
|
| 1623 |
part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 1624 |
],
|
| 1625 |
)
|
|
|
|
| 1637 |
routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
|
| 1638 |
routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
|
| 1639 |
|
| 1640 |
+
def make_quick_select_fn_1(full_model_name):
|
| 1641 |
def fn():
|
| 1642 |
+
prices = get_routing_prices_with_labels(full_model_name)
|
| 1643 |
+
return (gr.update(value=full_model_name), *prices,
|
| 1644 |
+
gr.update(visible=True), gr.update(interactive=True))
|
| 1645 |
+
return fn
|
| 1646 |
+
|
| 1647 |
+
def make_quick_select_fn_2(full_model_name):
|
| 1648 |
+
def fn():
|
| 1649 |
+
prices = get_routing_prices_with_labels(full_model_name)
|
| 1650 |
+
return (gr.update(value=full_model_name), *prices,
|
| 1651 |
+
gr.update(visible=True))
|
| 1652 |
+
return fn
|
| 1653 |
+
|
| 1654 |
+
def make_quick_select_fn_3(full_model_name):
|
| 1655 |
+
def fn():
|
| 1656 |
+
prices = get_routing_prices_with_labels(full_model_name)
|
| 1657 |
+
return (gr.update(value=full_model_name), *prices)
|
| 1658 |
return fn
|
| 1659 |
|
| 1660 |
for btn, full_model in quick_btns_1:
|
| 1661 |
+
btn.click(
|
| 1662 |
+
fn=make_quick_select_fn_1(full_model),
|
| 1663 |
+
outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
|
| 1664 |
+
)
|
| 1665 |
|
| 1666 |
for btn, full_model in quick_btns_2:
|
| 1667 |
+
btn.click(
|
| 1668 |
+
fn=make_quick_select_fn_2(full_model),
|
| 1669 |
+
outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
|
| 1670 |
+
)
|
| 1671 |
|
| 1672 |
for btn, full_model in quick_btns_3:
|
| 1673 |
+
btn.click(
|
| 1674 |
+
fn=make_quick_select_fn_3(full_model),
|
| 1675 |
+
outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
|
| 1676 |
+
)
|
| 1677 |
|
| 1678 |
def get_routing_prices_with_labels(model_name):
|
| 1679 |
"""Get all 4 prices for a routing model with found/estimated labels"""
|
|
|
|
| 1716 |
def on_routing_model_1_select(model_name):
|
| 1717 |
prices = get_routing_prices_with_labels(model_name)
|
| 1718 |
show_btn = bool(model_name)
|
| 1719 |
+
return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))
|
| 1720 |
|
| 1721 |
def on_routing_model_2_select(model_name):
|
| 1722 |
prices = get_routing_prices_with_labels(model_name)
|
| 1723 |
show_btn = bool(model_name)
|
| 1724 |
+
return (*prices, gr.update(visible=show_btn))
|
| 1725 |
|
| 1726 |
def on_routing_model_3_select(model_name):
|
| 1727 |
return get_routing_prices_with_labels(model_name)
|
|
|
|
| 1803 |
k_1_val, k_2_val, k_3_val,
|
| 1804 |
slice_1_val, slice_2_val, slice_3_val,
|
| 1805 |
grep_1_val, grep_2_val, grep_3_val,
|
| 1806 |
+
resolved_model_val, unresolved_model_val,
|
| 1807 |
part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
|
| 1808 |
overhead, with_cache
|
| 1809 |
):
|
|
|
|
| 1824 |
return
|
| 1825 |
|
| 1826 |
trajectory_steps = state_data.get("steps", {})
|
| 1827 |
+
resolved_instances = state_data.get("resolved", {})
|
| 1828 |
if not trajectory_steps:
|
| 1829 |
yield (
|
| 1830 |
gr.update(visible=True, value="❌ No trajectory steps data available."),
|
|
|
|
| 1999 |
if grep_matches(content, grep_val):
|
| 2000 |
step_to_model[i] = f"__routing_{j}__"
|
| 2001 |
|
| 2002 |
+
elif strategy_val == "Resolved/Unresolved":
|
| 2003 |
+
is_resolved = resolved_instances.get(instance_id, False)
|
| 2004 |
+
target_model = resolved_model_val if is_resolved else unresolved_model_val
|
| 2005 |
+
if target_model and target_model != "Base":
|
| 2006 |
+
model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
|
| 2007 |
+
if model_idx is not None and model_idx < len(routing_models):
|
| 2008 |
+
for i in range(total_steps):
|
| 2009 |
+
step_to_model[i] = f"__routing_{model_idx}__"
|
| 2010 |
+
|
| 2011 |
elif strategy_val == "Replace part of trajectory":
|
| 2012 |
for j, (start_val, end_val) in enumerate(part_ranges):
|
| 2013 |
if part_mode_val == "Percentages":
|
|
|
|
| 2135 |
k_model_1, k_model_2, k_model_3,
|
| 2136 |
slice_model_1, slice_model_2, slice_model_3,
|
| 2137 |
grep_model_1, grep_model_2, grep_model_3,
|
| 2138 |
+
resolved_model, unresolved_model,
|
| 2139 |
part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
|
| 2140 |
thinking_overhead, use_cache,
|
| 2141 |
],
|
|
|
|
| 2226 |
progress(0.8, desc="Reading steps")
|
| 2227 |
trajectory_steps = load_all_trajectory_steps(folder)
|
| 2228 |
|
| 2229 |
+
model_details, _ = get_model_details(folder)
|
| 2230 |
+
resolved_instances = {}
|
| 2231 |
+
if model_details:
|
| 2232 |
+
per_instance = model_details.get("per_instance_details", {})
|
| 2233 |
+
for inst_id, details in per_instance.items():
|
| 2234 |
+
resolved_instances[inst_id] = details.get("resolved", False)
|
| 2235 |
+
|
| 2236 |
+
state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "resolved": resolved_instances}
|
| 2237 |
|
| 2238 |
if df_meta.empty:
|
| 2239 |
progress(1, desc="No trajectories found")
|