Spaces:

JetBrains-Research
/

SWE-bench-Costs-Calculator

Sleeping

IgorSlinko commited on 4 days ago

Commit

26e685e

1 Parent(s): bb3fde6

Add Resolved/Unresolved routing strategy (v0.3.18)

- Route all steps based on trajectory resolution status
- Select model for resolved and unresolved trajectories
- Choices: Base, M1, M2, M3 (static dropdown)
- Loads resolved status from leaderboard per_instance_details

Files changed (1) hide show

app.py +73 -11

app.py CHANGED Viewed

@@ -1323,7 +1323,7 @@ def build_app():
         """)
         trajectories_state = gr.State(None)
-        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.17`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
@@ -1491,7 +1491,7 @@ def build_app():
                     gr.Markdown("### 🎯 Router Strategy")
                     selected_strategy = gr.Radio(
-                        choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Replace part of trajectory"],
                         value="Random router",
                         label="",
                         interactive=True,
@@ -1523,6 +1523,21 @@ def build_app():
                         grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
                         grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
                     with gr.Column(visible=False) as part_block:
                         part_hint = gr.Markdown("*Ranges must not overlap*")
                         part_mode = gr.Radio(
@@ -1556,6 +1571,7 @@ def build_app():
             show_every_k = strategy == "Every k-th step"
             show_slice = strategy == "Python list slices"
             show_grep = strategy == "Grep"
             show_part = strategy == "Replace part of trajectory"
             has_m2 = num_models >= 2
             has_m3 = num_models >= 3
@@ -1564,6 +1580,7 @@ def build_app():
                 gr.update(visible=show_every_k),      # every_k_block
                 gr.update(visible=show_slice),        # slice_block
                 gr.update(visible=show_grep),         # grep_block
                 gr.update(visible=show_part),         # part_block
                 gr.update(visible=show_random),       # random_hint
                 gr.update(visible=show_random),       # weight_base
@@ -1582,6 +1599,7 @@ def build_app():
                 gr.update(visible=show_grep),         # grep_model_1
                 gr.update(visible=show_grep and has_m2), # grep_model_2
                 gr.update(visible=show_grep and has_m3), # grep_model_3
                 gr.update(visible=show_part),         # part_hint
                 gr.update(visible=show_part),         # part_mode
                 gr.update(visible=show_part),         # start_1
@@ -1596,11 +1614,12 @@ def build_app():
             fn=on_strategy_change,
             inputs=[selected_strategy, num_routing_models],
             outputs=[
-                random_block, every_k_block, slice_block, grep_block, part_block,
                 random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
                 every_k_hint, k_model_1, k_model_2, k_model_3,
                 slice_hint, slice_model_1, slice_model_2, slice_model_3,
                 grep_hint, grep_model_1, grep_model_2, grep_model_3,
                 part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
             ],
         )
@@ -1618,19 +1637,43 @@ def build_app():
         routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
         routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
-        def make_quick_select_fn(full_model_name):
             def fn():
-                return gr.update(value=full_model_name)
             return fn
         for btn, full_model in quick_btns_1:
-            btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_1])
         for btn, full_model in quick_btns_2:
-            btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_2])
         for btn, full_model in quick_btns_3:
-            btn.click(fn=make_quick_select_fn(full_model), outputs=[routing_model_3])
         def get_routing_prices_with_labels(model_name):
             """Get all 4 prices for a routing model with found/estimated labels"""
@@ -1673,12 +1716,12 @@ def build_app():
         def on_routing_model_1_select(model_name):
             prices = get_routing_prices_with_labels(model_name)
             show_btn = bool(model_name)
-            return *prices, gr.update(visible=show_btn), gr.update(interactive=show_btn)
         def on_routing_model_2_select(model_name):
             prices = get_routing_prices_with_labels(model_name)
             show_btn = bool(model_name)
-            return *prices, gr.update(visible=show_btn)
         def on_routing_model_3_select(model_name):
             return get_routing_prices_with_labels(model_name)
@@ -1760,6 +1803,7 @@ def build_app():
             k_1_val, k_2_val, k_3_val,
             slice_1_val, slice_2_val, slice_3_val,
             grep_1_val, grep_2_val, grep_3_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
             overhead, with_cache
         ):
@@ -1780,6 +1824,7 @@ def build_app():
                 return
             trajectory_steps = state_data.get("steps", {})
             if not trajectory_steps:
                 yield (
                     gr.update(visible=True, value="❌ No trajectory steps data available."),
@@ -1954,6 +1999,15 @@ def build_app():
                                 if grep_matches(content, grep_val):
                                     step_to_model[i] = f"__routing_{j}__"
                 elif strategy_val == "Replace part of trajectory":
                     for j, (start_val, end_val) in enumerate(part_ranges):
                         if part_mode_val == "Percentages":
@@ -2081,6 +2135,7 @@ def build_app():
                 k_model_1, k_model_2, k_model_3,
                 slice_model_1, slice_model_2, slice_model_3,
                 grep_model_1, grep_model_2, grep_model_3,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
             ],
@@ -2171,7 +2226,14 @@ def build_app():
             progress(0.8, desc="Reading steps")
             trajectory_steps = load_all_trajectory_steps(folder)
-            state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps}
             if df_meta.empty:
                 progress(1, desc="No trajectories found")

         """)
         trajectories_state = gr.State(None)
+        gr.Markdown("# 🧮 SWE-bench Bash-Only Leaderboard `v0.3.18`")
         gr.Markdown("## 🎯 Select a base model for cost analysis (click a row)")
         with gr.Row():
                     gr.Markdown("### 🎯 Router Strategy")
                     selected_strategy = gr.Radio(
+                        choices=["Random router", "Every k-th step", "Python list slices", "Grep", "Resolved/Unresolved", "Replace part of trajectory"],
                         value="Random router",
                         label="",
                         interactive=True,
                         grep_model_2 = gr.Textbox(label="M2 grep", value="cat|echo|printf|tee", interactive=True, visible=False)
                         grep_model_3 = gr.Textbox(label="M3 grep", value="python&.py", interactive=True, visible=False)
+                    with gr.Column(visible=False) as resolved_block:
+                        resolved_hint = gr.Markdown("*Route all steps based on trajectory resolution status*")
+                        resolved_model = gr.Dropdown(
+                            label="Model for resolved trajectories",
+                            choices=["Base", "M1", "M2", "M3"],
+                            value="Base",
+                            interactive=True,
+                        )
+                        unresolved_model = gr.Dropdown(
+                            label="Model for unresolved trajectories",
+                            choices=["Base", "M1", "M2", "M3"],
+                            value="M1",
+                            interactive=True,
+                        )
                     with gr.Column(visible=False) as part_block:
                         part_hint = gr.Markdown("*Ranges must not overlap*")
                         part_mode = gr.Radio(
             show_every_k = strategy == "Every k-th step"
             show_slice = strategy == "Python list slices"
             show_grep = strategy == "Grep"
+            show_resolved = strategy == "Resolved/Unresolved"
             show_part = strategy == "Replace part of trajectory"
             has_m2 = num_models >= 2
             has_m3 = num_models >= 3
                 gr.update(visible=show_every_k),      # every_k_block
                 gr.update(visible=show_slice),        # slice_block
                 gr.update(visible=show_grep),         # grep_block
+                gr.update(visible=show_resolved),     # resolved_block
                 gr.update(visible=show_part),         # part_block
                 gr.update(visible=show_random),       # random_hint
                 gr.update(visible=show_random),       # weight_base
                 gr.update(visible=show_grep),         # grep_model_1
                 gr.update(visible=show_grep and has_m2), # grep_model_2
                 gr.update(visible=show_grep and has_m3), # grep_model_3
+                gr.update(visible=show_resolved),     # resolved_hint
                 gr.update(visible=show_part),         # part_hint
                 gr.update(visible=show_part),         # part_mode
                 gr.update(visible=show_part),         # start_1
             fn=on_strategy_change,
             inputs=[selected_strategy, num_routing_models],
             outputs=[
+                random_block, every_k_block, slice_block, grep_block, resolved_block, part_block,
                 random_hint, weight_base, weight_model_1, weight_model_2, weight_model_3,
                 every_k_hint, k_model_1, k_model_2, k_model_3,
                 slice_hint, slice_model_1, slice_model_2, slice_model_3,
                 grep_hint, grep_model_1, grep_model_2, grep_model_3,
+                resolved_hint,
                 part_hint, part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
             ],
         )
         routing_model_2.input(fn=filter_models, inputs=[routing_model_2], outputs=[routing_model_2])
         routing_model_3.input(fn=filter_models, inputs=[routing_model_3], outputs=[routing_model_3])
+        def make_quick_select_fn_1(full_model_name):
             def fn():
+                prices = get_routing_prices_with_labels(full_model_name)
+                return (gr.update(value=full_model_name), *prices,
+                        gr.update(visible=True), gr.update(interactive=True))
+            return fn
+        def make_quick_select_fn_2(full_model_name):
+            def fn():
+                prices = get_routing_prices_with_labels(full_model_name)
+                return (gr.update(value=full_model_name), *prices,
+                        gr.update(visible=True))
+            return fn
+        def make_quick_select_fn_3(full_model_name):
+            def fn():
+                prices = get_routing_prices_with_labels(full_model_name)
+                return (gr.update(value=full_model_name), *prices)
             return fn
         for btn, full_model in quick_btns_1:
+            btn.click(
+                fn=make_quick_select_fn_1(full_model),
+                outputs=[routing_model_1, routing_price_1_input, routing_price_1_cache_read, routing_price_1_cache_creation, routing_price_1_completion, add_model_2_btn, route_btn]
+            )
         for btn, full_model in quick_btns_2:
+            btn.click(
+                fn=make_quick_select_fn_2(full_model),
+                outputs=[routing_model_2, routing_price_2_input, routing_price_2_cache_read, routing_price_2_cache_creation, routing_price_2_completion, add_model_3_btn]
+            )
         for btn, full_model in quick_btns_3:
+            btn.click(
+                fn=make_quick_select_fn_3(full_model),
+                outputs=[routing_model_3, routing_price_3_input, routing_price_3_cache_read, routing_price_3_cache_creation, routing_price_3_completion]
+            )
         def get_routing_prices_with_labels(model_name):
             """Get all 4 prices for a routing model with found/estimated labels"""
         def on_routing_model_1_select(model_name):
             prices = get_routing_prices_with_labels(model_name)
             show_btn = bool(model_name)
+            return (*prices, gr.update(visible=show_btn), gr.update(interactive=show_btn))
         def on_routing_model_2_select(model_name):
             prices = get_routing_prices_with_labels(model_name)
             show_btn = bool(model_name)
+            return (*prices, gr.update(visible=show_btn))
         def on_routing_model_3_select(model_name):
             return get_routing_prices_with_labels(model_name)
             k_1_val, k_2_val, k_3_val,
             slice_1_val, slice_2_val, slice_3_val,
             grep_1_val, grep_2_val, grep_3_val,
+            resolved_model_val, unresolved_model_val,
             part_mode_val, start_1_val, end_1_val, start_2_val, end_2_val, start_3_val, end_3_val,
             overhead, with_cache
         ):
                 return
             trajectory_steps = state_data.get("steps", {})
+            resolved_instances = state_data.get("resolved", {})
             if not trajectory_steps:
                 yield (
                     gr.update(visible=True, value="❌ No trajectory steps data available."),
                                 if grep_matches(content, grep_val):
                                     step_to_model[i] = f"__routing_{j}__"
+                elif strategy_val == "Resolved/Unresolved":
+                    is_resolved = resolved_instances.get(instance_id, False)
+                    target_model = resolved_model_val if is_resolved else unresolved_model_val
+                    if target_model and target_model != "Base":
+                        model_idx = {"M1": 0, "M2": 1, "M3": 2}.get(target_model)
+                        if model_idx is not None and model_idx < len(routing_models):
+                            for i in range(total_steps):
+                                step_to_model[i] = f"__routing_{model_idx}__"
                 elif strategy_val == "Replace part of trajectory":
                     for j, (start_val, end_val) in enumerate(part_ranges):
                         if part_mode_val == "Percentages":
                 k_model_1, k_model_2, k_model_3,
                 slice_model_1, slice_model_2, slice_model_3,
                 grep_model_1, grep_model_2, grep_model_3,
+                resolved_model, unresolved_model,
                 part_mode, start_1, end_1, start_2, end_2, start_3, end_3,
                 thinking_overhead, use_cache,
             ],
             progress(0.8, desc="Reading steps")
             trajectory_steps = load_all_trajectory_steps(folder)
+            model_details, _ = get_model_details(folder)
+            resolved_instances = {}
+            if model_details:
+                per_instance = model_details.get("per_instance_details", {})
+                for inst_id, details in per_instance.items():
+                    resolved_instances[inst_id] = details.get("resolved", False)
+            state_data = {"meta": df_meta, "calculated": df_calc, "folder": folder, "steps": trajectory_steps, "resolved": resolved_instances}
             if df_meta.empty:
                 progress(1, desc="No trajectories found")