| """Tests for the defensive misnested-arg extraction in benchmark + profile_run. |
| |
| Live AMD-GPU lesson: Qwen2.5-7B (and probably others) occasionally JSON-nests |
| ``steps`` / ``cache`` *inside* the ``config`` dict instead of at the top level |
| alongside it. WorkloadConfig strict-validates extras, so without this defense |
| the call errors out and a tool slot is wasted. The well-tuned scenario run |
| on 2026-05-07 burned two of the eight available slots on this exact mistake; |
| fixing it costs nothing and saves the audit. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import shutil |
|
|
| from agent.tools import call |
|
|
|
|
| def _baseline_config() -> dict: |
| return { |
| "model_name": "Qwen/Qwen2.5-7B-Instruct", |
| "batch_size": 4, |
| "precision": "fp16", |
| "attention_impl": "eager", |
| "dataloader_workers": 0, |
| } |
|
|
|
|
| class TestBenchmarkMisnestedArgs: |
| def setup_method(self) -> None: |
| |
| |
| shutil.rmtree("bench_cache", ignore_errors=True) |
|
|
| def test_steps_nested_in_config_is_extracted(self) -> None: |
| """Old behavior: ``WorkloadConfig`` validation explodes with |
| 'Extra inputs are not permitted [steps]'. New behavior: defensive |
| extraction pulls ``steps`` back to the top-level arg, call succeeds. |
| """ |
| cfg = {**_baseline_config(), "steps": 25} |
| result = call("benchmark", config=cfg) |
| assert result.ok, result.error |
| assert result.result["steps"] == 25 |
|
|
| def test_cache_nested_in_config_is_extracted(self) -> None: |
| cfg = {**_baseline_config(), "cache": False} |
| result = call("benchmark", config=cfg) |
| assert result.ok, result.error |
|
|
| def test_force_rerun_nested_in_config_is_extracted(self) -> None: |
| cfg = {**_baseline_config(), "force_rerun": True} |
| result = call("benchmark", config=cfg) |
| assert result.ok, result.error |
|
|
| def test_explicit_top_level_wins_over_nested(self) -> None: |
| """If caller passes BOTH (config has steps + top-level steps), the |
| explicit non-default top-level wins. Defensive code is for the |
| accident case, not for letting nesting silently override.""" |
| cfg = {**_baseline_config(), "steps": 25} |
| result = call("benchmark", config=cfg, steps=37) |
| assert result.ok, result.error |
| assert result.result["steps"] == 37 |
|
|
| def test_all_three_nested_at_once(self) -> None: |
| """The exact failure mode from the live run: model nested three |
| runtime args inside config. All three should get pulled out. |
| """ |
| cfg = { |
| **_baseline_config(), |
| "steps": 30, |
| "cache": False, |
| "force_rerun": True, |
| } |
| result = call("benchmark", config=cfg) |
| assert result.ok, result.error |
| assert result.result["steps"] == 30 |
|
|
|
|
| class TestProfileRunMisnestedArgs: |
| def test_steps_nested_in_config_is_extracted(self) -> None: |
| cfg = {**_baseline_config(), "steps": 7} |
| result = call("profile_run", config=cfg) |
| assert result.ok, result.error |
| assert result.result["steps"] == 7 |
|
|
| def test_explicit_top_level_wins(self) -> None: |
| cfg = {**_baseline_config(), "steps": 7} |
| result = call("profile_run", config=cfg, steps=15) |
| assert result.ok, result.error |
| assert result.result["steps"] == 15 |
|
|
| def test_clean_config_unaffected(self) -> None: |
| """Sanity: when nothing is misnested, behavior is unchanged.""" |
| result = call("profile_run", config=_baseline_config()) |
| assert result.ok, result.error |
| assert result.result["steps"] == 10 |
|
|