Spaces:

smolagents
/

ml-agent

Running

akseljoonas HF Staff commited on Nov 27, 2025

Commit

2c820a4

1 Parent(s): 7e21458

rename

Files changed (4) hide show

eval/README.md CHANGED Viewed

@@ -32,7 +32,7 @@ python eval/generate_rubrics.py \
 Files:
 - `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
   the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
-- `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent_solver`,
   `claude_code`). If additional solvers are needed, register them there and pass
   `-T solver_name=<name>` to swap them in without touching the task.
 - `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
@@ -44,7 +44,7 @@ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
   -T dataset_name=akseljoonas/hf-agent-rubrics \
   -T dataset_split=train \
   -T limit=25 \
-  -T solver_name=hf_agent_solver \
   -T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
   --log-dir logs/inspect
 ```
@@ -73,7 +73,7 @@ and only appends results when the command succeeds):
 uv run python eval/run_eval_with_leaderboard.py \
   --hf-dataset akseljoonas/hf-agent-leaderboard \
   --hf-token $HF_TOKEN \
-  --solver-name hf_agent_solver \
   --solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
   --dataset akseljoonas/hf-agent-rubrics@train \
   --limit 25

 Files:
 - `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
   the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
+- `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent`,
   `claude_code`). If additional solvers are needed, register them there and pass
   `-T solver_name=<name>` to swap them in without touching the task.
 - `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
   -T dataset_name=akseljoonas/hf-agent-rubrics \
   -T dataset_split=train \
   -T limit=25 \
+  -T solver_name=hf_agent \
   -T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
   --log-dir logs/inspect
 ```
 uv run python eval/run_eval_with_leaderboard.py \
   --hf-dataset akseljoonas/hf-agent-leaderboard \
   --hf-token $HF_TOKEN \
+  --solver-name hf_agent \
   --solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
   --dataset akseljoonas/hf-agent-rubrics@train \
   --limit 25

eval/leaderboard.py CHANGED Viewed

@@ -148,7 +148,7 @@ def build_record(
         "command": command,
     }
-    if solver_name == "hf_agent_solver":
         record["solver_version"] = detect_agent_version(
             solver_kwargs.get("config_path", "agent/config_mcp_example.json")
         )

         "command": command,
     }
+    if solver_name == "hf_agent":
         record["solver_version"] = detect_agent_version(
             solver_kwargs.get("config_path", "agent/config_mcp_example.json")
         )

eval/run_eval_with_leaderboard.py CHANGED Viewed

@@ -119,7 +119,7 @@ def main() -> None:
     parser.add_argument(
         "--solver-name",
         required=True,
-        help="Solver name used in the Inspect task (e.g. hf_agent_solver).",
     )
     parser.add_argument(
         "--solver-kwargs",

     parser.add_argument(
         "--solver-name",
         required=True,
+        help="Solver name used in the Inspect task (e.g. hf_agent).",
     )
     parser.add_argument(
         "--solver-kwargs",

eval/task.py CHANGED Viewed

@@ -92,7 +92,7 @@ def rubric_scorer(judge_model: str = "gpt-4o-mini"):
 @task(name="hf-benchmark-with-rubrics")
 def hf_benchmark_with_rubrics(
-    solver_name: str = "hf_agent_solver",
     solver_kwargs: dict[str, Any] = {
         "max_iterations": 10,
         "config_path": "agent/config_mcp_example.json",

 @task(name="hf-benchmark-with-rubrics")
 def hf_benchmark_with_rubrics(
+    solver_name: str = "hf_agent",
     solver_kwargs: dict[str, Any] = {
         "max_iterations": 10,
         "config_path": "agent/config_mcp_example.json",