akseljoonas HF Staff commited on
Commit
2c820a4
·
1 Parent(s): 7e21458
eval/README.md CHANGED
@@ -32,7 +32,7 @@ python eval/generate_rubrics.py \
32
  Files:
33
  - `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
34
  the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
35
- - `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent_solver`,
36
  `claude_code`). If additional solvers are needed, register them there and pass
37
  `-T solver_name=<name>` to swap them in without touching the task.
38
  - `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
@@ -44,7 +44,7 @@ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
44
  -T dataset_name=akseljoonas/hf-agent-rubrics \
45
  -T dataset_split=train \
46
  -T limit=25 \
47
- -T solver_name=hf_agent_solver \
48
  -T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
49
  --log-dir logs/inspect
50
  ```
@@ -73,7 +73,7 @@ and only appends results when the command succeeds):
73
  uv run python eval/run_eval_with_leaderboard.py \
74
  --hf-dataset akseljoonas/hf-agent-leaderboard \
75
  --hf-token $HF_TOKEN \
76
- --solver-name hf_agent_solver \
77
  --solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
78
  --dataset akseljoonas/hf-agent-rubrics@train \
79
  --limit 25
 
32
  Files:
33
  - `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
34
  the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
35
+ - `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent`,
36
  `claude_code`). If additional solvers are needed, register them there and pass
37
  `-T solver_name=<name>` to swap them in without touching the task.
38
  - `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
 
44
  -T dataset_name=akseljoonas/hf-agent-rubrics \
45
  -T dataset_split=train \
46
  -T limit=25 \
47
+ -T solver_name=hf_agent \
48
  -T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
49
  --log-dir logs/inspect
50
  ```
 
73
  uv run python eval/run_eval_with_leaderboard.py \
74
  --hf-dataset akseljoonas/hf-agent-leaderboard \
75
  --hf-token $HF_TOKEN \
76
+ --solver-name hf_agent \
77
  --solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
78
  --dataset akseljoonas/hf-agent-rubrics@train \
79
  --limit 25
eval/leaderboard.py CHANGED
@@ -148,7 +148,7 @@ def build_record(
148
  "command": command,
149
  }
150
 
151
- if solver_name == "hf_agent_solver":
152
  record["solver_version"] = detect_agent_version(
153
  solver_kwargs.get("config_path", "agent/config_mcp_example.json")
154
  )
 
148
  "command": command,
149
  }
150
 
151
+ if solver_name == "hf_agent":
152
  record["solver_version"] = detect_agent_version(
153
  solver_kwargs.get("config_path", "agent/config_mcp_example.json")
154
  )
eval/run_eval_with_leaderboard.py CHANGED
@@ -119,7 +119,7 @@ def main() -> None:
119
  parser.add_argument(
120
  "--solver-name",
121
  required=True,
122
- help="Solver name used in the Inspect task (e.g. hf_agent_solver).",
123
  )
124
  parser.add_argument(
125
  "--solver-kwargs",
 
119
  parser.add_argument(
120
  "--solver-name",
121
  required=True,
122
+ help="Solver name used in the Inspect task (e.g. hf_agent).",
123
  )
124
  parser.add_argument(
125
  "--solver-kwargs",
eval/task.py CHANGED
@@ -92,7 +92,7 @@ def rubric_scorer(judge_model: str = "gpt-4o-mini"):
92
 
93
  @task(name="hf-benchmark-with-rubrics")
94
  def hf_benchmark_with_rubrics(
95
- solver_name: str = "hf_agent_solver",
96
  solver_kwargs: dict[str, Any] = {
97
  "max_iterations": 10,
98
  "config_path": "agent/config_mcp_example.json",
 
92
 
93
  @task(name="hf-benchmark-with-rubrics")
94
  def hf_benchmark_with_rubrics(
95
+ solver_name: str = "hf_agent",
96
  solver_kwargs: dict[str, Any] = {
97
  "max_iterations": 10,
98
  "config_path": "agent/config_mcp_example.json",