Spaces:
Running
Running
Commit
·
2c820a4
1
Parent(s):
7e21458
rename
Browse files- eval/README.md +3 -3
- eval/leaderboard.py +1 -1
- eval/run_eval_with_leaderboard.py +1 -1
- eval/task.py +1 -1
eval/README.md
CHANGED
|
@@ -32,7 +32,7 @@ python eval/generate_rubrics.py \
|
|
| 32 |
Files:
|
| 33 |
- `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
|
| 34 |
the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
|
| 35 |
-
- `eval/solvers.py` keeps the solver implementations (e.g. `
|
| 36 |
`claude_code`). If additional solvers are needed, register them there and pass
|
| 37 |
`-T solver_name=<name>` to swap them in without touching the task.
|
| 38 |
- `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
|
|
@@ -44,7 +44,7 @@ uv run inspect eval eval/task.py@hf-benchmark-with-rubrics \
|
|
| 44 |
-T dataset_name=akseljoonas/hf-agent-rubrics \
|
| 45 |
-T dataset_split=train \
|
| 46 |
-T limit=25 \
|
| 47 |
-
-T solver_name=
|
| 48 |
-T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
|
| 49 |
--log-dir logs/inspect
|
| 50 |
```
|
|
@@ -73,7 +73,7 @@ and only appends results when the command succeeds):
|
|
| 73 |
uv run python eval/run_eval_with_leaderboard.py \
|
| 74 |
--hf-dataset akseljoonas/hf-agent-leaderboard \
|
| 75 |
--hf-token $HF_TOKEN \
|
| 76 |
-
--solver-name
|
| 77 |
--solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
|
| 78 |
--dataset akseljoonas/hf-agent-rubrics@train \
|
| 79 |
--limit 25
|
|
|
|
| 32 |
Files:
|
| 33 |
- `eval/hf_agent_connector.py` contains a lightweight bridge that spins up
|
| 34 |
the existing hf-agent stack in `agent/` (tools, MCP, LiteLLM loop) and returns the assistant reply.
|
| 35 |
+
- `eval/solvers.py` keeps the solver implementations (e.g. `hf_agent`,
|
| 36 |
`claude_code`). If additional solvers are needed, register them there and pass
|
| 37 |
`-T solver_name=<name>` to swap them in without touching the task.
|
| 38 |
- `eval/task.py` registers `hf-benchmark-with-rubrics`, which wires
|
|
|
|
| 44 |
-T dataset_name=akseljoonas/hf-agent-rubrics \
|
| 45 |
-T dataset_split=train \
|
| 46 |
-T limit=25 \
|
| 47 |
+
-T solver_name=hf_agent \
|
| 48 |
-T solver_kwargs='{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
|
| 49 |
--log-dir logs/inspect
|
| 50 |
```
|
|
|
|
| 73 |
uv run python eval/run_eval_with_leaderboard.py \
|
| 74 |
--hf-dataset akseljoonas/hf-agent-leaderboard \
|
| 75 |
--hf-token $HF_TOKEN \
|
| 76 |
+
--solver-name hf_agent \
|
| 77 |
--solver-kwargs '{"config_path":"agent/config_mcp_example.json","max_iterations":10}' \
|
| 78 |
--dataset akseljoonas/hf-agent-rubrics@train \
|
| 79 |
--limit 25
|
eval/leaderboard.py
CHANGED
|
@@ -148,7 +148,7 @@ def build_record(
|
|
| 148 |
"command": command,
|
| 149 |
}
|
| 150 |
|
| 151 |
-
if solver_name == "
|
| 152 |
record["solver_version"] = detect_agent_version(
|
| 153 |
solver_kwargs.get("config_path", "agent/config_mcp_example.json")
|
| 154 |
)
|
|
|
|
| 148 |
"command": command,
|
| 149 |
}
|
| 150 |
|
| 151 |
+
if solver_name == "hf_agent":
|
| 152 |
record["solver_version"] = detect_agent_version(
|
| 153 |
solver_kwargs.get("config_path", "agent/config_mcp_example.json")
|
| 154 |
)
|
eval/run_eval_with_leaderboard.py
CHANGED
|
@@ -119,7 +119,7 @@ def main() -> None:
|
|
| 119 |
parser.add_argument(
|
| 120 |
"--solver-name",
|
| 121 |
required=True,
|
| 122 |
-
help="Solver name used in the Inspect task (e.g.
|
| 123 |
)
|
| 124 |
parser.add_argument(
|
| 125 |
"--solver-kwargs",
|
|
|
|
| 119 |
parser.add_argument(
|
| 120 |
"--solver-name",
|
| 121 |
required=True,
|
| 122 |
+
help="Solver name used in the Inspect task (e.g. hf_agent).",
|
| 123 |
)
|
| 124 |
parser.add_argument(
|
| 125 |
"--solver-kwargs",
|
eval/task.py
CHANGED
|
@@ -92,7 +92,7 @@ def rubric_scorer(judge_model: str = "gpt-4o-mini"):
|
|
| 92 |
|
| 93 |
@task(name="hf-benchmark-with-rubrics")
|
| 94 |
def hf_benchmark_with_rubrics(
|
| 95 |
-
solver_name: str = "
|
| 96 |
solver_kwargs: dict[str, Any] = {
|
| 97 |
"max_iterations": 10,
|
| 98 |
"config_path": "agent/config_mcp_example.json",
|
|
|
|
| 92 |
|
| 93 |
@task(name="hf-benchmark-with-rubrics")
|
| 94 |
def hf_benchmark_with_rubrics(
|
| 95 |
+
solver_name: str = "hf_agent",
|
| 96 |
solver_kwargs: dict[str, Any] = {
|
| 97 |
"max_iterations": 10,
|
| 98 |
"config_path": "agent/config_mcp_example.json",
|