Spaces:

Nomearod
/

agentbench

Sleeping

Nomearod Claude Opus 4.6 (1M context) commited on Mar 24

Commit

36a9ab7

1 Parent(s): 9d976db

feat: Day 2 — tool system with registry, calculator, and search

- Tool ABC with definition() → ToolDefinition for provider format_tools()
- ToolRegistry: dict-based register/get/execute/get_definitions
- CalculatorTool: simpleeval with try/except safety wrapper
- SearchTool: numbered passage format [1] (filename.md): content
with source deduplication in metadata
- SearchTool uses Protocol for retriever dependency (decoupled from rag/)
- 15 new tests: registry CRUD, unknown tool handling, calculator
valid/invalid/dangerous expressions, search formatting/empty/dedup

38 total tests, lint + mypy clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

Files changed (6) hide show

agent_bench/tools/__init__.py +1 -0
agent_bench/tools/base.py +36 -0
agent_bench/tools/calculator.py +34 -0
agent_bench/tools/registry.py +32 -0
agent_bench/tools/search.py +83 -0
tests/test_tools.py +188 -0

agent_bench/tools/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Tool system: base interface, registry, and built-in tools."""

agent_bench/tools/base.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Base tool interface and output model."""
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from pydantic import BaseModel, Field
+from agent_bench.core.types import ToolDefinition
+class ToolOutput(BaseModel):
+    success: bool
+    result: str
+    metadata: dict = Field(default_factory=dict)
+class Tool(ABC):
+    """Abstract base for all tools the agent can invoke."""
+    name: str
+    description: str
+    parameters: dict  # JSON Schema for the tool's arguments
+    @abstractmethod
+    async def execute(self, **kwargs: object) -> ToolOutput:
+        """Execute the tool with the given arguments."""
+        ...
+    def definition(self) -> ToolDefinition:
+        """Return a ToolDefinition for provider format_tools()."""
+        return ToolDefinition(
+            name=self.name,
+            description=self.description,
+            parameters=self.parameters,
+        )

agent_bench/tools/calculator.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Calculator tool: safe math expression evaluation via simpleeval."""
+from __future__ import annotations
+from simpleeval import simple_eval
+from agent_bench.tools.base import Tool, ToolOutput
+class CalculatorTool(Tool):
+    """Evaluate mathematical expressions safely."""
+    name = "calculator"
+    description = "Evaluate a mathematical expression. Supports +, -, *, /, **, %, and parentheses."
+    parameters = {
+        "type": "object",
+        "properties": {
+            "expression": {
+                "type": "string",
+                "description": "The mathematical expression to evaluate, e.g. '2 + 3 * 4'",
+            },
+        },
+        "required": ["expression"],
+    }
+    async def execute(self, **kwargs: object) -> ToolOutput:
+        expression = str(kwargs.get("expression", ""))
+        if not expression:
+            return ToolOutput(success=False, result="No expression provided")
+        try:
+            result = simple_eval(expression)
+            return ToolOutput(success=True, result=str(result))
+        except Exception:
+            return ToolOutput(success=False, result=f"Could not evaluate: {expression}")

agent_bench/tools/registry.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Tool registry: register, retrieve, and dispatch tools by name."""
+from __future__ import annotations
+from agent_bench.core.types import ToolDefinition
+from agent_bench.tools.base import Tool, ToolOutput
+class ToolRegistry:
+    """Dict-based tool registry."""
+    def __init__(self) -> None:
+        self._tools: dict[str, Tool] = {}
+    def register(self, tool: Tool) -> None:
+        """Register a tool by its name."""
+        self._tools[tool.name] = tool
+    def get(self, name: str) -> Tool | None:
+        """Retrieve a tool by name, or None if not found."""
+        return self._tools.get(name)
+    def get_definitions(self) -> list[ToolDefinition]:
+        """Return ToolDefinitions for all registered tools."""
+        return [tool.definition() for tool in self._tools.values()]
+    async def execute(self, name: str, **kwargs: object) -> ToolOutput:
+        """Execute a tool by name. Returns failure output for unknown tools."""
+        tool = self._tools.get(name)
+        if tool is None:
+            return ToolOutput(success=False, result=f"Unknown tool: {name}")
+        return await tool.execute(**kwargs)

agent_bench/tools/search.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""Search tool: RAG retrieval over the document corpus."""
+from __future__ import annotations
+from typing import Protocol
+from agent_bench.tools.base import Tool, ToolOutput
+class SearchResult(Protocol):
+    """Protocol for retriever search results (defined fully in rag.store)."""
+    @property
+    def chunk(self) -> object: ...
+    @property
+    def score(self) -> float: ...
+class Retriever(Protocol):
+    """Protocol for the retriever dependency (defined fully in rag.retriever)."""
+    async def search(self, query: str, top_k: int = 5) -> list: ...
+class SearchTool(Tool):
+    """Search the document corpus and return relevant passages."""
+    name = "search_documents"
+    description = (
+        "Search the technical documentation corpus for relevant passages. "
+        "Returns the most relevant document chunks with source attribution."
+    )
+    parameters = {
+        "type": "object",
+        "properties": {
+            "query": {
+                "type": "string",
+                "description": "The search query to find relevant documentation",
+            },
+            "top_k": {
+                "type": "integer",
+                "description": "Number of results to return (default 5)",
+            },
+        },
+        "required": ["query"],
+    }
+    def __init__(self, retriever: Retriever) -> None:
+        self._retriever = retriever
+    async def execute(self, **kwargs: object) -> ToolOutput:
+        query = str(kwargs.get("query", ""))
+        top_k_val = kwargs.get("top_k", 5)
+        top_k: int = top_k_val if isinstance(top_k_val, int) else int(str(top_k_val))
+        if not query:
+            return ToolOutput(success=False, result="No query provided")
+        results = await self._retriever.search(query, top_k=top_k)
+        if not results:
+            return ToolOutput(
+                success=True,
+                result="No relevant documents found.",
+                metadata={"sources": []},
+            )
+        # Format as numbered passages with filename attribution
+        lines = []
+        sources = []
+        for i, r in enumerate(results, 1):
+            source = r.chunk.source
+            content = r.chunk.content
+            lines.append(f"[{i}] ({source}): {content}")
+            if source not in sources:
+                sources.append(source)
+        return ToolOutput(
+            success=True,
+            result="\n\n".join(lines),
+            metadata={"sources": sources},
+        )

tests/test_tools.py ADDED Viewed

	@@ -0,0 +1,188 @@

+"""Tests for tool system: registry, calculator, search, and schema generation."""
+from __future__ import annotations
+from dataclasses import dataclass
+import pytest
+from agent_bench.tools.calculator import CalculatorTool
+from agent_bench.tools.registry import ToolRegistry
+from agent_bench.tools.search import SearchTool
+# --- Mock retriever for SearchTool tests ---
+@dataclass
+class MockChunk:
+    content: str
+    source: str
+@dataclass
+class MockSearchResult:
+    chunk: MockChunk
+    score: float
+class MockRetriever:
+    """Fake retriever that returns canned results."""
+    def __init__(self, results: list[MockSearchResult] | None = None) -> None:
+        self._results = results or []
+    async def search(self, query: str, top_k: int = 5) -> list[MockSearchResult]:
+        return self._results[:top_k]
+# --- Registry tests ---
+class TestToolRegistry:
+    def test_register_and_retrieve(self):
+        registry = ToolRegistry()
+        tool = CalculatorTool()
+        registry.register(tool)
+        assert registry.get("calculator") is tool
+    def test_get_unknown_returns_none(self):
+        registry = ToolRegistry()
+        assert registry.get("nonexistent") is None
+    @pytest.mark.asyncio
+    async def test_execute_unknown_tool(self):
+        registry = ToolRegistry()
+        result = await registry.execute("nonexistent", query="test")
+        assert result.success is False
+        assert "Unknown tool: nonexistent" in result.result
+    def test_get_definitions(self):
+        registry = ToolRegistry()
+        registry.register(CalculatorTool())
+        registry.register(SearchTool(retriever=MockRetriever()))
+        defs = registry.get_definitions()
+        assert len(defs) == 2
+        names = {d.name for d in defs}
+        assert names == {"calculator", "search_documents"}
+# --- Calculator tests ---
+class TestCalculatorTool:
+    @pytest.mark.asyncio
+    async def test_valid_expression(self):
+        calc = CalculatorTool()
+        result = await calc.execute(expression="2 + 3 * 4")
+        assert result.success is True
+        assert result.result == "14"
+    @pytest.mark.asyncio
+    async def test_float_expression(self):
+        calc = CalculatorTool()
+        result = await calc.execute(expression="10 / 3")
+        assert result.success is True
+        assert float(result.result) == pytest.approx(3.333333, rel=1e-4)
+    @pytest.mark.asyncio
+    async def test_rejects_import(self):
+        calc = CalculatorTool()
+        result = await calc.execute(expression="__import__('os').system('ls')")
+        assert result.success is False
+        assert "Could not evaluate" in result.result
+    @pytest.mark.asyncio
+    async def test_rejects_exec(self):
+        calc = CalculatorTool()
+        result = await calc.execute(expression="exec('print(1)')")
+        assert result.success is False
+    @pytest.mark.asyncio
+    async def test_empty_expression(self):
+        calc = CalculatorTool()
+        result = await calc.execute(expression="")
+        assert result.success is False
+    def test_definition_produces_valid_schema(self):
+        calc = CalculatorTool()
+        defn = calc.definition()
+        assert defn.name == "calculator"
+        assert defn.parameters["type"] == "object"
+        assert "expression" in defn.parameters["properties"]
+        assert "expression" in defn.parameters["required"]
+# --- Search tool tests ---
+class TestSearchTool:
+    @pytest.mark.asyncio
+    async def test_returns_formatted_results(self):
+        retriever = MockRetriever(
+            results=[
+                MockSearchResult(
+                    chunk=MockChunk(
+                        content="Path parameters are defined using curly braces.",
+                        source="fastapi_path_params.md",
+                    ),
+                    score=0.95,
+                ),
+                MockSearchResult(
+                    chunk=MockChunk(
+                        content="Query parameters are automatically parsed.",
+                        source="fastapi_query_params.md",
+                    ),
+                    score=0.82,
+                ),
+            ]
+        )
+        tool = SearchTool(retriever=retriever)
+        result = await tool.execute(query="path parameters")
+        assert result.success is True
+        assert "[1] (fastapi_path_params.md):" in result.result
+        assert "[2] (fastapi_query_params.md):" in result.result
+        assert result.metadata["sources"] == [
+            "fastapi_path_params.md",
+            "fastapi_query_params.md",
+        ]
+    @pytest.mark.asyncio
+    async def test_empty_results(self):
+        tool = SearchTool(retriever=MockRetriever(results=[]))
+        result = await tool.execute(query="nonexistent topic")
+        assert result.success is True
+        assert "No relevant documents found" in result.result
+        assert result.metadata["sources"] == []
+    @pytest.mark.asyncio
+    async def test_deduplicates_sources(self):
+        retriever = MockRetriever(
+            results=[
+                MockSearchResult(
+                    chunk=MockChunk(content="Chunk 1", source="same_file.md"),
+                    score=0.9,
+                ),
+                MockSearchResult(
+                    chunk=MockChunk(content="Chunk 2", source="same_file.md"),
+                    score=0.8,
+                ),
+            ]
+        )
+        tool = SearchTool(retriever=retriever)
+        result = await tool.execute(query="test")
+        assert result.metadata["sources"] == ["same_file.md"]
+    @pytest.mark.asyncio
+    async def test_empty_query(self):
+        tool = SearchTool(retriever=MockRetriever())
+        result = await tool.execute(query="")
+        assert result.success is False
+    def test_definition_produces_valid_schema(self):
+        tool = SearchTool(retriever=MockRetriever())
+        defn = tool.definition()
+        assert defn.name == "search_documents"
+        assert defn.parameters["type"] == "object"
+        assert "query" in defn.parameters["properties"]
+        assert "query" in defn.parameters["required"]