ml-intern

Sleeping

Aksel Joonas Reedi commited on Apr 27

Commit

72f615f

unverified ·

1 Parent(s): bce8a45

Add web search so the agent can cite current sources (#159)

The agent had research tools for papers, docs, and repositories, but no direct current-web lookup. This ports Claw Code's WebSearch behavior into a Python tool that searches DuckDuckGo HTML, extracts citeable title/URL hits, applies domain filters, and returns the same JSON-shaped result payload for model consumption.

Constraint: The implementation was prepared in a writable checkout because the primary working tree was sandbox read-only during this session.

Rejected: Add a new search API dependency | the Claw implementation uses the DuckDuckGo HTML endpoint and the repo already has requests.

Confidence: medium

Scope-risk: narrow

Directive: Keep the output schema stable unless the agent prompt/tool consumers are updated together.

Tested: pytest tests/unit/test_web_search_tool.py tests/unit/test_malformed_args_recovery.py -q

Tested: python -m compileall agent/tools/web_search_tool.py agent/core/tools.py agent/tools/research_tool.py agent/tools/__init__.py tests/unit/test_web_search_tool.py

Not-tested: Live DuckDuckGo network response beyond mocked HTML parser coverage.

Files changed (5) hide show

agent/core/tools.py +7 -0
agent/tools/__init__.py +3 -0
agent/tools/research_tool.py +4 -1
agent/tools/web_search_tool.py +273 -0
tests/unit/test_web_search_tool.py +161 -0

agent/core/tools.py CHANGED Viewed

@@ -51,6 +51,7 @@ from agent.tools.papers_tool import HF_PAPERS_TOOL_SPEC, hf_papers_handler
 from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 from agent.tools.research_tool import RESEARCH_TOOL_SPEC, research_handler
 from agent.tools.sandbox_tool import get_sandbox_tools
 # NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
 # from agent.tools.private_hf_repo_tools import (
@@ -311,6 +312,12 @@ def create_builtin_tools(local_mode: bool = False) -> list[ToolSpec]:
             parameters=HF_PAPERS_TOOL_SPEC["parameters"],
             handler=hf_papers_handler,
         ),
         # Dataset inspection tool (unified)
         ToolSpec(
             name=HF_INSPECT_DATASET_TOOL_SPEC["name"],

 from agent.tools.plan_tool import PLAN_TOOL_SPEC, plan_tool_handler
 from agent.tools.research_tool import RESEARCH_TOOL_SPEC, research_handler
 from agent.tools.sandbox_tool import get_sandbox_tools
+from agent.tools.web_search_tool import WEB_SEARCH_TOOL_SPEC, web_search_handler
 # NOTE: Private HF repo tool disabled - replaced by hf_repo_files and hf_repo_git
 # from agent.tools.private_hf_repo_tools import (
             parameters=HF_PAPERS_TOOL_SPEC["parameters"],
             handler=hf_papers_handler,
         ),
+        ToolSpec(
+            name=WEB_SEARCH_TOOL_SPEC["name"],
+            description=WEB_SEARCH_TOOL_SPEC["description"],
+            parameters=WEB_SEARCH_TOOL_SPEC["parameters"],
+            handler=web_search_handler,
+        ),
         # Dataset inspection tool (unified)
         ToolSpec(
             name=HF_INSPECT_DATASET_TOOL_SPEC["name"],

agent/tools/__init__.py CHANGED Viewed

@@ -20,6 +20,7 @@ from agent.tools.github_read_file import (
 )
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 from agent.tools.types import ToolResult
 __all__ = [
     "ToolResult",
@@ -36,4 +37,6 @@ __all__ = [
     "github_search_code_handler",
     "HF_INSPECT_DATASET_TOOL_SPEC",
     "hf_inspect_dataset_handler",
 ]

 )
 from agent.tools.jobs_tool import HF_JOBS_TOOL_SPEC, HfJobsTool, hf_jobs_handler
 from agent.tools.types import ToolResult
+from agent.tools.web_search_tool import WEB_SEARCH_TOOL_SPEC, web_search_handler
 __all__ = [
     "ToolResult",
     "github_search_code_handler",
     "HF_INSPECT_DATASET_TOOL_SPEC",
     "hf_inspect_dataset_handler",
+    "WEB_SEARCH_TOOL_SPEC",
+    "web_search_handler",
 ]

agent/tools/research_tool.py CHANGED Viewed

@@ -37,6 +37,7 @@ RESEARCH_TOOL_NAMES = {
     "github_find_examples",
     "github_list_repos",
     "github_read_file",
     "hf_inspect_dataset",
     "hf_repo_files",
 }
@@ -102,6 +103,8 @@ tell you what actually works.
 - `explore_hf_docs(endpoint)`: Search docs for a library. Endpoints: trl, transformers, datasets, peft, accelerate, trackio, vllm, inference-endpoints, etc.
 - `fetch_hf_docs(url)`: Fetch full page content from explore results
 - `find_hf_api(query=..., tag=...)`: Find REST API endpoints
 ## Hub repo inspection
 - `hf_repo_files`: List/read files in any HF repo (model, dataset, space)
@@ -426,7 +429,7 @@ async def research_handler(
                 await _log(f"▸ {tool_name}  {args_str}")
                 output, _success = await session.tool_router.call_tool(
-                    tool_name, tool_args, session=session
                 )
                 _tool_uses += 1
                 await _log(f"tools:{_tool_uses}")

     "github_find_examples",
     "github_list_repos",
     "github_read_file",
+    "web_search",
     "hf_inspect_dataset",
     "hf_repo_files",
 }
 - `explore_hf_docs(endpoint)`: Search docs for a library. Endpoints: trl, transformers, datasets, peft, accelerate, trackio, vllm, inference-endpoints, etc.
 - `fetch_hf_docs(url)`: Fetch full page content from explore results
 - `find_hf_api(query=..., tag=...)`: Find REST API endpoints
+- `web_search(query=..., allowed_domains=[...], blocked_domains=[...])`:
+  Search the current web when papers/docs/GitHub are not enough.
 ## Hub repo inspection
 - `hf_repo_files`: List/read files in any HF repo (model, dataset, space)
                 await _log(f"▸ {tool_name}  {args_str}")
                 output, _success = await session.tool_router.call_tool(
+                    tool_name, tool_args, session=session, tool_call_id=tc.id
                 )
                 _tool_uses += 1
                 await _log(f"tools:{_tool_uses}")

agent/tools/web_search_tool.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""DuckDuckGo HTML web search tool.
+This mirrors Claw Code's Rust WebSearch behavior: fetch DuckDuckGo's HTML
+endpoint, extract result links, optionally filter domains, and return a
+JSON payload the model can cite.
+"""
+from __future__ import annotations
+import asyncio
+import html
+import json
+import os
+import time
+from dataclasses import dataclass
+from html.parser import HTMLParser
+from typing import Any
+from urllib.parse import parse_qsl, parse_qs, urlencode, urlparse, urlunparse
+import requests
+DEFAULT_SEARCH_URL = "https://html.duckduckgo.com/html/"
+WEB_SEARCH_BASE_URL_ENV = "CLAWD_WEB_SEARCH_BASE_URL"
+USER_AGENT = "clawd-rust-tools/0.1"
+REQUEST_TIMEOUT_SECONDS = 20
+MAX_RESULTS = 8
+@dataclass(frozen=True)
+class SearchHit:
+    title: str
+    url: str
+    def as_json(self) -> dict[str, str]:
+        return {"title": self.title, "url": self.url}
+class _AnchorParser(HTMLParser):
+    def __init__(self, *, require_result_class: bool) -> None:
+        super().__init__(convert_charrefs=True)
+        self.require_result_class = require_result_class
+        self.hits: list[tuple[str, str]] = []
+        self._active_href: str | None = None
+        self._active_text: list[str] = []
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        if tag.lower() != "a":
+            return
+        attr_map = {key.lower(): value or "" for key, value in attrs}
+        href = attr_map.get("href")
+        if not href:
+            return
+        if self.require_result_class and "result__a" not in attr_map.get("class", ""):
+            return
+        self._active_href = href
+        self._active_text = []
+    def handle_data(self, data: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(data)
+    def handle_entityref(self, name: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(f"&{name};")
+    def handle_charref(self, name: str) -> None:
+        if self._active_href is not None:
+            self._active_text.append(f"&#{name};")
+    def handle_endtag(self, tag: str) -> None:
+        if tag.lower() != "a" or self._active_href is None:
+            return
+        title = collapse_whitespace(html.unescape("".join(self._active_text))).strip()
+        self.hits.append((self._active_href, title))
+        self._active_href = None
+        self._active_text = []
+def build_search_url(query: str) -> str:
+    base = os.environ.get(WEB_SEARCH_BASE_URL_ENV, DEFAULT_SEARCH_URL)
+    parsed = urlparse(base)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        raise ValueError(f"invalid search base URL: {base}")
+    query_pairs = parse_qsl(parsed.query, keep_blank_values=True)
+    query_pairs.append(("q", query))
+    return urlunparse(parsed._replace(query=urlencode(query_pairs)))
+def collapse_whitespace(value: str) -> str:
+    return " ".join(value.split())
+def decode_duckduckgo_redirect(url: str) -> str | None:
+    if url.startswith("http://") or url.startswith("https://"):
+        return html.unescape(url)
+    if url.startswith("//"):
+        joined = f"https:{url}"
+    elif url.startswith("/"):
+        joined = f"https://duckduckgo.com{url}"
+    else:
+        return None
+    parsed = urlparse(joined)
+    if parsed.path in {"/l", "/l/"}:
+        uddg = parse_qs(parsed.query).get("uddg", [])
+        if uddg:
+            return html.unescape(uddg[0])
+    return joined
+def _extract_links(search_html: str, *, require_result_class: bool) -> list[SearchHit]:
+    parser = _AnchorParser(require_result_class=require_result_class)
+    parser.feed(search_html)
+    hits: list[SearchHit] = []
+    for raw_url, title in parser.hits:
+        if not title:
+            continue
+        decoded_url = decode_duckduckgo_redirect(raw_url)
+        if decoded_url and (
+            decoded_url.startswith("http://") or decoded_url.startswith("https://")
+        ):
+            hits.append(SearchHit(title=title, url=decoded_url))
+    return hits
+def extract_search_hits(search_html: str) -> list[SearchHit]:
+    return _extract_links(search_html, require_result_class=True)
+def extract_search_hits_from_generic_links(search_html: str) -> list[SearchHit]:
+    return _extract_links(search_html, require_result_class=False)
+def normalize_domain_filter(domain: str) -> str:
+    trimmed = domain.strip()
+    parsed = urlparse(trimmed)
+    candidate = parsed.hostname if parsed.scheme and parsed.hostname else trimmed
+    return candidate.strip().lstrip(".").rstrip("/").lower()
+def host_matches_list(url: str, domains: list[str]) -> bool:
+    host = urlparse(url).hostname
+    if not host:
+        return False
+    normalized_host = host.lower()
+    for domain in domains:
+        normalized = normalize_domain_filter(domain)
+        if normalized and (
+            normalized_host == normalized or normalized_host.endswith(f".{normalized}")
+        ):
+            return True
+    return False
+def dedupe_hits(hits: list[SearchHit]) -> list[SearchHit]:
+    seen: set[str] = set()
+    deduped: list[SearchHit] = []
+    for hit in hits:
+        if hit.url in seen:
+            continue
+        seen.add(hit.url)
+        deduped.append(hit)
+    return deduped
+def execute_web_search(
+    query: str,
+    allowed_domains: list[str] | None = None,
+    blocked_domains: list[str] | None = None,
+    tool_use_id: str = "web_search_1",
+) -> dict[str, Any]:
+    started = time.monotonic()
+    search_url = build_search_url(query)
+    response = requests.get(
+        search_url,
+        headers={"User-Agent": USER_AGENT},
+        timeout=REQUEST_TIMEOUT_SECONDS,
+        allow_redirects=True,
+    )
+    hits = extract_search_hits(response.text)
+    if not hits and urlparse(response.url or search_url).hostname:
+        hits = extract_search_hits_from_generic_links(response.text)
+    if allowed_domains is not None:
+        hits = [hit for hit in hits if host_matches_list(hit.url, allowed_domains)]
+    if blocked_domains is not None:
+        hits = [hit for hit in hits if not host_matches_list(hit.url, blocked_domains)]
+    hits = dedupe_hits(hits)[:MAX_RESULTS]
+    rendered_hits = "\n".join(f"- [{hit.title}]({hit.url})" for hit in hits)
+    if hits:
+        summary = (
+            f"Search results for {query!r}. Include a Sources section in the final answer.\n"
+            f"{rendered_hits}"
+        )
+    else:
+        summary = f"No web search results matched the query {query!r}."
+    return {
+        "query": query,
+        "results": [
+            summary,
+            {
+                "tool_use_id": tool_use_id,
+                "content": [hit.as_json() for hit in hits],
+            },
+        ],
+        "durationSeconds": time.monotonic() - started,
+    }
+WEB_SEARCH_TOOL_SPEC = {
+    "name": "web_search",
+    "description": "Search the web for current information and return cited results.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "query": {"type": "string", "minLength": 2},
+            "allowed_domains": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Optional allowlist of domains or URLs. Subdomains match.",
+            },
+            "blocked_domains": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Optional blocklist of domains or URLs. Subdomains match.",
+            },
+        },
+        "required": ["query"],
+        "additionalProperties": False,
+    },
+}
+def _optional_string_list(arguments: dict[str, Any], key: str) -> list[str] | None:
+    value = arguments.get(key)
+    if value is None:
+        return None
+    if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
+        raise ValueError(f"{key} must be an array of strings")
+    return value
+async def web_search_handler(
+    arguments: dict[str, Any],
+    session: Any = None,
+    tool_call_id: str | None = None,
+    **_kw: Any,
+) -> tuple[str, bool]:
+    query_value = arguments.get("query", "")
+    if not isinstance(query_value, str):
+        return "Error: web_search requires a query string with at least 2 characters.", False
+    query = query_value.strip()
+    if len(query) < 2:
+        return "Error: web_search requires a query with at least 2 characters.", False
+    try:
+        output = await asyncio.to_thread(
+            execute_web_search,
+            query=query,
+            allowed_domains=_optional_string_list(arguments, "allowed_domains"),
+            blocked_domains=_optional_string_list(arguments, "blocked_domains"),
+            tool_use_id=tool_call_id or "web_search_1",
+        )
+    except Exception as exc:
+        return f"Error executing web search: {exc}", False
+    return json.dumps(output, indent=2), True

tests/unit/test_web_search_tool.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import json
+import pytest
+from agent.core.tools import create_builtin_tools
+from agent.tools import web_search_tool
+class _FakeResponse:
+    def __init__(self, text: str, url: str = "https://html.duckduckgo.com/html/?q=x"):
+        self.text = text
+        self.url = url
+def _content_block(output: dict):
+    return next(item for item in output["results"] if isinstance(item, dict))["content"]
+def test_web_search_extracts_duckduckgo_results_and_filters_domains(monkeypatch):
+    seen = {}
+    def fake_get(url, headers, timeout, allow_redirects):
+        seen.update(
+            {
+                "url": url,
+                "user_agent": headers["User-Agent"],
+                "timeout": timeout,
+                "allow_redirects": allow_redirects,
+            }
+        )
+        return _FakeResponse(
+            """
+            <html><body>
+              <a class="result__a" href="https://docs.rs/reqwest">Reqwest docs</a>
+              <a class="result__a" href="https://example.com/blocked">Blocked result</a>
+            </body></html>
+            """,
+            url,
+        )
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "http://search.test/search")
+    monkeypatch.setattr(web_search_tool.requests, "get", fake_get)
+    output = web_search_tool.execute_web_search(
+        "rust web search",
+        allowed_domains=["https://DOCS.rs/"],
+        blocked_domains=["HTTPS://EXAMPLE.COM"],
+    )
+    assert seen == {
+        "url": "http://search.test/search?q=rust+web+search",
+        "user_agent": "clawd-rust-tools/0.1",
+        "timeout": 20,
+        "allow_redirects": True,
+    }
+    assert output["query"] == "rust web search"
+    assert _content_block(output) == [
+        {"title": "Reqwest docs", "url": "https://docs.rs/reqwest"}
+    ]
+    assert "Include a Sources section" in output["results"][0]
+def test_web_search_decodes_duckduckgo_redirects():
+    hits = web_search_tool.extract_search_hits(
+        """
+        <a class="result__a"
+           href="/l/?uddg=https%3A%2F%2Fexample.org%2Fpaper%3Fx%3D1&amp;rut=abc">
+          Example Paper
+        </a>
+        """
+    )
+    assert hits == [
+        web_search_tool.SearchHit(
+            title="Example Paper",
+            url="https://example.org/paper?x=1",
+        )
+    ]
+def test_web_search_generic_fallback_dedupes_and_rejects_bad_base_url(monkeypatch):
+    def fake_get(url, headers, timeout, allow_redirects):
+        return _FakeResponse(
+            """
+            <html><body>
+              <a href="https://example.com/one">Example One</a>
+              <a href="https://example.com/one">Duplicate Example One</a>
+              <a href="https://docs.rs/tokio">Tokio Docs</a>
+            </body></html>
+            """,
+            url,
+        )
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "http://search.test/fallback")
+    monkeypatch.setattr(web_search_tool.requests, "get", fake_get)
+    output = web_search_tool.execute_web_search("generic links")
+    assert _content_block(output) == [
+        {"title": "Example One", "url": "https://example.com/one"},
+        {"title": "Tokio Docs", "url": "https://docs.rs/tokio"},
+    ]
+    monkeypatch.setenv(web_search_tool.WEB_SEARCH_BASE_URL_ENV, "://bad-base-url")
+    with pytest.raises(ValueError):
+        web_search_tool.execute_web_search("generic links")
+@pytest.mark.asyncio
+async def test_web_search_handler_returns_pretty_json(monkeypatch):
+    to_thread_calls = []
+    async def fake_to_thread(func, /, *args, **kwargs):
+        to_thread_calls.append((func, args, kwargs))
+        return func(*args, **kwargs)
+    monkeypatch.setattr(
+        web_search_tool,
+        "execute_web_search",
+        lambda **kwargs: {
+            "query": kwargs["query"],
+            "results": ["No web search results matched the query 'x'.", {"content": []}],
+            "durationSeconds": 0.1,
+        },
+    )
+    monkeypatch.setattr(web_search_tool.asyncio, "to_thread", fake_to_thread)
+    text, ok = await web_search_tool.web_search_handler({"query": "x"})
+    assert ok is False
+    assert "at least 2 characters" in text
+    text, ok = await web_search_tool.web_search_handler(
+        {"query": "valid query"}, tool_call_id="call_123"
+    )
+    assert ok is True
+    parsed = json.loads(text)
+    assert parsed["query"] == "valid query"
+    assert to_thread_calls[0][0] is web_search_tool.execute_web_search
+    assert to_thread_calls[0][2]["tool_use_id"] == "call_123"
+    text, ok = await web_search_tool.web_search_handler(
+        {"query": "valid query", "allowed_domains": "docs.rs"}
+    )
+    assert ok is False
+    assert "allowed_domains must be an array of strings" in text
+    text, ok = await web_search_tool.web_search_handler({"query": None})
+    assert ok is False
+    assert "query string" in text
+def test_web_search_is_registered_for_llm():
+    tools = create_builtin_tools(local_mode=True)
+    specs = {tool.name: tool for tool in tools}
+    assert "web_search" in specs
+    assert specs["web_search"].parameters["required"] == ["query"]