# # https://medium.com/@laurentkubaski/smolagents-duckduckgosearchtool-to-search-in-wikipedia-2578973bb131 # from __future__ import annotations import logging from typing import Any from urllib.parse import quote from ddgs import ddgs from ddgs.base import BaseSearchEngine from ddgs.results import TextResult # from ddgs.utils import json_loads from json import loads as json_loads from smolagents import DuckDuckGoSearchTool logger = logging.getLogger(__name__) class CustomWikipedia(BaseSearchEngine[TextResult]): """ A customized ddgs Wikipedia search engine that returns multiple results """ name = "wikipedia" category = "text" provider = "wikipedia" priority = 2 search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}" search_method = "GET" def build_payload( self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any ) -> dict[str, Any]: """ This is mostly a copy-paste of the original method where I've removed the "&limit=1" query parameter """ country, lang = region.lower().split("-") encoded_query = quote(query) #encoded_query = quote(query) self.search_url = ( # f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}" f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&search={encoded_query}" ) payload: dict[str, Any] = {} self.lang = lang # used in extract_results return payload def extract_results(self, html_text: str) -> list[TextResult]: return self.extract_results_with_body(html_text) def extract_results_with_body(self, html_text: str) -> list[TextResult]: """ This is mostly a copy-paste of the original method except that I'm now looping over the results instead of just returning the first one """ json_data = json_loads(html_text) if not json_data[1]: return [] results = [] for title, href in zip(json_data[1], json_data[3]): result = TextResult() result.title = title result.href = href # Add body encoded_query = quote(result.title) resp_data = self.request( "GET", f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1", ) if resp_data: page_json = json_loads(resp_data) try: result.body = list(page_json["query"]["pages"].values())[0]["extract"] except KeyError as ex: logger.warning(f"Error getting body from Wikipedia for title={result.title}: {ex}") if "may refer to:" not in result.body: results.append(result) return results class CustomDuckDuckGoSearchTool(DuckDuckGoSearchTool): """ A customized smolagents DuckDuckGoSearchTool that allows using a single search engine """ name = "web_search" description = "Performs a web search for a query and returns a list of the top search results formatted as markdown with page titles and urls." inputs = {"query": {"type": "string", "description": "The search query to perform."}} output_type = "string" def __init__(self, max_results: int = 10, rate_limit: float | None = 1.0, backend: str = "auto", **kwargs): super().__init__(max_results=max_results, rate_limit=rate_limit, **kwargs) self.backend = backend if backend == "wikipedia": ddgs.ENGINES["text"]["wikipedia"] = CustomWikipedia def forward(self, query: str) -> str: """ This is mostly a copy-paste of the original method where I'm adding the self.backend attribute when calling self.ddgs.text() """ self._enforce_rate_limit() results = self.ddgs.text( query=query, max_results=self.max_results, backend=self.backend) if len(results) == 0: raise Exception("No results found! Try a less restrictive/shorter query.") postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results] return "## Search Results\n\n" + "\n\n".join(postprocessed_results) if __name__ == "__main__": tool = CustomDuckDuckGoSearchTool( max_results=3, rate_limit=1.0, backend="wikipedia") result = tool( query='Leopard' ) print(result)