Unit_3_Agentic_RAG / websearch.py
Xylor's picture
Using base json.loads instead of (missing) ddgs json_loads
267b8dd verified
#
# https://medium.com/@laurentkubaski/smolagents-duckduckgosearchtool-to-search-in-wikipedia-2578973bb131
#
from __future__ import annotations
import logging
from typing import Any
from urllib.parse import quote
from ddgs import ddgs
from ddgs.base import BaseSearchEngine
from ddgs.results import TextResult
# from ddgs.utils import json_loads
from json import loads as json_loads
from smolagents import DuckDuckGoSearchTool
logger = logging.getLogger(__name__)
class CustomWikipedia(BaseSearchEngine[TextResult]):
"""
A customized ddgs Wikipedia search engine that returns multiple results
"""
name = "wikipedia"
category = "text"
provider = "wikipedia"
priority = 2
search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
search_method = "GET"
def build_payload(
self, query: str, region: str, safesearch: str, timelimit: str | None, page: int = 1, **kwargs: Any
) -> dict[str, Any]:
"""
This is mostly a copy-paste of the original method where I've removed the "&limit=1" query parameter
"""
country, lang = region.lower().split("-")
encoded_query = quote(query)
#encoded_query = quote(query)
self.search_url = (
# f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&search={encoded_query}"
)
payload: dict[str, Any] = {}
self.lang = lang # used in extract_results
return payload
def extract_results(self, html_text: str) -> list[TextResult]:
return self.extract_results_with_body(html_text)
def extract_results_with_body(self, html_text: str) -> list[TextResult]:
"""
This is mostly a copy-paste of the original method except that I'm now looping over the results
instead of just returning the first one
"""
json_data = json_loads(html_text)
if not json_data[1]:
return []
results = []
for title, href in zip(json_data[1], json_data[3]):
result = TextResult()
result.title = title
result.href = href
# Add body
encoded_query = quote(result.title)
resp_data = self.request(
"GET",
f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1",
)
if resp_data:
page_json = json_loads(resp_data)
try:
result.body = list(page_json["query"]["pages"].values())[0]["extract"]
except KeyError as ex:
logger.warning(f"Error getting body from Wikipedia for title={result.title}: {ex}")
if "may refer to:" not in result.body:
results.append(result)
return results
class CustomDuckDuckGoSearchTool(DuckDuckGoSearchTool):
"""
A customized smolagents DuckDuckGoSearchTool that allows using a single search engine
"""
name = "web_search"
description = "Performs a web search for a query and returns a list of the top search results formatted as markdown with page titles and urls."
inputs = {"query": {"type": "string", "description": "The search query to perform."}}
output_type = "string"
def __init__(self, max_results: int = 10, rate_limit: float | None = 1.0, backend: str = "auto", **kwargs):
super().__init__(max_results=max_results, rate_limit=rate_limit, **kwargs)
self.backend = backend
if backend == "wikipedia":
ddgs.ENGINES["text"]["wikipedia"] = CustomWikipedia
def forward(self, query: str) -> str:
"""
This is mostly a copy-paste of the original method where I'm adding the self.backend attribute
when calling self.ddgs.text()
"""
self._enforce_rate_limit()
results = self.ddgs.text(
query=query,
max_results=self.max_results,
backend=self.backend)
if len(results) == 0:
raise Exception("No results found! Try a less restrictive/shorter query.")
postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
return "## Search Results\n\n" + "\n\n".join(postprocessed_results)
if __name__ == "__main__":
tool = CustomDuckDuckGoSearchTool(
max_results=3,
rate_limit=1.0,
backend="wikipedia")
result = tool(
query='Leopard'
)
print(result)