Unit_3_Agentic_RAG

Running

App Files Files Community

Unit_3_Agentic_RAG / websearch.py

Xylor

Using base json.loads instead of (missing) ddgs json_loads

267b8dd verified 15 days ago

raw

history blame contribute delete

4.71 kB

	#
	# https://medium.com/@laurentkubaski/smolagents-duckduckgosearchtool-to-search-in-wikipedia-2578973bb131
	#

	from __future__ import annotations

	import logging
	from typing import Any
	from urllib.parse import quote

	from ddgs import ddgs
	from ddgs.base import BaseSearchEngine
	from ddgs.results import TextResult
	# from ddgs.utils import json_loads
	from json import loads as json_loads
	from smolagents import DuckDuckGoSearchTool

	logger = logging.getLogger(__name__)

	class CustomWikipedia(BaseSearchEngine[TextResult]):
	"""
	A customized ddgs Wikipedia search engine that returns multiple results
	"""

	name = "wikipedia"
	category = "text"
	provider = "wikipedia"
	priority = 2

	search_url = "https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={query}"
	search_method = "GET"

	def build_payload(
	self, query: str, region: str, safesearch: str, timelimit: str \| None, page: int = 1, **kwargs: Any
	) -> dict[str, Any]:
	"""
	This is mostly a copy-paste of the original method where I've removed the "&limit=1" query parameter
	"""
	country, lang = region.lower().split("-")
	encoded_query = quote(query)
	#encoded_query = quote(query)
	self.search_url = (
	# f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&limit=1&search={encoded_query}"
	f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&profile=fuzzy&search={encoded_query}"
	)
	payload: dict[str, Any] = {}
	self.lang = lang # used in extract_results
	return payload

	def extract_results(self, html_text: str) -> list[TextResult]:
	return self.extract_results_with_body(html_text)


	def extract_results_with_body(self, html_text: str) -> list[TextResult]:
	"""
	This is mostly a copy-paste of the original method except that I'm now looping over the results
	instead of just returning the first one
	"""
	json_data = json_loads(html_text)
	if not json_data[1]:
	return []

	results = []

	for title, href in zip(json_data[1], json_data[3]):
	result = TextResult()
	result.title = title
	result.href = href

	# Add body
	encoded_query = quote(result.title)
	resp_data = self.request(
	"GET",
	f"https://{self.lang}.wikipedia.org/w/api.php?action=query&format=json&prop=extracts&titles={encoded_query}&explaintext=0&exintro=0&redirects=1",
	)
	if resp_data:
	page_json = json_loads(resp_data)
	try:
	result.body = list(page_json["query"]["pages"].values())[0]["extract"]
	except KeyError as ex:
	logger.warning(f"Error getting body from Wikipedia for title={result.title}: {ex}")

	if "may refer to:" not in result.body:
	results.append(result)

	return results


	class CustomDuckDuckGoSearchTool(DuckDuckGoSearchTool):
	"""
	A customized smolagents DuckDuckGoSearchTool that allows using a single search engine
	"""
	name = "web_search"
	description = "Performs a web search for a query and returns a list of the top search results formatted as markdown with page titles and urls."
	inputs = {"query": {"type": "string", "description": "The search query to perform."}}
	output_type = "string"

	def __init__(self, max_results: int = 10, rate_limit: float \| None = 1.0, backend: str = "auto", **kwargs):
	super().__init__(max_results=max_results, rate_limit=rate_limit, **kwargs)
	self.backend = backend
	if backend == "wikipedia":
	ddgs.ENGINES["text"]["wikipedia"] = CustomWikipedia

	def forward(self, query: str) -> str:
	"""
	This is mostly a copy-paste of the original method where I'm adding the self.backend attribute
	when calling self.ddgs.text()
	"""
	self._enforce_rate_limit()
	results = self.ddgs.text(
	query=query,
	max_results=self.max_results,
	backend=self.backend)
	if len(results) == 0:
	raise Exception("No results found! Try a less restrictive/shorter query.")
	postprocessed_results = [f"[{result['title']}]({result['href']})\n{result['body']}" for result in results]
	return "## Search Results\n\n" + "\n\n".join(postprocessed_results)


	if __name__ == "__main__":
	tool = CustomDuckDuckGoSearchTool(
	max_results=3,
	rate_limit=1.0,
	backend="wikipedia")
	result = tool(
	query='Leopard'
	)
	print(result)