Spaces:

CalebMaresca
/

matrix-game-rag

Sleeping

App Files Files Community

matrix-game-rag / wikipedia_tool.py

CalebMaresca

remove unnecessary imports, etc

7d9df5e 10 months ago

raw

history blame contribute delete

8.77 kB

	from langchain_core.tools import BaseTool
	from langchain_core.callbacks import CallbackManagerForToolRun
	from pydantic import BaseModel, Field
	from typing import Optional, Type, Any, Dict, Iterator, List

	from langchain_core.documents import Document
	from pydantic import BaseModel, model_validator

	WIKIPEDIA_MAX_QUERY_LENGTH = 300

	class WikipediaAPIWrapper(BaseModel):
	"""Wrapper around WikipediaAPI.

	To use, you should have the ``wikipedia`` python package installed.
	This wrapper will use the Wikipedia API to conduct searches and
	fetch page summaries. By default, it will return the page summaries
	of the top-k results.
	It limits the Document content by doc_content_chars_max.
	"""

	wiki_client: Any #: :meta private:

	@model_validator(mode="before")
	@classmethod
	def validate_environment(cls, values: Dict) -> Any:
	"""Validate that the python package exists in environment."""
	try:
	import wikipedia

	lang = values.get("lang", "en")
	wikipedia.set_lang(lang)
	values["wiki_client"] = wikipedia
	except ImportError:
	raise ImportError(
	"Could not import wikipedia python package. "
	"Please install it with `pip install wikipedia`."
	)
	return values

	def search(self, query: str, top_k_results: int = 3, doc_content_chars_max: int = 4000) -> str:
	"""Search Wikipedia and get page summaries."""
	page_titles = self.wiki_client.search(
	query[:WIKIPEDIA_MAX_QUERY_LENGTH], results=top_k_results
	)
	summaries = []
	for page_title in page_titles[:top_k_results]:
	if wiki_page := self._fetch_page(page_title):
	if summary := self._formatted_page_summary(page_title, wiki_page):
	summaries.append(summary)
	if not summaries:
	return "No good Wikipedia Search Result was found"
	return "\n\n".join(summaries)[:doc_content_chars_max]

	def fetch(self, page_title: str, doc_content_chars_max: int = 20000) -> str:
	"""Fetch a specific Wikipedia page by title and return the full article. Returns the closest match if the page is ambiguous."""
	page_titles = self.wiki_client.search(
	page_title[:WIKIPEDIA_MAX_QUERY_LENGTH], results=1
	)
	if wiki_page := self._fetch_page(page_titles[0]):
	article_text = f"Page: {page_titles[0]}\n\n{wiki_page.content[:doc_content_chars_max]}"
	return article_text
	return f"No Wikipedia page found for '{page_titles[0]}'. Try using the search tool."


	@staticmethod
	def _formatted_page_summary(page_title: str, wiki_page: Any) -> Optional[str]:
	return f"Page: {page_title}\nSummary: {wiki_page.summary}"

	def _page_to_document(self, page_title: str, wiki_page: Any,
	load_all_available_meta: bool = False,
	doc_content_chars_max: int = 4000) -> Document:
	main_meta = {
	"title": page_title,
	"summary": wiki_page.summary,
	"source": wiki_page.url,
	}
	add_meta = (
	{
	"categories": wiki_page.categories,
	"page_url": wiki_page.url,
	"image_urls": wiki_page.images,
	"related_titles": wiki_page.links,
	"parent_id": wiki_page.parent_id,
	"references": wiki_page.references,
	"revision_id": wiki_page.revision_id,
	"sections": wiki_page.sections,
	}
	if load_all_available_meta
	else {}
	)
	doc = Document(
	page_content=wiki_page.content[:doc_content_chars_max],
	metadata={
	**main_meta,
	**add_meta,
	},
	)
	return doc

	def _fetch_page(self, page: str) -> Optional[str]:
	try:
	return self.wiki_client.page(title=page, auto_suggest=False)
	except (
	self.wiki_client.exceptions.PageError,
	self.wiki_client.exceptions.DisambiguationError,
	):
	return None

	def load(self, query: str, top_k_results: int = 3,
	load_all_available_meta: bool = False,
	doc_content_chars_max: int = 4000) -> List[Document]:
	"""
	Run Wikipedia search and get the article text plus the meta information.

	Returns: a list of documents.
	"""
	return list(self.lazy_load(
	query,
	top_k_results=top_k_results,
	load_all_available_meta=load_all_available_meta,
	doc_content_chars_max=doc_content_chars_max
	))

	def lazy_load(self, query: str, top_k_results: int = 3,
	load_all_available_meta: bool = False,
	doc_content_chars_max: int = 4000) -> Iterator[Document]:
	"""
	Run Wikipedia search and get the article text plus the meta information.

	Returns: a list of documents.
	"""
	page_titles = self.wiki_client.search(
	query[:WIKIPEDIA_MAX_QUERY_LENGTH], results=top_k_results
	)
	for page_title in page_titles[:top_k_results]:
	if wiki_page := self._fetch_page(page_title):
	if doc := self._page_to_document(
	page_title,
	wiki_page,
	load_all_available_meta=load_all_available_meta,
	doc_content_chars_max=doc_content_chars_max
	):
	yield doc


	class WikipediaSearchInput(BaseModel):
	"""Input for the Wikipedia search tool."""

	query: str = Field(..., description="The search query to search Wikipedia for.")
	top_k_results: int = Field(3, description="The number of search results to return.")
	doc_content_chars_max: int = Field(4000, description="The maximum number of characters to return for each document.")


	class WikipediaFetchInput(BaseModel):
	"""Input for the Wikipedia fetch tool."""

	page_title: str = Field(..., description="The title of the Wikipedia page to fetch.")
	doc_content_chars_max: int = Field(20000, description="The maximum number of characters to return for the article.")


	class WikipediaSearchTool(BaseTool):
	"""Tool that searches Wikipedia and returns summaries."""

	name: str = "wikipedia_search"
	description: str = (
	"A wrapper around Wikipedia search. "
	"Useful for when you need to answer general questions about "
	"people, places, companies, facts, historical events, or other subjects. "
	"Input should be a search query. Returns summaries of the top search results."
	)
	api_wrapper: WikipediaAPIWrapper

	args_schema: Type[BaseModel] = WikipediaSearchInput

	def _run(
	self,
	query: str,
	top_k_results: int = 3,
	doc_content_chars_max: int = 4000,
	run_manager: Optional[CallbackManagerForToolRun] = None,
	) -> str:
	"""Use the Wikipedia search tool."""
	return self.api_wrapper.search(
	query,
	top_k_results=top_k_results,
	doc_content_chars_max=doc_content_chars_max
	)


	class WikipediaFetchTool(BaseTool):
	"""Tool that fetches a specific Wikipedia page by title."""

	name: str = "wikipedia_fetch"
	description: str = (
	"Retrieve a specific Wikipedia page by its title. "
	"Useful for when you need comprehensive information about "
	"people, places, companies, facts, historical events, or other subjects. "
	"This returns the complete article text rather than just summary. "
	"Returns the closest match if the page is ambiguous."
	)
	api_wrapper: WikipediaAPIWrapper

	args_schema: Type[BaseModel] = WikipediaFetchInput

	def _run(
	self,
	page_title: str,
	doc_content_chars_max: int = 20000,
	run_manager: Optional[CallbackManagerForToolRun] = None,
	) -> str:
	"""Use the Wikipedia fetch tool."""
	return self.api_wrapper.fetch(
	page_title,
	doc_content_chars_max=doc_content_chars_max
	)


	# Wikipedia Toolkit
	class WikipediaToolkit:
	"""Toolkit for Wikipedia."""

	def __init__(self):
	self.api_wrapper = WikipediaAPIWrapper()

	def get_tools(self) -> List[BaseTool]:
	"""Get the tools in the toolkit."""
	return [
	WikipediaSearchTool(api_wrapper=self.api_wrapper),
	WikipediaFetchTool(api_wrapper=self.api_wrapper),
	]


	# wikipedia = WikipediaAPIWrapper(doc_content_chars_max = 400000)
	# print(wikipedia.fetch("India Pakistan conflict 2025"))