agentflow2

Sleeping

agentflow2 / agentflow /tools /wikipedia_search /tool.py

IsaacGHX

update

d12a6df 2 months ago

11.8 kB

	import os
	import sys
	import wikipedia
	from pydantic import BaseModel

	from agentflow.tools.base import BaseTool
	from agentflow.engine.factory import create_llm_engine
	from agentflow.tools.web_search.tool import Web_Search_Tool

	# from web_rag import Web_Search_Tool
	# from agentflow.tools.web_search.tool import Web_Search_Tool # NOTE: Shall be used in the future

	# from utilis import select_relevant_queries

	from agentflow.tools.base import BaseTool
	from agentflow.engine.factory import create_llm_engine

	# Tool name mapping - this defines the external name for this tool
	TOOL_NAME = "Wikipedia_RAG_Search_Tool"

	LIMITATION = f"""
	{TOOL_NAME} has the following limitations:
	1. It is designed specifically for retrieving grounded information from Wikipedia pages only.
	2. Filtering of relevant pages depends on LLM model performance and may not always select optimal pages.
	3. The returned information accuracy depends on Wikipedia content quality.
	"""

	BEST_PRACTICE = f"""
	For optimal results with {TOOL_NAME}:
	1. Use specific, targeted queries rather than broad or ambiguous questions.
	2. The tool automatically filters for relevant pages using LLM-based selection - trust the "relevant_pages" results.
	3. If initial results are insufficient, examine the "other_pages" section for additional potentially relevant content.
	4. Use this tool as part of a multi-step research process rather than a single source of truth.
	5. You can use the {TOOL_NAME} to get more information from the URLs.
	"""

	class Select_Relevant_Queries(BaseModel):
	matched_queries: list[str]
	matched_query_ids: list[int]

	def select_relevant_queries(original_query: str, query_candidates: list[str], llm_engine):

	query_candidates = "\n".join([f"{i}. {query}" for i, query in enumerate(query_candidates)])

	prompt = f"""
	You are an expert AI assistant. Your task is to identify and select the most relevant queries from a list of Wikipedia search results that are most likely to address the user’s original question.

	## Input

	Original Query: `{original_query}`
	Query Candidates from Wikipedia Search: `{query_candidates}`

	## Instructions

	1. Carefully read the original query and the list of query candidates.
	2. Select the query candidates that are most relevant to the original query — i.e., those most likely to contain the information needed to answer the question.
	3. Return the most relevant queries. If you think multiple queries are helpful, you can return up to 3 queries.
	4. Return your output in the following format:

	```
	- Matched Queries: <list of matched queries>
	- Matched Query IDs: <list of matched query ids>. Please make sure the ids are integers. And do not return empty list.
	```

	## Examples

	Original Query: What is the capital of France?
	Query Candidates from Wikipedia Search:
	0. Closed-ended question
	1. France
	2. What Is a Nation?
	3. Capital city
	4. London
	5. WhatsApp
	6. French Revolution
	7. Communes of France
	8. Capital punishment
	9. Louis XIV

	Output:
	- Matched Queries: France
	- Matched Query IDs: [1]


	Original Query: What is the mass of the moon?
	Query Candidates from Wikipedia Search:
	0. Moon
	1. Planetary-mass moon
	2. What If the Moon Didn't Exist
	3. Earth mass
	4. Moon landing
	5. Mass
	6. Colonization of the Moon
	7. Planetary mass
	8. Hollow Moon
	9. Gravitation of the Moon

	Output:
	- Matched Queries: Moon, Planetary-mass moon
	- Matched Query IDs: [0, 1]
	"""

	try:
	prompt = prompt.format(original_query=original_query, query_candidates=query_candidates)

	response = llm_engine.generate(prompt, response_format=Select_Relevant_Queries)
	# print(response)

	matched_queries = response.matched_queries
	matched_query_ids = [int(i) for i in response.matched_query_ids]
	return matched_queries, matched_query_ids
	except Exception as e:
	print(f"Error selecting relevant queries: {e}")
	return [], []

	class Wikipedia_Search_Tool(BaseTool):
	def __init__(self, model_string="gpt-4o-mini"):
	super().__init__(
	tool_name=TOOL_NAME,
	tool_description="A tool that searches Wikipedia and returns relevant pages with their page titles, URLs, abstract, and retrieved information based on a given query.",
	tool_version="1.0.0",
	input_types={
	"query": "str - The search query for Wikipedia."
	},
	output_type="dict - A dictionary containing search results, all matching pages with their content, URLs, and metadata.",
	demo_commands=[
	{
	"command": 'execution = tool.execute(query="What is the exact mass in kg of the moon")',
	"description": "Search Wikipedia and get the information about the mass of the moon."
	},
	{
	"command": 'execution = tool.execute(query="Funtion of human kidney")',
	"description": "Search Wikipedia and get the information about the function of human kidney."
	},
	{
	"command": 'execution = tool.execute(query="When was the first moon landing?")',
	"description": "Search Wikipedia and get the information about the first moon landing."
	}
	],
	user_metadata = {
	"limitation": LIMITATION,
	"best_practice": BEST_PRACTICE
	}
	)
	self.model_string = model_string
	self.llm_engine = create_llm_engine(model_string=model_string, temperature=0.0, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0)

	def _get_wikipedia_url(self, query):
	"""
	Get the Wikipedia URL for a given query.
	"""
	query = query.replace(" ", "_") # replace spaces with underscores
	return f"https://en.wikipedia.org/wiki/{query}"

	def search_wikipedia(self, query, max_length=100, max_pages=10):
	"""
	Searches Wikipedia based on the given query and returns multiple pages with their text and URLs.

	Parameters:
	query (str): The search query for Wikipedia.

	Returns:
	tuple: (search_results, pages_data)
	- search_results: List of search result titles
	- pages_data: List of dictionaries containing page info (title, text, url, error)
	"""
	try:
	search_results = wikipedia.search(query)
	if not search_results:
	return [{"title": None, "url": None, "abstract": None, "error": f"No results found for query: {query}"}]

	pages_data = []
	pages_to_process = search_results[:max_pages] if max_pages else search_results

	# get the pages datafsave

	for title in pages_to_process:
	try:
	page = wikipedia.page(title)
	text = page.content
	url = page.url

	if max_length != -1:
	text = text[:max_length] + f"... [truncated]" if len(text) > max_length else text

	pages_data.append({
	"title": title,
	"url": url,
	"abstract": text
	})
	except Exception as e:
	pages_data.append({
	"title": title,
	"url": self._get_wikipedia_url(title),
	"abstract": "Please use the URL to get the full text further if needed.",
	})

	return pages_data
	except Exception as e:
	return [{"title": None, "url": None, "abstract": None, "error": f"Error searching Wikipedia: {str(e)}"}]

	def execute(self, query):
	"""
	Searches Wikipedia based on the provided query and returns all matching pages.

	Parameters:
	query (str): The search query for Wikipedia.

	Returns:
	dict: A dictionary containing the search results and all matching pages with their content.
	"""
	# Check if OpenAI API key is set
	api_key = os.getenv("OPENAI_API_KEY")
	if not api_key:
	sys.exit("[Wikipedia RAG Search] Error: OPENAI_API_KEY environment variable is not set.")

	# First get relevant queries from the search results
	search_results = self.search_wikipedia(query)

	# Get the titles of the pages
	titles = [page["title"] for page in search_results if page["title"] is not None]
	if not titles:
	return {"query": query, "relevant_pages": [], "other_pages (may be irrelevant to the query)": search_results}

	# Select the most relevant pages
	matched_queries, matched_query_ids = select_relevant_queries(query, titles, self.llm_engine)

	# Only process the most relevant pages
	pages_data = [search_results[i] for i in matched_query_ids]
	other_pages = [search_results[i] for i in range(len(search_results)) if i not in matched_query_ids]

	# For each relevant page, get detailed information using Web RAG
	try:
	web_rag_tool = Web_Search_Tool(model_string=self.model_string)
	except Exception as e:
	print(f"Error creating Web RAG tool: {e}")
	return {"query": query, "relevant_pages": [], "other_pages (may be irrelevant to the query)": search_results}

	for page in pages_data:
	url = page["url"]
	if url is None:
	continue
	try:
	execution = web_rag_tool.execute(query=query, url=url)
	page["retrieved_information"] = execution
	except Exception as e:
	page["retrieved_information"] = None

	return {
	"query": query,
	"relevant_pages (to the query)": pages_data,
	"other_pages (may be irrelevant to the query)": other_pages
	}

	def get_metadata(self):
	"""
	Returns the metadata for the Wikipedia_Search_Tool.

	Returns:
	dict: A dictionary containing the tool's metadata.
	"""
	metadata = super().get_metadata()
	return metadata


	if __name__ == "__main__":
	# Test command:
	"""
	Run the following commands in the terminal to test the script:

	cd agentflow/tools/wikipedia_search
	python tool.py
	"""

	# Example usage of the Wikipedia_Search_Tool
	tool = Wikipedia_Search_Tool(model_string="gpt-4o-mini")
	# tool = Wikipedia_Search_Tool(model_string="gemini-1.5-flash")
	# tool = Wikipedia_Search_Tool(model_string="dashscope") #

	# Get tool metadata
	metadata = tool.get_metadata()
	# print(metadata)

	# Sample query for searching Wikipedia
	# query = "Python programming language"
	# query = "what is the main function of the human kidney"
	# query = "What is the mass of the moon"
	# query = "mass of the moon"
	# query = "mass of the moon in kg"
	# query = "What is the mass of the moon (in kg)?"
	# query = "What is the capital of France"
	# query = "Who is Yann LeCun"
	# query = "What is the exact mass in kg of the moon?"
	query = "When was the first moon landing?"

	import json

	# Execute the tool with the sample query
	try:
	# Test with default parameters (all pages)
	execution = tool.execute(query=query)
	print("Execution Result (all pages):")
	print(json.dumps(execution, indent=4))

	# Save the execution result to a JSON file
	os.makedirs("logs", exist_ok=True)
	with open(f"logs/{query}.json", "w") as f:
	json.dump(execution, f, indent=4)

	except ValueError as e:
	print(f"Execution failed: {e}")

	print("Done!")