Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| import wikipedia | |
| from pydantic import BaseModel | |
| from agentflow.tools.base import BaseTool | |
| from agentflow.engine.factory import create_llm_engine | |
| from agentflow.tools.web_search.tool import Web_Search_Tool | |
| # from web_rag import Web_Search_Tool | |
| # from agentflow.tools.web_search.tool import Web_Search_Tool # NOTE: Shall be used in the future | |
| # from utilis import select_relevant_queries | |
| from agentflow.tools.base import BaseTool | |
| from agentflow.engine.factory import create_llm_engine | |
| # Tool name mapping - this defines the external name for this tool | |
| TOOL_NAME = "Wikipedia_RAG_Search_Tool" | |
| LIMITATION = f""" | |
| {TOOL_NAME} has the following limitations: | |
| 1. It is designed specifically for retrieving grounded information from Wikipedia pages only. | |
| 2. Filtering of relevant pages depends on LLM model performance and may not always select optimal pages. | |
| 3. The returned information accuracy depends on Wikipedia content quality. | |
| """ | |
| BEST_PRACTICE = f""" | |
| For optimal results with {TOOL_NAME}: | |
| 1. Use specific, targeted queries rather than broad or ambiguous questions. | |
| 2. The tool automatically filters for relevant pages using LLM-based selection - trust the "relevant_pages" results. | |
| 3. If initial results are insufficient, examine the "other_pages" section for additional potentially relevant content. | |
| 4. Use this tool as part of a multi-step research process rather than a single source of truth. | |
| 5. You can use the {TOOL_NAME} to get more information from the URLs. | |
| """ | |
| class Select_Relevant_Queries(BaseModel): | |
| matched_queries: list[str] | |
| matched_query_ids: list[int] | |
| def select_relevant_queries(original_query: str, query_candidates: list[str], llm_engine): | |
| query_candidates = "\n".join([f"{i}. {query}" for i, query in enumerate(query_candidates)]) | |
| prompt = f""" | |
| You are an expert AI assistant. Your task is to identify and select the most relevant queries from a list of Wikipedia search results that are most likely to address the user’s original question. | |
| ## Input | |
| Original Query: `{original_query}` | |
| Query Candidates from Wikipedia Search: `{query_candidates}` | |
| ## Instructions | |
| 1. Carefully read the original query and the list of query candidates. | |
| 2. Select the query candidates that are most relevant to the original query — i.e., those most likely to contain the information needed to answer the question. | |
| 3. Return the most relevant queries. If you think multiple queries are helpful, you can return up to 3 queries. | |
| 4. Return your output in the following format: | |
| ``` | |
| - Matched Queries: <list of matched queries> | |
| - Matched Query IDs: <list of matched query ids>. Please make sure the ids are integers. And do not return empty list. | |
| ``` | |
| ## Examples | |
| Original Query: What is the capital of France? | |
| Query Candidates from Wikipedia Search: | |
| 0. Closed-ended question | |
| 1. France | |
| 2. What Is a Nation? | |
| 3. Capital city | |
| 4. London | |
| 5. WhatsApp | |
| 6. French Revolution | |
| 7. Communes of France | |
| 8. Capital punishment | |
| 9. Louis XIV | |
| Output: | |
| - Matched Queries: France | |
| - Matched Query IDs: [1] | |
| Original Query: What is the mass of the moon? | |
| Query Candidates from Wikipedia Search: | |
| 0. Moon | |
| 1. Planetary-mass moon | |
| 2. What If the Moon Didn't Exist | |
| 3. Earth mass | |
| 4. Moon landing | |
| 5. Mass | |
| 6. Colonization of the Moon | |
| 7. Planetary mass | |
| 8. Hollow Moon | |
| 9. Gravitation of the Moon | |
| Output: | |
| - Matched Queries: Moon, Planetary-mass moon | |
| - Matched Query IDs: [0, 1] | |
| """ | |
| try: | |
| prompt = prompt.format(original_query=original_query, query_candidates=query_candidates) | |
| response = llm_engine.generate(prompt, response_format=Select_Relevant_Queries) | |
| # print(response) | |
| matched_queries = response.matched_queries | |
| matched_query_ids = [int(i) for i in response.matched_query_ids] | |
| return matched_queries, matched_query_ids | |
| except Exception as e: | |
| print(f"Error selecting relevant queries: {e}") | |
| return [], [] | |
| class Wikipedia_Search_Tool(BaseTool): | |
| def __init__(self, model_string="gpt-4o-mini"): | |
| super().__init__( | |
| tool_name=TOOL_NAME, | |
| tool_description="A tool that searches Wikipedia and returns relevant pages with their page titles, URLs, abstract, and retrieved information based on a given query.", | |
| tool_version="1.0.0", | |
| input_types={ | |
| "query": "str - The search query for Wikipedia." | |
| }, | |
| output_type="dict - A dictionary containing search results, all matching pages with their content, URLs, and metadata.", | |
| demo_commands=[ | |
| { | |
| "command": 'execution = tool.execute(query="What is the exact mass in kg of the moon")', | |
| "description": "Search Wikipedia and get the information about the mass of the moon." | |
| }, | |
| { | |
| "command": 'execution = tool.execute(query="Funtion of human kidney")', | |
| "description": "Search Wikipedia and get the information about the function of human kidney." | |
| }, | |
| { | |
| "command": 'execution = tool.execute(query="When was the first moon landing?")', | |
| "description": "Search Wikipedia and get the information about the first moon landing." | |
| } | |
| ], | |
| user_metadata = { | |
| "limitation": LIMITATION, | |
| "best_practice": BEST_PRACTICE | |
| } | |
| ) | |
| self.model_string = model_string | |
| self.llm_engine = create_llm_engine(model_string=model_string, temperature=0.0, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0) | |
| def _get_wikipedia_url(self, query): | |
| """ | |
| Get the Wikipedia URL for a given query. | |
| """ | |
| query = query.replace(" ", "_") # replace spaces with underscores | |
| return f"https://en.wikipedia.org/wiki/{query}" | |
| def search_wikipedia(self, query, max_length=100, max_pages=10): | |
| """ | |
| Searches Wikipedia based on the given query and returns multiple pages with their text and URLs. | |
| Parameters: | |
| query (str): The search query for Wikipedia. | |
| Returns: | |
| tuple: (search_results, pages_data) | |
| - search_results: List of search result titles | |
| - pages_data: List of dictionaries containing page info (title, text, url, error) | |
| """ | |
| try: | |
| search_results = wikipedia.search(query) | |
| if not search_results: | |
| return [{"title": None, "url": None, "abstract": None, "error": f"No results found for query: {query}"}] | |
| pages_data = [] | |
| pages_to_process = search_results[:max_pages] if max_pages else search_results | |
| # get the pages datafsave | |
| for title in pages_to_process: | |
| try: | |
| page = wikipedia.page(title) | |
| text = page.content | |
| url = page.url | |
| if max_length != -1: | |
| text = text[:max_length] + f"... [truncated]" if len(text) > max_length else text | |
| pages_data.append({ | |
| "title": title, | |
| "url": url, | |
| "abstract": text | |
| }) | |
| except Exception as e: | |
| pages_data.append({ | |
| "title": title, | |
| "url": self._get_wikipedia_url(title), | |
| "abstract": "Please use the URL to get the full text further if needed.", | |
| }) | |
| return pages_data | |
| except Exception as e: | |
| return [{"title": None, "url": None, "abstract": None, "error": f"Error searching Wikipedia: {str(e)}"}] | |
| def execute(self, query): | |
| """ | |
| Searches Wikipedia based on the provided query and returns all matching pages. | |
| Parameters: | |
| query (str): The search query for Wikipedia. | |
| Returns: | |
| dict: A dictionary containing the search results and all matching pages with their content. | |
| """ | |
| # Check if OpenAI API key is set | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| sys.exit("[Wikipedia RAG Search] Error: OPENAI_API_KEY environment variable is not set.") | |
| # First get relevant queries from the search results | |
| search_results = self.search_wikipedia(query) | |
| # Get the titles of the pages | |
| titles = [page["title"] for page in search_results if page["title"] is not None] | |
| if not titles: | |
| return {"query": query, "relevant_pages": [], "other_pages (may be irrelevant to the query)": search_results} | |
| # Select the most relevant pages | |
| matched_queries, matched_query_ids = select_relevant_queries(query, titles, self.llm_engine) | |
| # Only process the most relevant pages | |
| pages_data = [search_results[i] for i in matched_query_ids] | |
| other_pages = [search_results[i] for i in range(len(search_results)) if i not in matched_query_ids] | |
| # For each relevant page, get detailed information using Web RAG | |
| try: | |
| web_rag_tool = Web_Search_Tool(model_string=self.model_string) | |
| except Exception as e: | |
| print(f"Error creating Web RAG tool: {e}") | |
| return {"query": query, "relevant_pages": [], "other_pages (may be irrelevant to the query)": search_results} | |
| for page in pages_data: | |
| url = page["url"] | |
| if url is None: | |
| continue | |
| try: | |
| execution = web_rag_tool.execute(query=query, url=url) | |
| page["retrieved_information"] = execution | |
| except Exception as e: | |
| page["retrieved_information"] = None | |
| return { | |
| "query": query, | |
| "relevant_pages (to the query)": pages_data, | |
| "other_pages (may be irrelevant to the query)": other_pages | |
| } | |
| def get_metadata(self): | |
| """ | |
| Returns the metadata for the Wikipedia_Search_Tool. | |
| Returns: | |
| dict: A dictionary containing the tool's metadata. | |
| """ | |
| metadata = super().get_metadata() | |
| return metadata | |
| if __name__ == "__main__": | |
| # Test command: | |
| """ | |
| Run the following commands in the terminal to test the script: | |
| cd agentflow/tools/wikipedia_search | |
| python tool.py | |
| """ | |
| # Example usage of the Wikipedia_Search_Tool | |
| tool = Wikipedia_Search_Tool(model_string="gpt-4o-mini") | |
| # tool = Wikipedia_Search_Tool(model_string="gemini-1.5-flash") | |
| # tool = Wikipedia_Search_Tool(model_string="dashscope") # | |
| # Get tool metadata | |
| metadata = tool.get_metadata() | |
| # print(metadata) | |
| # Sample query for searching Wikipedia | |
| # query = "Python programming language" | |
| # query = "what is the main function of the human kidney" | |
| # query = "What is the mass of the moon" | |
| # query = "mass of the moon" | |
| # query = "mass of the moon in kg" | |
| # query = "What is the mass of the moon (in kg)?" | |
| # query = "What is the capital of France" | |
| # query = "Who is Yann LeCun" | |
| # query = "What is the exact mass in kg of the moon?" | |
| query = "When was the first moon landing?" | |
| import json | |
| # Execute the tool with the sample query | |
| try: | |
| # Test with default parameters (all pages) | |
| execution = tool.execute(query=query) | |
| print("Execution Result (all pages):") | |
| print(json.dumps(execution, indent=4)) | |
| # Save the execution result to a JSON file | |
| os.makedirs("logs", exist_ok=True) | |
| with open(f"logs/{query}.json", "w") as f: | |
| json.dump(execution, f, indent=4) | |
| except ValueError as e: | |
| print(f"Execution failed: {e}") | |
| print("Done!") | |