import sys import os from langchain_core.tools import tool sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from utils.call_llm import llm from configs.config import Config env = Config() def generate_search_string(query: str) -> str: """ Generate an optimal Wikipedia search string from the given query. Args: query (str): The input query for generating the search string. Returns: str: A single continuous search string optimized for Wikipedia search. """ if not query or not isinstance(query, str): raise ValueError("Query must be a non-empty string.") prompt = f""" Generate an optimal Wikipedia search string from the query '{query}'. \n Just return a single continuous search string without any additional text or formatting or quotation marks. \n Do not include any other text or explanation.""" response = env.LOCAL_LLM.invoke(prompt) if not response or not response.content.strip(): raise ValueError("Failed to generate a valid search string.") return response.content.strip() def document_store(query, chunk_size, chunk_overlap): """Load a Wikipedia page based on the query and language.""" from langchain_community.document_loaders import WikipediaLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema.document import Document from langchain_community.vectorstores.faiss import FAISS embedding_model = env.EMBED_MODEL language = "en" search_query = generate_search_string(query) if not search_query: raise ValueError("Search query is empty or invalid.") loader = WikipediaLoader(query=search_query, lang=language) documents = loader.load() combined_text = "".join([doc.page_content for doc in documents if doc.page_content]) if not combined_text: raise ValueError("No text found in the loaded documents.") splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, ) chunks = splitter.split_text(combined_text) if not chunks: raise ValueError("No chunks generated from the combined text.") docs = [ Document(page_content=chunk, metadata={"source": query}) for chunk in chunks ] if not docs: raise ValueError("No documents created from the chunks.") embeddings = embedding_model.embed_documents([doc.page_content for doc in docs]) if not embeddings: raise ValueError("No embeddings generated for the documents.") store = FAISS.from_documents(docs, embedding=embedding_model) return store def search(query,chunk_size, chunk_overlap): store = document_store(query,chunk_size, chunk_overlap) results = store.similarity_search_with_score(query, k=5) # Filter results based on a relevance threshold filtered_results = [] for doc, score in results: if score <= 0.5: # Relevance threshold filtered_results.append((doc, score)) return filtered_results @tool("wikipedia_search_tool") def wikipedia_search_tool(query: str, chunk_size: int =1000, chunk_overlap: int =200): """ Run the Wikipedia search tool with the given query and parameters. """ print("----- Wiki Run ---") default_prompts = env.WIKI_DEFAULT_PROMPTS response = search(query, chunk_size=chunk_size, chunk_overlap=chunk_overlap) if not response: response = [("No relevant documents found.", 1.0)] llm_input = [ {"role": "system", "content": default_prompts["system"]}, {"role": "user", "content": default_prompts["user"].format(query=query)}, {"role": "user", "content": response[0][0] if response else "No relevant documents found."} ] call_llm = env.LOCAL_LLM.invoke(llm_input) return call_llm