Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| from langchain_core.tools import tool | |
| sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) | |
| from utils.call_llm import llm | |
| from configs.config import Config | |
| env = Config() | |
| def generate_search_string(query: str) -> str: | |
| """ | |
| Generate an optimal Wikipedia search string from the given query. | |
| Args: | |
| query (str): The input query for generating the search string. | |
| Returns: | |
| str: A single continuous search string optimized for Wikipedia search. | |
| """ | |
| if not query or not isinstance(query, str): | |
| raise ValueError("Query must be a non-empty string.") | |
| prompt = f""" | |
| Generate an optimal Wikipedia search string from the query '{query}'. \n | |
| Just return a single continuous search string without any additional text or formatting or quotation marks. \n | |
| Do not include any other text or explanation.""" | |
| response = env.LOCAL_LLM.invoke(prompt) | |
| if not response or not response.content.strip(): | |
| raise ValueError("Failed to generate a valid search string.") | |
| return response.content.strip() | |
| def document_store(query, chunk_size, chunk_overlap): | |
| """Load a Wikipedia page based on the query and language.""" | |
| from langchain_community.document_loaders import WikipediaLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema.document import Document | |
| from langchain_community.vectorstores.faiss import FAISS | |
| embedding_model = env.EMBED_MODEL | |
| language = "en" | |
| search_query = generate_search_string(query) | |
| if not search_query: | |
| raise ValueError("Search query is empty or invalid.") | |
| loader = WikipediaLoader(query=search_query, lang=language) | |
| documents = loader.load() | |
| combined_text = "".join([doc.page_content for doc in documents if doc.page_content]) | |
| if not combined_text: | |
| raise ValueError("No text found in the loaded documents.") | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| ) | |
| chunks = splitter.split_text(combined_text) | |
| if not chunks: | |
| raise ValueError("No chunks generated from the combined text.") | |
| docs = [ | |
| Document(page_content=chunk, metadata={"source": query}) | |
| for chunk in chunks | |
| ] | |
| if not docs: | |
| raise ValueError("No documents created from the chunks.") | |
| embeddings = embedding_model.embed_documents([doc.page_content for doc in docs]) | |
| if not embeddings: | |
| raise ValueError("No embeddings generated for the documents.") | |
| store = FAISS.from_documents(docs, embedding=embedding_model) | |
| return store | |
| def search(query,chunk_size, chunk_overlap): | |
| store = document_store(query,chunk_size, chunk_overlap) | |
| results = store.similarity_search_with_score(query, k=5) | |
| # Filter results based on a relevance threshold | |
| filtered_results = [] | |
| for doc, score in results: | |
| if score <= 0.5: # Relevance threshold | |
| filtered_results.append((doc, score)) | |
| return filtered_results | |
| def wikipedia_search_tool(query: str, chunk_size: int =1000, chunk_overlap: int =200): | |
| """ | |
| Run the Wikipedia search tool with the given query and parameters. | |
| """ | |
| print("----- Wiki Run ---") | |
| default_prompts = env.WIKI_DEFAULT_PROMPTS | |
| response = search(query, chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| if not response: | |
| response = [("No relevant documents found.", 1.0)] | |
| llm_input = [ | |
| {"role": "system", "content": default_prompts["system"]}, | |
| {"role": "user", "content": default_prompts["user"].format(query=query)}, | |
| {"role": "user", "content": response[0][0] if response else "No relevant documents found."} | |
| ] | |
| call_llm = env.LOCAL_LLM.invoke(llm_input) | |
| return call_llm | |