from langchain_core.tools import tool from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.vectorstores import FAISS from langchain_community.document_loaders import PyPDFLoader from langchain_openai import OpenAIEmbeddings from langchain_community.tools import WikipediaQueryRun, ArxivQueryRun from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper from langchain_community.tools.tavily_search import TavilySearchResults from dotenv import load_dotenv import os load_dotenv() # ============================== # CONFIG # ============================== VECTORSTORE_DIR = "data/vectorstore" os.makedirs(VECTORSTORE_DIR, exist_ok=True) # ============================== # VECTOR STORE CREATION # ============================== def build_vectorstore(file_path: str): loader = PyPDFLoader(file_path) documents = loader.load() splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=100 ) chunks = splitter.split_documents(documents) embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vectorstore = FAISS.from_documents(chunks, embeddings) vectorstore.save_local(VECTORSTORE_DIR) return vectorstore def update_retriever(file_path: str): """Rebuild vectorstore when a new document is uploaded.""" build_vectorstore(file_path) # ============================== # RAG TOOL (HF SAFE) # ============================== def create_rag_tool(): @tool def rag_search(query: str) -> str: """ Retrieve relevant information from uploaded documents. """ if not os.path.exists(os.path.join(VECTORSTORE_DIR, "index.faiss")): return "No document has been uploaded yet." embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vectorstore = FAISS.load_local( VECTORSTORE_DIR, embeddings, allow_dangerous_deserialization=True ) retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) docs = retriever.invoke(query) if not docs: return "No relevant information found in the document." return "\n\n".join(d.page_content for d in docs) return rag_search # ============================== # EXTRA TOOLS # ============================== @tool def wikipedia_search(query: str) -> dict: """Search Wikipedia.""" try: return {"results": WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()).run(query)} except Exception as e: return {"error": str(e)} @tool def arxiv_search(query: str) -> dict: """Search academic papers on arXiv.""" try: return {"results": ArxivQueryRun(api_wrapper=ArxivAPIWrapper()).run(query)} except Exception as e: return {"error": str(e)} @tool def tavily_search(query: str) -> dict: """Search the web using Tavily.""" try: return {"results": TavilySearchResults(max_results=5).run(query)} except Exception as e: return {"error": str(e)}