Spaces:
Sleeping
Sleeping
| from langchain_core.tools import tool | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.tools import WikipediaQueryRun, ArxivQueryRun | |
| from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper | |
| from langchain_community.tools.tavily_search import TavilySearchResults | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| # ============================== | |
| # CONFIG | |
| # ============================== | |
| VECTORSTORE_DIR = "data/vectorstore" | |
| os.makedirs(VECTORSTORE_DIR, exist_ok=True) | |
| # ============================== | |
| # VECTOR STORE CREATION | |
| # ============================== | |
| def build_vectorstore(file_path: str): | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=500, | |
| chunk_overlap=100 | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vectorstore = FAISS.from_documents(chunks, embeddings) | |
| vectorstore.save_local(VECTORSTORE_DIR) | |
| return vectorstore | |
| def update_retriever(file_path: str): | |
| """Rebuild vectorstore when a new document is uploaded.""" | |
| build_vectorstore(file_path) | |
| # ============================== | |
| # RAG TOOL (HF SAFE) | |
| # ============================== | |
| def create_rag_tool(): | |
| def rag_search(query: str) -> str: | |
| """ | |
| Retrieve relevant information from uploaded documents. | |
| """ | |
| if not os.path.exists(os.path.join(VECTORSTORE_DIR, "index.faiss")): | |
| return "No document has been uploaded yet." | |
| embeddings = OpenAIEmbeddings(model="text-embedding-3-small") | |
| vectorstore = FAISS.load_local( | |
| VECTORSTORE_DIR, | |
| embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) | |
| docs = retriever.invoke(query) | |
| if not docs: | |
| return "No relevant information found in the document." | |
| return "\n\n".join(d.page_content for d in docs) | |
| return rag_search | |
| # ============================== | |
| # EXTRA TOOLS | |
| # ============================== | |
| def wikipedia_search(query: str) -> dict: | |
| """Search Wikipedia.""" | |
| try: | |
| return {"results": WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()).run(query)} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def arxiv_search(query: str) -> dict: | |
| """Search academic papers on arXiv.""" | |
| try: | |
| return {"results": ArxivQueryRun(api_wrapper=ArxivAPIWrapper()).run(query)} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def tavily_search(query: str) -> dict: | |
| """Search the web using Tavily.""" | |
| try: | |
| return {"results": TavilySearchResults(max_results=5).run(query)} | |
| except Exception as e: | |
| return {"error": str(e)} |