File size: 3,057 Bytes
02073ee
 
 
 
 
 
 
 
 
 
 
 
 
ef1f54c
 
 
 
 
02073ee
 
7afac3f
 
 
 
 
 
02073ee
 
 
 
 
 
7afac3f
ef1f54c
7afac3f
367fd43
3e10e7d
7afac3f
ef1f54c
02073ee
 
7afac3f
 
 
02073ee
 
ef1f54c
7afac3f
ef1f54c
02073ee
aa4ce92
2e019bc
02073ee
7afac3f
 
 
ef1f54c
7afac3f
 
ef1f54c
7afac3f
 
 
 
 
 
 
02073ee
7afac3f
02073ee
aa4ce92
02073ee
367fd43
02073ee
aa4ce92
02073ee
 
 
ef1f54c
7afac3f
 
 
ef1f54c
02073ee
367fd43
 
02073ee
367fd43
02073ee
 
 
 
 
367fd43
 
02073ee
367fd43
02073ee
 
 
 
 
ef1f54c
3e10e7d
02073ee
367fd43
aa4ce92
d1f2f58
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from langchain_core.tools import tool
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.tools import WikipediaQueryRun, ArxivQueryRun
from langchain_community.utilities import WikipediaAPIWrapper, ArxivAPIWrapper
from langchain_community.tools.tavily_search import TavilySearchResults
from dotenv import load_dotenv
import os

load_dotenv()

# ==============================
# CONFIG
# ==============================
VECTORSTORE_DIR = "data/vectorstore"
os.makedirs(VECTORSTORE_DIR, exist_ok=True)


# ==============================
# VECTOR STORE CREATION
# ==============================
def build_vectorstore(file_path: str):
    loader = PyPDFLoader(file_path)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100
    )

    chunks = splitter.split_documents(documents)

    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vectorstore = FAISS.from_documents(chunks, embeddings)

    vectorstore.save_local(VECTORSTORE_DIR)
    return vectorstore


def update_retriever(file_path: str):
    """Rebuild vectorstore when a new document is uploaded."""
    build_vectorstore(file_path)


# ==============================
# RAG TOOL (HF SAFE)
# ==============================
def create_rag_tool():

    @tool
    def rag_search(query: str) -> str:
        """
        Retrieve relevant information from uploaded documents.
        """

        if not os.path.exists(os.path.join(VECTORSTORE_DIR, "index.faiss")):
            return "No document has been uploaded yet."

        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

        vectorstore = FAISS.load_local(
            VECTORSTORE_DIR,
            embeddings,
            allow_dangerous_deserialization=True
        )

        retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
        docs = retriever.invoke(query)

        if not docs:
            return "No relevant information found in the document."

        return "\n\n".join(d.page_content for d in docs)

    return rag_search


# ==============================
# EXTRA TOOLS
# ==============================

@tool
def wikipedia_search(query: str) -> dict:
    """Search Wikipedia."""
    try:
        return {"results": WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()).run(query)}
    except Exception as e:
        return {"error": str(e)}


@tool
def arxiv_search(query: str) -> dict:
    """Search academic papers on arXiv."""
    try:
        return {"results": ArxivQueryRun(api_wrapper=ArxivAPIWrapper()).run(query)}
    except Exception as e:
        return {"error": str(e)}


@tool
def tavily_search(query: str) -> dict:
    """Search the web using Tavily."""
    try:
        return {"results": TavilySearchResults(max_results=5).run(query)}
    except Exception as e:
        return {"error": str(e)}