File size: 2,434 Bytes
bae14fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.tools import tool
from dotenv import load_dotenv

load_dotenv()


class Docs:
    """Document manager with vector store for RAG-based retrieval."""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-mpnet-base-v2"
        )
        self.vector_store = self._upload_file(file_path)

    def _upload_file(self, file_path: str) -> InMemoryVectorStore:
        """Load PDF, chunk it, and create vector store."""
        loader = PyPDFLoader(file_path)
        docs = loader.load()
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            add_start_index=True,
        )
        all_splits = text_splitter.split_documents(docs)
        
        vector_store = InMemoryVectorStore(self.embeddings)
        vector_store.add_documents(documents=all_splits)
        
        return vector_store

    def as_search_tool(self):
        """Return a LangChain tool for searching the document."""
        vector_store = self.vector_store
        
        @tool
        def search_in_docs(query: str) -> str:
            """Retrieve information from the uploaded document to answer a query."""
            retrieved_docs = vector_store.similarity_search(query, k=2)
            serialized = "\n\n".join(
                f"Source: {doc.metadata}\nContent: {doc.page_content}"
                for doc in retrieved_docs
            )
            return serialized
        
        return search_in_docs

    def get_diverse_chunks_mmr(self, query: str, k: int = 30):
        """Get diverse chunks using MMR (Maximal Marginal Relevance)."""
        retriever = self.vector_store.as_retriever(
            search_type="mmr",
            search_kwargs={
                "k": k,
                "lambda_mult": 0.5,
                "fetch_k": max(k * 3, 50),
            },
        )
        return retriever.invoke(query)

    def similarity_search(self, query: str, k: int = 4):
        """Simple similarity search."""
        return self.vector_store.similarity_search(query, k=k)