Spaces:
Sleeping
Sleeping
File size: 2,434 Bytes
bae14fb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.tools import tool
from dotenv import load_dotenv
load_dotenv()
class Docs:
"""Document manager with vector store for RAG-based retrieval."""
def __init__(self, file_path: str):
self.file_path = file_path
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2"
)
self.vector_store = self._upload_file(file_path)
def _upload_file(self, file_path: str) -> InMemoryVectorStore:
"""Load PDF, chunk it, and create vector store."""
loader = PyPDFLoader(file_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
all_splits = text_splitter.split_documents(docs)
vector_store = InMemoryVectorStore(self.embeddings)
vector_store.add_documents(documents=all_splits)
return vector_store
def as_search_tool(self):
"""Return a LangChain tool for searching the document."""
vector_store = self.vector_store
@tool
def search_in_docs(query: str) -> str:
"""Retrieve information from the uploaded document to answer a query."""
retrieved_docs = vector_store.similarity_search(query, k=2)
serialized = "\n\n".join(
f"Source: {doc.metadata}\nContent: {doc.page_content}"
for doc in retrieved_docs
)
return serialized
return search_in_docs
def get_diverse_chunks_mmr(self, query: str, k: int = 30):
"""Get diverse chunks using MMR (Maximal Marginal Relevance)."""
retriever = self.vector_store.as_retriever(
search_type="mmr",
search_kwargs={
"k": k,
"lambda_mult": 0.5,
"fetch_k": max(k * 3, 50),
},
)
return retriever.invoke(query)
def similarity_search(self, query: str, k: int = 4):
"""Simple similarity search."""
return self.vector_store.similarity_search(query, k=k)
|