TutorAgent / agents /tools.py
Maga222006's picture
Upload 27 files
bae14fb verified
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.tools import tool
from dotenv import load_dotenv
load_dotenv()
class Docs:
"""Document manager with vector store for RAG-based retrieval."""
def __init__(self, file_path: str):
self.file_path = file_path
self.embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2"
)
self.vector_store = self._upload_file(file_path)
def _upload_file(self, file_path: str) -> InMemoryVectorStore:
"""Load PDF, chunk it, and create vector store."""
loader = PyPDFLoader(file_path)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
all_splits = text_splitter.split_documents(docs)
vector_store = InMemoryVectorStore(self.embeddings)
vector_store.add_documents(documents=all_splits)
return vector_store
def as_search_tool(self):
"""Return a LangChain tool for searching the document."""
vector_store = self.vector_store
@tool
def search_in_docs(query: str) -> str:
"""Retrieve information from the uploaded document to answer a query."""
retrieved_docs = vector_store.similarity_search(query, k=2)
serialized = "\n\n".join(
f"Source: {doc.metadata}\nContent: {doc.page_content}"
for doc in retrieved_docs
)
return serialized
return search_in_docs
def get_diverse_chunks_mmr(self, query: str, k: int = 30):
"""Get diverse chunks using MMR (Maximal Marginal Relevance)."""
retriever = self.vector_store.as_retriever(
search_type="mmr",
search_kwargs={
"k": k,
"lambda_mult": 0.5,
"fetch_k": max(k * 3, 50),
},
)
return retriever.invoke(query)
def similarity_search(self, query: str, k: int = 4):
"""Simple similarity search."""
return self.vector_store.similarity_search(query, k=k)