Spaces:

Miguelpef
/

MercadonaAgent

Build error

App Files Files Community

MercadonaAgent / utils.py

Miguelpef

Upload 4 files

fb40ebb verified 11 months ago

raw

history blame contribute delete

2.47 kB

	# Import required libraries
	import os
	import tempfile
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import PDFPlumberLoader
	from langchain_ollama import OllamaEmbeddings
	from langchain_chroma import Chroma


	# Function to load, split, and embed data from PDF documents into Chroma vector store
	def process_documents(pdfs):
	"""
	Process PDF documents through loading, splitting, and embedding.
	Returns vector store instance.
	"""
	# Create temporary directory for PDF storage
	with tempfile.TemporaryDirectory() as temp_dir:
	# Save uploaded PDFs to temp directory
	pdf_paths = []
	for pdf in pdfs:
	path = os.path.join(temp_dir, pdf.name)
	with open(path, "wb") as f:
	f.write(pdf.getbuffer())
	pdf_paths.append(path)

	# Load the documents
	documents = []
	for path in pdf_paths:
	loader = PDFPlumberLoader(path)
	documents.extend(loader.load())

	# Split documents into chunks using RecursiveCharacterTextSplitter
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1200,
	chunk_overlap=150
	)
	splits = text_splitter.split_documents(documents)

	# Instantiate the embeddings model
	embeddings = OllamaEmbeddings(model="nomic-embed-text")

	# Create embeddings and vector store
	vector_store = Chroma.from_documents(
	documents=splits,
	embedding=embeddings,
	persist_directory="./chroma_db"
	)

	return vector_store

	# Initialize and returns a retriever for the vector store, which will be used to fetch relevant chunks from the stored embeddings based on user queries.
	def get_retriever():
	"""Initialize and return the vector store retriever"""
	# Initialize the embedding model
	embeddings = OllamaEmbeddings(model="nomic-embed-text")

	try:
	# Initialize the vector store
	vector_store = Chroma(
	embedding_function=embeddings,
	persist_directory="./chroma_db"
	)

	# Return the retriever with MMR (Maximum Marginal Relevance) search and k=3
	return vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})

	except Exception as e:
	print(f"Error initializing vector store: {e}")
	return None