Spaces:

senthil3226w
/

Multi-Document-rag

Build error

App Files Files Community

Multi-Document-rag / app.py

senthil3226w

Update app.py

23a0ec0 verified over 1 year ago

raw

history blame contribute delete

3.49 kB

	import gradio as gr
	import os
	from PyPDF2 import PdfReader
	from llama_index.core.schema import TextNode
	from langchain_google_genai import GoogleGenerativeAIEmbeddings
	import chromadb

	os.environ["GOOGLE_API_KEY"] = "AIzaSyBrCisSoUqfhFvP2L3bXLhOUUZl9kHLbL0"

	# Initialize the ChromaDB client and collection
	chroma_client = chromadb.Client()
	chroma_collection = chroma_client.create_collection("user_uploaded_docs")

	# Function to extract text from PDF
	def extract_text_from_pdf(pdf_file):
	reader = PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	text += page.extract_text()
	return text

	# Chunk text into smaller pieces
	def chunk_text(text, max_length=2500):
	return [text[i:i + max_length] for i in range(0, len(text), max_length)]

	# Initialize the embedding model
	embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

	# Function to handle the embedding process and store in ChromaDB
	def process_documents(pdf_files):
	for pdf_file in pdf_files:
	# Extract text from the PDF
	pdf_text = extract_text_from_pdf(pdf_file)

	# Chunk the extracted text
	chunks = chunk_text(pdf_text)

	# Embed chunks and store in ChromaDB
	chunk_embeddings = []
	nodes = []

	for i, chunk in enumerate(chunks):
	node = TextNode(
	text=chunk,
	metadata={
	"filename": os.path.basename(pdf_file.name),
	"chunk_index": i,
	"length": len(chunk),
	}
	)
	nodes.append(node)
	chunk_embeddings.append(chunk)

	# Perform batch embedding
	embeddings_batch = embeddings.embed_documents(chunk_embeddings)

	# Store each chunk with its embedding in ChromaDB
	for i, node in enumerate(nodes):
	node.embedding = embeddings_batch[i]
	chroma_collection.add(
	documents=[node.text],
	embeddings=[node.embedding],
	metadatas=[node.metadata],
	ids=[f"{node.metadata['filename']}_{i}"]
	)

	return "Files have been successfully processed and embedded!"

	# Function to query ChromaDB and retrieve relevant documents
	def query_documents(user_query):
	query_embedding = embeddings.embed_query(user_query)

	# Perform the query on ChromaDB
	results = chroma_collection.query(
	query_embeddings=[query_embedding],
	n_results=3 # Return the top 3 most relevant documents
	)

	response = ""
	for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
	response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"

	return response


	# Gradio interface combining document upload and query features
	with gr.Blocks() as demo:
	pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files")
	process_btn = gr.Button("Process PDFs")
	process_output = gr.Textbox(label="wait before success message for the document process")
	query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...")


	query_btn = gr.Button("Query Documents")

	query_output = gr.Textbox(label="retrieved documents")

	process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output])
	query_btn.click(query_documents, inputs=[query_input], outputs=[query_output])

	demo.launch()