Multi-RAG / app.py
NigeethaR's picture
Update app.py
b5a0d75 verified
import gradio as gr
import os
from PyPDF2 import PdfReader
from llama_index.core.schema import TextNode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import chromadb
os.environ["GOOGLE_API_KEY"] = "AIzaSyBlEd_7R6jzUVx40Bt-W6J8ilP4zoiOKu0"
# Initialize the ChromaDB client and collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("user_uploaded_docs")
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Chunk text into smaller pieces
def chunk_text(text, max_length=2500):
return [text[i:i + max_length] for i in range(0, len(text), max_length)]
# Initialize the embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
# Function to handle the embedding process and store in ChromaDB
def process_documents(pdf_files):
for pdf_file in pdf_files:
# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_file)
# Chunk the extracted text
chunks = chunk_text(pdf_text)
# Embed chunks and store in ChromaDB
chunk_embeddings = []
nodes = []
for i, chunk in enumerate(chunks):
node = TextNode(
text=chunk,
metadata={
"filename": os.path.basename(pdf_file.name),
"chunk_index": i,
"length": len(chunk),
}
)
nodes.append(node)
chunk_embeddings.append(chunk)
# Perform batch embedding
embeddings_batch = embeddings.embed_documents(chunk_embeddings)
# Store each chunk with its embedding in ChromaDB
for i, node in enumerate(nodes):
node.embedding = embeddings_batch[i]
chroma_collection.add(
documents=[node.text],
embeddings=[node.embedding],
metadatas=[node.metadata],
ids=[f"{node.metadata['filename']}_{i}"]
)
return "Files have been successfully processed and embedded!"
# Function to query ChromaDB and retrieve relevant documents
def query_documents(user_query):
query_embedding = embeddings.embed_query(user_query)
# Perform the query on ChromaDB
results = chroma_collection.query(
query_embeddings=[query_embedding],
n_results=3 # Return the top 3 most relevant documents
)
response = ""
for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"
return response
# Gradio interface combining document upload and query features
with gr.Blocks() as demo:
pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files")
process_btn = gr.Button("Process PDFs")
process_output = gr.Textbox(label="wait before success message for the document process")
query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...")
query_btn = gr.Button("Query Documents")
query_output = gr.Textbox(label="retrieved documents")
process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output])
query_btn.click(query_documents, inputs=[query_input], outputs=[query_output])
demo.launch()