Spaces:

NigeethaR
/

Multi-RAG

Sleeping

App Files Files Community

NigeethaR commited on Oct 12, 2024

Commit

b5a0d75

verified ·

1 Parent(s): 015f04f

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -0

app.py CHANGED Viewed

	@@ -0,0 +1,101 @@

+import gradio as gr
+import os
+from PyPDF2 import PdfReader
+from llama_index.core.schema import TextNode
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import chromadb
+os.environ["GOOGLE_API_KEY"] = "AIzaSyBlEd_7R6jzUVx40Bt-W6J8ilP4zoiOKu0"
+# Initialize the ChromaDB client and collection
+chroma_client = chromadb.Client()
+chroma_collection = chroma_client.create_collection("user_uploaded_docs")
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Chunk text into smaller pieces
+def chunk_text(text, max_length=2500):
+    return [text[i:i + max_length] for i in range(0, len(text), max_length)]
+# Initialize the embedding model
+embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+# Function to handle the embedding process and store in ChromaDB
+def process_documents(pdf_files):
+    for pdf_file in pdf_files:
+        # Extract text from the PDF
+        pdf_text = extract_text_from_pdf(pdf_file)
+        # Chunk the extracted text
+        chunks = chunk_text(pdf_text)
+        # Embed chunks and store in ChromaDB
+        chunk_embeddings = []
+        nodes = []
+        for i, chunk in enumerate(chunks):
+            node = TextNode(
+                text=chunk,
+                metadata={
+                    "filename": os.path.basename(pdf_file.name),
+                    "chunk_index": i,
+                    "length": len(chunk),
+                }
+            )
+            nodes.append(node)
+            chunk_embeddings.append(chunk)
+        # Perform batch embedding
+        embeddings_batch = embeddings.embed_documents(chunk_embeddings)
+        # Store each chunk with its embedding in ChromaDB
+        for i, node in enumerate(nodes):
+            node.embedding = embeddings_batch[i]
+            chroma_collection.add(
+                documents=[node.text],
+                embeddings=[node.embedding],
+                metadatas=[node.metadata],
+                ids=[f"{node.metadata['filename']}_{i}"]
+            )
+    return "Files have been successfully processed and embedded!"
+# Function to query ChromaDB and retrieve relevant documents
+def query_documents(user_query):
+    query_embedding = embeddings.embed_query(user_query)
+    # Perform the query on ChromaDB
+    results = chroma_collection.query(
+        query_embeddings=[query_embedding],
+        n_results=3  # Return the top 3 most relevant documents
+    )
+    response = ""
+    for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
+        response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"
+    return response
+# Gradio interface combining document upload and query features
+with gr.Blocks() as demo:
+    pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files")
+    process_btn = gr.Button("Process PDFs")
+    process_output = gr.Textbox(label="wait before success message for the document process")
+    query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...")
+    query_btn = gr.Button("Query Documents")
+    query_output = gr.Textbox(label="retrieved documents")
+    process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output])
+    query_btn.click(query_documents, inputs=[query_input], outputs=[query_output])
+demo.launch()