Spaces:

senthil3226w
/

Multi-Document-rag

Build error

App Files Files Community

senthil3226w commited on Oct 9, 2024

Commit

df76a85

verified ·

1 Parent(s): d06a1db

Create app.py

Browse files

Files changed (1) hide show

app.py +100 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import gradio as gr
+import os
+from PyPDF2 import PdfReader
+from llama_index.core.schema import TextNode
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import chromadb
+os.environ["GOOGLE_API_KEY"] = "AIzaSyBrCisSoUqfhFvP2L3bXLhOUUZl9kHLbL0"
+# Initialize the ChromaDB client and collection
+chroma_client = chromadb.Client()
+chroma_collection = chroma_client.create_collection("user_uploaded_docs")
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Chunk text into smaller pieces
+def chunk_text(text, max_length=2500):
+    return [text[i:i + max_length] for i in range(0, len(text), max_length)]
+# Initialize the embedding model
+embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
+# Function to handle the embedding process and store in ChromaDB
+def process_documents(pdf_files):
+    for pdf_file in pdf_files:
+        # Extract text from the PDF
+        pdf_text = extract_text_from_pdf(pdf_file)
+        # Chunk the extracted text
+        chunks = chunk_text(pdf_text)
+        # Embed chunks and store in ChromaDB
+        chunk_embeddings = []
+        nodes = []
+        for i, chunk in enumerate(chunks):
+            node = TextNode(
+                text=chunk,
+                metadata={
+                    "filename": os.path.basename(pdf_file.name),
+                    "chunk_index": i,
+                    "length": len(chunk),
+                }
+            )
+            nodes.append(node)
+            chunk_embeddings.append(chunk)
+        # Perform batch embedding
+        embeddings_batch = embeddings.embed_documents(chunk_embeddings)
+        # Store each chunk with its embedding in ChromaDB
+        for i, node in enumerate(nodes):
+            node.embedding = embeddings_batch[i]
+            chroma_collection.add(
+                documents=[node.text],
+                embeddings=[node.embedding],
+                metadatas=[node.metadata],
+                ids=[f"{node.metadata['filename']}_{i}"]
+            )
+    return "Files have been successfully processed and embedded!"
+# Function to query ChromaDB and retrieve relevant documents
+def query_documents(user_query):
+    query_embedding = embeddings.embed_query(user_query)
+    # Perform the query on ChromaDB
+    results = chroma_collection.query(
+        query_embeddings=[query_embedding],
+        n_results=3  # Return the top 3 most relevant documents
+    )
+    response = ""
+    for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
+        response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"
+    return response
+# Gradio interface to upload PDFs
+pdf_input = gr.inputs.File(file_count="multiple", label="Upload up to 10 PDF files")
+query_input = gr.inputs.Textbox(label="Enter your query", placeholder="Type a question here...")
+# Gradio output
+output = gr.outputs.Textbox()
+# Gradio interface combining document upload and query features
+gr.Interface(
+    fn=[process_documents, query_documents],
+    inputs=[pdf_input, query_input],
+    outputs=[output, output],
+    title="PDF Document Embedding and Query",
+    description="Upload PDF files to embed them and then query to retrieve relevant documents.",
+    live=True
+).launch()