File size: 3,492 Bytes
b5a0d75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
import os
from PyPDF2 import PdfReader
from llama_index.core.schema import TextNode
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import chromadb

os.environ["GOOGLE_API_KEY"] = "AIzaSyBlEd_7R6jzUVx40Bt-W6J8ilP4zoiOKu0"

# Initialize the ChromaDB client and collection
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("user_uploaded_docs")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Chunk text into smaller pieces
def chunk_text(text, max_length=2500):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# Initialize the embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Function to handle the embedding process and store in ChromaDB
def process_documents(pdf_files):
    for pdf_file in pdf_files:
        # Extract text from the PDF
        pdf_text = extract_text_from_pdf(pdf_file)
        
        # Chunk the extracted text
        chunks = chunk_text(pdf_text)

        # Embed chunks and store in ChromaDB
        chunk_embeddings = []
        nodes = []
        
        for i, chunk in enumerate(chunks):
            node = TextNode(
                text=chunk,
                metadata={
                    "filename": os.path.basename(pdf_file.name),
                    "chunk_index": i,
                    "length": len(chunk),
                }
            )
            nodes.append(node)
            chunk_embeddings.append(chunk)

        # Perform batch embedding
        embeddings_batch = embeddings.embed_documents(chunk_embeddings)

        # Store each chunk with its embedding in ChromaDB
        for i, node in enumerate(nodes):
            node.embedding = embeddings_batch[i]
            chroma_collection.add(
                documents=[node.text],
                embeddings=[node.embedding],
                metadatas=[node.metadata],
                ids=[f"{node.metadata['filename']}_{i}"]
            )
    
    return "Files have been successfully processed and embedded!"

# Function to query ChromaDB and retrieve relevant documents
def query_documents(user_query):
    query_embedding = embeddings.embed_query(user_query)

    # Perform the query on ChromaDB
    results = chroma_collection.query(
        query_embeddings=[query_embedding],
        n_results=3  # Return the top 3 most relevant documents
    )

    response = ""
    for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
        response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"

    return response


# Gradio interface combining document upload and query features
with gr.Blocks() as demo:
    pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files")
    process_btn = gr.Button("Process PDFs")
    process_output = gr.Textbox(label="wait before success message for the document process")
    query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...")
    
   
    query_btn = gr.Button("Query Documents")
    
    query_output = gr.Textbox(label="retrieved documents")
    
    process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output])
    query_btn.click(query_documents, inputs=[query_input], outputs=[query_output])

demo.launch()