NigeethaR commited on
Commit
b5a0d75
·
verified ·
1 Parent(s): 015f04f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -0
app.py CHANGED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from PyPDF2 import PdfReader
4
+ from llama_index.core.schema import TextNode
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ import chromadb
7
+
8
+ os.environ["GOOGLE_API_KEY"] = "AIzaSyBlEd_7R6jzUVx40Bt-W6J8ilP4zoiOKu0"
9
+
10
+ # Initialize the ChromaDB client and collection
11
+ chroma_client = chromadb.Client()
12
+ chroma_collection = chroma_client.create_collection("user_uploaded_docs")
13
+
14
+ # Function to extract text from PDF
15
+ def extract_text_from_pdf(pdf_file):
16
+ reader = PdfReader(pdf_file)
17
+ text = ""
18
+ for page in reader.pages:
19
+ text += page.extract_text()
20
+ return text
21
+
22
+ # Chunk text into smaller pieces
23
+ def chunk_text(text, max_length=2500):
24
+ return [text[i:i + max_length] for i in range(0, len(text), max_length)]
25
+
26
+ # Initialize the embedding model
27
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
28
+
29
+ # Function to handle the embedding process and store in ChromaDB
30
+ def process_documents(pdf_files):
31
+ for pdf_file in pdf_files:
32
+ # Extract text from the PDF
33
+ pdf_text = extract_text_from_pdf(pdf_file)
34
+
35
+ # Chunk the extracted text
36
+ chunks = chunk_text(pdf_text)
37
+
38
+ # Embed chunks and store in ChromaDB
39
+ chunk_embeddings = []
40
+ nodes = []
41
+
42
+ for i, chunk in enumerate(chunks):
43
+ node = TextNode(
44
+ text=chunk,
45
+ metadata={
46
+ "filename": os.path.basename(pdf_file.name),
47
+ "chunk_index": i,
48
+ "length": len(chunk),
49
+ }
50
+ )
51
+ nodes.append(node)
52
+ chunk_embeddings.append(chunk)
53
+
54
+ # Perform batch embedding
55
+ embeddings_batch = embeddings.embed_documents(chunk_embeddings)
56
+
57
+ # Store each chunk with its embedding in ChromaDB
58
+ for i, node in enumerate(nodes):
59
+ node.embedding = embeddings_batch[i]
60
+ chroma_collection.add(
61
+ documents=[node.text],
62
+ embeddings=[node.embedding],
63
+ metadatas=[node.metadata],
64
+ ids=[f"{node.metadata['filename']}_{i}"]
65
+ )
66
+
67
+ return "Files have been successfully processed and embedded!"
68
+
69
+ # Function to query ChromaDB and retrieve relevant documents
70
+ def query_documents(user_query):
71
+ query_embedding = embeddings.embed_query(user_query)
72
+
73
+ # Perform the query on ChromaDB
74
+ results = chroma_collection.query(
75
+ query_embeddings=[query_embedding],
76
+ n_results=3 # Return the top 3 most relevant documents
77
+ )
78
+
79
+ response = ""
80
+ for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
81
+ response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"
82
+
83
+ return response
84
+
85
+
86
+ # Gradio interface combining document upload and query features
87
+ with gr.Blocks() as demo:
88
+ pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files")
89
+ process_btn = gr.Button("Process PDFs")
90
+ process_output = gr.Textbox(label="wait before success message for the document process")
91
+ query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...")
92
+
93
+
94
+ query_btn = gr.Button("Query Documents")
95
+
96
+ query_output = gr.Textbox(label="retrieved documents")
97
+
98
+ process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output])
99
+ query_btn.click(query_documents, inputs=[query_input], outputs=[query_output])
100
+
101
+ demo.launch()