senthil3226w commited on
Commit
df76a85
·
verified ·
1 Parent(s): d06a1db

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from PyPDF2 import PdfReader
4
+ from llama_index.core.schema import TextNode
5
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
+ import chromadb
7
+
8
+ os.environ["GOOGLE_API_KEY"] = "AIzaSyBrCisSoUqfhFvP2L3bXLhOUUZl9kHLbL0"
9
+
10
+ # Initialize the ChromaDB client and collection
11
+ chroma_client = chromadb.Client()
12
+ chroma_collection = chroma_client.create_collection("user_uploaded_docs")
13
+
14
+ # Function to extract text from PDF
15
+ def extract_text_from_pdf(pdf_file):
16
+ reader = PdfReader(pdf_file)
17
+ text = ""
18
+ for page in reader.pages:
19
+ text += page.extract_text()
20
+ return text
21
+
22
+ # Chunk text into smaller pieces
23
+ def chunk_text(text, max_length=2500):
24
+ return [text[i:i + max_length] for i in range(0, len(text), max_length)]
25
+
26
+ # Initialize the embedding model
27
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
28
+
29
+ # Function to handle the embedding process and store in ChromaDB
30
+ def process_documents(pdf_files):
31
+ for pdf_file in pdf_files:
32
+ # Extract text from the PDF
33
+ pdf_text = extract_text_from_pdf(pdf_file)
34
+
35
+ # Chunk the extracted text
36
+ chunks = chunk_text(pdf_text)
37
+
38
+ # Embed chunks and store in ChromaDB
39
+ chunk_embeddings = []
40
+ nodes = []
41
+
42
+ for i, chunk in enumerate(chunks):
43
+ node = TextNode(
44
+ text=chunk,
45
+ metadata={
46
+ "filename": os.path.basename(pdf_file.name),
47
+ "chunk_index": i,
48
+ "length": len(chunk),
49
+ }
50
+ )
51
+ nodes.append(node)
52
+ chunk_embeddings.append(chunk)
53
+
54
+ # Perform batch embedding
55
+ embeddings_batch = embeddings.embed_documents(chunk_embeddings)
56
+
57
+ # Store each chunk with its embedding in ChromaDB
58
+ for i, node in enumerate(nodes):
59
+ node.embedding = embeddings_batch[i]
60
+ chroma_collection.add(
61
+ documents=[node.text],
62
+ embeddings=[node.embedding],
63
+ metadatas=[node.metadata],
64
+ ids=[f"{node.metadata['filename']}_{i}"]
65
+ )
66
+
67
+ return "Files have been successfully processed and embedded!"
68
+
69
+ # Function to query ChromaDB and retrieve relevant documents
70
+ def query_documents(user_query):
71
+ query_embedding = embeddings.embed_query(user_query)
72
+
73
+ # Perform the query on ChromaDB
74
+ results = chroma_collection.query(
75
+ query_embeddings=[query_embedding],
76
+ n_results=3 # Return the top 3 most relevant documents
77
+ )
78
+
79
+ response = ""
80
+ for doc, metadata in zip(results['documents'][0], results['metadatas'][0]):
81
+ response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n"
82
+
83
+ return response
84
+
85
+ # Gradio interface to upload PDFs
86
+ pdf_input = gr.inputs.File(file_count="multiple", label="Upload up to 10 PDF files")
87
+ query_input = gr.inputs.Textbox(label="Enter your query", placeholder="Type a question here...")
88
+
89
+ # Gradio output
90
+ output = gr.outputs.Textbox()
91
+
92
+ # Gradio interface combining document upload and query features
93
+ gr.Interface(
94
+ fn=[process_documents, query_documents],
95
+ inputs=[pdf_input, query_input],
96
+ outputs=[output, output],
97
+ title="PDF Document Embedding and Query",
98
+ description="Upload PDF files to embed them and then query to retrieve relevant documents.",
99
+ live=True
100
+ ).launch()