|
|
import gradio as gr |
|
|
import os |
|
|
from PyPDF2 import PdfReader |
|
|
from llama_index.core.schema import TextNode |
|
|
from langchain_google_genai import GoogleGenerativeAIEmbeddings |
|
|
import chromadb |
|
|
|
|
|
os.environ["GOOGLE_API_KEY"] = "AIzaSyBlEd_7R6jzUVx40Bt-W6J8ilP4zoiOKu0" |
|
|
|
|
|
|
|
|
chroma_client = chromadb.Client() |
|
|
chroma_collection = chroma_client.create_collection("user_uploaded_docs") |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
reader = PdfReader(pdf_file) |
|
|
text = "" |
|
|
for page in reader.pages: |
|
|
text += page.extract_text() |
|
|
return text |
|
|
|
|
|
|
|
|
def chunk_text(text, max_length=2500): |
|
|
return [text[i:i + max_length] for i in range(0, len(text), max_length)] |
|
|
|
|
|
|
|
|
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") |
|
|
|
|
|
|
|
|
def process_documents(pdf_files): |
|
|
for pdf_file in pdf_files: |
|
|
|
|
|
pdf_text = extract_text_from_pdf(pdf_file) |
|
|
|
|
|
|
|
|
chunks = chunk_text(pdf_text) |
|
|
|
|
|
|
|
|
chunk_embeddings = [] |
|
|
nodes = [] |
|
|
|
|
|
for i, chunk in enumerate(chunks): |
|
|
node = TextNode( |
|
|
text=chunk, |
|
|
metadata={ |
|
|
"filename": os.path.basename(pdf_file.name), |
|
|
"chunk_index": i, |
|
|
"length": len(chunk), |
|
|
} |
|
|
) |
|
|
nodes.append(node) |
|
|
chunk_embeddings.append(chunk) |
|
|
|
|
|
|
|
|
embeddings_batch = embeddings.embed_documents(chunk_embeddings) |
|
|
|
|
|
|
|
|
for i, node in enumerate(nodes): |
|
|
node.embedding = embeddings_batch[i] |
|
|
chroma_collection.add( |
|
|
documents=[node.text], |
|
|
embeddings=[node.embedding], |
|
|
metadatas=[node.metadata], |
|
|
ids=[f"{node.metadata['filename']}_{i}"] |
|
|
) |
|
|
|
|
|
return "Files have been successfully processed and embedded!" |
|
|
|
|
|
|
|
|
def query_documents(user_query): |
|
|
query_embedding = embeddings.embed_query(user_query) |
|
|
|
|
|
|
|
|
results = chroma_collection.query( |
|
|
query_embeddings=[query_embedding], |
|
|
n_results=3 |
|
|
) |
|
|
|
|
|
response = "" |
|
|
for doc, metadata in zip(results['documents'][0], results['metadatas'][0]): |
|
|
response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n" |
|
|
|
|
|
return response |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files") |
|
|
process_btn = gr.Button("Process PDFs") |
|
|
process_output = gr.Textbox(label="wait before success message for the document process") |
|
|
query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...") |
|
|
|
|
|
|
|
|
query_btn = gr.Button("Query Documents") |
|
|
|
|
|
query_output = gr.Textbox(label="retrieved documents") |
|
|
|
|
|
process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output]) |
|
|
query_btn.click(query_documents, inputs=[query_input], outputs=[query_output]) |
|
|
|
|
|
demo.launch() |