Spaces:
Build error
Build error
| import gradio as gr | |
| import os | |
| from PyPDF2 import PdfReader | |
| from llama_index.core.schema import TextNode | |
| from langchain_google_genai import GoogleGenerativeAIEmbeddings | |
| import chromadb | |
| os.environ["GOOGLE_API_KEY"] = "AIzaSyBrCisSoUqfhFvP2L3bXLhOUUZl9kHLbL0" | |
| # Initialize the ChromaDB client and collection | |
| chroma_client = chromadb.Client() | |
| chroma_collection = chroma_client.create_collection("user_uploaded_docs") | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Chunk text into smaller pieces | |
| def chunk_text(text, max_length=2500): | |
| return [text[i:i + max_length] for i in range(0, len(text), max_length)] | |
| # Initialize the embedding model | |
| embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") | |
| # Function to handle the embedding process and store in ChromaDB | |
| def process_documents(pdf_files): | |
| for pdf_file in pdf_files: | |
| # Extract text from the PDF | |
| pdf_text = extract_text_from_pdf(pdf_file) | |
| # Chunk the extracted text | |
| chunks = chunk_text(pdf_text) | |
| # Embed chunks and store in ChromaDB | |
| chunk_embeddings = [] | |
| nodes = [] | |
| for i, chunk in enumerate(chunks): | |
| node = TextNode( | |
| text=chunk, | |
| metadata={ | |
| "filename": os.path.basename(pdf_file.name), | |
| "chunk_index": i, | |
| "length": len(chunk), | |
| } | |
| ) | |
| nodes.append(node) | |
| chunk_embeddings.append(chunk) | |
| # Perform batch embedding | |
| embeddings_batch = embeddings.embed_documents(chunk_embeddings) | |
| # Store each chunk with its embedding in ChromaDB | |
| for i, node in enumerate(nodes): | |
| node.embedding = embeddings_batch[i] | |
| chroma_collection.add( | |
| documents=[node.text], | |
| embeddings=[node.embedding], | |
| metadatas=[node.metadata], | |
| ids=[f"{node.metadata['filename']}_{i}"] | |
| ) | |
| return "Files have been successfully processed and embedded!" | |
| # Function to query ChromaDB and retrieve relevant documents | |
| def query_documents(user_query): | |
| query_embedding = embeddings.embed_query(user_query) | |
| # Perform the query on ChromaDB | |
| results = chroma_collection.query( | |
| query_embeddings=[query_embedding], | |
| n_results=3 # Return the top 3 most relevant documents | |
| ) | |
| response = "" | |
| for doc, metadata in zip(results['documents'][0], results['metadatas'][0]): | |
| response += f"Document: {metadata['filename']}, Chunk {metadata['chunk_index']}:\n{doc}\n\n" | |
| return response | |
| # Gradio interface combining document upload and query features | |
| with gr.Blocks() as demo: | |
| pdf_input = gr.File(file_count="multiple", label="Upload up to 10 PDF files") | |
| process_btn = gr.Button("Process PDFs") | |
| process_output = gr.Textbox(label="wait before success message for the document process") | |
| query_input = gr.Textbox(label="Enter your query", placeholder="Type a question here...") | |
| query_btn = gr.Button("Query Documents") | |
| query_output = gr.Textbox(label="retrieved documents") | |
| process_btn.click(process_documents, inputs=[pdf_input], outputs=[process_output]) | |
| query_btn.click(query_documents, inputs=[query_input], outputs=[query_output]) | |
| demo.launch() | |