pdfs / app.py
iamomtiwari's picture
Update app.py
ebfca53 verified
import os
import gradio as gr
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")
# Persistent Chroma DB path
CHROMA_PATH = "./chroma_store"
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = None # global to store embeddings
def process_pdf(pdf_file):
global collection
try:
if not pdf_file:
return "❌ Please upload a valid PDF file."
pdf_path = pdf_file.name # βœ… Use uploaded file path directly
# Load PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load()
if not documents:
return "❌ PDF loaded but no content found."
# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80)
chunks = splitter.split_documents(documents)
texts = [doc.page_content for doc in chunks if doc.page_content.strip()]
if not texts:
return "❌ Failed to extract valid text from PDF."
# Get embeddings
embeddings = model.encode(texts)
# Reset collection
collection_name = "pdf-documents"
try:
client.delete_collection(collection_name)
except:
pass
collection = client.create_collection(collection_name)
collection.add(
documents=texts,
embeddings=embeddings.tolist(),
metadatas=[{"source": "pdf"} for _ in texts],
ids=[f"chunk-{i}" for i in range(len(texts))]
)
return f"βœ… Successfully processed {len(texts)} chunks."
except Exception as e:
return f"❌ Error: {str(e)}"
def query_pdf(query):
if not collection:
return "❌ No PDF has been processed yet."
try:
query_embedding = model.encode([query])
results = collection.query(
query_embeddings=query_embedding.tolist(),
n_results=3
)
docs = results.get("documents", [[]])[0]
if not docs:
return "❌ No matching results found."
return "\n\n---\n\n".join(docs)
except Exception as e:
return f"❌ Query error: {str(e)}"
# Gradio UI
with gr.Blocks() as demo:
gr.Markdown("# πŸ“˜ Ask Questions About Your PDF")
with gr.Row():
pdf_input = gr.File(label="πŸ“„ Upload PDF", file_types=[".pdf"])
upload_button = gr.Button("πŸ“€ Process PDF")
status_output = gr.Textbox(label="Status")
upload_button.click(fn=process_pdf, inputs=pdf_input, outputs=status_output)
question_input = gr.Textbox(label="❓ Ask a question")
answer_output = gr.Textbox(label="πŸ’¬ Answer")
question_input.submit(fn=query_pdf, inputs=question_input, outputs=answer_output)
demo.launch()