Spaces:

iamomtiwari
/

pdfs

Sleeping

App Files Files Community

pdfs / app.py

iamomtiwari

Update app.py

ebfca53 verified 9 months ago

raw

history blame contribute delete

2.95 kB

	import os
	import gradio as gr
	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from sentence_transformers import SentenceTransformer
	import chromadb

	# Load embedding model
	model = SentenceTransformer("all-MiniLM-L6-v2")

	# Persistent Chroma DB path
	CHROMA_PATH = "./chroma_store"
	client = chromadb.PersistentClient(path=CHROMA_PATH)
	collection = None # global to store embeddings
	def process_pdf(pdf_file):
	global collection
	try:
	if not pdf_file:
	return "❌ Please upload a valid PDF file."

	pdf_path = pdf_file.name # ✅ Use uploaded file path directly

	# Load PDF
	loader = PyPDFLoader(pdf_path)
	documents = loader.load()
	if not documents:
	return "❌ PDF loaded but no content found."

	# Split into chunks
	splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=80)
	chunks = splitter.split_documents(documents)
	texts = [doc.page_content for doc in chunks if doc.page_content.strip()]
	if not texts:
	return "❌ Failed to extract valid text from PDF."

	# Get embeddings
	embeddings = model.encode(texts)

	# Reset collection
	collection_name = "pdf-documents"
	try:
	client.delete_collection(collection_name)
	except:
	pass
	collection = client.create_collection(collection_name)

	collection.add(
	documents=texts,
	embeddings=embeddings.tolist(),
	metadatas=[{"source": "pdf"} for _ in texts],
	ids=[f"chunk-{i}" for i in range(len(texts))]
	)

	return f"✅ Successfully processed {len(texts)} chunks."

	except Exception as e:
	return f"❌ Error: {str(e)}"

	def query_pdf(query):
	if not collection:
	return "❌ No PDF has been processed yet."

	try:
	query_embedding = model.encode([query])
	results = collection.query(
	query_embeddings=query_embedding.tolist(),
	n_results=3
	)
	docs = results.get("documents", [[]])[0]
	if not docs:
	return "❌ No matching results found."
	return "\n\n---\n\n".join(docs)
	except Exception as e:
	return f"❌ Query error: {str(e)}"

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown("# 📘 Ask Questions About Your PDF")

	with gr.Row():
	pdf_input = gr.File(label="📄 Upload PDF", file_types=[".pdf"])
	upload_button = gr.Button("📤 Process PDF")
	status_output = gr.Textbox(label="Status")

	upload_button.click(fn=process_pdf, inputs=pdf_input, outputs=status_output)

	question_input = gr.Textbox(label="❓ Ask a question")
	answer_output = gr.Textbox(label="💬 Answer")

	question_input.submit(fn=query_pdf, inputs=question_input, outputs=answer_output)

	demo.launch()