Spaces:

pkraman06
/

PDF_Question_Answering_Bot

Sleeping

App Files Files Community

PDF_Question_Answering_Bot / app.py

pkraman06

Update app.py

deac4cd verified about 1 month ago

Raw

History Blame Contribute Delete

4.43 kB

	import os
	import gradio as gr

	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	# Imported Chroma from the dedicated langchain_chroma package
	from langchain_chroma import Chroma

	from langchain_huggingface import (
	HuggingFaceEmbeddings,
	HuggingFaceEndpoint
	)

	# =====================================================
	# HUGGING FACE TOKEN
	# =====================================================

	HF_TOKEN = os.getenv("HF_TOKEN")

	# =====================================================
	# EMBEDDING MODEL
	# =====================================================

	embedding_model = HuggingFaceEmbeddings(
	model_name="sentence-transformers/all-MiniLM-L6-v2",
	model_kwargs={'device': 'cpu'},
	encode_kwargs={'normalize_embeddings': True}
	)
	# =====================================================
	# LLM MODEL
	# =====================================================
	llm = HuggingFaceEndpoint(
	repo_id="NousResearch/Llama-2-7b-chat-hf",
	task="text-generation",
	huggingfacehub_api_token=HF_TOKEN,
	temperature=0.5,
	max_new_tokens=512
	)

	# =====================================================
	# VECTOR DATABASE
	# =====================================================
	db = None

	# =====================================================
	# PDF PROCESSING FUNCTION
	# =====================================================
	def process_pdf(pdf_file):
	global db

	if pdf_file is None:
	return "Please upload a PDF file."

	try:
	# Load PDF
	loader = PyPDFLoader(pdf_file.name)
	documents = loader.load()

	# Split into chunks
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	docs = splitter.split_documents(documents)

	# Create Chroma vector store in-memory
	db = Chroma.from_documents(
	documents=docs,
	embedding=embedding_model
	)

	return "PDF processed and indexed in ChromaDB successfully!"
	except Exception as e:
	return f"Error processing PDF: {str(e)}"

	# =====================================================
	# QUESTION ANSWERING FUNCTION
	# =====================================================
	def ask_question(question):
	global db

	if db is None:
	return "Please upload and process a PDF first."

	if question.strip() == "":
	return "Please enter a question."

	try:
	# Retrieve relevant chunks from Chroma
	docs = db.similarity_search(
	question,
	k=3
	)

	# Combine retrieved text
	context = "\n\n".join(
	[doc.page_content for doc in docs]
	)

	# Prompt
	prompt = f"""You are a helpful PDF question answering assistant.

	Answer the question ONLY from the provided context.

	If the answer is not in the context, say:
	"I could not find the answer in the PDF."

	Context:
	{context}

	Question:
	{question}

	Answer:"""


	response = llm.invoke(prompt)
	return response

	except Exception as e:
	return f"Error generating answer: {str(e)}"

	# =====================================================
	# GRADIO UI
	# =====================================================

	with gr.Blocks() as demo:

	gr.Markdown("# PDF Question Answering Bot (Powered by ChromaDB)")

	pdf_input = gr.File(
	label="Upload PDF",
	file_types=[".pdf"]
	)

	process_btn = gr.Button("Process PDF")

	process_output = gr.Textbox(
	label="PDF Status"
	)

	process_btn.click(
	fn=process_pdf,
	inputs=pdf_input,
	outputs=process_output
	)

	question_input = gr.Textbox(
	label="Ask a Question"
	)

	ask_btn = gr.Button("Get Answer")

	answer_output = gr.Textbox(
	label="Answer",
	lines=10
	)

	ask_btn.click(
	fn=ask_question,
	inputs=question_input,
	outputs=answer_output
	)

	# =====================================================
	# LAUNCH APP
	# =====================================================
	if __name__ == "__main__":
	demo.launch()