Spaces:

AnnaMathews
/

pdfrag

Sleeping

App Files Files Community

pdfrag / app.py

AnnaMathews

Create app.py

aeb2a10 verified 9 months ago

raw

history blame contribute delete

4.89 kB

	import gradio as gr
	import os
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_core.output_parsers import StrOutputParser
	from langchain_core.runnables import RunnablePassthrough
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from groq import Groq # Using Groq's direct client instead of LangChain wrapper
	from dotenv import load_dotenv

	# Initialize Groq client with your direct API key
	GROQ_API_KEY = "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" # Paste your actual Groq API key here
	groq_client = Groq(api_key=GROQ_API_KEY)

	def process_pdf(file):
	try:
	# Save uploaded file
	filepath = file.name

	# Load PDF
	loader = PyPDFLoader(filepath)
	documents = loader.load()

	# Split into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	separators=["\n\n", "\n", " ", ""]
	)
	docs = text_splitter.split_documents(documents)

	# Create vector store with HuggingFace embeddings
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.from_documents(docs, embeddings)

	# Save vectorstore for later use
	vectorstore.save_local("faiss_index")

	# Return the first chunk for preview and document info
	doc_info = f"Processed {len(docs)} chunks from {len(documents)} pages."
	return docs[0].page_content, doc_info
	except Exception as e:
	return f"Error: {str(e)}", ""

	def answer_question(question):
	try:
	# Load vector store
	embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

	# Create retriever
	retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

	# Get relevant documents
	docs = retriever.invoke(question)
	context = "\n\n".join([doc.page_content for doc in docs])

	# Create prompt template for LLaMA 3
	prompt = f"""<\|begin_of_text\|><\|start_header_id\|>system<\|end_header_id\|>
	You are a helpful AI assistant that answers questions based on the provided context.
	Use only the information from the context to answer the question. If you don't know the answer, say you don't know.
	Be concise and accurate in your responses.<\|eot_id\|>
	<\|start_header_id\|>user<\|end_header_id\|>
	Context: {context}
	Question: {question}<\|eot_id\|>
	<\|start_header_id\|>assistant<\|end_header_id\|>"""

	# Get response from Groq
	response = groq_client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="llama3-70b-8192",
	temperature=0,
	)

	return response.choices[0].message.content
	except Exception as e:
	return f"Error: {str(e)}"

	# Gradio UI
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown("## 📄 PDF RAG System with Groq & LLaMA 3", elem_id="title")
	gr.Markdown("Upload a PDF file, process it, and ask questions using LLaMA 3. ✨")

	with gr.Tab("📂 Upload & Process"):
	with gr.Row():
	with gr.Column():
	pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
	process_button = gr.Button("🚀 Process PDF", variant="primary")
	doc_info = gr.Textbox(label="Document Info")
	output_text = gr.Textbox(
	label="First Chunk Preview",
	lines=12,
	placeholder="Processed PDF content will appear here...",
	show_copy_button=True
	)
	process_button.click(
	fn=process_pdf,
	inputs=pdf_input,
	outputs=[output_text, doc_info]
	)

	with gr.Tab("❓ Ask Questions"):
	with gr.Row():
	with gr.Column():
	question_input = gr.Textbox(
	label="Your Question",
	placeholder="Ask something about the PDF content..."
	)
	ask_button = gr.Button("🔍 Get Answer", variant="primary")
	answer_output = gr.Textbox(
	label="Answer",
	lines=8,
	interactive=False
	)
	ask_button.click(
	fn=answer_question,
	inputs=question_input,
	outputs=answer_output
	)

	gr.Markdown("<p style='text-align: center; color: gray;'>Made with ❤ using Gradio + Groq + LLaMA 3</p>")

	if __name__ == "__main__":
	demo.launch()