Spaces:

yashalhussain
/

chatbot_rag

Sleeping

App Files Files Community

chatbot_rag / app.py

yashalhussain

Create app.py

31fc4f4 verified 4 months ago

raw

history blame contribute delete

4.33 kB

	import os
	import gradio as gr
	import PyPDF2
	import requests

	# ================= CONFIG =================
	GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
	GROQ_MODEL = "llama-3.1-8b-instant"
	GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"

	processed_texts = {}

	# ================= PDF HANDLING =================
	def extract_pdf_text(file):
	reader = PyPDF2.PdfReader(file)
	text = ""
	for page in reader.pages:
	text += (page.extract_text() or "") + "\n"
	return text


	def chunk_text(text, chunk_size=400, overlap=50):
	words = text.split()
	chunks = []
	i = 0
	while i < len(words):
	chunk = " ".join(words[i:i + chunk_size])
	chunks.append(chunk)
	i += chunk_size - overlap
	return chunks


	def preview_documents(files):
	processed_texts.clear()
	rows = []

	for f in files:
	text = extract_pdf_text(f)
	name = os.path.basename(f)

	chunks = chunk_text(text)
	processed_texts[name] = chunks

	rows.append([
	name,
	len(text.split()),
	text[:300],
	f"{len(chunks)} chunks"
	])

	return rows


	def process_documents(files):
	if not processed_texts:
	return "❌ Preview documents first."
	return f"✅ {len(processed_texts)} document(s) processed."


	# ================= GROQ CALL =================
	def query_groq(prompt):
	if not GROQ_API_KEY:
	return "❌ GROQ_API_KEY not set."

	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	payload = {
	"model": GROQ_MODEL,
	"messages": [
	{"role": "system", "content": "Answer ONLY from the provided documents."},
	{"role": "user", "content": prompt}
	],
	"temperature": 0.1,
	"max_tokens": 400
	}

	r = requests.post(GROQ_URL, headers=headers, json=payload, timeout=30)

	if r.status_code == 200:
	return r.json()["choices"][0]["message"]["content"]

	return f"❌ Groq Error {r.status_code}: {r.text}"


	# ================= RAG =================
	def retrieve_context(question):
	question_words = set(question.lower().split())
	best_chunk = ""
	best_score = 0

	for chunks in processed_texts.values():
	for chunk in chunks:
	chunk_words = set(chunk.lower().split())
	score = len(question_words & chunk_words)
	if score > best_score:
	best_score = score
	best_chunk = chunk

	return best_chunk[:1000] if best_chunk else ""


	def answer_question(question, history):
	if history is None:
	history = []

	# 🔹 STEP 1: show user message instantly
	history.append((question, ""))

	if not processed_texts:
	history[-1] = (question, "⚠️ Upload and process PDFs first.")
	return history, ""

	context = retrieve_context(question)

	if not context:
	history[-1] = (question, "❌ No relevant information found in documents.")
	return history, ""

	prompt = f"""
	DOCUMENT CONTEXT:
	{context}

	QUESTION:
	{question}

	Answer clearly using the document context only.
	"""

	# 🔹 STEP 2: get model response
	answer = query_groq(prompt)

	# 🔹 STEP 3: replace last empty reply
	history[-1] = (question, answer)

	return history, ""


	# ================= UI =================
	with gr.Blocks(title="RAG PDF Chatbot") as demo:
	gr.Markdown("# 📚 RAG PDF Chatbot (Groq)")
	gr.Markdown("Upload PDFs → Preview → Ask questions")

	with gr.Row():
	files = gr.File(file_types=[".pdf"], file_count="multiple")
	preview_btn = gr.Button("📄 Preview")
	process_btn = gr.Button("🚀 Process")
	status = gr.Textbox(label="Status")

	table = gr.DataFrame(
	headers=["File", "Words", "Preview", "Chunks"],
	interactive=False
	)

	chatbot = gr.Chatbot(height=420)

	msg = gr.Textbox(
	placeholder="Ask a question from the documents...",
	lines=2
	)
	send = gr.Button("Send")

	preview_btn.click(preview_documents, files, table)
	process_btn.click(process_documents, files, status)

	send.click(answer_question, [msg, chatbot], [chatbot, msg])
	msg.submit(answer_question, [msg, chatbot], [chatbot, msg])

	demo.launch()