Spaces:

sakanat
/

smartdoc_rag_chatbot

Sleeping

App Files Files Community

smartdoc_rag_chatbot / app.py

sakanat

Update app.py

75576a8 verified 2 months ago

raw

history blame contribute delete

4.59 kB

	import gradio as gr
	import os
	import requests
	import numpy as np
	from pypdf import PdfReader
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity


	# ---------------- CONFIG ----------------
	GROQ_API_KEY = os.environ.get("smartdoc_rag_chatbot") # HF Secrets me add hona chahiye
	GROQ_URL = "https://api.groq.com/openai/v1/chat/completions"
	MODEL_NAME = "llama-3.1-8b-instant"

	embedder = SentenceTransformer("all-MiniLM-L6-v2")

	chunks = []
	chunk_embeddings = []

	# ---------------- PDF LOADING ----------------
	def load_pdfs(pdf_files):
	global chunks, chunk_embeddings

	if not pdf_files:
	return "❌ Please upload at least one PDF."

	documents = []

	for doc_id, pdf in enumerate(pdf_files):
	reader = PdfReader(pdf)
	for page_num, page in enumerate(reader.pages):
	text = page.extract_text()
	if text:
	documents.append({
	"text": text,
	"page": page_num + 1,
	"doc": f"Document {doc_id + 1}"
	})

	# chunking
	chunks = []
	for doc in documents:
	text = doc["text"]
	for i in range(0, len(text), 500):
	chunks.append({
	"content": text[i:i+500],
	"page": doc["page"],
	"doc": doc["doc"]
	})

	texts = [c["content"] for c in chunks]
	chunk_embeddings = embedder.encode(texts)

	return f"✅ Loaded {len(pdf_files)} PDF(s) with {len(chunks)} chunks."

	# ---------------- RETRIEVAL ----------------
	def retrieve_context(query, k=3):
	query_embedding = embedder.encode([query])
	similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
	top_k = np.argsort(similarities)[-k:]
	selected = [chunks[i] for i in top_k]
	context = "\n".join([c["content"] for c in selected])
	source = selected[-1]
	return context, source

	# ---------------- GROQ CALL ----------------
	def ask_question(question):
	if not chunks:
	return "⚠️ Please load PDFs first."

	context, source = retrieve_context(question)

	prompt = f"""
	You are SmartDoc RAG Chatbot.
	Answer the question using ONLY the context below.

	Context:
	{context}

	Question:
	{question}
	"""

	headers = {
	"Authorization": f"Bearer {GROQ_API_KEY}",
	"Content-Type": "application/json"
	}

	response = requests.post(
	GROQ_URL,
	headers=headers,
	json={
	"model": MODEL_NAME,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.2
	}
	)

	answer = response.json()["choices"][0]["message"]["content"]

	return f"""{answer}

	📄 Source: {source['doc']} — Page {source['page']}"""

	# ---------------- UI ----------------
	css = """
	body {
	background: linear-gradient(120deg, #e0f2ff, #f8fbff);
	}
	h1, h3 {
	text-align: center;
	}
	.gr-textbox textarea {
	font-size: 15px;
	}
	.gr-button-primary {
	font-weight: bold;
	}
	"""

	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue="blue",
	secondary_hue="cyan",
	neutral_hue="slate",
	font=["Inter", "sans-serif"]
	),
	css=css
	) as demo:

	gr.Markdown("""
	# 📄 SmartDoc RAG Chatbot
	### Retrieval‑Augmented AI for Document Question Answering
	Upload PDFs and ask questions based only on their content.
	""")

	with gr.Row():

	# LEFT PANEL
	with gr.Column(scale=1):
	pdf_files = gr.File(
	file_types=[".pdf"],
	file_count="multiple",
	label="📂 Upload PDF Documents"
	)
	load_btn = gr.Button("📥 Load Documents", variant="primary")
	status = gr.Textbox(label="Status", interactive=False)

	# RIGHT PANEL
	with gr.Column(scale=2):
	with gr.Row():
	question = gr.Textbox(
	placeholder="Type your question here…",
	lines=1,
	scale=8
	)
	send_btn = gr.Button("➤", scale=1)

	answer = gr.Textbox(
	label="Answer",
	lines=8
	)

	# EVENTS
	load_btn.click(load_pdfs, inputs=pdf_files, outputs=status)

	send_btn.click(
	ask_question,
	inputs=question,
	outputs=answer
	).then(lambda: "", None, question)

	question.submit(
	ask_question,
	inputs=question,
	outputs=answer
	).then(lambda: "", None, question)

	demo.launch()