Spaces:

sehaj13
/

SK_RAG_Chatbot

Runtime error

App Files Files Community

SK_RAG_Chatbot / app.py

sehaj13

Create app.py

9a9feeb verified 8 months ago

raw

history blame contribute delete

2.61 kB

	import os
	import fitz # PyMuPDF
	import faiss
	import numpy as np
	from sentence_transformers import SentenceTransformer
	from transformers import pipeline
	import gradio as gr

	# --- SETTINGS ---
	PDF_DIR = "data"
	MAX_TOKENS = 500
	TOP_K = 4

	# --- MODELS ---
	embed_model = SentenceTransformer("all-MiniLM-L6-v2")
	llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")

	# --- UTILS ---
	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def chunk_text(text, max_tokens=MAX_TOKENS):
	sentences = text.split(". ")
	chunks, chunk = [], ""
	for sentence in sentences:
	if len((chunk + sentence).split()) > max_tokens:
	chunks.append(chunk)
	chunk = sentence + ". "
	else:
	chunk += sentence + ". "
	chunks.append(chunk)
	return chunks

	# --- LOAD & INDEX ALL PDFs ---
	print("📄 Loading and indexing all PDFs in /data ...")
	all_chunks = []
	chunk_to_doc = []

	for filename in os.listdir(PDF_DIR):
	if filename.endswith(".pdf"):
	path = os.path.join(PDF_DIR, filename)
	text = extract_text_from_pdf(path)
	chunks = chunk_text(text)
	all_chunks.extend(chunks)
	chunk_to_doc.extend([filename] * len(chunks))

	# Embed and index
	embeddings = embed_model.encode(all_chunks)
	index = faiss.IndexFlatL2(embeddings.shape[1])
	index.add(np.array(embeddings))
	print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")

	# --- QA FUNCTION ---
	def answer_question(question):
	question_embedding = embed_model.encode([question])
	_, top_indices = index.search(np.array(question_embedding), k=TOP_K)

	context_chunks = [all_chunks[i] for i in top_indices[0]]
	source_docs = [chunk_to_doc[i] for i in top_indices[0]]

	context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
	prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"

	output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
	return output.split("Answer:")[-1].strip()

	# --- UI ---
	with gr.Blocks() as demo:
	gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")

	with gr.Row():
	question = gr.Textbox(label="Your Question")
	button = gr.Button("Get Answer")
	answer = gr.Textbox(label="Answer")

	button.click(fn=answer_question, inputs=question, outputs=answer)

	demo.launch()