SK_RAG_Chatbot / app.py
sehaj13's picture
Create app.py
9a9feeb verified
import os
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
# --- SETTINGS ---
PDF_DIR = "data"
MAX_TOKENS = 500
TOP_K = 4
# --- MODELS ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
# --- UTILS ---
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text, max_tokens=MAX_TOKENS):
sentences = text.split(". ")
chunks, chunk = [], ""
for sentence in sentences:
if len((chunk + sentence).split()) > max_tokens:
chunks.append(chunk)
chunk = sentence + ". "
else:
chunk += sentence + ". "
chunks.append(chunk)
return chunks
# --- LOAD & INDEX ALL PDFs ---
print("πŸ“„ Loading and indexing all PDFs in /data ...")
all_chunks = []
chunk_to_doc = []
for filename in os.listdir(PDF_DIR):
if filename.endswith(".pdf"):
path = os.path.join(PDF_DIR, filename)
text = extract_text_from_pdf(path)
chunks = chunk_text(text)
all_chunks.extend(chunks)
chunk_to_doc.extend([filename] * len(chunks))
# Embed and index
embeddings = embed_model.encode(all_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
print(f"βœ… Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")
# --- QA FUNCTION ---
def answer_question(question):
question_embedding = embed_model.encode([question])
_, top_indices = index.search(np.array(question_embedding), k=TOP_K)
context_chunks = [all_chunks[i] for i in top_indices[0]]
source_docs = [chunk_to_doc[i] for i in top_indices[0]]
context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
return output.split("Answer:")[-1].strip()
# --- UI ---
with gr.Blocks() as demo:
gr.Markdown("# πŸ€– PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")
with gr.Row():
question = gr.Textbox(label="Your Question")
button = gr.Button("Get Answer")
answer = gr.Textbox(label="Answer")
button.click(fn=answer_question, inputs=question, outputs=answer)
demo.launch()