Spaces:

sehaj13
/

SK_RAG_Chatbot

Runtime error

App Files Files Community

sehaj13 commited on Jun 6, 2025

Commit

9a9feeb

verified ·

1 Parent(s): 5217fcd

Create app.py

Browse files

Files changed (1) hide show

app.py +82 -0

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import os
+import fitz  # PyMuPDF
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from transformers import pipeline
+import gradio as gr
+# --- SETTINGS ---
+PDF_DIR = "data"
+MAX_TOKENS = 500
+TOP_K = 4
+# --- MODELS ---
+embed_model = SentenceTransformer("all-MiniLM-L6-v2")
+llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
+# --- UTILS ---
+def extract_text_from_pdf(pdf_path):
+    doc = fitz.open(pdf_path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
+def chunk_text(text, max_tokens=MAX_TOKENS):
+    sentences = text.split(". ")
+    chunks, chunk = [], ""
+    for sentence in sentences:
+        if len((chunk + sentence).split()) > max_tokens:
+            chunks.append(chunk)
+            chunk = sentence + ". "
+        else:
+            chunk += sentence + ". "
+    chunks.append(chunk)
+    return chunks
+# --- LOAD & INDEX ALL PDFs ---
+print("📄 Loading and indexing all PDFs in /data ...")
+all_chunks = []
+chunk_to_doc = []
+for filename in os.listdir(PDF_DIR):
+    if filename.endswith(".pdf"):
+        path = os.path.join(PDF_DIR, filename)
+        text = extract_text_from_pdf(path)
+        chunks = chunk_text(text)
+        all_chunks.extend(chunks)
+        chunk_to_doc.extend([filename] * len(chunks))
+# Embed and index
+embeddings = embed_model.encode(all_chunks)
+index = faiss.IndexFlatL2(embeddings.shape[1])
+index.add(np.array(embeddings))
+print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")
+# --- QA FUNCTION ---
+def answer_question(question):
+    question_embedding = embed_model.encode([question])
+    _, top_indices = index.search(np.array(question_embedding), k=TOP_K)
+    context_chunks = [all_chunks[i] for i in top_indices[0]]
+    source_docs = [chunk_to_doc[i] for i in top_indices[0]]
+    context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
+    prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
+    output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
+    return output.split("Answer:")[-1].strip()
+# --- UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")
+    with gr.Row():
+        question = gr.Textbox(label="Your Question")
+        button = gr.Button("Get Answer")
+        answer = gr.Textbox(label="Answer")
+    button.click(fn=answer_question, inputs=question, outputs=answer)
+demo.launch()