import os import fitz # PyMuPDF import faiss import numpy as np from sentence_transformers import SentenceTransformer from transformers import pipeline import gradio as gr # --- SETTINGS --- PDF_DIR = "data" MAX_TOKENS = 500 TOP_K = 4 # --- MODELS --- embed_model = SentenceTransformer("all-MiniLM-L6-v2") llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2") # --- UTILS --- def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) text = "" for page in doc: text += page.get_text() return text def chunk_text(text, max_tokens=MAX_TOKENS): sentences = text.split(". ") chunks, chunk = [], "" for sentence in sentences: if len((chunk + sentence).split()) > max_tokens: chunks.append(chunk) chunk = sentence + ". " else: chunk += sentence + ". " chunks.append(chunk) return chunks # --- LOAD & INDEX ALL PDFs --- print("📄 Loading and indexing all PDFs in /data ...") all_chunks = [] chunk_to_doc = [] for filename in os.listdir(PDF_DIR): if filename.endswith(".pdf"): path = os.path.join(PDF_DIR, filename) text = extract_text_from_pdf(path) chunks = chunk_text(text) all_chunks.extend(chunks) chunk_to_doc.extend([filename] * len(chunks)) # Embed and index embeddings = embed_model.encode(all_chunks) index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(np.array(embeddings)) print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.") # --- QA FUNCTION --- def answer_question(question): question_embedding = embed_model.encode([question]) _, top_indices = index.search(np.array(question_embedding), k=TOP_K) context_chunks = [all_chunks[i] for i in top_indices[0]] source_docs = [chunk_to_doc[i] for i in top_indices[0]] context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))]) prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:" output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"] return output.split("Answer:")[-1].strip() # --- UI --- with gr.Blocks() as demo: gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.") with gr.Row(): question = gr.Textbox(label="Your Question") button = gr.Button("Get Answer") answer = gr.Textbox(label="Answer") button.click(fn=answer_question, inputs=question, outputs=answer) demo.launch()