Spaces:
Runtime error
Runtime error
File size: 2,609 Bytes
9a9feeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import os
import fitz # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr
# --- SETTINGS ---
PDF_DIR = "data"
MAX_TOKENS = 500
TOP_K = 4
# --- MODELS ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
# --- UTILS ---
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
return text
def chunk_text(text, max_tokens=MAX_TOKENS):
sentences = text.split(". ")
chunks, chunk = [], ""
for sentence in sentences:
if len((chunk + sentence).split()) > max_tokens:
chunks.append(chunk)
chunk = sentence + ". "
else:
chunk += sentence + ". "
chunks.append(chunk)
return chunks
# --- LOAD & INDEX ALL PDFs ---
print("📄 Loading and indexing all PDFs in /data ...")
all_chunks = []
chunk_to_doc = []
for filename in os.listdir(PDF_DIR):
if filename.endswith(".pdf"):
path = os.path.join(PDF_DIR, filename)
text = extract_text_from_pdf(path)
chunks = chunk_text(text)
all_chunks.extend(chunks)
chunk_to_doc.extend([filename] * len(chunks))
# Embed and index
embeddings = embed_model.encode(all_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")
# --- QA FUNCTION ---
def answer_question(question):
question_embedding = embed_model.encode([question])
_, top_indices = index.search(np.array(question_embedding), k=TOP_K)
context_chunks = [all_chunks[i] for i in top_indices[0]]
source_docs = [chunk_to_doc[i] for i in top_indices[0]]
context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
return output.split("Answer:")[-1].strip()
# --- UI ---
with gr.Blocks() as demo:
gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")
with gr.Row():
question = gr.Textbox(label="Your Question")
button = gr.Button("Get Answer")
answer = gr.Textbox(label="Answer")
button.click(fn=answer_question, inputs=question, outputs=answer)
demo.launch()
|