File size: 2,609 Bytes
9a9feeb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import gradio as gr

# --- SETTINGS ---
PDF_DIR = "data"
MAX_TOKENS = 500
TOP_K = 4

# --- MODELS ---
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")

# --- UTILS ---
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def chunk_text(text, max_tokens=MAX_TOKENS):
    sentences = text.split(". ")
    chunks, chunk = [], ""
    for sentence in sentences:
        if len((chunk + sentence).split()) > max_tokens:
            chunks.append(chunk)
            chunk = sentence + ". "
        else:
            chunk += sentence + ". "
    chunks.append(chunk)
    return chunks

# --- LOAD & INDEX ALL PDFs ---
print("📄 Loading and indexing all PDFs in /data ...")
all_chunks = []
chunk_to_doc = []

for filename in os.listdir(PDF_DIR):
    if filename.endswith(".pdf"):
        path = os.path.join(PDF_DIR, filename)
        text = extract_text_from_pdf(path)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_to_doc.extend([filename] * len(chunks))

# Embed and index
embeddings = embed_model.encode(all_chunks)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")

# --- QA FUNCTION ---
def answer_question(question):
    question_embedding = embed_model.encode([question])
    _, top_indices = index.search(np.array(question_embedding), k=TOP_K)
    
    context_chunks = [all_chunks[i] for i in top_indices[0]]
    source_docs = [chunk_to_doc[i] for i in top_indices[0]]
    
    context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
    prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    
    output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
    return output.split("Answer:")[-1].strip()

# --- UI ---
with gr.Blocks() as demo:
    gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")

    with gr.Row():
        question = gr.Textbox(label="Your Question")
        button = gr.Button("Get Answer")
        answer = gr.Textbox(label="Answer")

    button.click(fn=answer_question, inputs=question, outputs=answer)

demo.launch()