sehaj13 commited on
Commit
9a9feeb
·
verified ·
1 Parent(s): 5217fcd

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import fitz # PyMuPDF
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from transformers import pipeline
7
+ import gradio as gr
8
+
9
+ # --- SETTINGS ---
10
+ PDF_DIR = "data"
11
+ MAX_TOKENS = 500
12
+ TOP_K = 4
13
+
14
+ # --- MODELS ---
15
+ embed_model = SentenceTransformer("all-MiniLM-L6-v2")
16
+ llm_pipeline = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
17
+
18
+ # --- UTILS ---
19
+ def extract_text_from_pdf(pdf_path):
20
+ doc = fitz.open(pdf_path)
21
+ text = ""
22
+ for page in doc:
23
+ text += page.get_text()
24
+ return text
25
+
26
+ def chunk_text(text, max_tokens=MAX_TOKENS):
27
+ sentences = text.split(". ")
28
+ chunks, chunk = [], ""
29
+ for sentence in sentences:
30
+ if len((chunk + sentence).split()) > max_tokens:
31
+ chunks.append(chunk)
32
+ chunk = sentence + ". "
33
+ else:
34
+ chunk += sentence + ". "
35
+ chunks.append(chunk)
36
+ return chunks
37
+
38
+ # --- LOAD & INDEX ALL PDFs ---
39
+ print("📄 Loading and indexing all PDFs in /data ...")
40
+ all_chunks = []
41
+ chunk_to_doc = []
42
+
43
+ for filename in os.listdir(PDF_DIR):
44
+ if filename.endswith(".pdf"):
45
+ path = os.path.join(PDF_DIR, filename)
46
+ text = extract_text_from_pdf(path)
47
+ chunks = chunk_text(text)
48
+ all_chunks.extend(chunks)
49
+ chunk_to_doc.extend([filename] * len(chunks))
50
+
51
+ # Embed and index
52
+ embeddings = embed_model.encode(all_chunks)
53
+ index = faiss.IndexFlatL2(embeddings.shape[1])
54
+ index.add(np.array(embeddings))
55
+ print(f"✅ Loaded {len(all_chunks)} chunks from {len(set(chunk_to_doc))} PDFs.")
56
+
57
+ # --- QA FUNCTION ---
58
+ def answer_question(question):
59
+ question_embedding = embed_model.encode([question])
60
+ _, top_indices = index.search(np.array(question_embedding), k=TOP_K)
61
+
62
+ context_chunks = [all_chunks[i] for i in top_indices[0]]
63
+ source_docs = [chunk_to_doc[i] for i in top_indices[0]]
64
+
65
+ context = "\n".join([f"[{source_docs[i]}]\n{context_chunks[i]}" for i in range(len(context_chunks))])
66
+ prompt = f"Answer the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
67
+
68
+ output = llm_pipeline(prompt, max_new_tokens=200)[0]["generated_text"]
69
+ return output.split("Answer:")[-1].strip()
70
+
71
+ # --- UI ---
72
+ with gr.Blocks() as demo:
73
+ gr.Markdown("# 🤖 PDF Question Answering Bot (Multi-PDF)\nAsk a question based on all loaded documents.")
74
+
75
+ with gr.Row():
76
+ question = gr.Textbox(label="Your Question")
77
+ button = gr.Button("Get Answer")
78
+ answer = gr.Textbox(label="Answer")
79
+
80
+ button.click(fn=answer_question, inputs=question, outputs=answer)
81
+
82
+ demo.launch()