aaporosh commited on
Commit
d9893e1
Β·
verified Β·
1 Parent(s): 56d0815

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -411
app.py CHANGED
@@ -1,425 +1,60 @@
1
  import streamlit as st
2
- import logging
3
- import os
4
- from io import BytesIO
5
- import re
6
- import time
7
- from typing import List, Tuple, Optional
8
-
9
  import pdfplumber
10
-
11
- # Optional OCR (guarded)
12
- try:
13
- import pytesseract
14
- OCR_AVAILABLE = True
15
- except Exception:
16
- OCR_AVAILABLE = False
17
-
18
- from rank_bm25 import BM25Okapi
19
-
20
- # Embeddings + Vector store
21
- from sentence_transformers import SentenceTransformer
22
- import numpy as np
23
-
24
- try:
25
- import faiss # direct FAISS for speed and control
26
- FAISS_OK = True
27
- except Exception:
28
- FAISS_OK = False
29
-
30
- # Lightweight HF pipelines
31
  from transformers import pipeline
 
32
 
33
- # ----------------------------
34
- # App & Logging Setup
35
- # ----------------------------
36
- st.set_page_config(page_title="Smart PDF Chat & Summarizer", page_icon="πŸ“„", layout="wide")
37
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
38
- logger = logging.getLogger("smart_pdf")
39
-
40
- # ----------------------------
41
- # Caching: models & utilities
42
- # ----------------------------
43
- @st.cache_resource(show_spinner=False)
44
- def get_embedder(name: str = "sentence-transformers/all-MiniLM-L6-v2"):
45
- return SentenceTransformer(name)
46
-
47
- @st.cache_resource(show_spinner=False)
48
- def get_qa_pipeline():
49
- # Small, fast instruction model
50
- return pipeline(
51
- "text2text-generation",
52
- model="google/flan-t5-small",
53
- device=-1,
54
- max_length=220
55
- )
56
-
57
- @st.cache_resource(show_spinner=False)
58
- def get_summarizer():
59
- # DistilBART is much faster than bart-large-cnn
60
- return pipeline(
61
- "summarization",
62
- model="sshleifer/distilbart-cnn-12-6",
63
- device=-1,
64
- max_length=220,
65
- min_length=80,
66
- do_sample=False,
67
- )
68
 
69
- # ----------------------------
70
- # PDF processing
71
- # ----------------------------
72
 
73
- def _looks_like_code(line: str) -> bool:
74
- if len(line.strip()) == 0:
75
- return False
76
- # Heuristics for code-y lines
77
- code_tokens = [
78
- r"\b(def|class|import|from|return|if|elif|else|for|while|try|except|finally|with)\b",
79
- r"[{}`;<>]|::|=>|#|//|/\*|\*/",
80
- r"\(|\)|\[|\]|\{|\}",
81
- ]
82
- matches = sum(bool(re.search(p, line)) for p in code_tokens)
83
- indent = len(line) - len(line.lstrip())
84
- return matches >= 1 or indent >= 4
85
 
 
 
86
 
87
- def extract_text_and_code_from_pdf(file_bytes: bytes, ocr_fallback: bool = True, max_pages: int = 50) -> Tuple[str, List[str]]:
88
- """Return (plain_text, code_blocks[]) from a PDF with simple OCR fallback."""
89
- text_parts: List[str] = []
90
- code_lines: List[str] = []
91
 
92
- with pdfplumber.open(BytesIO(file_bytes)) as pdf:
93
- pages = pdf.pages[:max_pages]
94
- for page in pages:
95
- # 1) Try text extraction
96
- extracted = page.extract_text(x_tolerance=1.5, y_tolerance=1.0) or ""
97
 
98
- # 2) OCR fallback if page empty and OCR available
99
- if not extracted.strip() and ocr_fallback and OCR_AVAILABLE:
 
 
 
100
  try:
101
- img = page.to_image(resolution=180).original
102
- extracted = pytesseract.image_to_string(img, config='--psm 6') or ""
103
  except Exception as e:
104
- logger.warning(f"OCR failed on a page: {e}")
105
-
106
- # 3) Clean and collect
107
- if extracted:
108
- # Remove common headers/footers by simple rules
109
- lines = [ln for ln in extracted.splitlines() if not re.match(r"^(Page\s*\d+|Copyright.*)$", ln, flags=re.I)]
110
- text_parts.append("\n".join(lines))
111
-
112
- # Code detection: fenced blocks first
113
- fenced = re.findall(r"```[\w-]*\n([\s\S]*?)```", extracted, flags=re.M)
114
- for blk in fenced:
115
- blk = blk.strip()
116
- if blk:
117
- code_lines.append(blk)
118
-
119
- # Otherwise, line-wise heuristic
120
- for ln in lines:
121
- if _looks_like_code(ln):
122
- code_lines.append(ln)
123
-
124
- # 4) Tables -> pipe-separated rows
125
- try:
126
- tables = page.extract_tables() or []
127
- for tb in tables:
128
- for row in tb:
129
- if row and any(str(c).strip() for c in row):
130
- text_parts.append(" | ".join(str(c).strip() for c in row))
131
- except Exception:
132
- pass
133
-
134
- full_text = "\n\n".join(tp for tp in text_parts if tp.strip())
135
-
136
- # Merge adjacent code lines into blocks
137
- code_blocks: List[str] = []
138
- if code_lines:
139
- current: List[str] = []
140
- for ln in code_lines:
141
- if ln.strip():
142
- current.append(ln)
143
- else:
144
- if current:
145
- code_blocks.append("\n".join(current))
146
- current = []
147
- if current:
148
- code_blocks.append("\n".join(current))
149
-
150
- # Deduplicate & trim giant blocks
151
- seen = set()
152
- unique_blocks = []
153
- for blk in code_blocks:
154
- key = blk.strip()
155
- if key and key not in seen:
156
- seen.add(key)
157
- # cap extreme long blocks for UI; still allow download of full
158
- unique_blocks.append(blk[:8000])
159
-
160
- return full_text, unique_blocks
161
-
162
- # ----------------------------
163
- # Chunking & Indexing
164
- # ----------------------------
165
-
166
- def chunk_text(text: str, chunk_size: int = 700, chunk_overlap: int = 120) -> List[str]:
167
- text = re.sub(r"\n{3,}", "\n\n", text).strip()
168
- paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
169
- chunks: List[str] = []
170
- buf: str = ""
171
- for para in paras:
172
- if not buf:
173
- buf = para
174
- elif len(buf) + len(para) + 1 <= chunk_size:
175
- buf += "\n" + para
176
- else:
177
- chunks.append(buf)
178
- # overlap
179
- overlap = buf[-chunk_overlap:] if chunk_overlap > 0 else ""
180
- buf = (overlap + "\n" + para).strip()
181
- if buf:
182
- chunks.append(buf)
183
- return chunks
184
-
185
- @st.cache_resource(show_spinner=False)
186
- def build_indexes(chunks: List[str]):
187
- embedder = get_embedder()
188
- matrix = embedder.encode(chunks, show_progress_bar=False, batch_size=64, normalize_embeddings=True)
189
- matrix = np.asarray(matrix).astype('float32')
190
-
191
- bm25 = BM25Okapi([c.split() for c in chunks])
192
-
193
- if FAISS_OK:
194
- index = faiss.IndexFlatIP(matrix.shape[1])
195
- index.add(matrix)
196
- return {
197
- "chunks": chunks,
198
- "embeddings": matrix,
199
- "faiss": index,
200
- "bm25": bm25,
201
- }
202
- else:
203
- # Fallback: cosine via numpy (slower but OK for small docs)
204
- return {
205
- "chunks": chunks,
206
- "embeddings": matrix,
207
- "faiss": None,
208
- "bm25": bm25,
209
- }
210
-
211
- # ----------------------------
212
- # Retrieval + QA
213
- # ----------------------------
214
-
215
- def retrieve(topk: int, query: str, idx):
216
- chunks = idx["chunks"]
217
- embeddings = idx["embeddings"]
218
- bm25 = idx["bm25"]
219
-
220
- # BM25
221
- bm25_docs = bm25.get_top_n(query.split(), chunks, n=min(topk, len(chunks)))
222
-
223
- # FAISS / cosine
224
- embedder = get_embedder()
225
- qv = embedder.encode([query], normalize_embeddings=True)[0].astype('float32')
226
 
227
- if idx["faiss"] is not None:
228
- D, I = idx["faiss"].search(np.array([qv]), min(topk, len(chunks)))
229
- faiss_docs = [chunks[i] for i in I[0]]
230
- else:
231
- # cosine with numpy
232
- sims = embeddings @ qv
233
- order = np.argsort(-sims)[:topk]
234
- faiss_docs = [chunks[i] for i in order]
235
-
236
- # Merge uniques with preference to BM25 then FAISS
237
- merged: List[str] = []
238
- seen = set()
239
- for c in bm25_docs + faiss_docs:
240
- if c not in seen:
241
- merged.append(c)
242
- seen.add(c)
243
- if len(merged) >= topk:
244
- break
245
- return merged
246
-
247
-
248
- def rag_answer(query: str, idx, max_ctx_chars: int = 3000) -> str:
249
- ctx_chunks = retrieve(6, query, idx)
250
- # Concatenate up to a char budget
251
- ctx = "\n\n".join(ctx_chunks)
252
- if len(ctx) > max_ctx_chars:
253
- ctx = ctx[:max_ctx_chars]
254
- qa = get_qa_pipeline()
255
- prompt = (
256
- "Answer the question using ONLY the provided context. "
257
- "If the answer is not in the context, say 'I couldn't find that in the PDF.'\n\n"
258
- f"Context:\n{ctx}\n\nQuestion: {query}\nAnswer:"
259
- )
260
- out = qa(prompt)[0]["generated_text"].strip()
261
- return out
262
-
263
-
264
- def summarize_text(full_text: str) -> str:
265
- summarizer = get_summarizer()
266
- # Summarize in parts for long docs
267
- chunks = chunk_text(full_text, chunk_size=1200, chunk_overlap=150)
268
- partials = []
269
- for ch in chunks[:8]: # cap to keep it snappy on CPU
270
- partials.append(summarizer(ch)[0]["summary_text"].strip())
271
- # Final stitch summary
272
- stitched = " ".join(partials)
273
- if len(stitched) > 2000:
274
- stitched = summarizer(stitched[:3000])[0]["summary_text"].strip()
275
- return stitched
276
-
277
- # ----------------------------
278
- # UI
279
- # ----------------------------
280
-
281
- st.markdown(
282
- """
283
- <style>
284
- .app-header {background: linear-gradient(90deg,#10b981,#22c55e); color: white; padding: 16px; border-radius: 14px; text-align:center; box-shadow: 0 6px 20px rgba(16,185,129,.25)}
285
- .card {border:1px solid #e5e7eb; border-radius: 14px; padding: 16px; background: #fff}
286
- .muted {color:#6b7280}
287
- .kbd {background:#f3f4f6; border:1px solid #e5e7eb; border-radius:6px; padding:2px 6px; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco}
288
- </style>
289
- """,
290
- unsafe_allow_html=True,
291
- )
292
-
293
- st.markdown('<div class="app-header"><h1>πŸ“„ Smart PDF Chat & Summarizer</h1><p class="muted">Fast answers, focused summaries, and automatic code extraction</p></div>', unsafe_allow_html=True)
294
-
295
- # Session state
296
- if "idx" not in st.session_state:
297
- st.session_state.idx = None
298
- if "pdf_text" not in st.session_state:
299
- st.session_state.pdf_text = ""
300
- if "code_blocks" not in st.session_state:
301
- st.session_state.code_blocks = []
302
-
303
- # Sidebar
304
- with st.sidebar:
305
- st.subheader("Upload & Options")
306
- file = st.file_uploader("Upload a PDF", type=["pdf"], help="Max ~50 pages for speed. Uses OCR fallback if needed.")
307
- max_pages = st.slider("Max pages to parse", 5, 100, 50, help="Lower = faster")
308
- do_ocr = st.toggle("Enable OCR fallback (slower)", value=False)
309
- chunk_size = st.slider("Chunk size", 300, 1400, 700, step=50)
310
- overlap = st.slider("Chunk overlap", 0, 300, 120, step=10)
311
-
312
- colA, colB = st.columns(2)
313
- with colA:
314
- if st.button("βš™οΈ Build Index", use_container_width=True, type="primary"):
315
- if not file:
316
- st.warning("Please upload a PDF first.")
317
- else:
318
- with st.spinner("Reading & indexing PDF…"):
319
- data = file.read()
320
- text, code_blocks = extract_text_and_code_from_pdf(data, ocr_fallback=do_ocr, max_pages=max_pages)
321
- st.session_state.pdf_text = text
322
- st.session_state.code_blocks = code_blocks
323
-
324
- if not text.strip():
325
- st.error("Couldn't extract any text from the PDF.")
326
- else:
327
- chunks = chunk_text(text, chunk_size=chunk_size, chunk_overlap=overlap)
328
- st.session_state.idx = build_indexes(chunks)
329
- st.success(f"Indexed {len(chunks)} chunks. Ready!")
330
- with colB:
331
- if st.button("🧹 Clear", use_container_width=True):
332
- st.session_state.idx = None
333
- st.session_state.pdf_text = ""
334
- st.session_state.code_blocks = []
335
- st.experimental_rerun()
336
-
337
- if st.session_state.code_blocks:
338
- st.caption("Detected code blocks. You can copy or download from the Summary tab.")
339
-
340
- # Main area β€” two sections exactly: Chat & Summary
341
- chat_tab, summary_tab = st.tabs(["πŸ’¬ Chat", "πŸ“ Summary (with Code)"])
342
-
343
- with chat_tab:
344
- st.markdown("<div class='card'>Ask questions about your PDF. Retrieval-augmented answers use only the document context.</div>", unsafe_allow_html=True)
345
-
346
- if st.session_state.idx is None:
347
- st.info("Upload a PDF and click **Build Index** in the sidebar.")
348
- else:
349
- user_q = st.chat_input("Ask anything about the PDF…")
350
- if "chat" not in st.session_state:
351
- st.session_state.chat = []
352
-
353
- # Render history
354
- for role, content in st.session_state.get("chat", []):
355
- with st.chat_message(role):
356
- st.markdown(content)
357
-
358
- if user_q:
359
- st.session_state.chat.append(("user", user_q))
360
- with st.chat_message("user"):
361
- st.markdown(user_q)
362
-
363
- with st.chat_message("assistant"):
364
- with st.spinner("Thinking…"):
365
- try:
366
- ans = rag_answer(user_q, st.session_state.idx)
367
- except Exception as e:
368
- ans = f"Sorry, I hit an error while answering: {e}"
369
- st.markdown(ans)
370
- st.session_state.chat.append(("assistant", ans))
371
-
372
- with summary_tab:
373
- st.markdown("<div class='card'>One-click concise summary of the entire document, plus extracted programming code if detected.</div>", unsafe_allow_html=True)
374
-
375
- col1, col2 = st.columns([1,1])
376
- with col1:
377
- if st.button("πŸ”Ž Summarize PDF", type="primary", use_container_width=True):
378
- if not st.session_state.pdf_text.strip():
379
- st.warning("No parsed text yet. Upload & Build Index first.")
380
  else:
381
- with st.spinner("Summarizing…"):
382
- try:
383
- sm = summarize_text(st.session_state.pdf_text)
384
- st.session_state.summary = sm
385
- st.success("Summary generated.")
386
- except Exception as e:
387
- st.error(f"Summarization failed: {e}")
388
- with col2:
389
- if st.session_state.pdf_text:
390
- st.download_button(
391
- "⬇️ Download raw extracted text",
392
- st.session_state.pdf_text,
393
- file_name="extracted_text.txt",
394
- use_container_width=True,
395
- )
396
-
397
- if st.session_state.get("summary"):
398
- st.subheader("Summary")
399
- st.write(st.session_state.summary)
400
-
401
- st.divider()
402
-
403
- st.subheader("Extracted Code")
404
- if st.session_state.code_blocks:
405
- for i, blk in enumerate(st.session_state.code_blocks, start=1):
406
- with st.expander(f"Code block #{i}"):
407
- st.code(blk, language=None)
408
- st.download_button(
409
- f"Download code #{i}",
410
- blk,
411
- file_name=f"code_block_{i}.txt",
412
- key=f"dl_{i}",
413
- )
414
- all_code = "\n\n\n".join(st.session_state.code_blocks)
415
- st.download_button("⬇️ Download all code", all_code, file_name="all_code.txt")
416
- else:
417
- st.caption("No code-like content detected yet.")
418
-
419
- # Footer tips
420
- st.markdown(
421
- """
422
- <div class="muted" style="margin-top:24px">⚑ Tips for faster responses: use smaller PDFs, lower the "Max pages" and "Chunk size" in the sidebar, and keep OCR off unless needed.</div>
423
- """,
424
- unsafe_allow_html=True,
425
- )
 
1
  import streamlit as st
 
 
 
 
 
 
 
2
  import pdfplumber
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from transformers import pipeline
4
+ import re
5
 
6
+ # Load models once for speed
7
+ qa_model = pipeline("question-answering", model="google/flan-t5-large", tokenizer="google/flan-t5-large")
8
+ summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ st.set_page_config(page_title="Smart PDF Chatbot & Summarizer", layout="wide")
11
+ st.title("πŸ“„ Smart PDF Chatbot & Summarizer")
 
12
 
13
+ # Sidebar settings
14
+ st.sidebar.header("βš™οΈ Settings")
15
+ max_length = st.sidebar.slider("Summary Length", 50, 500, 250)
 
 
 
 
 
 
 
 
 
16
 
17
+ # Upload PDF
18
+ uploaded_file = st.file_uploader("Upload your PDF", type=["pdf"])
19
 
20
+ if uploaded_file:
21
+ with pdfplumber.open(uploaded_file) as pdf:
22
+ text = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
 
23
 
24
+ if not text.strip():
25
+ st.error("Couldn't extract text from this PDF.")
26
+ else:
27
+ tabs = st.tabs(["πŸ’¬ Chat with PDF", "πŸ“ Summarize PDF", "πŸ’» Extract Code"])
 
28
 
29
+ # Chat tab
30
+ with tabs[0]:
31
+ st.subheader("Ask Questions About Your PDF")
32
+ question = st.text_input("Enter your question:")
33
+ if st.button("Ask", key="qa") and question:
34
  try:
35
+ result = qa_model(question=question, context=text)
36
+ st.success(result['answer'])
37
  except Exception as e:
38
+ st.error(f"Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ # Summarization tab
41
+ with tabs[1]:
42
+ st.subheader("PDF Summary")
43
+ if st.button("Generate Summary", key="sum"):
44
+ try:
45
+ summary = summarizer(text, max_length=max_length, min_length=30, do_sample=False)
46
+ st.info(summary[0]['summary_text'])
47
+ except Exception as e:
48
+ st.error(f"Error: {e}")
49
+
50
+ # Code extraction tab
51
+ with tabs[2]:
52
+ st.subheader("Extracted Programming Code")
53
+ code_blocks = re.findall(r'```[a-zA-Z]*([\s\S]*?)```', text)
54
+ if code_blocks:
55
+ for idx, code in enumerate(code_blocks, 1):
56
+ st.code(code, language="python")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  else:
58
+ st.warning("No code blocks found in this PDF.")
59
+ else:
60
+ st.info("πŸ‘† Please upload a PDF to start.")