EngrMuhammadBilal commited on
Commit
b623c81
ยท
verified ยท
1 Parent(s): 8565c3f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +232 -67
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, io, json, math, pickle, textwrap, shutil, re
2
  from typing import List, Dict, Any, Tuple
3
  import numpy as np, faiss, fitz # pymupdf
4
  from tqdm import tqdm
@@ -6,17 +6,24 @@ import torch
6
  from sentence_transformers import SentenceTransformer
7
  import gradio as gr
8
  from groq import Groq
 
 
9
 
10
- # ---------- Config ----------
 
 
 
 
11
  EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
12
  CHUNK_SIZE = 1200
13
  CHUNK_OVERLAP = 200
14
- TOP_K_DEFAULT = 5
15
- MAX_CONTEXT_CHARS = 12000
16
 
17
  INDEX_PATH = "rag_index.faiss"
18
  STORE_PATH = "rag_store.pkl"
19
 
 
20
  MODEL_CHOICES = [
21
  "llama-3.3-70b-versatile",
22
  "llama-3.1-8b-instant",
@@ -96,6 +103,28 @@ def load_index() -> bool:
96
  return False
97
 
98
  # ---------- Ingest ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
100
  entries: List[Dict[str, Any]] = []
101
  for pdf in tqdm(paths, total=len(paths), desc="Parsing PDFs"):
@@ -122,7 +151,7 @@ def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
122
  index = build_faiss(embs)
123
  return index, entries
124
 
125
- # ---------- Retrieval (supports required keywords) ----------
126
  def retrieve(query: str, top_k=5, must_contain: str = ""):
127
  global faiss_index, docstore
128
  if faiss_index is None or not docstore:
@@ -153,10 +182,10 @@ def retrieve(query: str, top_k=5, must_contain: str = ""):
153
  return hits
154
 
155
  # ---------- Groq LLM ----------
156
- def groq_answer(query: str, contexts, model_name="llama-3.1-70b-versatile", temperature=0.2, max_tokens=1000):
157
  try:
158
  if not os.environ.get("GROQ_API_KEY"):
159
- return "GROQ_API_KEY is not set. Add it in your host's environment/secrets."
160
  client = Groq(api_key=os.environ["GROQ_API_KEY"])
161
 
162
  packed, used = [], 0
@@ -189,99 +218,235 @@ def groq_answer(query: str, contexts, model_name="llama-3.1-70b-versatile", temp
189
  import traceback
190
  return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
191
 
192
- # ---------- Helpers for UI ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  def build_index_from_uploads(paths: List[str]) -> str:
194
  global faiss_index, docstore
195
- if not paths: return "Please upload at least one PDF."
196
- if len(paths) > 120: return "Please limit to ~100 PDFs per build."
197
-
198
- faiss_index, entries = ingest_pdfs(paths)
199
  save_index(faiss_index, entries)
200
  docstore = entries
201
- return f"Index built with {len(entries)} chunks from {len(paths)} PDFs. Saved to disk."
202
 
203
  def reload_index() -> str:
204
  ok = load_index()
205
- return f"Index reloaded. Chunks: {len(docstore)}" if ok else "No saved index found."
206
 
207
- def ask_rag(query: str, top_k, model_name: str, temperature: float, must_contain: str):
208
  try:
209
- if not query.strip():
210
- return "Please enter a question.", []
211
- ctx = retrieve(query, top_k=int(top_k) if top_k else TOP_K_DEFAULT, must_contain=must_contain)
212
- ans = groq_answer(query, ctx, model_name=model_name, temperature=temperature)
 
 
213
  rows = []
214
  for c in ctx:
215
  preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
216
  rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
217
- return ans, rows
 
 
 
 
 
 
 
 
218
  except Exception as e:
219
  import traceback
220
- return f"**Error:** {e}\n```\n{traceback.format_exc()}\n```", []
 
221
 
222
  def set_api_key(k: str):
223
  if k and k.strip():
224
  os.environ["GROQ_API_KEY"] = k.strip()
225
- return "API key set in runtime."
226
  return "No key provided."
227
 
228
  def download_index_zip():
229
  if not (os.path.exists(INDEX_PATH) and os.path.exists(STORE_PATH)):
230
  return None
231
- base = "rag_index_bundle"
232
- zip_path = shutil.make_archive(base, "zip", ".", ".")
233
- # workaround for shutil: package explicit files
234
- with shutil.make_archive("rag_index", "zip"):
235
- pass
236
- # build our own zip containing only index files
237
- import zipfile
238
  zp = "rag_index_bundle.zip"
239
  with zipfile.ZipFile(zp, "w", zipfile.ZIP_DEFLATED) as z:
240
  z.write(INDEX_PATH)
241
  z.write(STORE_PATH)
242
  return zp
243
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  # ---------- Gradio UI ----------
245
- with gr.Blocks(title="RAG over PDFs (Groq)") as demo:
246
- gr.Markdown("## RAG over your PDFs using Groq\nUpload PDFs, build an index, then ask questions with cited answers.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  with gr.Row():
248
- api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY for this session", type="password", placeholder="sk_...")
249
  set_btn = gr.Button("Set Key")
250
  set_out = gr.Markdown()
251
- set_btn.click(set_api_key, inputs=[api_box], outputs=[set_out])
252
-
253
- with gr.Tab("1) Build or Load Index"):
254
- file_u = gr.Files(label="Upload PDFs", file_types=[".pdf"], type="filepath")
255
- with gr.Row():
256
- build_btn = gr.Button("Build Index")
257
- reload_btn = gr.Button("Reload Saved Index")
258
- download_btn = gr.Button("Download Index (.zip)")
259
- build_out = gr.Markdown()
260
-
261
- def on_build(paths, progress=gr.Progress(track_tqdm=True)):
262
- try:
263
- return build_index_from_uploads(paths)
264
- except Exception as e:
265
- import traceback
266
- return f"**Error while building index:** {e}\n\n```\n{traceback.format_exc()}\n```"
267
-
268
- build_btn.click(on_build, inputs=[file_u], outputs=[build_out])
269
- reload_btn.click(fn=reload_index, outputs=[build_out])
270
- zpath = gr.File(label="Index zip", interactive=False)
271
- download_btn.click(fn=download_index_zip, outputs=[zpath])
272
-
273
- with gr.Tab("2) Ask Questions"):
274
- q = gr.Textbox(label="Your question", lines=2, placeholder="Ask something present in the uploaded papersโ€ฆ")
275
- with gr.Row():
276
- topk = gr.Slider(1, 15, value=TOP_K_DEFAULT, step=1, label="Top-K passages")
277
- model_dd = gr.Dropdown(MODEL_CHOICES, value=MODEL_CHOICES[0], label="Groq model")
278
- temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
279
- must = gr.Textbox(label="Must contain (comma-separated keywords)", placeholder="camera, CMOS, frame rate")
280
- ask_btn = gr.Button("Answer")
281
- ans = gr.Markdown()
282
- src = gr.Dataframe(headers=["Source","Page","Score","Snippet"], wrap=True)
283
- ask_btn.click(ask_rag, inputs=[q, topk, model_dd, temp, must], outputs=[ans, src])
284
-
285
- demo.queue() # keep it simple for broad Gradio versions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  if __name__ == "__main__":
287
  demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
 
1
+ import os, io, json, math, pickle, textwrap, shutil, re, zipfile, tempfile
2
  from typing import List, Dict, Any, Tuple
3
  import numpy as np, faiss, fitz # pymupdf
4
  from tqdm import tqdm
 
6
  from sentence_transformers import SentenceTransformer
7
  import gradio as gr
8
  from groq import Groq
9
+ from docx import Document
10
+ from docx.shared import Pt
11
 
12
+ # ---------- Branding ----------
13
+ APP_NAME = "ScholarLens"
14
+ TAGLINE = "Query your literature, get page-level proof"
15
+
16
+ # ---------- Config (same engine, nicer UI) ----------
17
  EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
18
  CHUNK_SIZE = 1200
19
  CHUNK_OVERLAP = 200
20
+ TOP_K_DEFAULT = 7
21
+ MAX_CONTEXT_CHARS = 16000
22
 
23
  INDEX_PATH = "rag_index.faiss"
24
  STORE_PATH = "rag_store.pkl"
25
 
26
+ # You can edit the default model here. All are selectable in the UI.
27
  MODEL_CHOICES = [
28
  "llama-3.3-70b-versatile",
29
  "llama-3.1-8b-instant",
 
103
  return False
104
 
105
  # ---------- Ingest ----------
106
+ def _collect_pdf_paths(upload_paths: List[str]) -> List[str]:
107
+ """Accept PDFs and ZIPs of PDFs."""
108
+ if not upload_paths:
109
+ return []
110
+ out = []
111
+ for p in upload_paths:
112
+ p = str(p)
113
+ if p.lower().endswith(".pdf"):
114
+ out.append(p)
115
+ elif p.lower().endswith(".zip"):
116
+ tmpdir = tempfile.mkdtemp(prefix="pdfs_")
117
+ with zipfile.ZipFile(p, "r") as z:
118
+ for name in z.namelist():
119
+ if name.lower().endswith(".pdf"):
120
+ z.extract(name, tmpdir)
121
+ # collect extracted PDFs
122
+ for root, _, files in os.walk(tmpdir):
123
+ for f in files:
124
+ if f.lower().endswith(".pdf"):
125
+ out.append(os.path.join(root, f))
126
+ return out
127
+
128
  def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
129
  entries: List[Dict[str, Any]] = []
130
  for pdf in tqdm(paths, total=len(paths), desc="Parsing PDFs"):
 
151
  index = build_faiss(embs)
152
  return index, entries
153
 
154
+ # ---------- Retrieval with optional keyword filter ----------
155
  def retrieve(query: str, top_k=5, must_contain: str = ""):
156
  global faiss_index, docstore
157
  if faiss_index is None or not docstore:
 
182
  return hits
183
 
184
  # ---------- Groq LLM ----------
185
+ def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temperature=0.2, max_tokens=1000):
186
  try:
187
  if not os.environ.get("GROQ_API_KEY"):
188
+ return "GROQ_API_KEY is not set. Add it in your Space secrets or the key box."
189
  client = Groq(api_key=os.environ["GROQ_API_KEY"])
190
 
191
  packed, used = [], 0
 
218
  import traceback
219
  return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
220
 
221
+ # ---------- Export helpers ----------
222
+ def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]]) -> str:
223
+ """
224
+ Save Q&A with sources table to a .docx and return path.
225
+ rows: [Source, Page, Score, Snippet]
226
+ """
227
+ doc = Document()
228
+ styles = doc.styles
229
+ try:
230
+ styles['Normal'].font.name = 'Calibri'
231
+ styles['Normal'].font.size = Pt(11)
232
+ except Exception:
233
+ pass
234
+
235
+ doc.add_heading(f"{APP_NAME} - Answer", level=1)
236
+ doc.add_paragraph(f"Question: {question}")
237
+
238
+ doc.add_heading("Answer", level=2)
239
+ # Write as plain text to keep it simple in Word
240
+ for line in answer_md.splitlines():
241
+ doc.add_paragraph(line)
242
+
243
+ doc.add_heading("References (Top Passages)", level=2)
244
+ table = doc.add_table(rows=1, cols=4)
245
+ hdr = table.rows[0].cells
246
+ hdr[0].text = "Source"
247
+ hdr[1].text = "Page"
248
+ hdr[2].text = "Score"
249
+ hdr[3].text = "Snippet"
250
+ for r in rows:
251
+ row = table.add_row().cells
252
+ for i, val in enumerate(r):
253
+ row[i].text = str(val)
254
+
255
+ path = "scholarlens_answer.docx"
256
+ doc.save(path)
257
+ return path
258
+
259
+ # ---------- UI helpers ----------
260
  def build_index_from_uploads(paths: List[str]) -> str:
261
  global faiss_index, docstore
262
+ pdfs = _collect_pdf_paths(paths)
263
+ if not pdfs:
264
+ return "Please upload at least one PDF or ZIP of PDFs."
265
+ faiss_index, entries = ingest_pdfs(pdfs)
266
  save_index(faiss_index, entries)
267
  docstore = entries
268
+ return f"โœ… Index built with {len(entries)} chunks from {len(pdfs)} files. You can start asking questions."
269
 
270
  def reload_index() -> str:
271
  ok = load_index()
272
+ return f"๐Ÿ” Index reloaded. Chunks ready: {len(docstore)}" if ok else "No saved index found yet."
273
 
274
+ def ask_rag(question: str, top_k, model_name: str, temperature: float, must_contain: str):
275
  try:
276
+ if not question.strip():
277
+ return "Please enter a question.", [], "", gr.update(visible=False)
278
+ ctx = retrieve(question, top_k=int(top_k) if top_k else TOP_K_DEFAULT, must_contain=must_contain)
279
+ ans = groq_answer(question, ctx, model_name=model_name, temperature=temperature)
280
+
281
+ # sources table
282
  rows = []
283
  for c in ctx:
284
  preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
285
  rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
286
+
287
+ # snippets pretty print
288
+ details = []
289
+ for c in ctx:
290
+ details.append(f"**{c['source']} p.{c['page_start']}**\n> {c['text'].strip()[:1000]}")
291
+ snippets_md = "\n\n---\n\n".join(details)
292
+
293
+ download_btn = gr.update(visible=True)
294
+ return ans, rows, snippets_md, download_btn
295
  except Exception as e:
296
  import traceback
297
+ err = f"**Error:** {e}\n```\n{traceback.format_exc()}\n```"
298
+ return err, [], "", gr.update(visible=False)
299
 
300
  def set_api_key(k: str):
301
  if k and k.strip():
302
  os.environ["GROQ_API_KEY"] = k.strip()
303
+ return "๐Ÿ”‘ API key set for this session."
304
  return "No key provided."
305
 
306
  def download_index_zip():
307
  if not (os.path.exists(INDEX_PATH) and os.path.exists(STORE_PATH)):
308
  return None
 
 
 
 
 
 
 
309
  zp = "rag_index_bundle.zip"
310
  with zipfile.ZipFile(zp, "w", zipfile.ZIP_DEFLATED) as z:
311
  z.write(INDEX_PATH)
312
  z.write(STORE_PATH)
313
  return zp
314
 
315
+ def do_export_docx(question, answer_md, sources_rows):
316
+ if not answer_md or not sources_rows:
317
+ return None
318
+ try:
319
+ path = export_answer_to_docx(question, answer_md, sources_rows)
320
+ return path
321
+ except Exception:
322
+ return None
323
+
324
+ # ---------- Theme ----------
325
+ theme = gr.themes.Soft(
326
+ primary_hue="indigo",
327
+ secondary_hue="blue",
328
+ neutral_hue="slate",
329
+ ).set(
330
+ body_background_fill="#0B1220", # dark-friendly hero
331
+ block_background_fill="#0F172A",
332
+ block_shadow="*shadow-lg",
333
+ radius_size="8px",
334
+ )
335
+
336
  # ---------- Gradio UI ----------
337
+ with gr.Blocks(title=f"{APP_NAME} | RAG over PDFs", theme=theme, css="""
338
+ #hero {
339
+ background: radial-gradient(1200px 600px at 20% -10%, rgba(99,102,241,.25), transparent),
340
+ radial-gradient(1000px 500px at 120% 10%, rgba(14,165,233,.20), transparent);
341
+ border: 1px solid rgba(99,102,241,.20);
342
+ }
343
+ .kpi {text-align:center;padding:12px;border-radius:10px;border:1px solid rgba(255,255,255,.08);}
344
+ .footer {opacity:.8;}
345
+ """) as demo:
346
+ # --- Header / Hero ---
347
+ with gr.Group(elem_id="hero"):
348
+ gr.Markdown(
349
+ f"""
350
+ <div style="display:flex;align-items:center;gap:16px;">
351
+ <div style="font-size:36px">๐Ÿ“š๐Ÿ”Ž <b>{APP_NAME}</b></div>
352
+ <div style="opacity:.9;">{TAGLINE}</div>
353
+ </div>
354
+ <p style="opacity:.85;margin-top:6px;">
355
+ Upload your papers, build an index, and ask research questions with verifiable, page-level citations.
356
+ </p>
357
+ """)
358
+
359
+ # --- KPI row ---
360
+ with gr.Row():
361
+ gr.Markdown("**Meaning-aware retrieval**<br><span class='kpi'>E5 + FAISS</span>", elem_classes=["kpi"])
362
+ gr.Markdown("**Cited answers**<br><span class='kpi'>Page-level proof</span>", elem_classes=["kpi"])
363
+ gr.Markdown("**Runs anywhere**<br><span class='kpi'>HF Spaces or Colab</span>", elem_classes=["kpi"])
364
+
365
+ # --- Key / Settings ---
366
  with gr.Row():
367
+ api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY", type="password", placeholder="sk_...")
368
  set_btn = gr.Button("Set Key")
369
  set_out = gr.Markdown()
370
+ set_btn.click(set_api_key, inputs=[api_box], outputs=[set_out])
371
+
372
+ with gr.Tabs():
373
+ # ---------------- Tab 1: Build / Load ----------------
374
+ with gr.Tab("1) Build or Load Index"):
375
+ gr.Markdown("Upload PDFs or a ZIP of PDFs, then click **Build Index**.")
376
+ file_u = gr.Files(label="Upload PDFs or ZIP", file_types=[".pdf", ".zip"], type="filepath")
377
+ with gr.Row():
378
+ build_btn = gr.Button("Build Index", variant="primary")
379
+ reload_btn = gr.Button("Reload Saved Index")
380
+ download_btn = gr.Button("Download Index (.zip)")
381
+ build_out = gr.Markdown()
382
+
383
+ def on_build(paths, progress=gr.Progress(track_tqdm=True)):
384
+ try:
385
+ return build_index_from_uploads(paths)
386
+ except Exception as e:
387
+ import traceback
388
+ return f"**Error while building index:** {e}\n\n```\n{traceback.format_exc()}\n```"
389
+
390
+ build_btn.click(on_build, inputs=[file_u], outputs=[build_out])
391
+ reload_btn.click(fn=reload_index, outputs=[build_out])
392
+ zpath = gr.File(label="Index bundle", interactive=False)
393
+ download_btn.click(fn=download_index_zip, outputs=[zpath])
394
+
395
+ # ---------------- Tab 2: Ask ----------------
396
+ with gr.Tab("2) Ask Questions"):
397
+ with gr.Row():
398
+ with gr.Column(scale=1):
399
+ q = gr.Textbox(label="Your question", lines=3, placeholder="e.g., Compare GTAW experimental parameters with citations")
400
+ must = gr.Textbox(label="Must contain (comma-separated keywords)", placeholder="camera, CMOS, frame rate")
401
+ with gr.Accordion("Advanced settings", open=False):
402
+ topk = gr.Slider(1, 20, value=TOP_K_DEFAULT, step=1, label="Top-K passages")
403
+ model_dd = gr.Dropdown(MODEL_CHOICES, value=MODEL_CHOICES[0], label="Groq model")
404
+ temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
405
+ with gr.Row():
406
+ ask_btn = gr.Button("Answer", variant="primary")
407
+ clear_btn = gr.Button("Clear")
408
+
409
+ gr.Examples(
410
+ examples=[
411
+ ["List camera model, sensor type, resolution, and FPS across studies. Cite pages.", "camera, fps, resolution"],
412
+ ["Extract limitations and future work across the corpus, with page references.", ""],
413
+ ["Compare GTAW setups: current range, travel speed, torch standoff, sensors.", "GTAW, current, speed, torch"],
414
+ ["Summarize the main results tables with metrics and page citations.", "table, accuracy, mAP, F1"]
415
+ ],
416
+ inputs=[q, must],
417
+ label="Quick examples",
418
+ )
419
+ with gr.Column(scale=1.4):
420
+ ans = gr.Markdown(label="Answer", show_label=False)
421
+ src = gr.Dataframe(headers=["Source","Page","Score","Snippet"], wrap=True, label="Top passages")
422
+ with gr.Accordion("Show retrieved snippets", open=False):
423
+ snippets_md = gr.Markdown("")
424
+ with gr.Row():
425
+ export_btn = gr.Button("Export Answer to DOCX", visible=False)
426
+ exported = gr.File(label="Download answer", visible=True)
427
+
428
+ # wire buttons
429
+ ask_btn.click(fn=ask_rag, inputs=[q, topk, model_dd, temp, must], outputs=[ans, src, snippets_md, export_btn])
430
+ export_btn.click(fn=do_export_docx, inputs=[q, ans, src], outputs=[exported])
431
+ clear_btn.click(lambda: ("", [], "", gr.update(visible=False)), outputs=[ans, src, snippets_md, export_btn])
432
+
433
+ # ---------------- Tab 3: About ----------------
434
+ with gr.Tab("About"):
435
+ gr.Markdown(
436
+ """
437
+ **ScholarLens** helps researchers move from reading to results with answers grounded in the papers you upload.
438
+
439
+ - Meaning-aware retrieval (E5 + FAISS)
440
+ - Answers limited to your corpus, with page-level citations
441
+ - Optional keyword filter to stay on topic
442
+ - Runs on Hugging Face Spaces or Google Colab
443
+ - Powered by Groq models
444
+
445
+ *Privacy note:* your files stay on this Space. Only the Groq call is external.
446
+ """
447
+ )
448
+
449
+ # broad compatibility for Spaces
450
+ demo.queue()
451
  if __name__ == "__main__":
452
  demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))