Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import os, io, json, math, pickle, textwrap, shutil, re
|
| 2 |
from typing import List, Dict, Any, Tuple
|
| 3 |
import numpy as np, faiss, fitz # pymupdf
|
| 4 |
from tqdm import tqdm
|
|
@@ -6,17 +6,24 @@ import torch
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
import gradio as gr
|
| 8 |
from groq import Groq
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
# ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 12 |
CHUNK_SIZE = 1200
|
| 13 |
CHUNK_OVERLAP = 200
|
| 14 |
-
TOP_K_DEFAULT =
|
| 15 |
-
MAX_CONTEXT_CHARS =
|
| 16 |
|
| 17 |
INDEX_PATH = "rag_index.faiss"
|
| 18 |
STORE_PATH = "rag_store.pkl"
|
| 19 |
|
|
|
|
| 20 |
MODEL_CHOICES = [
|
| 21 |
"llama-3.3-70b-versatile",
|
| 22 |
"llama-3.1-8b-instant",
|
|
@@ -96,6 +103,28 @@ def load_index() -> bool:
|
|
| 96 |
return False
|
| 97 |
|
| 98 |
# ---------- Ingest ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
|
| 100 |
entries: List[Dict[str, Any]] = []
|
| 101 |
for pdf in tqdm(paths, total=len(paths), desc="Parsing PDFs"):
|
|
@@ -122,7 +151,7 @@ def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
|
|
| 122 |
index = build_faiss(embs)
|
| 123 |
return index, entries
|
| 124 |
|
| 125 |
-
# ---------- Retrieval
|
| 126 |
def retrieve(query: str, top_k=5, must_contain: str = ""):
|
| 127 |
global faiss_index, docstore
|
| 128 |
if faiss_index is None or not docstore:
|
|
@@ -153,10 +182,10 @@ def retrieve(query: str, top_k=5, must_contain: str = ""):
|
|
| 153 |
return hits
|
| 154 |
|
| 155 |
# ---------- Groq LLM ----------
|
| 156 |
-
def groq_answer(query: str, contexts, model_name="llama-3.
|
| 157 |
try:
|
| 158 |
if not os.environ.get("GROQ_API_KEY"):
|
| 159 |
-
return "GROQ_API_KEY is not set. Add it in your
|
| 160 |
client = Groq(api_key=os.environ["GROQ_API_KEY"])
|
| 161 |
|
| 162 |
packed, used = [], 0
|
|
@@ -189,99 +218,235 @@ def groq_answer(query: str, contexts, model_name="llama-3.1-70b-versatile", temp
|
|
| 189 |
import traceback
|
| 190 |
return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
|
| 191 |
|
| 192 |
-
# ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
def build_index_from_uploads(paths: List[str]) -> str:
|
| 194 |
global faiss_index, docstore
|
| 195 |
-
|
| 196 |
-
if
|
| 197 |
-
|
| 198 |
-
faiss_index, entries = ingest_pdfs(
|
| 199 |
save_index(faiss_index, entries)
|
| 200 |
docstore = entries
|
| 201 |
-
return f"Index built with {len(entries)} chunks from {len(
|
| 202 |
|
| 203 |
def reload_index() -> str:
|
| 204 |
ok = load_index()
|
| 205 |
-
return f"Index reloaded. Chunks: {len(docstore)}" if ok else "No saved index found."
|
| 206 |
|
| 207 |
-
def ask_rag(
|
| 208 |
try:
|
| 209 |
-
if not
|
| 210 |
-
return "Please enter a question.", []
|
| 211 |
-
ctx = retrieve(
|
| 212 |
-
ans = groq_answer(
|
|
|
|
|
|
|
| 213 |
rows = []
|
| 214 |
for c in ctx:
|
| 215 |
preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
|
| 216 |
rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
except Exception as e:
|
| 219 |
import traceback
|
| 220 |
-
|
|
|
|
| 221 |
|
| 222 |
def set_api_key(k: str):
|
| 223 |
if k and k.strip():
|
| 224 |
os.environ["GROQ_API_KEY"] = k.strip()
|
| 225 |
-
return "API key set
|
| 226 |
return "No key provided."
|
| 227 |
|
| 228 |
def download_index_zip():
|
| 229 |
if not (os.path.exists(INDEX_PATH) and os.path.exists(STORE_PATH)):
|
| 230 |
return None
|
| 231 |
-
base = "rag_index_bundle"
|
| 232 |
-
zip_path = shutil.make_archive(base, "zip", ".", ".")
|
| 233 |
-
# workaround for shutil: package explicit files
|
| 234 |
-
with shutil.make_archive("rag_index", "zip"):
|
| 235 |
-
pass
|
| 236 |
-
# build our own zip containing only index files
|
| 237 |
-
import zipfile
|
| 238 |
zp = "rag_index_bundle.zip"
|
| 239 |
with zipfile.ZipFile(zp, "w", zipfile.ZIP_DEFLATED) as z:
|
| 240 |
z.write(INDEX_PATH)
|
| 241 |
z.write(STORE_PATH)
|
| 242 |
return zp
|
| 243 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
# ---------- Gradio UI ----------
|
| 245 |
-
with gr.Blocks(title="RAG over PDFs
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
with gr.Row():
|
| 248 |
-
api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY
|
| 249 |
set_btn = gr.Button("Set Key")
|
| 250 |
set_out = gr.Markdown()
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
with gr.
|
| 254 |
-
|
| 255 |
-
with gr.
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
if __name__ == "__main__":
|
| 287 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
|
|
|
| 1 |
+
import os, io, json, math, pickle, textwrap, shutil, re, zipfile, tempfile
|
| 2 |
from typing import List, Dict, Any, Tuple
|
| 3 |
import numpy as np, faiss, fitz # pymupdf
|
| 4 |
from tqdm import tqdm
|
|
|
|
| 6 |
from sentence_transformers import SentenceTransformer
|
| 7 |
import gradio as gr
|
| 8 |
from groq import Groq
|
| 9 |
+
from docx import Document
|
| 10 |
+
from docx.shared import Pt
|
| 11 |
|
| 12 |
+
# ---------- Branding ----------
|
| 13 |
+
APP_NAME = "ScholarLens"
|
| 14 |
+
TAGLINE = "Query your literature, get page-level proof"
|
| 15 |
+
|
| 16 |
+
# ---------- Config (same engine, nicer UI) ----------
|
| 17 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 18 |
CHUNK_SIZE = 1200
|
| 19 |
CHUNK_OVERLAP = 200
|
| 20 |
+
TOP_K_DEFAULT = 7
|
| 21 |
+
MAX_CONTEXT_CHARS = 16000
|
| 22 |
|
| 23 |
INDEX_PATH = "rag_index.faiss"
|
| 24 |
STORE_PATH = "rag_store.pkl"
|
| 25 |
|
| 26 |
+
# You can edit the default model here. All are selectable in the UI.
|
| 27 |
MODEL_CHOICES = [
|
| 28 |
"llama-3.3-70b-versatile",
|
| 29 |
"llama-3.1-8b-instant",
|
|
|
|
| 103 |
return False
|
| 104 |
|
| 105 |
# ---------- Ingest ----------
|
| 106 |
+
def _collect_pdf_paths(upload_paths: List[str]) -> List[str]:
|
| 107 |
+
"""Accept PDFs and ZIPs of PDFs."""
|
| 108 |
+
if not upload_paths:
|
| 109 |
+
return []
|
| 110 |
+
out = []
|
| 111 |
+
for p in upload_paths:
|
| 112 |
+
p = str(p)
|
| 113 |
+
if p.lower().endswith(".pdf"):
|
| 114 |
+
out.append(p)
|
| 115 |
+
elif p.lower().endswith(".zip"):
|
| 116 |
+
tmpdir = tempfile.mkdtemp(prefix="pdfs_")
|
| 117 |
+
with zipfile.ZipFile(p, "r") as z:
|
| 118 |
+
for name in z.namelist():
|
| 119 |
+
if name.lower().endswith(".pdf"):
|
| 120 |
+
z.extract(name, tmpdir)
|
| 121 |
+
# collect extracted PDFs
|
| 122 |
+
for root, _, files in os.walk(tmpdir):
|
| 123 |
+
for f in files:
|
| 124 |
+
if f.lower().endswith(".pdf"):
|
| 125 |
+
out.append(os.path.join(root, f))
|
| 126 |
+
return out
|
| 127 |
+
|
| 128 |
def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
|
| 129 |
entries: List[Dict[str, Any]] = []
|
| 130 |
for pdf in tqdm(paths, total=len(paths), desc="Parsing PDFs"):
|
|
|
|
| 151 |
index = build_faiss(embs)
|
| 152 |
return index, entries
|
| 153 |
|
| 154 |
+
# ---------- Retrieval with optional keyword filter ----------
|
| 155 |
def retrieve(query: str, top_k=5, must_contain: str = ""):
|
| 156 |
global faiss_index, docstore
|
| 157 |
if faiss_index is None or not docstore:
|
|
|
|
| 182 |
return hits
|
| 183 |
|
| 184 |
# ---------- Groq LLM ----------
|
| 185 |
+
def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temperature=0.2, max_tokens=1000):
|
| 186 |
try:
|
| 187 |
if not os.environ.get("GROQ_API_KEY"):
|
| 188 |
+
return "GROQ_API_KEY is not set. Add it in your Space secrets or the key box."
|
| 189 |
client = Groq(api_key=os.environ["GROQ_API_KEY"])
|
| 190 |
|
| 191 |
packed, used = [], 0
|
|
|
|
| 218 |
import traceback
|
| 219 |
return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
|
| 220 |
|
| 221 |
+
# ---------- Export helpers ----------
|
| 222 |
+
def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]]) -> str:
|
| 223 |
+
"""
|
| 224 |
+
Save Q&A with sources table to a .docx and return path.
|
| 225 |
+
rows: [Source, Page, Score, Snippet]
|
| 226 |
+
"""
|
| 227 |
+
doc = Document()
|
| 228 |
+
styles = doc.styles
|
| 229 |
+
try:
|
| 230 |
+
styles['Normal'].font.name = 'Calibri'
|
| 231 |
+
styles['Normal'].font.size = Pt(11)
|
| 232 |
+
except Exception:
|
| 233 |
+
pass
|
| 234 |
+
|
| 235 |
+
doc.add_heading(f"{APP_NAME} - Answer", level=1)
|
| 236 |
+
doc.add_paragraph(f"Question: {question}")
|
| 237 |
+
|
| 238 |
+
doc.add_heading("Answer", level=2)
|
| 239 |
+
# Write as plain text to keep it simple in Word
|
| 240 |
+
for line in answer_md.splitlines():
|
| 241 |
+
doc.add_paragraph(line)
|
| 242 |
+
|
| 243 |
+
doc.add_heading("References (Top Passages)", level=2)
|
| 244 |
+
table = doc.add_table(rows=1, cols=4)
|
| 245 |
+
hdr = table.rows[0].cells
|
| 246 |
+
hdr[0].text = "Source"
|
| 247 |
+
hdr[1].text = "Page"
|
| 248 |
+
hdr[2].text = "Score"
|
| 249 |
+
hdr[3].text = "Snippet"
|
| 250 |
+
for r in rows:
|
| 251 |
+
row = table.add_row().cells
|
| 252 |
+
for i, val in enumerate(r):
|
| 253 |
+
row[i].text = str(val)
|
| 254 |
+
|
| 255 |
+
path = "scholarlens_answer.docx"
|
| 256 |
+
doc.save(path)
|
| 257 |
+
return path
|
| 258 |
+
|
| 259 |
+
# ---------- UI helpers ----------
|
| 260 |
def build_index_from_uploads(paths: List[str]) -> str:
|
| 261 |
global faiss_index, docstore
|
| 262 |
+
pdfs = _collect_pdf_paths(paths)
|
| 263 |
+
if not pdfs:
|
| 264 |
+
return "Please upload at least one PDF or ZIP of PDFs."
|
| 265 |
+
faiss_index, entries = ingest_pdfs(pdfs)
|
| 266 |
save_index(faiss_index, entries)
|
| 267 |
docstore = entries
|
| 268 |
+
return f"โ
Index built with {len(entries)} chunks from {len(pdfs)} files. You can start asking questions."
|
| 269 |
|
| 270 |
def reload_index() -> str:
|
| 271 |
ok = load_index()
|
| 272 |
+
return f"๐ Index reloaded. Chunks ready: {len(docstore)}" if ok else "No saved index found yet."
|
| 273 |
|
| 274 |
+
def ask_rag(question: str, top_k, model_name: str, temperature: float, must_contain: str):
|
| 275 |
try:
|
| 276 |
+
if not question.strip():
|
| 277 |
+
return "Please enter a question.", [], "", gr.update(visible=False)
|
| 278 |
+
ctx = retrieve(question, top_k=int(top_k) if top_k else TOP_K_DEFAULT, must_contain=must_contain)
|
| 279 |
+
ans = groq_answer(question, ctx, model_name=model_name, temperature=temperature)
|
| 280 |
+
|
| 281 |
+
# sources table
|
| 282 |
rows = []
|
| 283 |
for c in ctx:
|
| 284 |
preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
|
| 285 |
rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
|
| 286 |
+
|
| 287 |
+
# snippets pretty print
|
| 288 |
+
details = []
|
| 289 |
+
for c in ctx:
|
| 290 |
+
details.append(f"**{c['source']} p.{c['page_start']}**\n> {c['text'].strip()[:1000]}")
|
| 291 |
+
snippets_md = "\n\n---\n\n".join(details)
|
| 292 |
+
|
| 293 |
+
download_btn = gr.update(visible=True)
|
| 294 |
+
return ans, rows, snippets_md, download_btn
|
| 295 |
except Exception as e:
|
| 296 |
import traceback
|
| 297 |
+
err = f"**Error:** {e}\n```\n{traceback.format_exc()}\n```"
|
| 298 |
+
return err, [], "", gr.update(visible=False)
|
| 299 |
|
| 300 |
def set_api_key(k: str):
|
| 301 |
if k and k.strip():
|
| 302 |
os.environ["GROQ_API_KEY"] = k.strip()
|
| 303 |
+
return "๐ API key set for this session."
|
| 304 |
return "No key provided."
|
| 305 |
|
| 306 |
def download_index_zip():
|
| 307 |
if not (os.path.exists(INDEX_PATH) and os.path.exists(STORE_PATH)):
|
| 308 |
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
zp = "rag_index_bundle.zip"
|
| 310 |
with zipfile.ZipFile(zp, "w", zipfile.ZIP_DEFLATED) as z:
|
| 311 |
z.write(INDEX_PATH)
|
| 312 |
z.write(STORE_PATH)
|
| 313 |
return zp
|
| 314 |
|
| 315 |
+
def do_export_docx(question, answer_md, sources_rows):
|
| 316 |
+
if not answer_md or not sources_rows:
|
| 317 |
+
return None
|
| 318 |
+
try:
|
| 319 |
+
path = export_answer_to_docx(question, answer_md, sources_rows)
|
| 320 |
+
return path
|
| 321 |
+
except Exception:
|
| 322 |
+
return None
|
| 323 |
+
|
| 324 |
+
# ---------- Theme ----------
|
| 325 |
+
theme = gr.themes.Soft(
|
| 326 |
+
primary_hue="indigo",
|
| 327 |
+
secondary_hue="blue",
|
| 328 |
+
neutral_hue="slate",
|
| 329 |
+
).set(
|
| 330 |
+
body_background_fill="#0B1220", # dark-friendly hero
|
| 331 |
+
block_background_fill="#0F172A",
|
| 332 |
+
block_shadow="*shadow-lg",
|
| 333 |
+
radius_size="8px",
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
# ---------- Gradio UI ----------
|
| 337 |
+
with gr.Blocks(title=f"{APP_NAME} | RAG over PDFs", theme=theme, css="""
|
| 338 |
+
#hero {
|
| 339 |
+
background: radial-gradient(1200px 600px at 20% -10%, rgba(99,102,241,.25), transparent),
|
| 340 |
+
radial-gradient(1000px 500px at 120% 10%, rgba(14,165,233,.20), transparent);
|
| 341 |
+
border: 1px solid rgba(99,102,241,.20);
|
| 342 |
+
}
|
| 343 |
+
.kpi {text-align:center;padding:12px;border-radius:10px;border:1px solid rgba(255,255,255,.08);}
|
| 344 |
+
.footer {opacity:.8;}
|
| 345 |
+
""") as demo:
|
| 346 |
+
# --- Header / Hero ---
|
| 347 |
+
with gr.Group(elem_id="hero"):
|
| 348 |
+
gr.Markdown(
|
| 349 |
+
f"""
|
| 350 |
+
<div style="display:flex;align-items:center;gap:16px;">
|
| 351 |
+
<div style="font-size:36px">๐๐ <b>{APP_NAME}</b></div>
|
| 352 |
+
<div style="opacity:.9;">{TAGLINE}</div>
|
| 353 |
+
</div>
|
| 354 |
+
<p style="opacity:.85;margin-top:6px;">
|
| 355 |
+
Upload your papers, build an index, and ask research questions with verifiable, page-level citations.
|
| 356 |
+
</p>
|
| 357 |
+
""")
|
| 358 |
+
|
| 359 |
+
# --- KPI row ---
|
| 360 |
+
with gr.Row():
|
| 361 |
+
gr.Markdown("**Meaning-aware retrieval**<br><span class='kpi'>E5 + FAISS</span>", elem_classes=["kpi"])
|
| 362 |
+
gr.Markdown("**Cited answers**<br><span class='kpi'>Page-level proof</span>", elem_classes=["kpi"])
|
| 363 |
+
gr.Markdown("**Runs anywhere**<br><span class='kpi'>HF Spaces or Colab</span>", elem_classes=["kpi"])
|
| 364 |
+
|
| 365 |
+
# --- Key / Settings ---
|
| 366 |
with gr.Row():
|
| 367 |
+
api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY", type="password", placeholder="sk_...")
|
| 368 |
set_btn = gr.Button("Set Key")
|
| 369 |
set_out = gr.Markdown()
|
| 370 |
+
set_btn.click(set_api_key, inputs=[api_box], outputs=[set_out])
|
| 371 |
+
|
| 372 |
+
with gr.Tabs():
|
| 373 |
+
# ---------------- Tab 1: Build / Load ----------------
|
| 374 |
+
with gr.Tab("1) Build or Load Index"):
|
| 375 |
+
gr.Markdown("Upload PDFs or a ZIP of PDFs, then click **Build Index**.")
|
| 376 |
+
file_u = gr.Files(label="Upload PDFs or ZIP", file_types=[".pdf", ".zip"], type="filepath")
|
| 377 |
+
with gr.Row():
|
| 378 |
+
build_btn = gr.Button("Build Index", variant="primary")
|
| 379 |
+
reload_btn = gr.Button("Reload Saved Index")
|
| 380 |
+
download_btn = gr.Button("Download Index (.zip)")
|
| 381 |
+
build_out = gr.Markdown()
|
| 382 |
+
|
| 383 |
+
def on_build(paths, progress=gr.Progress(track_tqdm=True)):
|
| 384 |
+
try:
|
| 385 |
+
return build_index_from_uploads(paths)
|
| 386 |
+
except Exception as e:
|
| 387 |
+
import traceback
|
| 388 |
+
return f"**Error while building index:** {e}\n\n```\n{traceback.format_exc()}\n```"
|
| 389 |
+
|
| 390 |
+
build_btn.click(on_build, inputs=[file_u], outputs=[build_out])
|
| 391 |
+
reload_btn.click(fn=reload_index, outputs=[build_out])
|
| 392 |
+
zpath = gr.File(label="Index bundle", interactive=False)
|
| 393 |
+
download_btn.click(fn=download_index_zip, outputs=[zpath])
|
| 394 |
+
|
| 395 |
+
# ---------------- Tab 2: Ask ----------------
|
| 396 |
+
with gr.Tab("2) Ask Questions"):
|
| 397 |
+
with gr.Row():
|
| 398 |
+
with gr.Column(scale=1):
|
| 399 |
+
q = gr.Textbox(label="Your question", lines=3, placeholder="e.g., Compare GTAW experimental parameters with citations")
|
| 400 |
+
must = gr.Textbox(label="Must contain (comma-separated keywords)", placeholder="camera, CMOS, frame rate")
|
| 401 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 402 |
+
topk = gr.Slider(1, 20, value=TOP_K_DEFAULT, step=1, label="Top-K passages")
|
| 403 |
+
model_dd = gr.Dropdown(MODEL_CHOICES, value=MODEL_CHOICES[0], label="Groq model")
|
| 404 |
+
temp = gr.Slider(0.0, 1.0, value=0.2, step=0.05, label="Temperature")
|
| 405 |
+
with gr.Row():
|
| 406 |
+
ask_btn = gr.Button("Answer", variant="primary")
|
| 407 |
+
clear_btn = gr.Button("Clear")
|
| 408 |
+
|
| 409 |
+
gr.Examples(
|
| 410 |
+
examples=[
|
| 411 |
+
["List camera model, sensor type, resolution, and FPS across studies. Cite pages.", "camera, fps, resolution"],
|
| 412 |
+
["Extract limitations and future work across the corpus, with page references.", ""],
|
| 413 |
+
["Compare GTAW setups: current range, travel speed, torch standoff, sensors.", "GTAW, current, speed, torch"],
|
| 414 |
+
["Summarize the main results tables with metrics and page citations.", "table, accuracy, mAP, F1"]
|
| 415 |
+
],
|
| 416 |
+
inputs=[q, must],
|
| 417 |
+
label="Quick examples",
|
| 418 |
+
)
|
| 419 |
+
with gr.Column(scale=1.4):
|
| 420 |
+
ans = gr.Markdown(label="Answer", show_label=False)
|
| 421 |
+
src = gr.Dataframe(headers=["Source","Page","Score","Snippet"], wrap=True, label="Top passages")
|
| 422 |
+
with gr.Accordion("Show retrieved snippets", open=False):
|
| 423 |
+
snippets_md = gr.Markdown("")
|
| 424 |
+
with gr.Row():
|
| 425 |
+
export_btn = gr.Button("Export Answer to DOCX", visible=False)
|
| 426 |
+
exported = gr.File(label="Download answer", visible=True)
|
| 427 |
+
|
| 428 |
+
# wire buttons
|
| 429 |
+
ask_btn.click(fn=ask_rag, inputs=[q, topk, model_dd, temp, must], outputs=[ans, src, snippets_md, export_btn])
|
| 430 |
+
export_btn.click(fn=do_export_docx, inputs=[q, ans, src], outputs=[exported])
|
| 431 |
+
clear_btn.click(lambda: ("", [], "", gr.update(visible=False)), outputs=[ans, src, snippets_md, export_btn])
|
| 432 |
+
|
| 433 |
+
# ---------------- Tab 3: About ----------------
|
| 434 |
+
with gr.Tab("About"):
|
| 435 |
+
gr.Markdown(
|
| 436 |
+
"""
|
| 437 |
+
**ScholarLens** helps researchers move from reading to results with answers grounded in the papers you upload.
|
| 438 |
+
|
| 439 |
+
- Meaning-aware retrieval (E5 + FAISS)
|
| 440 |
+
- Answers limited to your corpus, with page-level citations
|
| 441 |
+
- Optional keyword filter to stay on topic
|
| 442 |
+
- Runs on Hugging Face Spaces or Google Colab
|
| 443 |
+
- Powered by Groq models
|
| 444 |
+
|
| 445 |
+
*Privacy note:* your files stay on this Space. Only the Groq call is external.
|
| 446 |
+
"""
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# broad compatibility for Spaces
|
| 450 |
+
demo.queue()
|
| 451 |
if __name__ == "__main__":
|
| 452 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|