Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,132 +10,125 @@ from docx import Document
|
|
| 10 |
from docx.shared import Pt
|
| 11 |
from string import Template
|
| 12 |
|
| 13 |
-
#
|
|
|
|
|
|
|
| 14 |
APP_NAME = "ScholarLens"
|
| 15 |
TAGLINE = "Query your literature, get page-level proof"
|
| 16 |
|
| 17 |
-
# ---------- Palette (guarantees light text on dark, dark text on light) ----------
|
| 18 |
PALETTE = {
|
| 19 |
"navy": "#083D77", # dark background
|
| 20 |
"gold": "#F2B400", # primary buttons / accents
|
| 21 |
"ice": "#FBF8F9", # off-white
|
| 22 |
"maroon": "#8B1E1E", # emphasis chips / separators
|
| 23 |
"amber": "#F5C26B", # secondary accent
|
| 24 |
-
"text_on_dark": "#EAF2FF", # light text
|
| 25 |
-
"text_on_light": "#0B1220" # dark text
|
| 26 |
}
|
| 27 |
|
| 28 |
def build_custom_css():
|
| 29 |
-
"""
|
| 30 |
tmpl = Template(r"""
|
| 31 |
-
:root
|
| 32 |
-
--navy: $navy;
|
| 33 |
-
--
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
--
|
| 37 |
-
--text-
|
| 38 |
-
--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
-
/* Global surfaces */
|
| 42 |
-
body, .gradio-container
|
| 43 |
-
background: var(--
|
| 44 |
-
color: var(--text-
|
| 45 |
}
|
| 46 |
|
| 47 |
-
/* Blocks
|
| 48 |
-
.gradio-container .block,
|
| 49 |
-
|
| 50 |
-
.gradio-container .tabs > .tabitem {
|
| 51 |
-
background: color-mix(in srgb, var(--navy) 80%, black 20%) !important;
|
| 52 |
color: var(--text-light) !important;
|
| 53 |
border-radius: 12px;
|
| 54 |
-
border: 1px solid
|
| 55 |
}
|
| 56 |
|
| 57 |
-
/* Hero stripe
|
| 58 |
-
#hero
|
| 59 |
-
background:
|
| 60 |
-
|
| 61 |
-
border:
|
| 62 |
-
border-radius: 14px;
|
| 63 |
-
padding: 14px 16px;
|
| 64 |
-
color: var(--text-light);
|
| 65 |
}
|
| 66 |
|
| 67 |
-
/* Links */
|
| 68 |
-
a, .prose a { color: var(--amber) !important; text-decoration: none; }
|
| 69 |
-
a:hover { text-decoration: underline; }
|
| 70 |
-
|
| 71 |
/* KPI chips */
|
| 72 |
-
.kpi
|
| 73 |
-
|
| 74 |
-
border:1px solid color-mix(in srgb, var(--ice) 35%, var(--navy) 65%);
|
| 75 |
-
background: color-mix(in srgb, var(--navy) 75%, black 25%);
|
| 76 |
-
color: var(--text-light);
|
| 77 |
-
}
|
| 78 |
|
| 79 |
/* Buttons */
|
| 80 |
-
button, .
|
| 81 |
-
.gr-button, button
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
border: 1px solid color-mix(in srgb, var(--gold) 70%, black 10%) !important;
|
| 86 |
}
|
| 87 |
-
.gr-button-secondary {
|
| 88 |
-
background:
|
| 89 |
-
color: var(--text-
|
|
|
|
| 90 |
}
|
| 91 |
|
| 92 |
/* Inputs */
|
| 93 |
-
input, textarea, .gr-textbox, .gr-text-area, .gr-
|
| 94 |
-
background:
|
| 95 |
color: var(--text-light) !important;
|
| 96 |
-
border: 1px solid
|
| 97 |
border-radius: 10px !important;
|
| 98 |
}
|
| 99 |
-
input::placeholder, textarea::placeholder
|
| 100 |
-
color: color-mix(in srgb, var(--text-light) 60%, transparent) !important;
|
| 101 |
-
}
|
| 102 |
|
| 103 |
-
/*
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
}
|
| 107 |
|
| 108 |
-
/* Dataframe
|
| 109 |
-
.dataframe, table, .table, .gr-dataframe *
|
| 110 |
-
.dataframe th
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
.
|
| 115 |
-
border
|
| 116 |
}
|
| 117 |
|
| 118 |
-
/*
|
| 119 |
-
.
|
| 120 |
-
|
| 121 |
-
border: 1px solid color-mix(in srgb, var(--ice) 20%, var(--navy) 80%) !important;
|
| 122 |
-
border-radius: 10px !important;
|
| 123 |
}
|
| 124 |
|
| 125 |
-
/*
|
| 126 |
-
|
| 127 |
""")
|
| 128 |
return tmpl.substitute(
|
| 129 |
-
navy=PALETTE["navy"],
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
maroon=PALETTE["maroon"],
|
| 133 |
-
amber=PALETTE["amber"],
|
| 134 |
-
text_dark=PALETTE["text_on_light"],
|
| 135 |
-
text_light=PALETTE["text_on_dark"],
|
| 136 |
)
|
| 137 |
|
| 138 |
-
#
|
|
|
|
|
|
|
| 139 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 140 |
CHUNK_SIZE = 1200
|
| 141 |
CHUNK_OVERLAP = 200
|
|
@@ -156,7 +149,9 @@ embedder = None
|
|
| 156 |
faiss_index = None
|
| 157 |
docstore: List[Dict[str, Any]] = []
|
| 158 |
|
| 159 |
-
#
|
|
|
|
|
|
|
| 160 |
def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
|
| 161 |
pages = []
|
| 162 |
with fitz.open(pdf_path) as doc:
|
|
@@ -165,10 +160,7 @@ def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
|
|
| 165 |
if not txt.strip():
|
| 166 |
blocks = page.get_text("blocks")
|
| 167 |
if isinstance(blocks, list):
|
| 168 |
-
txt = "\n".join(
|
| 169 |
-
b[4] for b in blocks
|
| 170 |
-
if isinstance(b, (list, tuple)) and len(b) > 4
|
| 171 |
-
)
|
| 172 |
pages.append((i, txt or ""))
|
| 173 |
return pages
|
| 174 |
|
|
@@ -183,7 +175,9 @@ def chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP) -> List[
|
|
| 183 |
start = max(end - overlap, start + 1)
|
| 184 |
return out
|
| 185 |
|
| 186 |
-
#
|
|
|
|
|
|
|
| 187 |
def load_embedder():
|
| 188 |
global embedder
|
| 189 |
if embedder is None:
|
|
@@ -226,7 +220,9 @@ def load_index() -> bool:
|
|
| 226 |
return True
|
| 227 |
return False
|
| 228 |
|
| 229 |
-
#
|
|
|
|
|
|
|
| 230 |
def _collect_pdf_paths(upload_paths: List[str]) -> List[str]:
|
| 231 |
"""Accept PDFs and ZIPs of PDFs."""
|
| 232 |
if not upload_paths:
|
|
@@ -259,10 +255,8 @@ def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
|
|
| 259 |
continue
|
| 260 |
for ci, ch in enumerate(chunk_text(ptxt)):
|
| 261 |
entries.append({
|
| 262 |
-
"text": ch,
|
| 263 |
-
"
|
| 264 |
-
"page_start": pno,
|
| 265 |
-
"page_end": pno,
|
| 266 |
"chunk_id": f"{base}::p{pno}::c{ci}",
|
| 267 |
})
|
| 268 |
except Exception as e:
|
|
@@ -274,7 +268,9 @@ def ingest_pdfs(paths: List[str]) -> Tuple[Any, List[Dict[str, Any]]]:
|
|
| 274 |
index = build_faiss(embs)
|
| 275 |
return index, entries
|
| 276 |
|
| 277 |
-
#
|
|
|
|
|
|
|
| 278 |
def retrieve(query: str, top_k=5, must_contain: str = ""):
|
| 279 |
global faiss_index, docstore
|
| 280 |
if faiss_index is None or not docstore:
|
|
@@ -304,7 +300,9 @@ def retrieve(query: str, top_k=5, must_contain: str = ""):
|
|
| 304 |
hits.append(item)
|
| 305 |
return hits
|
| 306 |
|
| 307 |
-
#
|
|
|
|
|
|
|
| 308 |
def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temperature=0.2, max_tokens=1000):
|
| 309 |
try:
|
| 310 |
if not os.environ.get("GROQ_API_KEY"):
|
|
@@ -331,9 +329,7 @@ def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temp
|
|
| 331 |
)
|
| 332 |
|
| 333 |
resp = client.chat.completions.create(
|
| 334 |
-
model=model_name,
|
| 335 |
-
temperature=float(temperature),
|
| 336 |
-
max_tokens=int(max_tokens),
|
| 337 |
messages=[{"role":"system","content":system_prompt},{"role":"user","content":user_prompt}],
|
| 338 |
)
|
| 339 |
return resp.choices[0].message.content.strip()
|
|
@@ -341,12 +337,13 @@ def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temp
|
|
| 341 |
import traceback
|
| 342 |
return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
|
| 343 |
|
| 344 |
-
#
|
|
|
|
|
|
|
| 345 |
def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]]) -> str:
|
| 346 |
-
"""Save Q&A with sources table to a .docx and return path (rows = [Source, Page, Score, Snippet])."""
|
| 347 |
doc = Document()
|
| 348 |
-
styles = doc.styles
|
| 349 |
try:
|
|
|
|
| 350 |
styles['Normal'].font.name = 'Calibri'
|
| 351 |
styles['Normal'].font.size = Pt(11)
|
| 352 |
except Exception:
|
|
@@ -362,10 +359,7 @@ def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]])
|
|
| 362 |
doc.add_heading("References (Top Passages)", level=2)
|
| 363 |
table = doc.add_table(rows=1, cols=4)
|
| 364 |
hdr = table.rows[0].cells
|
| 365 |
-
hdr[0].text = "Source"
|
| 366 |
-
hdr[1].text = "Page"
|
| 367 |
-
hdr[2].text = "Score"
|
| 368 |
-
hdr[3].text = "Snippet"
|
| 369 |
for r in rows:
|
| 370 |
row = table.add_row().cells
|
| 371 |
for i, val in enumerate(r):
|
|
@@ -375,7 +369,9 @@ def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]])
|
|
| 375 |
doc.save(path)
|
| 376 |
return path
|
| 377 |
|
| 378 |
-
#
|
|
|
|
|
|
|
| 379 |
def build_index_from_uploads(paths: List[str]) -> str:
|
| 380 |
global faiss_index, docstore
|
| 381 |
pdfs = _collect_pdf_paths(paths)
|
|
@@ -433,40 +429,34 @@ def do_export_docx(question, answer_md, sources_rows):
|
|
| 433 |
if not answer_md or not sources_rows:
|
| 434 |
return None
|
| 435 |
try:
|
| 436 |
-
|
| 437 |
-
return path
|
| 438 |
except Exception:
|
| 439 |
return None
|
| 440 |
|
| 441 |
-
#
|
|
|
|
|
|
|
| 442 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="slate")
|
| 443 |
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
title=f"{APP_NAME} | RAG over PDFs",
|
| 447 |
-
theme=theme,
|
| 448 |
-
css=build_custom_css()
|
| 449 |
-
) as demo:
|
| 450 |
-
# --- Header / Hero ---
|
| 451 |
with gr.Group(elem_id="hero"):
|
| 452 |
-
gr.Markdown(
|
| 453 |
-
f"""
|
| 454 |
<div style="display:flex;align-items:center;gap:16px;">
|
| 455 |
<div style="font-size:36px">📚🔎 <b>{APP_NAME}</b></div>
|
| 456 |
<div style="opacity:.9;">{TAGLINE}</div>
|
| 457 |
</div>
|
| 458 |
<p style="opacity:.85;margin-top:6px;">
|
| 459 |
Upload your papers, build an index, and ask research questions with verifiable, page-level citations.
|
| 460 |
-
</p>
|
| 461 |
-
""")
|
| 462 |
|
| 463 |
-
#
|
| 464 |
with gr.Row():
|
| 465 |
gr.Markdown("**Meaning-aware retrieval**<br><span class='kpi'>E5 + FAISS</span>", elem_classes=["kpi"])
|
| 466 |
gr.Markdown("**Cited answers**<br><span class='kpi'>Page-level proof</span>", elem_classes=["kpi"])
|
| 467 |
gr.Markdown("**Runs anywhere**<br><span class='kpi'>HF Spaces or Colab</span>", elem_classes=["kpi"])
|
| 468 |
|
| 469 |
-
#
|
| 470 |
with gr.Row():
|
| 471 |
api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY", type="password", placeholder="sk_...")
|
| 472 |
set_btn = gr.Button("Set Key")
|
|
@@ -474,7 +464,7 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 474 |
set_btn.click(set_api_key, inputs=[api_box], outputs=[set_out])
|
| 475 |
|
| 476 |
with gr.Tabs():
|
| 477 |
-
#
|
| 478 |
with gr.Tab("1) Build or Load Index"):
|
| 479 |
gr.Markdown("Upload PDFs or a ZIP of PDFs, then click **Build Index**.")
|
| 480 |
file_u = gr.Files(label="Upload PDFs or ZIP", file_types=[".pdf", ".zip"], type="filepath")
|
|
@@ -496,7 +486,7 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 496 |
zpath = gr.File(label="Index bundle", interactive=False)
|
| 497 |
download_btn.click(fn=download_index_zip, outputs=[zpath])
|
| 498 |
|
| 499 |
-
#
|
| 500 |
with gr.Tab("2) Ask Questions"):
|
| 501 |
with gr.Row():
|
| 502 |
with gr.Column(scale=1):
|
|
@@ -515,7 +505,7 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 515 |
["List camera model, sensor type, resolution, and FPS across studies. Cite pages.", "camera, fps, resolution"],
|
| 516 |
["Extract limitations and future work across the corpus, with page references.", ""],
|
| 517 |
["Compare GTAW setups: current range, travel speed, torch standoff, sensors.", "GTAW, current, speed, torch"],
|
| 518 |
-
["Summarize
|
| 519 |
],
|
| 520 |
inputs=[q, must],
|
| 521 |
label="Quick examples",
|
|
@@ -529,15 +519,13 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 529 |
export_btn = gr.Button("Export Answer to DOCX", visible=False)
|
| 530 |
exported = gr.File(label="Download answer", visible=True)
|
| 531 |
|
| 532 |
-
# wire buttons
|
| 533 |
ask_btn.click(fn=ask_rag, inputs=[q, topk, model_dd, temp, must], outputs=[ans, src, snippets_md, export_btn])
|
| 534 |
export_btn.click(fn=do_export_docx, inputs=[q, ans, src], outputs=[exported])
|
| 535 |
clear_btn.click(lambda: ("", [], "", gr.update(visible=False)), outputs=[ans, src, snippets_md, export_btn])
|
| 536 |
|
| 537 |
-
#
|
| 538 |
with gr.Tab("About"):
|
| 539 |
-
gr.Markdown(
|
| 540 |
-
"""
|
| 541 |
**ScholarLens** helps researchers move from reading to results with answers grounded in the papers you upload.
|
| 542 |
|
| 543 |
- Meaning-aware retrieval (E5 + FAISS)
|
|
@@ -547,10 +535,9 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 547 |
- Powered by Groq models
|
| 548 |
|
| 549 |
*Privacy note:* your files stay on this Space. Only the Groq call is external.
|
| 550 |
-
|
| 551 |
-
)
|
| 552 |
|
| 553 |
-
#
|
| 554 |
demo.queue()
|
| 555 |
if __name__ == "__main__":
|
| 556 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
|
|
|
| 10 |
from docx.shared import Pt
|
| 11 |
from string import Template
|
| 12 |
|
| 13 |
+
# =========================
|
| 14 |
+
# Branding & Palette
|
| 15 |
+
# =========================
|
| 16 |
APP_NAME = "ScholarLens"
|
| 17 |
TAGLINE = "Query your literature, get page-level proof"
|
| 18 |
|
|
|
|
| 19 |
PALETTE = {
|
| 20 |
"navy": "#083D77", # dark background
|
| 21 |
"gold": "#F2B400", # primary buttons / accents
|
| 22 |
"ice": "#FBF8F9", # off-white
|
| 23 |
"maroon": "#8B1E1E", # emphasis chips / separators
|
| 24 |
"amber": "#F5C26B", # secondary accent
|
| 25 |
+
"text_on_dark": "#EAF2FF", # light text on dark
|
| 26 |
+
"text_on_light": "#0B1220" # dark text on light (gold/amber)
|
| 27 |
}
|
| 28 |
|
| 29 |
def build_custom_css():
|
| 30 |
+
"""Dark UI with light text and gold primary button — safe for Python via Template."""
|
| 31 |
tmpl = Template(r"""
|
| 32 |
+
:root{
|
| 33 |
+
--navy: $navy; --gold: $gold; --ice: $ice; --maroon: $maroon; --amber: $amber;
|
| 34 |
+
--text-dark: $text_dark; --text-light: $text_light;
|
| 35 |
+
|
| 36 |
+
/* Gradio tokens (force our palette) */
|
| 37 |
+
--body-background-fill: var(--navy);
|
| 38 |
+
--body-text-color: var(--text-light);
|
| 39 |
+
--block-background-fill: rgba(8,61,119,0.82);
|
| 40 |
+
--block-title-text-color: var(--text-light);
|
| 41 |
+
--border-color-primary: rgba(255,255,255,0.12);
|
| 42 |
+
|
| 43 |
+
--button-primary-background-fill: var(--gold);
|
| 44 |
+
--button-primary-text-color: var(--text-dark);
|
| 45 |
+
--button-primary-border-color: #c89200;
|
| 46 |
+
|
| 47 |
+
--button-secondary-background-fill: var(--amber);
|
| 48 |
+
--button-secondary-text-color: var(--text-dark);
|
| 49 |
+
--button-secondary-border-color: #caa157;
|
| 50 |
+
|
| 51 |
+
--link-text-color: var(--amber);
|
| 52 |
}
|
| 53 |
|
| 54 |
+
/* Global surfaces & text */
|
| 55 |
+
body, .gradio-container{
|
| 56 |
+
background: var(--body-background-fill) !important;
|
| 57 |
+
color: var(--body-text-color) !important;
|
| 58 |
}
|
| 59 |
|
| 60 |
+
/* Blocks / tabs */
|
| 61 |
+
.gradio-container .block, .gradio-container .tabs, .gradio-container .tabs>.tabitem{
|
| 62 |
+
background: var(--block-background-fill) !important;
|
|
|
|
|
|
|
| 63 |
color: var(--text-light) !important;
|
| 64 |
border-radius: 12px;
|
| 65 |
+
border: 1px solid var(--border-color-primary);
|
| 66 |
}
|
| 67 |
|
| 68 |
+
/* Hero stripe using your palette */
|
| 69 |
+
#hero{
|
| 70 |
+
background: linear-gradient(90deg, var(--navy) 0%, var(--gold) 25%, var(--ice) 45%, var(--maroon) 65%, var(--amber) 85%, transparent 100%);
|
| 71 |
+
border: 1px solid rgba(242,180,0,0.6);
|
| 72 |
+
border-radius: 14px; padding: 14px 16px; color: var(--text-light);
|
|
|
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
/* KPI chips */
|
| 76 |
+
.kpi{ text-align:center; padding:12px; border-radius:10px;
|
| 77 |
+
border:1px solid rgba(255,255,255,.14); background: rgba(8,61,119,0.65); color: var(--text-light);}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
/* Buttons */
|
| 80 |
+
.gradio-container .gr-button, .gradio-container button{ border-radius:10px !important; font-weight:600 !important; }
|
| 81 |
+
.gradio-container .gr-button-primary, .gradio-container button.primary{
|
| 82 |
+
background: var(--button-primary-background-fill) !important;
|
| 83 |
+
color: var(--button-primary-text-color) !important;
|
| 84 |
+
border: 1px solid var(--button-primary-border-color) !important;
|
|
|
|
| 85 |
}
|
| 86 |
+
.gradio-container .gr-button-secondary, .gradio-container button.secondary{
|
| 87 |
+
background: var(--button-secondary-background-fill) !important;
|
| 88 |
+
color: var(--button-secondary-text-color) !important;
|
| 89 |
+
border: 1px solid var(--button-secondary-border-color) !important;
|
| 90 |
}
|
| 91 |
|
| 92 |
/* Inputs */
|
| 93 |
+
input, textarea, select, .gr-textbox, .gr-text-area, .gr-dropdown, .gr-file, .gr-slider{
|
| 94 |
+
background: rgba(8,61,119,0.55) !important;
|
| 95 |
color: var(--text-light) !important;
|
| 96 |
+
border: 1px solid rgba(255,255,255,0.18) !important;
|
| 97 |
border-radius: 10px !important;
|
| 98 |
}
|
| 99 |
+
input::placeholder, textarea::placeholder{ color: rgba(234,242,255,0.65) !important; }
|
|
|
|
|
|
|
| 100 |
|
| 101 |
+
/* Links & text */
|
| 102 |
+
a, .prose a{ color: var(--amber) !important; text-decoration:none; } a:hover{ text-decoration:underline; }
|
| 103 |
+
label, .label, .prose h1, .prose h2, .prose h3, .prose p, .markdown-body{ color: var(--text-light) !important; }
|
|
|
|
| 104 |
|
| 105 |
+
/* Dataframe */
|
| 106 |
+
.dataframe, table, .table, .gr-dataframe *{ color: var(--text-light) !important; background: transparent !important; }
|
| 107 |
+
.dataframe th{ background: rgba(8,61,119,0.72) !important; border-bottom: 1px solid rgba(255,255,255,0.18) !important; }
|
| 108 |
+
.dataframe td{ border-bottom: 1px solid rgba(255,255,255,0.12) !important; }
|
| 109 |
+
|
| 110 |
+
/* Accordion */
|
| 111 |
+
.accordion, .gr-accordion{
|
| 112 |
+
background: rgba(8,61,119,0.65) !important; border: 1px solid rgba(255,255,255,0.14) !important; border-radius: 10px !important;
|
| 113 |
}
|
| 114 |
|
| 115 |
+
/* Tabs active underline color */
|
| 116 |
+
.gradio-container .tabs .tab-nav button.selected{
|
| 117 |
+
box-shadow: inset 0 -3px 0 0 var(--gold) !important; color: var(--text-light) !important;
|
|
|
|
|
|
|
| 118 |
}
|
| 119 |
|
| 120 |
+
/* Optional: center content */
|
| 121 |
+
.gradio-container{ max-width: 1120px; margin: 0 auto; }
|
| 122 |
""")
|
| 123 |
return tmpl.substitute(
|
| 124 |
+
navy=PALETTE["navy"], gold=PALETTE["gold"], ice=PALETTE["ice"],
|
| 125 |
+
maroon=PALETTE["maroon"], amber=PALETTE["amber"],
|
| 126 |
+
text_dark=PALETTE["text_on_light"], text_light=PALETTE["text_on_dark"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
)
|
| 128 |
|
| 129 |
+
# =========================
|
| 130 |
+
# Engine config
|
| 131 |
+
# =========================
|
| 132 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 133 |
CHUNK_SIZE = 1200
|
| 134 |
CHUNK_OVERLAP = 200
|
|
|
|
| 149 |
faiss_index = None
|
| 150 |
docstore: List[Dict[str, Any]] = []
|
| 151 |
|
| 152 |
+
# =========================
|
| 153 |
+
# PDF utils
|
| 154 |
+
# =========================
|
| 155 |
def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
|
| 156 |
pages = []
|
| 157 |
with fitz.open(pdf_path) as doc:
|
|
|
|
| 160 |
if not txt.strip():
|
| 161 |
blocks = page.get_text("blocks")
|
| 162 |
if isinstance(blocks, list):
|
| 163 |
+
txt = "\n".join(b[4] for b in blocks if isinstance(b, (list, tuple)) and len(b) > 4)
|
|
|
|
|
|
|
|
|
|
| 164 |
pages.append((i, txt or ""))
|
| 165 |
return pages
|
| 166 |
|
|
|
|
| 175 |
start = max(end - overlap, start + 1)
|
| 176 |
return out
|
| 177 |
|
| 178 |
+
# =========================
|
| 179 |
+
# Embeddings / FAISS
|
| 180 |
+
# =========================
|
| 181 |
def load_embedder():
|
| 182 |
global embedder
|
| 183 |
if embedder is None:
|
|
|
|
| 220 |
return True
|
| 221 |
return False
|
| 222 |
|
| 223 |
+
# =========================
|
| 224 |
+
# Ingest
|
| 225 |
+
# =========================
|
| 226 |
def _collect_pdf_paths(upload_paths: List[str]) -> List[str]:
|
| 227 |
"""Accept PDFs and ZIPs of PDFs."""
|
| 228 |
if not upload_paths:
|
|
|
|
| 255 |
continue
|
| 256 |
for ci, ch in enumerate(chunk_text(ptxt)):
|
| 257 |
entries.append({
|
| 258 |
+
"text": ch, "source": base,
|
| 259 |
+
"page_start": pno, "page_end": pno,
|
|
|
|
|
|
|
| 260 |
"chunk_id": f"{base}::p{pno}::c{ci}",
|
| 261 |
})
|
| 262 |
except Exception as e:
|
|
|
|
| 268 |
index = build_faiss(embs)
|
| 269 |
return index, entries
|
| 270 |
|
| 271 |
+
# =========================
|
| 272 |
+
# Retrieval
|
| 273 |
+
# =========================
|
| 274 |
def retrieve(query: str, top_k=5, must_contain: str = ""):
|
| 275 |
global faiss_index, docstore
|
| 276 |
if faiss_index is None or not docstore:
|
|
|
|
| 300 |
hits.append(item)
|
| 301 |
return hits
|
| 302 |
|
| 303 |
+
# =========================
|
| 304 |
+
# Groq LLM
|
| 305 |
+
# =========================
|
| 306 |
def groq_answer(query: str, contexts, model_name="llama-3.3-70b-versatile", temperature=0.2, max_tokens=1000):
|
| 307 |
try:
|
| 308 |
if not os.environ.get("GROQ_API_KEY"):
|
|
|
|
| 329 |
)
|
| 330 |
|
| 331 |
resp = client.chat.completions.create(
|
| 332 |
+
model=model_name, temperature=float(temperature), max_tokens=int(max_tokens),
|
|
|
|
|
|
|
| 333 |
messages=[{"role":"system","content":system_prompt},{"role":"user","content":user_prompt}],
|
| 334 |
)
|
| 335 |
return resp.choices[0].message.content.strip()
|
|
|
|
| 337 |
import traceback
|
| 338 |
return f"Groq API error: {e}\n```\n{traceback.format_exc()}\n```"
|
| 339 |
|
| 340 |
+
# =========================
|
| 341 |
+
# Export helpers
|
| 342 |
+
# =========================
|
| 343 |
def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]]) -> str:
|
|
|
|
| 344 |
doc = Document()
|
|
|
|
| 345 |
try:
|
| 346 |
+
styles = doc.styles
|
| 347 |
styles['Normal'].font.name = 'Calibri'
|
| 348 |
styles['Normal'].font.size = Pt(11)
|
| 349 |
except Exception:
|
|
|
|
| 359 |
doc.add_heading("References (Top Passages)", level=2)
|
| 360 |
table = doc.add_table(rows=1, cols=4)
|
| 361 |
hdr = table.rows[0].cells
|
| 362 |
+
hdr[0].text = "Source"; hdr[1].text = "Page"; hdr[2].text = "Score"; hdr[3].text = "Snippet"
|
|
|
|
|
|
|
|
|
|
| 363 |
for r in rows:
|
| 364 |
row = table.add_row().cells
|
| 365 |
for i, val in enumerate(r):
|
|
|
|
| 369 |
doc.save(path)
|
| 370 |
return path
|
| 371 |
|
| 372 |
+
# =========================
|
| 373 |
+
# UI helpers
|
| 374 |
+
# =========================
|
| 375 |
def build_index_from_uploads(paths: List[str]) -> str:
|
| 376 |
global faiss_index, docstore
|
| 377 |
pdfs = _collect_pdf_paths(paths)
|
|
|
|
| 429 |
if not answer_md or not sources_rows:
|
| 430 |
return None
|
| 431 |
try:
|
| 432 |
+
return export_answer_to_docx(question, answer_md, sources_rows)
|
|
|
|
| 433 |
except Exception:
|
| 434 |
return None
|
| 435 |
|
| 436 |
+
# =========================
|
| 437 |
+
# UI
|
| 438 |
+
# =========================
|
| 439 |
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="slate")
|
| 440 |
|
| 441 |
+
with gr.Blocks(title=f"{APP_NAME} | RAG over PDFs", theme=theme, css=build_custom_css()) as demo:
|
| 442 |
+
# Header / Hero
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
with gr.Group(elem_id="hero"):
|
| 444 |
+
gr.Markdown(f"""
|
|
|
|
| 445 |
<div style="display:flex;align-items:center;gap:16px;">
|
| 446 |
<div style="font-size:36px">📚🔎 <b>{APP_NAME}</b></div>
|
| 447 |
<div style="opacity:.9;">{TAGLINE}</div>
|
| 448 |
</div>
|
| 449 |
<p style="opacity:.85;margin-top:6px;">
|
| 450 |
Upload your papers, build an index, and ask research questions with verifiable, page-level citations.
|
| 451 |
+
</p>""")
|
|
|
|
| 452 |
|
| 453 |
+
# KPI Row
|
| 454 |
with gr.Row():
|
| 455 |
gr.Markdown("**Meaning-aware retrieval**<br><span class='kpi'>E5 + FAISS</span>", elem_classes=["kpi"])
|
| 456 |
gr.Markdown("**Cited answers**<br><span class='kpi'>Page-level proof</span>", elem_classes=["kpi"])
|
| 457 |
gr.Markdown("**Runs anywhere**<br><span class='kpi'>HF Spaces or Colab</span>", elem_classes=["kpi"])
|
| 458 |
|
| 459 |
+
# API Key
|
| 460 |
with gr.Row():
|
| 461 |
api_box = gr.Textbox(label="(Optional) Set GROQ_API_KEY", type="password", placeholder="sk_...")
|
| 462 |
set_btn = gr.Button("Set Key")
|
|
|
|
| 464 |
set_btn.click(set_api_key, inputs=[api_box], outputs=[set_out])
|
| 465 |
|
| 466 |
with gr.Tabs():
|
| 467 |
+
# Tab 1: Build / Load
|
| 468 |
with gr.Tab("1) Build or Load Index"):
|
| 469 |
gr.Markdown("Upload PDFs or a ZIP of PDFs, then click **Build Index**.")
|
| 470 |
file_u = gr.Files(label="Upload PDFs or ZIP", file_types=[".pdf", ".zip"], type="filepath")
|
|
|
|
| 486 |
zpath = gr.File(label="Index bundle", interactive=False)
|
| 487 |
download_btn.click(fn=download_index_zip, outputs=[zpath])
|
| 488 |
|
| 489 |
+
# Tab 2: Ask
|
| 490 |
with gr.Tab("2) Ask Questions"):
|
| 491 |
with gr.Row():
|
| 492 |
with gr.Column(scale=1):
|
|
|
|
| 505 |
["List camera model, sensor type, resolution, and FPS across studies. Cite pages.", "camera, fps, resolution"],
|
| 506 |
["Extract limitations and future work across the corpus, with page references.", ""],
|
| 507 |
["Compare GTAW setups: current range, travel speed, torch standoff, sensors.", "GTAW, current, speed, torch"],
|
| 508 |
+
["Summarize results tables with metrics and page citations.", "table, accuracy, mAP, F1"]
|
| 509 |
],
|
| 510 |
inputs=[q, must],
|
| 511 |
label="Quick examples",
|
|
|
|
| 519 |
export_btn = gr.Button("Export Answer to DOCX", visible=False)
|
| 520 |
exported = gr.File(label="Download answer", visible=True)
|
| 521 |
|
|
|
|
| 522 |
ask_btn.click(fn=ask_rag, inputs=[q, topk, model_dd, temp, must], outputs=[ans, src, snippets_md, export_btn])
|
| 523 |
export_btn.click(fn=do_export_docx, inputs=[q, ans, src], outputs=[exported])
|
| 524 |
clear_btn.click(lambda: ("", [], "", gr.update(visible=False)), outputs=[ans, src, snippets_md, export_btn])
|
| 525 |
|
| 526 |
+
# Tab 3: About
|
| 527 |
with gr.Tab("About"):
|
| 528 |
+
gr.Markdown("""
|
|
|
|
| 529 |
**ScholarLens** helps researchers move from reading to results with answers grounded in the papers you upload.
|
| 530 |
|
| 531 |
- Meaning-aware retrieval (E5 + FAISS)
|
|
|
|
| 535 |
- Powered by Groq models
|
| 536 |
|
| 537 |
*Privacy note:* your files stay on this Space. Only the Groq call is external.
|
| 538 |
+
""")
|
|
|
|
| 539 |
|
| 540 |
+
# Run
|
| 541 |
demo.queue()
|
| 542 |
if __name__ == "__main__":
|
| 543 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|