Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,7 +13,7 @@ from docx.shared import Pt
|
|
| 13 |
APP_NAME = "ScholarLens"
|
| 14 |
TAGLINE = "Query your literature, get page-level proof"
|
| 15 |
|
| 16 |
-
# ---------- Config
|
| 17 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 18 |
CHUNK_SIZE = 1200
|
| 19 |
CHUNK_OVERLAP = 200
|
|
@@ -23,9 +23,8 @@ MAX_CONTEXT_CHARS = 16000
|
|
| 23 |
INDEX_PATH = "rag_index.faiss"
|
| 24 |
STORE_PATH = "rag_store.pkl"
|
| 25 |
|
| 26 |
-
# You can edit the default model here. All are selectable in the UI.
|
| 27 |
MODEL_CHOICES = [
|
| 28 |
-
"llama-3.
|
| 29 |
"llama-3.1-8b-instant",
|
| 30 |
"mixtral-8x7b-32768",
|
| 31 |
]
|
|
@@ -44,7 +43,10 @@ def extract_text_from_pdf(pdf_path: str) -> List[Tuple[int, str]]:
|
|
| 44 |
if not txt.strip():
|
| 45 |
blocks = page.get_text("blocks")
|
| 46 |
if isinstance(blocks, list):
|
| 47 |
-
txt = "\n".join(
|
|
|
|
|
|
|
|
|
|
| 48 |
pages.append((i, txt or ""))
|
| 49 |
return pages
|
| 50 |
|
|
@@ -182,7 +184,7 @@ def retrieve(query: str, top_k=5, must_contain: str = ""):
|
|
| 182 |
return hits
|
| 183 |
|
| 184 |
# ---------- Groq LLM ----------
|
| 185 |
-
def groq_answer(query: str, contexts, model_name="llama-3.
|
| 186 |
try:
|
| 187 |
if not os.environ.get("GROQ_API_KEY"):
|
| 188 |
return "GROQ_API_KEY is not set. Add it in your Space secrets or the key box."
|
|
@@ -236,7 +238,6 @@ def export_answer_to_docx(question: str, answer_md: str, rows: List[List[str]])
|
|
| 236 |
doc.add_paragraph(f"Question: {question}")
|
| 237 |
|
| 238 |
doc.add_heading("Answer", level=2)
|
| 239 |
-
# Write as plain text to keep it simple in Word
|
| 240 |
for line in answer_md.splitlines():
|
| 241 |
doc.add_paragraph(line)
|
| 242 |
|
|
@@ -278,13 +279,11 @@ def ask_rag(question: str, top_k, model_name: str, temperature: float, must_cont
|
|
| 278 |
ctx = retrieve(question, top_k=int(top_k) if top_k else TOP_K_DEFAULT, must_contain=must_contain)
|
| 279 |
ans = groq_answer(question, ctx, model_name=model_name, temperature=temperature)
|
| 280 |
|
| 281 |
-
# sources table
|
| 282 |
rows = []
|
| 283 |
for c in ctx:
|
| 284 |
preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
|
| 285 |
rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
|
| 286 |
|
| 287 |
-
# snippets pretty print
|
| 288 |
details = []
|
| 289 |
for c in ctx:
|
| 290 |
details.append(f"**{c['source']} p.{c['page_start']}**\n> {c['text'].strip()[:1000]}")
|
|
@@ -321,28 +320,32 @@ def do_export_docx(question, answer_md, sources_rows):
|
|
| 321 |
except Exception:
|
| 322 |
return None
|
| 323 |
|
| 324 |
-
# ---------- Theme ----------
|
| 325 |
theme = gr.themes.Soft(
|
| 326 |
primary_hue="indigo",
|
| 327 |
secondary_hue="blue",
|
| 328 |
neutral_hue="slate",
|
| 329 |
-
).set(
|
| 330 |
-
body_background_fill="#0B1220", # dark-friendly hero
|
| 331 |
-
block_background_fill="#0F172A",
|
| 332 |
-
block_shadow="*shadow-lg",
|
| 333 |
-
radius_size="8px",
|
| 334 |
)
|
| 335 |
|
| 336 |
# ---------- Gradio UI ----------
|
| 337 |
-
with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
| 338 |
#hero {
|
| 339 |
background: radial-gradient(1200px 600px at 20% -10%, rgba(99,102,241,.25), transparent),
|
| 340 |
radial-gradient(1000px 500px at 120% 10%, rgba(14,165,233,.20), transparent);
|
| 341 |
border: 1px solid rgba(99,102,241,.20);
|
|
|
|
|
|
|
| 342 |
}
|
| 343 |
.kpi {text-align:center;padding:12px;border-radius:10px;border:1px solid rgba(255,255,255,.08);}
|
| 344 |
.footer {opacity:.8;}
|
| 345 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
# --- Header / Hero ---
|
| 347 |
with gr.Group(elem_id="hero"):
|
| 348 |
gr.Markdown(
|
|
@@ -396,7 +399,7 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 396 |
with gr.Tab("2) Ask Questions"):
|
| 397 |
with gr.Row():
|
| 398 |
with gr.Column(scale=1):
|
| 399 |
-
q = gr.Textbox(label="Your question", lines=3, placeholder="e.g., Compare GTAW
|
| 400 |
must = gr.Textbox(label="Must contain (comma-separated keywords)", placeholder="camera, CMOS, frame rate")
|
| 401 |
with gr.Accordion("Advanced settings", open=False):
|
| 402 |
topk = gr.Slider(1, 20, value=TOP_K_DEFAULT, step=1, label="Top-K passages")
|
|
@@ -446,7 +449,7 @@ Upload your papers, build an index, and ask research questions with verifiable,
|
|
| 446 |
"""
|
| 447 |
)
|
| 448 |
|
| 449 |
-
#
|
| 450 |
demo.queue()
|
| 451 |
if __name__ == "__main__":
|
| 452 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
|
|
|
| 13 |
APP_NAME = "ScholarLens"
|
| 14 |
TAGLINE = "Query your literature, get page-level proof"
|
| 15 |
|
| 16 |
+
# ---------- Config ----------
|
| 17 |
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
|
| 18 |
CHUNK_SIZE = 1200
|
| 19 |
CHUNK_OVERLAP = 200
|
|
|
|
| 23 |
INDEX_PATH = "rag_index.faiss"
|
| 24 |
STORE_PATH = "rag_store.pkl"
|
| 25 |
|
|
|
|
| 26 |
MODEL_CHOICES = [
|
| 27 |
+
"llama-3.1-70b-versatile",
|
| 28 |
"llama-3.1-8b-instant",
|
| 29 |
"mixtral-8x7b-32768",
|
| 30 |
]
|
|
|
|
| 43 |
if not txt.strip():
|
| 44 |
blocks = page.get_text("blocks")
|
| 45 |
if isinstance(blocks, list):
|
| 46 |
+
txt = "\n".join(
|
| 47 |
+
b[4] for b in blocks
|
| 48 |
+
if isinstance(b, (list, tuple)) and len(b) > 4
|
| 49 |
+
)
|
| 50 |
pages.append((i, txt or ""))
|
| 51 |
return pages
|
| 52 |
|
|
|
|
| 184 |
return hits
|
| 185 |
|
| 186 |
# ---------- Groq LLM ----------
|
| 187 |
+
def groq_answer(query: str, contexts, model_name="llama-3.1-70b-versatile", temperature=0.2, max_tokens=1000):
|
| 188 |
try:
|
| 189 |
if not os.environ.get("GROQ_API_KEY"):
|
| 190 |
return "GROQ_API_KEY is not set. Add it in your Space secrets or the key box."
|
|
|
|
| 238 |
doc.add_paragraph(f"Question: {question}")
|
| 239 |
|
| 240 |
doc.add_heading("Answer", level=2)
|
|
|
|
| 241 |
for line in answer_md.splitlines():
|
| 242 |
doc.add_paragraph(line)
|
| 243 |
|
|
|
|
| 279 |
ctx = retrieve(question, top_k=int(top_k) if top_k else TOP_K_DEFAULT, must_contain=must_contain)
|
| 280 |
ans = groq_answer(question, ctx, model_name=model_name, temperature=temperature)
|
| 281 |
|
|
|
|
| 282 |
rows = []
|
| 283 |
for c in ctx:
|
| 284 |
preview = c["text"][:200].replace("\n"," ") + ("..." if len(c["text"])>200 else "")
|
| 285 |
rows.append([c["source"], str(c["page_start"]), f"{c['score']:.3f}", preview])
|
| 286 |
|
|
|
|
| 287 |
details = []
|
| 288 |
for c in ctx:
|
| 289 |
details.append(f"**{c['source']} p.{c['page_start']}**\n> {c['text'].strip()[:1000]}")
|
|
|
|
| 320 |
except Exception:
|
| 321 |
return None
|
| 322 |
|
| 323 |
+
# ---------- Theme (no .set used; styling via CSS) ----------
|
| 324 |
theme = gr.themes.Soft(
|
| 325 |
primary_hue="indigo",
|
| 326 |
secondary_hue="blue",
|
| 327 |
neutral_hue="slate",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
)
|
| 329 |
|
| 330 |
# ---------- Gradio UI ----------
|
| 331 |
+
with gr.Blocks(
|
| 332 |
+
title=f"{APP_NAME} | RAG over PDFs",
|
| 333 |
+
theme=theme,
|
| 334 |
+
css="""
|
| 335 |
#hero {
|
| 336 |
background: radial-gradient(1200px 600px at 20% -10%, rgba(99,102,241,.25), transparent),
|
| 337 |
radial-gradient(1000px 500px at 120% 10%, rgba(14,165,233,.20), transparent);
|
| 338 |
border: 1px solid rgba(99,102,241,.20);
|
| 339 |
+
border-radius: 12px;
|
| 340 |
+
padding: 14px 16px;
|
| 341 |
}
|
| 342 |
.kpi {text-align:center;padding:12px;border-radius:10px;border:1px solid rgba(255,255,255,.08);}
|
| 343 |
.footer {opacity:.8;}
|
| 344 |
+
/* Dark-friendly background */
|
| 345 |
+
body, .gradio-container { background: #0B1220 !important; }
|
| 346 |
+
.gradio-container .block, .gradio-container .tabs { background: #0F172A !important; }
|
| 347 |
+
"""
|
| 348 |
+
) as demo:
|
| 349 |
# --- Header / Hero ---
|
| 350 |
with gr.Group(elem_id="hero"):
|
| 351 |
gr.Markdown(
|
|
|
|
| 399 |
with gr.Tab("2) Ask Questions"):
|
| 400 |
with gr.Row():
|
| 401 |
with gr.Column(scale=1):
|
| 402 |
+
q = gr.Textbox(label="Your question", lines=3, placeholder="e.g., Compare GTAW parameters with citations")
|
| 403 |
must = gr.Textbox(label="Must contain (comma-separated keywords)", placeholder="camera, CMOS, frame rate")
|
| 404 |
with gr.Accordion("Advanced settings", open=False):
|
| 405 |
topk = gr.Slider(1, 20, value=TOP_K_DEFAULT, step=1, label="Top-K passages")
|
|
|
|
| 449 |
"""
|
| 450 |
)
|
| 451 |
|
| 452 |
+
# Broad compatibility for Spaces
|
| 453 |
demo.queue()
|
| 454 |
if __name__ == "__main__":
|
| 455 |
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|