Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,139 +4,47 @@ import fitz
|
|
| 4 |
import re
|
| 5 |
import numpy as np
|
| 6 |
import faiss
|
| 7 |
-
import os
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
from groq import Groq
|
|
|
|
| 10 |
|
| 11 |
# =========================
|
| 12 |
# INITIALIZE MODELS
|
| 13 |
# =========================
|
| 14 |
|
| 15 |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
| 16 |
|
| 17 |
-
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
global whisper_model
|
| 21 |
-
if whisper_model is None:
|
| 22 |
-
from faster_whisper import WhisperModel
|
| 23 |
-
whisper_model = WhisperModel("base", compute_type="int8")
|
| 24 |
-
return whisper_model
|
| 25 |
-
|
| 26 |
-
GROQ_API_KEY = os.environ.get("YOUR_GROQ_API_KEY", "")
|
| 27 |
-
if not GROQ_API_KEY:
|
| 28 |
-
raise ValueError("YOUR_GROQ_API_KEY secret is not set. Add it in HF Space Settings β Secrets.")
|
| 29 |
-
|
| 30 |
-
client = Groq(api_key=GROQ_API_KEY)
|
| 31 |
MODEL_NAME = "llama-3.3-70b-versatile"
|
| 32 |
|
|
|
|
| 33 |
sections = {}
|
| 34 |
section_texts = []
|
| 35 |
index = None
|
| 36 |
|
|
|
|
| 37 |
# =========================
|
| 38 |
# PDF FUNCTIONS
|
| 39 |
# =========================
|
| 40 |
|
| 41 |
-
def
|
| 42 |
-
"""
|
| 43 |
-
Given a DOI, try multiple strategies to find a downloadable PDF:
|
| 44 |
-
1. Unpaywall API β finds legal open-access PDFs for any DOI (no key needed)
|
| 45 |
-
2. Direct arXiv β if the DOI belongs to an arXiv paper
|
| 46 |
-
3. Europe PMC β broad biomedical / life-science coverage
|
| 47 |
-
Returns (pdf_url, paper_title).
|
| 48 |
-
"""
|
| 49 |
-
doi = doi.strip()
|
| 50 |
-
# Strip common DOI prefixes so bare DOI always works
|
| 51 |
-
for prefix in ("https://doi.org/", "http://doi.org/", "doi:", "DOI:"):
|
| 52 |
-
if doi.startswith(prefix):
|
| 53 |
-
doi = doi[len(prefix):]
|
| 54 |
-
break
|
| 55 |
-
|
| 56 |
-
title = None
|
| 57 |
-
|
| 58 |
-
# ββ Strategy 1: Unpaywall (free, ~85% OA coverage) ββββββββββββββ
|
| 59 |
try:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
)
|
| 64 |
-
if r.status_code == 200:
|
| 65 |
-
data = r.json()
|
| 66 |
-
title = data.get("title")
|
| 67 |
-
best = data.get("best_oa_location")
|
| 68 |
-
if best:
|
| 69 |
-
pdf_url = best.get("url_for_pdf") or best.get("url")
|
| 70 |
-
if pdf_url:
|
| 71 |
-
return pdf_url, title
|
| 72 |
-
for loc in data.get("oa_locations", []):
|
| 73 |
-
pdf_url = loc.get("url_for_pdf") or loc.get("url")
|
| 74 |
-
if pdf_url:
|
| 75 |
-
return pdf_url, title
|
| 76 |
-
except Exception as e:
|
| 77 |
-
print(f"Unpaywall error: {e}")
|
| 78 |
-
|
| 79 |
-
# ββ Strategy 2: arXiv DOI pattern βββββββββββββββββββββββββββββββ
|
| 80 |
-
try:
|
| 81 |
-
arxiv_match = re.search(r"arXiv[\.:](\d{4}\.\d{4,5})", doi, re.IGNORECASE)
|
| 82 |
-
if arxiv_match:
|
| 83 |
-
arxiv_id = arxiv_match.group(1)
|
| 84 |
-
return f"https://arxiv.org/pdf/{arxiv_id}.pdf", title
|
| 85 |
-
except Exception as e:
|
| 86 |
-
print(f"arXiv DOI parse error: {e}")
|
| 87 |
-
|
| 88 |
-
# ββ Strategy 3: Europe PMC βββββββββββββββββββββββββββββββββββββββ
|
| 89 |
-
try:
|
| 90 |
-
r = requests.get(
|
| 91 |
-
f"https://www.ebi.ac.uk/europepmc/webservices/rest/search"
|
| 92 |
-
f"?query=DOI:{doi}&format=json&resultType=core",
|
| 93 |
-
timeout=15,
|
| 94 |
-
)
|
| 95 |
-
if r.status_code == 200:
|
| 96 |
-
results = r.json().get("resultList", {}).get("result", [])
|
| 97 |
-
if results:
|
| 98 |
-
item = results[0]
|
| 99 |
-
title = title or item.get("title")
|
| 100 |
-
pmcid = item.get("pmcid")
|
| 101 |
-
if pmcid:
|
| 102 |
-
pdf_url = (
|
| 103 |
-
f"https://europepmc.org/backend/ptpmcrender.fcgi"
|
| 104 |
-
f"?accid={pmcid}&blobtype=pdf"
|
| 105 |
-
)
|
| 106 |
-
return pdf_url, title
|
| 107 |
-
except Exception as e:
|
| 108 |
-
print(f"Europe PMC error: {e}")
|
| 109 |
-
|
| 110 |
-
return None, title
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
def download_pdf_from_doi(doi):
|
| 114 |
-
"""Resolve DOI β PDF URL β download to /tmp. Returns (file_path, paper_title)."""
|
| 115 |
-
try:
|
| 116 |
-
pdf_url, title = doi_to_pdf_url(doi)
|
| 117 |
-
if not pdf_url:
|
| 118 |
-
return None, title
|
| 119 |
-
|
| 120 |
-
safe_name = re.sub(r"[^\w\-]", "_", doi)[:60]
|
| 121 |
-
file_path = f"/tmp/{safe_name}.pdf"
|
| 122 |
-
|
| 123 |
-
headers = {"User-Agent": "Mozilla/5.0 (ResearchAssistant/1.0)"}
|
| 124 |
-
r = requests.get(pdf_url, timeout=40, headers=headers, allow_redirects=True)
|
| 125 |
-
r.raise_for_status()
|
| 126 |
-
|
| 127 |
-
# Verify it's actually a PDF
|
| 128 |
-
if b"%PDF" not in r.content[:16]:
|
| 129 |
-
print(f"Response is not a PDF from {pdf_url}")
|
| 130 |
-
return None, title
|
| 131 |
|
|
|
|
| 132 |
with open(file_path, "wb") as f:
|
| 133 |
-
f.write(
|
| 134 |
|
| 135 |
-
return file_path
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
except Exception as e:
|
| 138 |
-
print(f"PDF download error: {e}")
|
| 139 |
-
return None, None
|
| 140 |
|
| 141 |
def extract_text_from_pdf(pdf_path):
|
| 142 |
doc = fitz.open(pdf_path)
|
|
@@ -145,641 +53,247 @@ def extract_text_from_pdf(pdf_path):
|
|
| 145 |
text += page.get_text()
|
| 146 |
return text
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def extract_sections(text):
|
|
|
|
| 149 |
patterns = [
|
| 150 |
-
r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)",
|
| 151 |
-
r"\n(\d+\.\d+\.\d+\s+[^\n]+)",
|
| 152 |
-
r"\n(\d+\.\d+\s+[^\n]+)",
|
| 153 |
-
r"\n(\d+\.\s+[^\n]+)",
|
| 154 |
-
r"\n(\d+\s+[^\n]+)",
|
| 155 |
-
r"\n([A-Z][A-Z\s]{4,})\n"
|
| 156 |
]
|
|
|
|
| 157 |
matches = []
|
| 158 |
for p in patterns:
|
| 159 |
matches.extend(list(re.finditer(p, text)))
|
|
|
|
| 160 |
matches = sorted(matches, key=lambda x: x.start())
|
|
|
|
| 161 |
extracted = {}
|
|
|
|
| 162 |
for i, match in enumerate(matches):
|
| 163 |
title = match.group(1).strip()
|
|
|
|
| 164 |
start = match.end()
|
| 165 |
-
end = matches[i
|
|
|
|
| 166 |
content = text[start:end].strip()
|
|
|
|
| 167 |
if len(content) > 4000:
|
| 168 |
content = content[:4000]
|
|
|
|
| 169 |
extracted[title] = content
|
|
|
|
|
|
|
| 170 |
abstract_match = re.search(r"Abstract(.*?)\n", text, re.DOTALL)
|
| 171 |
if abstract_match:
|
| 172 |
extracted["Abstract"] = abstract_match.group(1).strip()
|
|
|
|
| 173 |
return extracted
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
def build_vector_store(sections_dict):
|
| 176 |
global index, section_texts
|
|
|
|
| 177 |
section_texts = list(sections_dict.values())
|
|
|
|
| 178 |
if len(section_texts) == 0:
|
| 179 |
index = None
|
| 180 |
return
|
|
|
|
| 181 |
embeddings = embedding_model.encode(section_texts)
|
| 182 |
embeddings = np.array(embeddings).astype("float32")
|
|
|
|
| 183 |
dim = embeddings.shape[1]
|
| 184 |
index = faiss.IndexFlatL2(dim)
|
| 185 |
index.add(embeddings)
|
| 186 |
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
global sections
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
if pdf_path is None:
|
| 193 |
-
return gr.update(choices=[]),
|
| 194 |
-
|
| 195 |
-
"**Tips:** Make sure the paper is open-access. "
|
| 196 |
-
"Try formats like `10.1038/s41586-021-03819-2` or paste the full `https://doi.org/...` URL."
|
| 197 |
-
)
|
| 198 |
text = extract_text_from_pdf(pdf_path)
|
| 199 |
sections = extract_sections(text)
|
| 200 |
-
|
| 201 |
-
return gr.update(choices=[]), "β οΈ PDF downloaded but no sections could be detected."
|
| 202 |
build_vector_store(sections)
|
| 203 |
-
|
| 204 |
-
return gr.update(choices=list(sections.keys())),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
def summarize_section(section_title):
|
| 207 |
try:
|
| 208 |
if not sections:
|
| 209 |
-
return "
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
|
|
|
| 213 |
if not content:
|
| 214 |
-
return "
|
| 215 |
-
prompt = f"""You are a research assistant. Summarize the following section from a research paper in a clear, structured way.
|
| 216 |
|
| 217 |
-
|
| 218 |
-
β’ **Main Idea** β What is this section about?
|
| 219 |
-
β’ **Key Concepts** β What are the important terms or methods?
|
| 220 |
-
β’ **Simple Explanation** β Explain it as if to a graduate student unfamiliar with the topic
|
| 221 |
-
β’ **Why It Matters** β What is the contribution or significance?
|
| 222 |
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
Content:
|
| 226 |
{content}
|
| 227 |
"""
|
|
|
|
| 228 |
response = client.chat.completions.create(
|
| 229 |
model=MODEL_NAME,
|
| 230 |
messages=[{"role": "user", "content": prompt}],
|
| 231 |
temperature=0.3
|
| 232 |
)
|
|
|
|
| 233 |
return response.choices[0].message.content
|
|
|
|
| 234 |
except Exception as e:
|
| 235 |
-
return f"β Error:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
def rag_chat(message, history):
|
| 238 |
try:
|
| 239 |
global index
|
| 240 |
-
|
| 241 |
-
return history, ""
|
| 242 |
if index is None:
|
| 243 |
-
history
|
| 244 |
return history, ""
|
|
|
|
| 245 |
query_embedding = embedding_model.encode([message])
|
| 246 |
query_embedding = np.array(query_embedding).astype("float32")
|
|
|
|
| 247 |
D, I = index.search(query_embedding, k=3)
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
| 251 |
|
| 252 |
Context:
|
| 253 |
{retrieved}
|
| 254 |
|
| 255 |
-
Question:
|
|
|
|
| 256 |
"""
|
|
|
|
| 257 |
response = client.chat.completions.create(
|
| 258 |
model=MODEL_NAME,
|
| 259 |
messages=[{"role": "user", "content": prompt}],
|
| 260 |
temperature=0.2
|
| 261 |
)
|
|
|
|
| 262 |
answer = response.choices[0].message.content
|
| 263 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
return history, ""
|
|
|
|
| 265 |
except Exception as e:
|
| 266 |
-
history
|
| 267 |
return history, ""
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
def voice_chat(audio, history):
|
| 270 |
try:
|
| 271 |
if audio is None:
|
| 272 |
-
history = history + [[None, "β οΈ No audio received. Please record a voice message."]]
|
| 273 |
-
return history, ""
|
| 274 |
-
model = get_whisper()
|
| 275 |
-
segments, _ = model.transcribe(audio)
|
| 276 |
-
text = " ".join([seg.text for seg in segments]).strip()
|
| 277 |
-
if not text:
|
| 278 |
-
history = history + [[None, "β οΈ Could not transcribe audio. Please try again."]]
|
| 279 |
return history, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
return rag_chat(text, history)
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
-
history
|
| 283 |
return history, ""
|
| 284 |
|
|
|
|
| 285 |
# =========================
|
| 286 |
-
#
|
| 287 |
-
# Ethos: High-Density Functionalism meets Dark Luxury
|
| 288 |
-
# Trends: Bento Grid Β· Glassmorphism Β· Kinetic borders Β· Layered depth
|
| 289 |
# =========================
|
| 290 |
|
| 291 |
-
|
| 292 |
-
@import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500&display=swap');
|
| 293 |
-
|
| 294 |
-
/* βββββββββββ DESIGN TOKENS βββββββββββ */
|
| 295 |
-
:root {
|
| 296 |
-
--void: #07080c;
|
| 297 |
-
--bg-0: #0b0d13;
|
| 298 |
-
--bg-1: #0f1119;
|
| 299 |
-
--bg-2: #13161f;
|
| 300 |
-
--bg-3: #181c28;
|
| 301 |
-
--bg-4: #1e2333;
|
| 302 |
-
--bg-5: #242840;
|
| 303 |
-
--b-0: #1c2030;
|
| 304 |
-
--b-1: #252a3c;
|
| 305 |
-
--b-2: #303650;
|
| 306 |
-
--b-hi: #404868;
|
| 307 |
-
--gold: #d4a853;
|
| 308 |
-
--gold-lt: #e8c47a;
|
| 309 |
-
--gold-dk: #9e7535;
|
| 310 |
-
--gold-muted: rgba(212,168,83,0.08);
|
| 311 |
-
--gold-glow: rgba(212,168,83,0.16);
|
| 312 |
-
--gold-ring: 0 0 0 2.5px rgba(212,168,83,0.22);
|
| 313 |
-
--green: #3ecf8e;
|
| 314 |
-
--blue: #4d94ff;
|
| 315 |
-
--red: #f06565;
|
| 316 |
-
--amber: #f0a443;
|
| 317 |
-
--tx-1: #f0ece2;
|
| 318 |
-
--tx-2: #8a8578;
|
| 319 |
-
--tx-3: #4a4740;
|
| 320 |
-
--r-xs: 6px; --r-sm: 10px; --r: 16px; --r-lg: 22px; --r-xl: 28px;
|
| 321 |
-
--sh-sm: 0 2px 12px rgba(0,0,0,.4);
|
| 322 |
-
--sh: 0 4px 32px rgba(0,0,0,.55);
|
| 323 |
-
--sh-lg: 0 8px 64px rgba(0,0,0,.7);
|
| 324 |
-
--sh-xl: 0 16px 80px rgba(0,0,0,.8);
|
| 325 |
-
--sh-gold: 0 4px 28px rgba(212,168,83,0.2);
|
| 326 |
-
}
|
| 327 |
-
|
| 328 |
-
*, *::before, *::after { box-sizing: border-box; margin: 0; }
|
| 329 |
-
html { scroll-behavior: smooth; }
|
| 330 |
-
|
| 331 |
-
body, .gradio-container {
|
| 332 |
-
background: var(--bg-0) !important;
|
| 333 |
-
font-family: 'Geist', system-ui, sans-serif !important;
|
| 334 |
-
color: var(--tx-1) !important;
|
| 335 |
-
min-height: 100vh;
|
| 336 |
-
-webkit-font-smoothing: antialiased;
|
| 337 |
-
}
|
| 338 |
-
|
| 339 |
-
/* Dot-matrix background */
|
| 340 |
-
.gradio-container {
|
| 341 |
-
background-image: radial-gradient(circle, rgba(212,168,83,0.05) 1px, transparent 1px) !important;
|
| 342 |
-
background-size: 28px 28px !important;
|
| 343 |
-
background-attachment: fixed !important;
|
| 344 |
-
}
|
| 345 |
-
|
| 346 |
-
/* Vignette overlay */
|
| 347 |
-
.gradio-container::before {
|
| 348 |
-
content: '';
|
| 349 |
-
position: fixed; inset: 0;
|
| 350 |
-
background: radial-gradient(ellipse 80% 60% at 50% 0%, transparent 40%, rgba(7,8,12,0.75) 100%);
|
| 351 |
-
pointer-events: none; z-index: 0;
|
| 352 |
-
}
|
| 353 |
-
|
| 354 |
-
/* βββββββββββ HERO HEADER βββββββββββ */
|
| 355 |
-
#hero {
|
| 356 |
-
position: relative;
|
| 357 |
-
background: linear-gradient(145deg, #0f1220 0%, #131828 45%, #0c0f1a 100%);
|
| 358 |
-
border: 1px solid var(--b-2);
|
| 359 |
-
border-radius: var(--r-xl);
|
| 360 |
-
padding: 56px 48px 44px;
|
| 361 |
-
text-align: center;
|
| 362 |
-
overflow: hidden;
|
| 363 |
-
box-shadow: var(--sh-xl);
|
| 364 |
-
margin-bottom: 2px;
|
| 365 |
-
}
|
| 366 |
-
|
| 367 |
-
#hero::before {
|
| 368 |
-
content: '';
|
| 369 |
-
position: absolute;
|
| 370 |
-
top: -120px; left: 50%; transform: translateX(-50%);
|
| 371 |
-
width: 600px; height: 400px;
|
| 372 |
-
background: conic-gradient(from 180deg at 50% 50%, rgba(212,168,83,0) 0deg, rgba(212,168,83,0.07) 60deg, rgba(77,148,255,0.04) 120deg, rgba(212,168,83,0) 180deg, rgba(212,168,83,0.06) 240deg, rgba(77,148,255,0.03) 300deg, rgba(212,168,83,0) 360deg);
|
| 373 |
-
animation: aurora 12s linear infinite;
|
| 374 |
-
pointer-events: none; border-radius: 50%;
|
| 375 |
-
}
|
| 376 |
-
|
| 377 |
-
@keyframes aurora {
|
| 378 |
-
from { transform: translateX(-50%) rotate(0deg); }
|
| 379 |
-
to { transform: translateX(-50%) rotate(360deg); }
|
| 380 |
-
}
|
| 381 |
-
|
| 382 |
-
#hero::after {
|
| 383 |
-
content: ''; position: absolute;
|
| 384 |
-
bottom: 0; left: 8%; right: 8%; height: 1px;
|
| 385 |
-
background: linear-gradient(90deg, transparent, var(--b-2) 30%, var(--b-hi) 50%, var(--b-2) 70%, transparent);
|
| 386 |
-
}
|
| 387 |
-
|
| 388 |
-
.hero-corner { position: absolute; width: 36px; height: 36px; opacity: 0.35; }
|
| 389 |
-
.hero-corner-tl { top: 18px; left: 18px; border-top: 1.5px solid var(--gold); border-left: 1.5px solid var(--gold); border-radius: 4px 0 0 0; }
|
| 390 |
-
.hero-corner-tr { top: 18px; right: 18px; border-top: 1.5px solid var(--gold); border-right: 1.5px solid var(--gold); border-radius: 0 4px 0 0; }
|
| 391 |
-
.hero-corner-bl { bottom: 18px; left: 18px; border-bottom: 1.5px solid var(--gold); border-left: 1.5px solid var(--gold); border-radius: 0 0 0 4px; }
|
| 392 |
-
.hero-corner-br { bottom: 18px; right: 18px; border-bottom: 1.5px solid var(--gold); border-right: 1.5px solid var(--gold); border-radius: 0 0 4px 0; }
|
| 393 |
-
|
| 394 |
-
.hero-eyebrow {
|
| 395 |
-
display: inline-flex; align-items: center; gap: 8px;
|
| 396 |
-
background: rgba(212,168,83,0.07); border: 1px solid rgba(212,168,83,0.2);
|
| 397 |
-
border-radius: 99px; padding: 5px 14px;
|
| 398 |
-
font-family: 'Geist Mono', monospace; font-size: 0.67rem;
|
| 399 |
-
color: var(--gold); letter-spacing: 2px; text-transform: uppercase;
|
| 400 |
-
margin-bottom: 20px; animation: fadeDown 0.6s ease both;
|
| 401 |
-
}
|
| 402 |
-
|
| 403 |
-
.hero-eyebrow::before {
|
| 404 |
-
content: ''; width: 5px; height: 5px; background: var(--gold);
|
| 405 |
-
border-radius: 50%; box-shadow: 0 0 6px var(--gold);
|
| 406 |
-
animation: pulse-dot 2s ease-in-out infinite;
|
| 407 |
-
}
|
| 408 |
-
|
| 409 |
-
@keyframes pulse-dot {
|
| 410 |
-
0%, 100% { opacity: 1; transform: scale(1); }
|
| 411 |
-
50% { opacity: 0.4; transform: scale(0.65); }
|
| 412 |
-
}
|
| 413 |
-
|
| 414 |
-
.hero-title {
|
| 415 |
-
font-family: 'Instrument Serif', Georgia, serif !important;
|
| 416 |
-
font-size: 3.4rem !important; font-weight: 400 !important;
|
| 417 |
-
color: var(--tx-1) !important; letter-spacing: -1.5px;
|
| 418 |
-
line-height: 1.05; margin-bottom: 16px !important;
|
| 419 |
-
animation: fadeDown 0.6s 0.1s ease both;
|
| 420 |
-
}
|
| 421 |
-
|
| 422 |
-
.hero-title em {
|
| 423 |
-
font-style: italic; color: var(--gold);
|
| 424 |
-
text-shadow: 0 0 40px rgba(212,168,83,0.3);
|
| 425 |
-
}
|
| 426 |
-
|
| 427 |
-
.hero-sub {
|
| 428 |
-
font-size: 1rem; color: var(--tx-2); font-weight: 300;
|
| 429 |
-
letter-spacing: 0.2px; max-width: 560px; margin: 0 auto 28px;
|
| 430 |
-
line-height: 1.6; animation: fadeDown 0.6s 0.2s ease both;
|
| 431 |
-
}
|
| 432 |
-
|
| 433 |
-
.pill-row {
|
| 434 |
-
display: flex; gap: 8px; justify-content: center;
|
| 435 |
-
flex-wrap: wrap; animation: fadeDown 0.6s 0.3s ease both;
|
| 436 |
-
}
|
| 437 |
-
|
| 438 |
-
.pill {
|
| 439 |
-
display: inline-flex; align-items: center; gap: 6px;
|
| 440 |
-
background: rgba(255,255,255,0.03); border: 1px solid var(--b-2);
|
| 441 |
-
border-radius: 99px; padding: 6px 14px;
|
| 442 |
-
font-size: 0.76rem; font-weight: 500;
|
| 443 |
-
color: var(--tx-2); letter-spacing: 0.2px;
|
| 444 |
-
transition: all 0.2s; cursor: default;
|
| 445 |
-
}
|
| 446 |
-
|
| 447 |
-
.pill:hover { border-color: var(--gold); color: var(--gold-lt); background: var(--gold-muted); transform: translateY(-1px); }
|
| 448 |
-
.pill-dot { width: 5px; height: 5px; border-radius: 50%; flex-shrink: 0; }
|
| 449 |
-
|
| 450 |
-
/* βββββββββββ PANEL CARDS βββββββββββ */
|
| 451 |
-
.gr-group {
|
| 452 |
-
background: rgba(19,22,31,0.88) !important;
|
| 453 |
-
backdrop-filter: blur(20px) saturate(140%) !important;
|
| 454 |
-
-webkit-backdrop-filter: blur(20px) saturate(140%) !important;
|
| 455 |
-
border: 1px solid var(--b-1) !important;
|
| 456 |
-
border-radius: var(--r-lg) !important;
|
| 457 |
-
padding: 28px 32px !important;
|
| 458 |
-
box-shadow: var(--sh) !important;
|
| 459 |
-
transition: border-color 0.3s, box-shadow 0.3s !important;
|
| 460 |
-
position: relative; overflow: hidden;
|
| 461 |
-
}
|
| 462 |
-
|
| 463 |
-
.gr-group::before {
|
| 464 |
-
content: ''; position: absolute;
|
| 465 |
-
top: 0; left: 0; right: 0; height: 1px;
|
| 466 |
-
background: linear-gradient(90deg, transparent, rgba(255,255,255,0.05) 40%, rgba(255,255,255,0.05) 60%, transparent);
|
| 467 |
-
pointer-events: none;
|
| 468 |
-
}
|
| 469 |
-
|
| 470 |
-
.gr-group:hover { border-color: var(--b-2) !important; box-shadow: var(--sh-lg) !important; }
|
| 471 |
-
|
| 472 |
-
/* βββββββββββ STEP HEADERS βββββββββββ */
|
| 473 |
-
.step-header { display: flex; align-items: center; gap: 14px; margin-bottom: 20px; }
|
| 474 |
-
|
| 475 |
-
.step-num {
|
| 476 |
-
width: 30px; height: 30px; background: var(--gold-muted);
|
| 477 |
-
border: 1px solid rgba(212,168,83,0.25); border-radius: 8px;
|
| 478 |
-
display: flex; align-items: center; justify-content: center;
|
| 479 |
-
font-family: 'Geist Mono', monospace; font-size: 0.72rem;
|
| 480 |
-
font-weight: 500; color: var(--gold); flex-shrink: 0;
|
| 481 |
-
}
|
| 482 |
-
|
| 483 |
-
.step-title { font-size: 0.95rem; font-weight: 600; color: var(--tx-1); letter-spacing: -0.2px; }
|
| 484 |
-
.step-desc { font-size: 0.76rem; color: var(--tx-3); margin-left: auto; }
|
| 485 |
-
.step-divider { height: 1px; background: linear-gradient(90deg, var(--b-1), transparent); margin-bottom: 20px; }
|
| 486 |
-
|
| 487 |
-
/* βββββββββββ FORM CONTROLS βββββββββββ */
|
| 488 |
-
label span, .block label span {
|
| 489 |
-
font-family: 'Geist', sans-serif !important; font-size: 0.73rem !important;
|
| 490 |
-
font-weight: 600 !important; color: var(--tx-3) !important;
|
| 491 |
-
letter-spacing: 0.8px !important; text-transform: uppercase !important;
|
| 492 |
-
}
|
| 493 |
-
|
| 494 |
-
textarea, input[type="text"], .gr-textbox textarea, .gr-textbox input {
|
| 495 |
-
background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
|
| 496 |
-
border-radius: var(--r-sm) !important; color: var(--tx-1) !important;
|
| 497 |
-
font-family: 'Geist Mono', monospace !important; font-size: 0.88rem !important;
|
| 498 |
-
padding: 14px 18px !important;
|
| 499 |
-
transition: border-color 0.2s, box-shadow 0.2s, background 0.2s !important;
|
| 500 |
-
resize: none !important; line-height: 1.5 !important;
|
| 501 |
-
}
|
| 502 |
-
|
| 503 |
-
textarea:focus, input[type="text"]:focus {
|
| 504 |
-
border-color: var(--gold) !important; box-shadow: var(--gold-ring) !important;
|
| 505 |
-
background: var(--bg-4) !important; outline: none !important;
|
| 506 |
-
}
|
| 507 |
-
|
| 508 |
-
textarea::placeholder, input::placeholder {
|
| 509 |
-
color: var(--tx-3) !important; font-style: italic;
|
| 510 |
-
font-family: 'Geist', sans-serif !important; font-size: 0.84rem !important;
|
| 511 |
-
}
|
| 512 |
-
|
| 513 |
-
select, .gr-dropdown select {
|
| 514 |
-
background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
|
| 515 |
-
border-radius: var(--r-sm) !important; color: var(--tx-1) !important;
|
| 516 |
-
font-family: 'Geist', sans-serif !important; font-size: 0.88rem !important;
|
| 517 |
-
padding: 14px 18px !important; transition: border-color 0.2s, box-shadow 0.2s !important;
|
| 518 |
-
}
|
| 519 |
-
|
| 520 |
-
select:focus { border-color: var(--gold) !important; box-shadow: var(--gold-ring) !important; outline: none !important; }
|
| 521 |
-
|
| 522 |
-
/* βββββββββββ BUTTONS βββββββββββ */
|
| 523 |
-
.gr-button {
|
| 524 |
-
font-family: 'Geist', sans-serif !important; font-size: 0.85rem !important;
|
| 525 |
-
font-weight: 600 !important; letter-spacing: 0.1px !important;
|
| 526 |
-
border-radius: var(--r-sm) !important; cursor: pointer !important;
|
| 527 |
-
transition: all 0.18s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
|
| 528 |
-
padding: 13px 24px !important; position: relative; overflow: hidden;
|
| 529 |
-
}
|
| 530 |
-
|
| 531 |
-
.gr-button-primary::after {
|
| 532 |
-
content: ''; position: absolute;
|
| 533 |
-
top: 0; left: -100%; width: 100%; height: 100%;
|
| 534 |
-
background: linear-gradient(90deg, transparent, rgba(255,255,255,0.12), transparent);
|
| 535 |
-
transition: left 0.45s ease;
|
| 536 |
-
}
|
| 537 |
-
.gr-button-primary:hover::after { left: 100%; }
|
| 538 |
-
|
| 539 |
-
.gr-button-primary {
|
| 540 |
-
background: linear-gradient(135deg, #d4a853 0%, #b8893a 60%, #9e7535 100%) !important;
|
| 541 |
-
color: #070810 !important; border: none !important;
|
| 542 |
-
box-shadow: 0 2px 16px rgba(212,168,83,0.3), inset 0 1px 0 rgba(255,255,255,0.15) !important;
|
| 543 |
-
}
|
| 544 |
-
|
| 545 |
-
.gr-button-primary:hover {
|
| 546 |
-
transform: translateY(-2px) scale(1.01) !important;
|
| 547 |
-
box-shadow: 0 8px 28px rgba(212,168,83,0.42), inset 0 1px 0 rgba(255,255,255,0.2) !important;
|
| 548 |
-
filter: brightness(1.06) !important;
|
| 549 |
-
}
|
| 550 |
-
|
| 551 |
-
.gr-button-primary:active { transform: translateY(0) scale(0.99) !important; }
|
| 552 |
-
|
| 553 |
-
.gr-button-secondary {
|
| 554 |
-
background: var(--bg-4) !important; color: var(--tx-2) !important;
|
| 555 |
-
border: 1px solid var(--b-2) !important;
|
| 556 |
-
}
|
| 557 |
-
|
| 558 |
-
.gr-button-secondary:hover {
|
| 559 |
-
border-color: var(--gold) !important; color: var(--gold-lt) !important;
|
| 560 |
-
background: var(--gold-muted) !important; transform: translateY(-1px) !important;
|
| 561 |
-
box-shadow: var(--sh-gold) !important;
|
| 562 |
-
}
|
| 563 |
-
|
| 564 |
-
/* βββββββββββ STATUS BOX βββββββββββ */
|
| 565 |
-
#status-box .prose p, #status-box p {
|
| 566 |
-
font-family: 'Geist Mono', monospace !important; font-size: 0.8rem !important;
|
| 567 |
-
color: var(--tx-2) !important; background: var(--bg-3) !important;
|
| 568 |
-
border: 1px solid var(--b-1) !important; border-left: 3px solid var(--gold) !important;
|
| 569 |
-
border-radius: 0 var(--r-xs) var(--r-xs) 0 !important;
|
| 570 |
-
padding: 11px 16px !important; margin: 0 !important; line-height: 1.55 !important;
|
| 571 |
-
}
|
| 572 |
-
|
| 573 |
-
/* βββββββββββ SUMMARY OUTPUT βββββββββββ */
|
| 574 |
-
#summary-out .prose strong, #summary-out strong { color: var(--gold-lt) !important; font-weight: 600 !important; }
|
| 575 |
-
#summary-out .prose p { margin-bottom: 10px !important; }
|
| 576 |
-
#summary-out .prose ul { padding-left: 20px !important; }
|
| 577 |
-
#summary-out .prose li { margin-bottom: 5px !important; color: var(--tx-1) !important; }
|
| 578 |
-
#summary-out .prose em { color: var(--tx-2) !important; }
|
| 579 |
-
|
| 580 |
-
/* βββββββββββ CHATBOT βββββββββββ */
|
| 581 |
-
.gr-chatbot, [data-testid="chatbot"] {
|
| 582 |
-
background: var(--bg-1) !important; border: 1px solid var(--b-1) !important;
|
| 583 |
-
border-radius: var(--r) !important; padding: 12px !important;
|
| 584 |
-
}
|
| 585 |
-
|
| 586 |
-
.gr-chatbot .message {
|
| 587 |
-
font-family: 'Geist', sans-serif !important; font-size: 0.9rem !important;
|
| 588 |
-
line-height: 1.7 !important; padding: 13px 18px !important;
|
| 589 |
-
border-radius: 12px !important; max-width: 82% !important;
|
| 590 |
-
animation: msgIn 0.25s cubic-bezier(0.34,1.56,0.64,1) both;
|
| 591 |
-
}
|
| 592 |
-
|
| 593 |
-
@keyframes msgIn {
|
| 594 |
-
from { opacity: 0; transform: translateY(8px) scale(0.97); }
|
| 595 |
-
to { opacity: 1; transform: translateY(0) scale(1); }
|
| 596 |
-
}
|
| 597 |
-
|
| 598 |
-
.gr-chatbot .message.user {
|
| 599 |
-
background: linear-gradient(135deg, var(--bg-4), var(--bg-5)) !important;
|
| 600 |
-
border: 1px solid var(--b-2) !important; color: var(--tx-1) !important; margin-left: auto !important;
|
| 601 |
-
}
|
| 602 |
-
|
| 603 |
-
.gr-chatbot .message.bot {
|
| 604 |
-
background: var(--bg-2) !important; border: 1px solid var(--b-1) !important; color: var(--tx-1) !important;
|
| 605 |
-
}
|
| 606 |
-
|
| 607 |
-
/* βββββββββββ AUDIO βββββββββββ */
|
| 608 |
-
.gr-audio, [data-testid="audio"] {
|
| 609 |
-
background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
|
| 610 |
-
border-radius: var(--r-sm) !important; padding: 8px !important;
|
| 611 |
-
}
|
| 612 |
-
|
| 613 |
-
/* βββββββββββ VOICE DESC βββββββββββ */
|
| 614 |
-
.voice-desc {
|
| 615 |
-
font-size: 0.84rem; color: var(--tx-2); margin: 0 0 16px; line-height: 1.6;
|
| 616 |
-
padding: 10px 14px; background: var(--bg-3); border-radius: var(--r-xs);
|
| 617 |
-
border-left: 2px solid var(--b-2);
|
| 618 |
-
}
|
| 619 |
-
|
| 620 |
-
/* βββββββββββ FOOTER βββββββββββ */
|
| 621 |
-
#footer-bar {
|
| 622 |
-
display: flex; align-items: center; justify-content: space-between;
|
| 623 |
-
padding: 14px 20px; background: rgba(15,17,25,0.6);
|
| 624 |
-
border: 1px solid var(--b-0); border-radius: var(--r); margin-top: 4px;
|
| 625 |
-
flex-wrap: wrap; gap: 10px;
|
| 626 |
-
}
|
| 627 |
-
|
| 628 |
-
.footer-copy { font-family: 'Geist Mono', monospace; font-size: 0.68rem; color: var(--tx-3); }
|
| 629 |
-
.footer-stack { display: flex; gap: 6px; flex-wrap: wrap; }
|
| 630 |
-
.footer-tag {
|
| 631 |
-
font-family: 'Geist Mono', monospace; font-size: 0.64rem; color: var(--tx-3);
|
| 632 |
-
background: var(--bg-2); border: 1px solid var(--b-0); border-radius: 4px; padding: 2px 7px;
|
| 633 |
-
}
|
| 634 |
-
|
| 635 |
-
/* βββββββββββ SCROLLBAR βββββββββββ */
|
| 636 |
-
::-webkit-scrollbar { width: 4px; height: 4px; }
|
| 637 |
-
::-webkit-scrollbar-track { background: transparent; }
|
| 638 |
-
::-webkit-scrollbar-thumb { background: var(--b-2); border-radius: 99px; }
|
| 639 |
-
::-webkit-scrollbar-thumb:hover { background: var(--gold-dk); }
|
| 640 |
-
|
| 641 |
-
/* βββββββββββ ANIMATIONS βββββββββββ */
|
| 642 |
-
@keyframes fadeDown { from { opacity: 0; transform: translateY(-10px); } to { opacity: 1; transform: translateY(0); } }
|
| 643 |
-
@keyframes fadeUp { from { opacity: 0; transform: translateY(16px); } to { opacity: 1; transform: translateY(0); } }
|
| 644 |
-
@keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } }
|
| 645 |
-
|
| 646 |
-
.gradio-container .gr-group { animation: fadeUp 0.5s cubic-bezier(0.22, 1, 0.36, 1) both; }
|
| 647 |
-
|
| 648 |
-
::selection { background: rgba(212,168,83,0.25); color: var(--tx-1); }
|
| 649 |
-
"""
|
| 650 |
|
| 651 |
-
# =
|
| 652 |
-
# BUILD UI
|
| 653 |
-
# =========================
|
| 654 |
|
| 655 |
-
with gr.
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
gr.HTML("""
|
| 659 |
-
<div id="hero">
|
| 660 |
-
<div class="hero-corner hero-corner-tl"></div>
|
| 661 |
-
<div class="hero-corner hero-corner-tr"></div>
|
| 662 |
-
<div class="hero-corner hero-corner-bl"></div>
|
| 663 |
-
<div class="hero-corner hero-corner-br"></div>
|
| 664 |
-
|
| 665 |
-
<div class="hero-eyebrow">Research Intelligence Platform</div>
|
| 666 |
-
|
| 667 |
-
<div class="hero-title">Your <em>AI Research</em> Partner</div>
|
| 668 |
-
|
| 669 |
-
<div class="hero-sub">
|
| 670 |
-
Paste any paper DOI β instantly fetch, index, summarize, and interrogate
|
| 671 |
-
research literature with LLaMA 3.3 and semantic search.
|
| 672 |
-
</div>
|
| 673 |
-
|
| 674 |
-
<div class="pill-row">
|
| 675 |
-
<span class="pill"><span class="pill-dot" style="background:#3ecf8e"></span>RAG Architecture</span>
|
| 676 |
-
<span class="pill"><span class="pill-dot" style="background:#d4a853"></span>LLaMA 3.3 Β· 70B</span>
|
| 677 |
-
<span class="pill"><span class="pill-dot" style="background:#4d94ff"></span>Whisper ASR</span>
|
| 678 |
-
<span class="pill"><span class="pill-dot" style="background:#f0a443"></span>FAISS Semantic Index</span>
|
| 679 |
-
<span class="pill"><span class="pill-dot" style="background:#c084fc"></span>DOI Resolver</span>
|
| 680 |
-
</div>
|
| 681 |
-
</div>
|
| 682 |
-
""")
|
| 683 |
-
|
| 684 |
-
# βββ STEP 01 β LOAD PAPER βββββββββββββββββββββββββββββββββββββββββ
|
| 685 |
-
with gr.Group():
|
| 686 |
-
gr.HTML("""
|
| 687 |
-
<div class="step-header">
|
| 688 |
-
<div class="step-num">01</div>
|
| 689 |
-
<div><div class="step-title">Load Paper</div></div>
|
| 690 |
-
<div class="step-desc">Paste DOI → fetch → index</div>
|
| 691 |
-
</div>
|
| 692 |
-
<div class="step-divider"></div>
|
| 693 |
-
""")
|
| 694 |
-
with gr.Row(equal_height=True):
|
| 695 |
-
arxiv_input = gr.Textbox(
|
| 696 |
-
label="Paper DOI",
|
| 697 |
-
placeholder="10.1038/s41586-021-03819-2 Β· https://doi.org/10.48550/arXiv.1706.03762",
|
| 698 |
-
scale=5,
|
| 699 |
-
)
|
| 700 |
-
load_btn = gr.Button("Load Paper β", variant="primary", scale=1, min_width=160)
|
| 701 |
-
status = gr.Markdown(
|
| 702 |
-
value="*Paste a DOI above and click **Load Paper** β the full text will be fetched and indexed automatically.*",
|
| 703 |
-
elem_id="status-box",
|
| 704 |
-
)
|
| 705 |
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
</div>
|
| 714 |
-
<div class="step-divider"></div>
|
| 715 |
-
""")
|
| 716 |
-
with gr.Row(equal_height=True):
|
| 717 |
-
section_dropdown = gr.Dropdown(
|
| 718 |
-
label="Select Section", choices=[], scale=5, interactive=True,
|
| 719 |
-
)
|
| 720 |
-
summarize_btn = gr.Button("β¦ Summarize", variant="secondary", scale=1, min_width=150)
|
| 721 |
-
summary_output = gr.Markdown(
|
| 722 |
-
value="*Select a section from the dropdown, then click **Summarize** for a structured AI breakdown.*",
|
| 723 |
-
elem_id="summary-out",
|
| 724 |
-
)
|
| 725 |
|
| 726 |
-
#
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
placeholder="What is the main contribution? Β· How does the method work? Β· What datasets were used?",
|
| 741 |
-
scale=5, lines=1, max_lines=4,
|
| 742 |
-
)
|
| 743 |
-
send_btn = gr.Button("Send β", variant="primary", scale=1, min_width=120)
|
| 744 |
-
|
| 745 |
-
# βββ STEP 04 β VOICE ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 746 |
-
with gr.Group():
|
| 747 |
-
gr.HTML("""
|
| 748 |
-
<div class="step-header">
|
| 749 |
-
<div class="step-num">04</div>
|
| 750 |
-
<div><div class="step-title">Voice Query</div></div>
|
| 751 |
-
<div class="step-desc">Speak → transcribe → search</div>
|
| 752 |
-
</div>
|
| 753 |
-
<div class="step-divider"></div>
|
| 754 |
-
<p class="voice-desc">
|
| 755 |
-
Record or upload an audio question β Whisper ASR transcribes it and runs semantic search automatically.
|
| 756 |
-
</p>
|
| 757 |
-
""")
|
| 758 |
-
with gr.Row(equal_height=True):
|
| 759 |
-
audio = gr.Audio(type="filepath", label="Record or Upload Audio", scale=5)
|
| 760 |
-
voice_btn = gr.Button("π Transcribe & Ask", variant="secondary", scale=1, min_width=170)
|
| 761 |
-
|
| 762 |
-
# βββ FOOTER βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 763 |
-
gr.HTML("""
|
| 764 |
-
<div id="footer-bar">
|
| 765 |
-
<span class="footer-copy">Research Intelligence Platform · 2026</span>
|
| 766 |
-
<div class="footer-stack">
|
| 767 |
-
<span class="footer-tag">Gradio</span>
|
| 768 |
-
<span class="footer-tag">Groq LLaMA 3.3</span>
|
| 769 |
-
<span class="footer-tag">FAISS</span>
|
| 770 |
-
<span class="footer-tag">Sentence Transformers</span>
|
| 771 |
-
<span class="footer-tag">faster-whisper</span>
|
| 772 |
-
<span class="footer-tag">PyMuPDF</span>
|
| 773 |
-
<span class="footer-tag">Unpaywall</span>
|
| 774 |
-
</div>
|
| 775 |
-
</div>
|
| 776 |
-
""")
|
| 777 |
-
|
| 778 |
-
# βββ BINDINGS βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 779 |
load_btn.click(load_paper, inputs=arxiv_input, outputs=[section_dropdown, status])
|
| 780 |
summarize_btn.click(summarize_section, inputs=section_dropdown, outputs=summary_output)
|
| 781 |
send_btn.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
|
| 782 |
-
msg.submit(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
|
| 783 |
voice_btn.click(voice_chat, inputs=[audio, chatbot], outputs=[chatbot, msg])
|
| 784 |
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import re
|
| 5 |
import numpy as np
|
| 6 |
import faiss
|
|
|
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from groq import Groq
|
| 9 |
+
from faster_whisper import WhisperModel
|
| 10 |
|
| 11 |
# =========================
|
| 12 |
# INITIALIZE MODELS
|
| 13 |
# =========================
|
| 14 |
|
| 15 |
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 16 |
+
whisper_model = WhisperModel("base", compute_type="int8")
|
| 17 |
|
| 18 |
+
# π PUT YOUR GROQ API KEY HERE
|
| 19 |
+
client = Groq(api_key="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")
|
| 20 |
|
| 21 |
+
# Use stable model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
MODEL_NAME = "llama-3.3-70b-versatile"
|
| 23 |
|
| 24 |
+
# Global storage
|
| 25 |
sections = {}
|
| 26 |
section_texts = []
|
| 27 |
index = None
|
| 28 |
|
| 29 |
+
|
| 30 |
# =========================
|
| 31 |
# PDF FUNCTIONS
|
| 32 |
# =========================
|
| 33 |
|
| 34 |
+
def download_arxiv_pdf(arxiv_id):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
try:
|
| 36 |
+
url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
|
| 37 |
+
response = requests.get(url)
|
| 38 |
+
response.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
file_path = f"{arxiv_id}.pdf"
|
| 41 |
with open(file_path, "wb") as f:
|
| 42 |
+
f.write(response.content)
|
| 43 |
|
| 44 |
+
return file_path
|
| 45 |
+
except:
|
| 46 |
+
return None
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def extract_text_from_pdf(pdf_path):
|
| 50 |
doc = fitz.open(pdf_path)
|
|
|
|
| 53 |
text += page.get_text()
|
| 54 |
return text
|
| 55 |
|
| 56 |
+
|
| 57 |
+
# =========================
|
| 58 |
+
# ROBUST SECTION EXTRACTION
|
| 59 |
+
# =========================
|
| 60 |
+
|
| 61 |
def extract_sections(text):
|
| 62 |
+
|
| 63 |
patterns = [
|
| 64 |
+
r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)", # Roman
|
| 65 |
+
r"\n(\d+\.\d+\.\d+\s+[^\n]+)", # 1.1.1
|
| 66 |
+
r"\n(\d+\.\d+\s+[^\n]+)", # 1.1
|
| 67 |
+
r"\n(\d+\.\s+[^\n]+)", # 1.
|
| 68 |
+
r"\n(\d+\s+[^\n]+)", # 1
|
| 69 |
+
r"\n([A-Z][A-Z\s]{4,})\n" # ALL CAPS
|
| 70 |
]
|
| 71 |
+
|
| 72 |
matches = []
|
| 73 |
for p in patterns:
|
| 74 |
matches.extend(list(re.finditer(p, text)))
|
| 75 |
+
|
| 76 |
matches = sorted(matches, key=lambda x: x.start())
|
| 77 |
+
|
| 78 |
extracted = {}
|
| 79 |
+
|
| 80 |
for i, match in enumerate(matches):
|
| 81 |
title = match.group(1).strip()
|
| 82 |
+
|
| 83 |
start = match.end()
|
| 84 |
+
end = matches[i+1].start() if i+1 < len(matches) else len(text)
|
| 85 |
+
|
| 86 |
content = text[start:end].strip()
|
| 87 |
+
|
| 88 |
if len(content) > 4000:
|
| 89 |
content = content[:4000]
|
| 90 |
+
|
| 91 |
extracted[title] = content
|
| 92 |
+
|
| 93 |
+
# Add abstract manually
|
| 94 |
abstract_match = re.search(r"Abstract(.*?)\n", text, re.DOTALL)
|
| 95 |
if abstract_match:
|
| 96 |
extracted["Abstract"] = abstract_match.group(1).strip()
|
| 97 |
+
|
| 98 |
return extracted
|
| 99 |
|
| 100 |
+
|
| 101 |
+
# =========================
|
| 102 |
+
# VECTOR STORE
|
| 103 |
+
# =========================
|
| 104 |
+
|
| 105 |
def build_vector_store(sections_dict):
|
| 106 |
global index, section_texts
|
| 107 |
+
|
| 108 |
section_texts = list(sections_dict.values())
|
| 109 |
+
|
| 110 |
if len(section_texts) == 0:
|
| 111 |
index = None
|
| 112 |
return
|
| 113 |
+
|
| 114 |
embeddings = embedding_model.encode(section_texts)
|
| 115 |
embeddings = np.array(embeddings).astype("float32")
|
| 116 |
+
|
| 117 |
dim = embeddings.shape[1]
|
| 118 |
index = faiss.IndexFlatL2(dim)
|
| 119 |
index.add(embeddings)
|
| 120 |
|
| 121 |
+
|
| 122 |
+
# =========================
|
| 123 |
+
# LOAD PAPER
|
| 124 |
+
# =========================
|
| 125 |
+
|
| 126 |
+
def load_paper(arxiv_id):
|
| 127 |
global sections
|
| 128 |
+
|
| 129 |
+
pdf_path = download_arxiv_pdf(arxiv_id)
|
| 130 |
+
|
| 131 |
if pdf_path is None:
|
| 132 |
+
return gr.update(choices=[]), "β Invalid arXiv ID"
|
| 133 |
+
|
|
|
|
|
|
|
|
|
|
| 134 |
text = extract_text_from_pdf(pdf_path)
|
| 135 |
sections = extract_sections(text)
|
| 136 |
+
|
|
|
|
| 137 |
build_vector_store(sections)
|
| 138 |
+
|
| 139 |
+
return gr.update(choices=list(sections.keys())), "β
Paper Loaded Successfully"
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# =========================
|
| 143 |
+
# SUMMARY FUNCTION
|
| 144 |
+
# =========================
|
| 145 |
|
| 146 |
def summarize_section(section_title):
|
| 147 |
try:
|
| 148 |
if not sections:
|
| 149 |
+
return "β Load paper first"
|
| 150 |
+
|
| 151 |
+
if section_title not in sections:
|
| 152 |
+
return "β Section not found"
|
| 153 |
+
|
| 154 |
+
content = sections[section_title]
|
| 155 |
+
|
| 156 |
if not content:
|
| 157 |
+
return "β Empty section"
|
|
|
|
| 158 |
|
| 159 |
+
content = content[:4000]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
prompt = f"""
|
| 162 |
+
Summarize this research section:
|
| 163 |
+
|
| 164 |
+
- Main idea
|
| 165 |
+
- Key concepts
|
| 166 |
+
- Simple explanation
|
| 167 |
+
- Importance
|
| 168 |
+
|
| 169 |
+
Section: {section_title}
|
| 170 |
|
| 171 |
Content:
|
| 172 |
{content}
|
| 173 |
"""
|
| 174 |
+
|
| 175 |
response = client.chat.completions.create(
|
| 176 |
model=MODEL_NAME,
|
| 177 |
messages=[{"role": "user", "content": prompt}],
|
| 178 |
temperature=0.3
|
| 179 |
)
|
| 180 |
+
|
| 181 |
return response.choices[0].message.content
|
| 182 |
+
|
| 183 |
except Exception as e:
|
| 184 |
+
return f"β Error:\n{str(e)}"
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
# =========================
|
| 188 |
+
# RAG CHAT
|
| 189 |
+
# =========================
|
| 190 |
|
| 191 |
def rag_chat(message, history):
|
| 192 |
try:
|
| 193 |
global index
|
| 194 |
+
|
|
|
|
| 195 |
if index is None:
|
| 196 |
+
history.append({"role": "assistant", "content": "β Load paper first"})
|
| 197 |
return history, ""
|
| 198 |
+
|
| 199 |
query_embedding = embedding_model.encode([message])
|
| 200 |
query_embedding = np.array(query_embedding).astype("float32")
|
| 201 |
+
|
| 202 |
D, I = index.search(query_embedding, k=3)
|
| 203 |
+
|
| 204 |
+
retrieved = "\n\n".join([section_texts[i] for i in I[0]])
|
| 205 |
+
|
| 206 |
+
prompt = f"""
|
| 207 |
+
Answer using ONLY this context.
|
| 208 |
|
| 209 |
Context:
|
| 210 |
{retrieved}
|
| 211 |
|
| 212 |
+
Question:
|
| 213 |
+
{message}
|
| 214 |
"""
|
| 215 |
+
|
| 216 |
response = client.chat.completions.create(
|
| 217 |
model=MODEL_NAME,
|
| 218 |
messages=[{"role": "user", "content": prompt}],
|
| 219 |
temperature=0.2
|
| 220 |
)
|
| 221 |
+
|
| 222 |
answer = response.choices[0].message.content
|
| 223 |
+
|
| 224 |
+
# β
FIXED FORMAT
|
| 225 |
+
history.append({"role": "user", "content": message})
|
| 226 |
+
history.append({"role": "assistant", "content": answer})
|
| 227 |
+
|
| 228 |
return history, ""
|
| 229 |
+
|
| 230 |
except Exception as e:
|
| 231 |
+
history.append({"role": "assistant", "content": f"β Error:\n{str(e)}"})
|
| 232 |
return history, ""
|
| 233 |
|
| 234 |
+
|
| 235 |
+
# =========================
|
| 236 |
+
# VOICE CHAT
|
| 237 |
+
# =========================
|
| 238 |
+
|
| 239 |
def voice_chat(audio, history):
|
| 240 |
try:
|
| 241 |
if audio is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
return history, ""
|
| 243 |
+
|
| 244 |
+
segments, _ = whisper_model.transcribe(audio)
|
| 245 |
+
text = " ".join([seg.text for seg in segments])
|
| 246 |
+
|
| 247 |
return rag_chat(text, history)
|
| 248 |
+
|
| 249 |
except Exception as e:
|
| 250 |
+
history.append({"role": "assistant", "content": f"β Error:\n{str(e)}"})
|
| 251 |
return history, ""
|
| 252 |
|
| 253 |
+
|
| 254 |
# =========================
|
| 255 |
+
# UI
|
|
|
|
|
|
|
| 256 |
# =========================
|
| 257 |
|
| 258 |
+
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
+
gr.Markdown("# π ArXiv Research Assistant", elem_id="title")
|
|
|
|
|
|
|
| 261 |
|
| 262 |
+
with gr.Row():
|
| 263 |
+
arxiv_input = gr.Textbox(label="Enter arXiv ID", scale=4)
|
| 264 |
+
load_btn = gr.Button("Load Paper", variant="primary", scale=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
+
status = gr.Markdown()
|
| 267 |
+
|
| 268 |
+
with gr.Row():
|
| 269 |
+
section_dropdown = gr.Dropdown(label="Sections", scale=3)
|
| 270 |
+
summarize_btn = gr.Button("Generate Summary", variant="secondary", scale=1)
|
| 271 |
+
|
| 272 |
+
summary_output = gr.Markdown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
+
gr.Markdown("## π¬ Chat with Paper")
|
| 275 |
+
chatbot = gr.Chatbot(height=400)
|
| 276 |
+
|
| 277 |
+
with gr.Row():
|
| 278 |
+
msg = gr.Textbox(label="Ask a question", scale=4)
|
| 279 |
+
send_btn = gr.Button("Send", variant="primary", scale=1)
|
| 280 |
+
|
| 281 |
+
gr.Markdown("## π Voice Query")
|
| 282 |
+
|
| 283 |
+
with gr.Row():
|
| 284 |
+
audio = gr.Audio(type="filepath", scale=4)
|
| 285 |
+
voice_btn = gr.Button("Ask via Voice", scale=1)
|
| 286 |
+
|
| 287 |
+
# Actions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
load_btn.click(load_paper, inputs=arxiv_input, outputs=[section_dropdown, status])
|
| 289 |
summarize_btn.click(summarize_section, inputs=section_dropdown, outputs=summary_output)
|
| 290 |
send_btn.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
|
|
|
|
| 291 |
voice_btn.click(voice_chat, inputs=[audio, chatbot], outputs=[chatbot, msg])
|
| 292 |
|
| 293 |
+
|
| 294 |
+
demo.launch(
|
| 295 |
+
theme=gr.themes.Soft(),
|
| 296 |
+
css="""
|
| 297 |
+
#title {text-align:center}
|
| 298 |
+
"""
|
| 299 |
+
)
|