MalikShehram commited on
Commit
db63cbd
Β·
verified Β·
1 Parent(s): 55417b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -647
app.py CHANGED
@@ -4,139 +4,47 @@ import fitz
4
  import re
5
  import numpy as np
6
  import faiss
7
- import os
8
  from sentence_transformers import SentenceTransformer
9
  from groq import Groq
 
10
 
11
  # =========================
12
  # INITIALIZE MODELS
13
  # =========================
14
 
15
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 
16
 
17
- whisper_model = None
 
18
 
19
- def get_whisper():
20
- global whisper_model
21
- if whisper_model is None:
22
- from faster_whisper import WhisperModel
23
- whisper_model = WhisperModel("base", compute_type="int8")
24
- return whisper_model
25
-
26
- GROQ_API_KEY = os.environ.get("YOUR_GROQ_API_KEY", "")
27
- if not GROQ_API_KEY:
28
- raise ValueError("YOUR_GROQ_API_KEY secret is not set. Add it in HF Space Settings β†’ Secrets.")
29
-
30
- client = Groq(api_key=GROQ_API_KEY)
31
  MODEL_NAME = "llama-3.3-70b-versatile"
32
 
 
33
  sections = {}
34
  section_texts = []
35
  index = None
36
 
 
37
  # =========================
38
  # PDF FUNCTIONS
39
  # =========================
40
 
41
- def doi_to_pdf_url(doi):
42
- """
43
- Given a DOI, try multiple strategies to find a downloadable PDF:
44
- 1. Unpaywall API β€” finds legal open-access PDFs for any DOI (no key needed)
45
- 2. Direct arXiv β€” if the DOI belongs to an arXiv paper
46
- 3. Europe PMC β€” broad biomedical / life-science coverage
47
- Returns (pdf_url, paper_title).
48
- """
49
- doi = doi.strip()
50
- # Strip common DOI prefixes so bare DOI always works
51
- for prefix in ("https://doi.org/", "http://doi.org/", "doi:", "DOI:"):
52
- if doi.startswith(prefix):
53
- doi = doi[len(prefix):]
54
- break
55
-
56
- title = None
57
-
58
- # ── Strategy 1: Unpaywall (free, ~85% OA coverage) ──────────────
59
  try:
60
- email = "research@assistant.app"
61
- r = requests.get(
62
- f"https://api.unpaywall.org/v2/{doi}?email={email}", timeout=15
63
- )
64
- if r.status_code == 200:
65
- data = r.json()
66
- title = data.get("title")
67
- best = data.get("best_oa_location")
68
- if best:
69
- pdf_url = best.get("url_for_pdf") or best.get("url")
70
- if pdf_url:
71
- return pdf_url, title
72
- for loc in data.get("oa_locations", []):
73
- pdf_url = loc.get("url_for_pdf") or loc.get("url")
74
- if pdf_url:
75
- return pdf_url, title
76
- except Exception as e:
77
- print(f"Unpaywall error: {e}")
78
-
79
- # ── Strategy 2: arXiv DOI pattern ───────────────────────────────
80
- try:
81
- arxiv_match = re.search(r"arXiv[\.:](\d{4}\.\d{4,5})", doi, re.IGNORECASE)
82
- if arxiv_match:
83
- arxiv_id = arxiv_match.group(1)
84
- return f"https://arxiv.org/pdf/{arxiv_id}.pdf", title
85
- except Exception as e:
86
- print(f"arXiv DOI parse error: {e}")
87
-
88
- # ── Strategy 3: Europe PMC ───────────────────────────────────────
89
- try:
90
- r = requests.get(
91
- f"https://www.ebi.ac.uk/europepmc/webservices/rest/search"
92
- f"?query=DOI:{doi}&format=json&resultType=core",
93
- timeout=15,
94
- )
95
- if r.status_code == 200:
96
- results = r.json().get("resultList", {}).get("result", [])
97
- if results:
98
- item = results[0]
99
- title = title or item.get("title")
100
- pmcid = item.get("pmcid")
101
- if pmcid:
102
- pdf_url = (
103
- f"https://europepmc.org/backend/ptpmcrender.fcgi"
104
- f"?accid={pmcid}&blobtype=pdf"
105
- )
106
- return pdf_url, title
107
- except Exception as e:
108
- print(f"Europe PMC error: {e}")
109
-
110
- return None, title
111
-
112
-
113
- def download_pdf_from_doi(doi):
114
- """Resolve DOI β†’ PDF URL β†’ download to /tmp. Returns (file_path, paper_title)."""
115
- try:
116
- pdf_url, title = doi_to_pdf_url(doi)
117
- if not pdf_url:
118
- return None, title
119
-
120
- safe_name = re.sub(r"[^\w\-]", "_", doi)[:60]
121
- file_path = f"/tmp/{safe_name}.pdf"
122
-
123
- headers = {"User-Agent": "Mozilla/5.0 (ResearchAssistant/1.0)"}
124
- r = requests.get(pdf_url, timeout=40, headers=headers, allow_redirects=True)
125
- r.raise_for_status()
126
-
127
- # Verify it's actually a PDF
128
- if b"%PDF" not in r.content[:16]:
129
- print(f"Response is not a PDF from {pdf_url}")
130
- return None, title
131
 
 
132
  with open(file_path, "wb") as f:
133
- f.write(r.content)
134
 
135
- return file_path, title
 
 
136
 
137
- except Exception as e:
138
- print(f"PDF download error: {e}")
139
- return None, None
140
 
141
  def extract_text_from_pdf(pdf_path):
142
  doc = fitz.open(pdf_path)
@@ -145,641 +53,247 @@ def extract_text_from_pdf(pdf_path):
145
  text += page.get_text()
146
  return text
147
 
 
 
 
 
 
148
  def extract_sections(text):
 
149
  patterns = [
150
- r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)",
151
- r"\n(\d+\.\d+\.\d+\s+[^\n]+)",
152
- r"\n(\d+\.\d+\s+[^\n]+)",
153
- r"\n(\d+\.\s+[^\n]+)",
154
- r"\n(\d+\s+[^\n]+)",
155
- r"\n([A-Z][A-Z\s]{4,})\n"
156
  ]
 
157
  matches = []
158
  for p in patterns:
159
  matches.extend(list(re.finditer(p, text)))
 
160
  matches = sorted(matches, key=lambda x: x.start())
 
161
  extracted = {}
 
162
  for i, match in enumerate(matches):
163
  title = match.group(1).strip()
 
164
  start = match.end()
165
- end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
 
166
  content = text[start:end].strip()
 
167
  if len(content) > 4000:
168
  content = content[:4000]
 
169
  extracted[title] = content
 
 
170
  abstract_match = re.search(r"Abstract(.*?)\n", text, re.DOTALL)
171
  if abstract_match:
172
  extracted["Abstract"] = abstract_match.group(1).strip()
 
173
  return extracted
174
 
 
 
 
 
 
175
  def build_vector_store(sections_dict):
176
  global index, section_texts
 
177
  section_texts = list(sections_dict.values())
 
178
  if len(section_texts) == 0:
179
  index = None
180
  return
 
181
  embeddings = embedding_model.encode(section_texts)
182
  embeddings = np.array(embeddings).astype("float32")
 
183
  dim = embeddings.shape[1]
184
  index = faiss.IndexFlatL2(dim)
185
  index.add(embeddings)
186
 
187
- def load_paper(doi):
 
 
 
 
 
188
  global sections
189
- if not doi or not doi.strip():
190
- return gr.update(choices=[]), "⚠️ Please enter a valid DOI"
191
- pdf_path, title = download_pdf_from_doi(doi)
192
  if pdf_path is None:
193
- return gr.update(choices=[]), (
194
- "❌ Could not find or download a PDF for this DOI.\n\n"
195
- "**Tips:** Make sure the paper is open-access. "
196
- "Try formats like `10.1038/s41586-021-03819-2` or paste the full `https://doi.org/...` URL."
197
- )
198
  text = extract_text_from_pdf(pdf_path)
199
  sections = extract_sections(text)
200
- if not sections:
201
- return gr.update(choices=[]), "⚠️ PDF downloaded but no sections could be detected."
202
  build_vector_store(sections)
203
- label = f"βœ… **{title}**\n\n{len(sections)} sections indexed and ready" if title else f"βœ… Paper loaded β€” {len(sections)} sections indexed and ready"
204
- return gr.update(choices=list(sections.keys())), label
 
 
 
 
 
205
 
206
  def summarize_section(section_title):
207
  try:
208
  if not sections:
209
- return "⚠️ Please load a paper first."
210
- if not section_title or section_title not in sections:
211
- return "⚠️ Please select a section from the dropdown."
212
- content = sections[section_title][:4000]
 
 
 
213
  if not content:
214
- return "⚠️ This section appears to be empty."
215
- prompt = f"""You are a research assistant. Summarize the following section from a research paper in a clear, structured way.
216
 
217
- Provide:
218
- β€’ **Main Idea** β€” What is this section about?
219
- β€’ **Key Concepts** β€” What are the important terms or methods?
220
- β€’ **Simple Explanation** β€” Explain it as if to a graduate student unfamiliar with the topic
221
- β€’ **Why It Matters** β€” What is the contribution or significance?
222
 
223
- Section Title: {section_title}
 
 
 
 
 
 
 
 
224
 
225
  Content:
226
  {content}
227
  """
 
228
  response = client.chat.completions.create(
229
  model=MODEL_NAME,
230
  messages=[{"role": "user", "content": prompt}],
231
  temperature=0.3
232
  )
 
233
  return response.choices[0].message.content
 
234
  except Exception as e:
235
- return f"❌ Error: {str(e)}"
 
 
 
 
 
236
 
237
  def rag_chat(message, history):
238
  try:
239
  global index
240
- if not message or not message.strip():
241
- return history, ""
242
  if index is None:
243
- history = history + [[None, "⚠️ Please load a paper first."]]
244
  return history, ""
 
245
  query_embedding = embedding_model.encode([message])
246
  query_embedding = np.array(query_embedding).astype("float32")
 
247
  D, I = index.search(query_embedding, k=3)
248
- retrieved = "\n\n".join([section_texts[i] for i in I[0] if i < len(section_texts)])
249
- prompt = f"""You are an expert research assistant. Answer the question using ONLY the provided context from the paper.
250
- If the answer cannot be found in the context, say so clearly. Be precise and cite relevant details.
 
 
251
 
252
  Context:
253
  {retrieved}
254
 
255
- Question: {message}
 
256
  """
 
257
  response = client.chat.completions.create(
258
  model=MODEL_NAME,
259
  messages=[{"role": "user", "content": prompt}],
260
  temperature=0.2
261
  )
 
262
  answer = response.choices[0].message.content
263
- history = history + [[message, answer]]
 
 
 
 
264
  return history, ""
 
265
  except Exception as e:
266
- history = history + [[message, f"❌ Error: {str(e)}"]]
267
  return history, ""
268
 
 
 
 
 
 
269
  def voice_chat(audio, history):
270
  try:
271
  if audio is None:
272
- history = history + [[None, "⚠️ No audio received. Please record a voice message."]]
273
- return history, ""
274
- model = get_whisper()
275
- segments, _ = model.transcribe(audio)
276
- text = " ".join([seg.text for seg in segments]).strip()
277
- if not text:
278
- history = history + [[None, "⚠️ Could not transcribe audio. Please try again."]]
279
  return history, ""
 
 
 
 
280
  return rag_chat(text, history)
 
281
  except Exception as e:
282
- history = history + [[None, f"❌ Error: {str(e)}"]]
283
  return history, ""
284
 
 
285
  # =========================
286
- # CSS β€” "Obsidian Intelligence" Premium 2026 Research Platform
287
- # Ethos: High-Density Functionalism meets Dark Luxury
288
- # Trends: Bento Grid Β· Glassmorphism Β· Kinetic borders Β· Layered depth
289
  # =========================
290
 
291
- CSS = """
292
- @import url('https://fonts.googleapis.com/css2?family=Instrument+Serif:ital@0;1&family=Geist:wght@300;400;500;600;700&family=Geist+Mono:wght@400;500&display=swap');
293
-
294
- /* ═══════════ DESIGN TOKENS ═══════════ */
295
- :root {
296
- --void: #07080c;
297
- --bg-0: #0b0d13;
298
- --bg-1: #0f1119;
299
- --bg-2: #13161f;
300
- --bg-3: #181c28;
301
- --bg-4: #1e2333;
302
- --bg-5: #242840;
303
- --b-0: #1c2030;
304
- --b-1: #252a3c;
305
- --b-2: #303650;
306
- --b-hi: #404868;
307
- --gold: #d4a853;
308
- --gold-lt: #e8c47a;
309
- --gold-dk: #9e7535;
310
- --gold-muted: rgba(212,168,83,0.08);
311
- --gold-glow: rgba(212,168,83,0.16);
312
- --gold-ring: 0 0 0 2.5px rgba(212,168,83,0.22);
313
- --green: #3ecf8e;
314
- --blue: #4d94ff;
315
- --red: #f06565;
316
- --amber: #f0a443;
317
- --tx-1: #f0ece2;
318
- --tx-2: #8a8578;
319
- --tx-3: #4a4740;
320
- --r-xs: 6px; --r-sm: 10px; --r: 16px; --r-lg: 22px; --r-xl: 28px;
321
- --sh-sm: 0 2px 12px rgba(0,0,0,.4);
322
- --sh: 0 4px 32px rgba(0,0,0,.55);
323
- --sh-lg: 0 8px 64px rgba(0,0,0,.7);
324
- --sh-xl: 0 16px 80px rgba(0,0,0,.8);
325
- --sh-gold: 0 4px 28px rgba(212,168,83,0.2);
326
- }
327
-
328
- *, *::before, *::after { box-sizing: border-box; margin: 0; }
329
- html { scroll-behavior: smooth; }
330
-
331
- body, .gradio-container {
332
- background: var(--bg-0) !important;
333
- font-family: 'Geist', system-ui, sans-serif !important;
334
- color: var(--tx-1) !important;
335
- min-height: 100vh;
336
- -webkit-font-smoothing: antialiased;
337
- }
338
-
339
- /* Dot-matrix background */
340
- .gradio-container {
341
- background-image: radial-gradient(circle, rgba(212,168,83,0.05) 1px, transparent 1px) !important;
342
- background-size: 28px 28px !important;
343
- background-attachment: fixed !important;
344
- }
345
-
346
- /* Vignette overlay */
347
- .gradio-container::before {
348
- content: '';
349
- position: fixed; inset: 0;
350
- background: radial-gradient(ellipse 80% 60% at 50% 0%, transparent 40%, rgba(7,8,12,0.75) 100%);
351
- pointer-events: none; z-index: 0;
352
- }
353
-
354
- /* ═══════════ HERO HEADER ═══════════ */
355
- #hero {
356
- position: relative;
357
- background: linear-gradient(145deg, #0f1220 0%, #131828 45%, #0c0f1a 100%);
358
- border: 1px solid var(--b-2);
359
- border-radius: var(--r-xl);
360
- padding: 56px 48px 44px;
361
- text-align: center;
362
- overflow: hidden;
363
- box-shadow: var(--sh-xl);
364
- margin-bottom: 2px;
365
- }
366
-
367
- #hero::before {
368
- content: '';
369
- position: absolute;
370
- top: -120px; left: 50%; transform: translateX(-50%);
371
- width: 600px; height: 400px;
372
- background: conic-gradient(from 180deg at 50% 50%, rgba(212,168,83,0) 0deg, rgba(212,168,83,0.07) 60deg, rgba(77,148,255,0.04) 120deg, rgba(212,168,83,0) 180deg, rgba(212,168,83,0.06) 240deg, rgba(77,148,255,0.03) 300deg, rgba(212,168,83,0) 360deg);
373
- animation: aurora 12s linear infinite;
374
- pointer-events: none; border-radius: 50%;
375
- }
376
-
377
- @keyframes aurora {
378
- from { transform: translateX(-50%) rotate(0deg); }
379
- to { transform: translateX(-50%) rotate(360deg); }
380
- }
381
-
382
- #hero::after {
383
- content: ''; position: absolute;
384
- bottom: 0; left: 8%; right: 8%; height: 1px;
385
- background: linear-gradient(90deg, transparent, var(--b-2) 30%, var(--b-hi) 50%, var(--b-2) 70%, transparent);
386
- }
387
-
388
- .hero-corner { position: absolute; width: 36px; height: 36px; opacity: 0.35; }
389
- .hero-corner-tl { top: 18px; left: 18px; border-top: 1.5px solid var(--gold); border-left: 1.5px solid var(--gold); border-radius: 4px 0 0 0; }
390
- .hero-corner-tr { top: 18px; right: 18px; border-top: 1.5px solid var(--gold); border-right: 1.5px solid var(--gold); border-radius: 0 4px 0 0; }
391
- .hero-corner-bl { bottom: 18px; left: 18px; border-bottom: 1.5px solid var(--gold); border-left: 1.5px solid var(--gold); border-radius: 0 0 0 4px; }
392
- .hero-corner-br { bottom: 18px; right: 18px; border-bottom: 1.5px solid var(--gold); border-right: 1.5px solid var(--gold); border-radius: 0 0 4px 0; }
393
-
394
- .hero-eyebrow {
395
- display: inline-flex; align-items: center; gap: 8px;
396
- background: rgba(212,168,83,0.07); border: 1px solid rgba(212,168,83,0.2);
397
- border-radius: 99px; padding: 5px 14px;
398
- font-family: 'Geist Mono', monospace; font-size: 0.67rem;
399
- color: var(--gold); letter-spacing: 2px; text-transform: uppercase;
400
- margin-bottom: 20px; animation: fadeDown 0.6s ease both;
401
- }
402
-
403
- .hero-eyebrow::before {
404
- content: ''; width: 5px; height: 5px; background: var(--gold);
405
- border-radius: 50%; box-shadow: 0 0 6px var(--gold);
406
- animation: pulse-dot 2s ease-in-out infinite;
407
- }
408
-
409
- @keyframes pulse-dot {
410
- 0%, 100% { opacity: 1; transform: scale(1); }
411
- 50% { opacity: 0.4; transform: scale(0.65); }
412
- }
413
-
414
- .hero-title {
415
- font-family: 'Instrument Serif', Georgia, serif !important;
416
- font-size: 3.4rem !important; font-weight: 400 !important;
417
- color: var(--tx-1) !important; letter-spacing: -1.5px;
418
- line-height: 1.05; margin-bottom: 16px !important;
419
- animation: fadeDown 0.6s 0.1s ease both;
420
- }
421
-
422
- .hero-title em {
423
- font-style: italic; color: var(--gold);
424
- text-shadow: 0 0 40px rgba(212,168,83,0.3);
425
- }
426
-
427
- .hero-sub {
428
- font-size: 1rem; color: var(--tx-2); font-weight: 300;
429
- letter-spacing: 0.2px; max-width: 560px; margin: 0 auto 28px;
430
- line-height: 1.6; animation: fadeDown 0.6s 0.2s ease both;
431
- }
432
-
433
- .pill-row {
434
- display: flex; gap: 8px; justify-content: center;
435
- flex-wrap: wrap; animation: fadeDown 0.6s 0.3s ease both;
436
- }
437
-
438
- .pill {
439
- display: inline-flex; align-items: center; gap: 6px;
440
- background: rgba(255,255,255,0.03); border: 1px solid var(--b-2);
441
- border-radius: 99px; padding: 6px 14px;
442
- font-size: 0.76rem; font-weight: 500;
443
- color: var(--tx-2); letter-spacing: 0.2px;
444
- transition: all 0.2s; cursor: default;
445
- }
446
-
447
- .pill:hover { border-color: var(--gold); color: var(--gold-lt); background: var(--gold-muted); transform: translateY(-1px); }
448
- .pill-dot { width: 5px; height: 5px; border-radius: 50%; flex-shrink: 0; }
449
-
450
- /* ═══════════ PANEL CARDS ═══════════ */
451
- .gr-group {
452
- background: rgba(19,22,31,0.88) !important;
453
- backdrop-filter: blur(20px) saturate(140%) !important;
454
- -webkit-backdrop-filter: blur(20px) saturate(140%) !important;
455
- border: 1px solid var(--b-1) !important;
456
- border-radius: var(--r-lg) !important;
457
- padding: 28px 32px !important;
458
- box-shadow: var(--sh) !important;
459
- transition: border-color 0.3s, box-shadow 0.3s !important;
460
- position: relative; overflow: hidden;
461
- }
462
-
463
- .gr-group::before {
464
- content: ''; position: absolute;
465
- top: 0; left: 0; right: 0; height: 1px;
466
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.05) 40%, rgba(255,255,255,0.05) 60%, transparent);
467
- pointer-events: none;
468
- }
469
-
470
- .gr-group:hover { border-color: var(--b-2) !important; box-shadow: var(--sh-lg) !important; }
471
-
472
- /* ═══════════ STEP HEADERS ═══════════ */
473
- .step-header { display: flex; align-items: center; gap: 14px; margin-bottom: 20px; }
474
-
475
- .step-num {
476
- width: 30px; height: 30px; background: var(--gold-muted);
477
- border: 1px solid rgba(212,168,83,0.25); border-radius: 8px;
478
- display: flex; align-items: center; justify-content: center;
479
- font-family: 'Geist Mono', monospace; font-size: 0.72rem;
480
- font-weight: 500; color: var(--gold); flex-shrink: 0;
481
- }
482
-
483
- .step-title { font-size: 0.95rem; font-weight: 600; color: var(--tx-1); letter-spacing: -0.2px; }
484
- .step-desc { font-size: 0.76rem; color: var(--tx-3); margin-left: auto; }
485
- .step-divider { height: 1px; background: linear-gradient(90deg, var(--b-1), transparent); margin-bottom: 20px; }
486
-
487
- /* ═══════════ FORM CONTROLS ═══════════ */
488
- label span, .block label span {
489
- font-family: 'Geist', sans-serif !important; font-size: 0.73rem !important;
490
- font-weight: 600 !important; color: var(--tx-3) !important;
491
- letter-spacing: 0.8px !important; text-transform: uppercase !important;
492
- }
493
-
494
- textarea, input[type="text"], .gr-textbox textarea, .gr-textbox input {
495
- background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
496
- border-radius: var(--r-sm) !important; color: var(--tx-1) !important;
497
- font-family: 'Geist Mono', monospace !important; font-size: 0.88rem !important;
498
- padding: 14px 18px !important;
499
- transition: border-color 0.2s, box-shadow 0.2s, background 0.2s !important;
500
- resize: none !important; line-height: 1.5 !important;
501
- }
502
-
503
- textarea:focus, input[type="text"]:focus {
504
- border-color: var(--gold) !important; box-shadow: var(--gold-ring) !important;
505
- background: var(--bg-4) !important; outline: none !important;
506
- }
507
-
508
- textarea::placeholder, input::placeholder {
509
- color: var(--tx-3) !important; font-style: italic;
510
- font-family: 'Geist', sans-serif !important; font-size: 0.84rem !important;
511
- }
512
-
513
- select, .gr-dropdown select {
514
- background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
515
- border-radius: var(--r-sm) !important; color: var(--tx-1) !important;
516
- font-family: 'Geist', sans-serif !important; font-size: 0.88rem !important;
517
- padding: 14px 18px !important; transition: border-color 0.2s, box-shadow 0.2s !important;
518
- }
519
-
520
- select:focus { border-color: var(--gold) !important; box-shadow: var(--gold-ring) !important; outline: none !important; }
521
-
522
- /* ═══════════ BUTTONS ═══════════ */
523
- .gr-button {
524
- font-family: 'Geist', sans-serif !important; font-size: 0.85rem !important;
525
- font-weight: 600 !important; letter-spacing: 0.1px !important;
526
- border-radius: var(--r-sm) !important; cursor: pointer !important;
527
- transition: all 0.18s cubic-bezier(0.34, 1.56, 0.64, 1) !important;
528
- padding: 13px 24px !important; position: relative; overflow: hidden;
529
- }
530
-
531
- .gr-button-primary::after {
532
- content: ''; position: absolute;
533
- top: 0; left: -100%; width: 100%; height: 100%;
534
- background: linear-gradient(90deg, transparent, rgba(255,255,255,0.12), transparent);
535
- transition: left 0.45s ease;
536
- }
537
- .gr-button-primary:hover::after { left: 100%; }
538
-
539
- .gr-button-primary {
540
- background: linear-gradient(135deg, #d4a853 0%, #b8893a 60%, #9e7535 100%) !important;
541
- color: #070810 !important; border: none !important;
542
- box-shadow: 0 2px 16px rgba(212,168,83,0.3), inset 0 1px 0 rgba(255,255,255,0.15) !important;
543
- }
544
-
545
- .gr-button-primary:hover {
546
- transform: translateY(-2px) scale(1.01) !important;
547
- box-shadow: 0 8px 28px rgba(212,168,83,0.42), inset 0 1px 0 rgba(255,255,255,0.2) !important;
548
- filter: brightness(1.06) !important;
549
- }
550
-
551
- .gr-button-primary:active { transform: translateY(0) scale(0.99) !important; }
552
-
553
- .gr-button-secondary {
554
- background: var(--bg-4) !important; color: var(--tx-2) !important;
555
- border: 1px solid var(--b-2) !important;
556
- }
557
-
558
- .gr-button-secondary:hover {
559
- border-color: var(--gold) !important; color: var(--gold-lt) !important;
560
- background: var(--gold-muted) !important; transform: translateY(-1px) !important;
561
- box-shadow: var(--sh-gold) !important;
562
- }
563
-
564
- /* ═══════════ STATUS BOX ═══════════ */
565
- #status-box .prose p, #status-box p {
566
- font-family: 'Geist Mono', monospace !important; font-size: 0.8rem !important;
567
- color: var(--tx-2) !important; background: var(--bg-3) !important;
568
- border: 1px solid var(--b-1) !important; border-left: 3px solid var(--gold) !important;
569
- border-radius: 0 var(--r-xs) var(--r-xs) 0 !important;
570
- padding: 11px 16px !important; margin: 0 !important; line-height: 1.55 !important;
571
- }
572
-
573
- /* ═══════════ SUMMARY OUTPUT ═══════════ */
574
- #summary-out .prose strong, #summary-out strong { color: var(--gold-lt) !important; font-weight: 600 !important; }
575
- #summary-out .prose p { margin-bottom: 10px !important; }
576
- #summary-out .prose ul { padding-left: 20px !important; }
577
- #summary-out .prose li { margin-bottom: 5px !important; color: var(--tx-1) !important; }
578
- #summary-out .prose em { color: var(--tx-2) !important; }
579
-
580
- /* ═══════════ CHATBOT ═══════════ */
581
- .gr-chatbot, [data-testid="chatbot"] {
582
- background: var(--bg-1) !important; border: 1px solid var(--b-1) !important;
583
- border-radius: var(--r) !important; padding: 12px !important;
584
- }
585
-
586
- .gr-chatbot .message {
587
- font-family: 'Geist', sans-serif !important; font-size: 0.9rem !important;
588
- line-height: 1.7 !important; padding: 13px 18px !important;
589
- border-radius: 12px !important; max-width: 82% !important;
590
- animation: msgIn 0.25s cubic-bezier(0.34,1.56,0.64,1) both;
591
- }
592
-
593
- @keyframes msgIn {
594
- from { opacity: 0; transform: translateY(8px) scale(0.97); }
595
- to { opacity: 1; transform: translateY(0) scale(1); }
596
- }
597
-
598
- .gr-chatbot .message.user {
599
- background: linear-gradient(135deg, var(--bg-4), var(--bg-5)) !important;
600
- border: 1px solid var(--b-2) !important; color: var(--tx-1) !important; margin-left: auto !important;
601
- }
602
-
603
- .gr-chatbot .message.bot {
604
- background: var(--bg-2) !important; border: 1px solid var(--b-1) !important; color: var(--tx-1) !important;
605
- }
606
-
607
- /* ═══════════ AUDIO ═══════════ */
608
- .gr-audio, [data-testid="audio"] {
609
- background: var(--bg-3) !important; border: 1px solid var(--b-1) !important;
610
- border-radius: var(--r-sm) !important; padding: 8px !important;
611
- }
612
-
613
- /* ═══════════ VOICE DESC ═══════════ */
614
- .voice-desc {
615
- font-size: 0.84rem; color: var(--tx-2); margin: 0 0 16px; line-height: 1.6;
616
- padding: 10px 14px; background: var(--bg-3); border-radius: var(--r-xs);
617
- border-left: 2px solid var(--b-2);
618
- }
619
-
620
- /* ═══════════ FOOTER ═══════════ */
621
- #footer-bar {
622
- display: flex; align-items: center; justify-content: space-between;
623
- padding: 14px 20px; background: rgba(15,17,25,0.6);
624
- border: 1px solid var(--b-0); border-radius: var(--r); margin-top: 4px;
625
- flex-wrap: wrap; gap: 10px;
626
- }
627
-
628
- .footer-copy { font-family: 'Geist Mono', monospace; font-size: 0.68rem; color: var(--tx-3); }
629
- .footer-stack { display: flex; gap: 6px; flex-wrap: wrap; }
630
- .footer-tag {
631
- font-family: 'Geist Mono', monospace; font-size: 0.64rem; color: var(--tx-3);
632
- background: var(--bg-2); border: 1px solid var(--b-0); border-radius: 4px; padding: 2px 7px;
633
- }
634
-
635
- /* ═══════════ SCROLLBAR ═══════════ */
636
- ::-webkit-scrollbar { width: 4px; height: 4px; }
637
- ::-webkit-scrollbar-track { background: transparent; }
638
- ::-webkit-scrollbar-thumb { background: var(--b-2); border-radius: 99px; }
639
- ::-webkit-scrollbar-thumb:hover { background: var(--gold-dk); }
640
-
641
- /* ═══════════ ANIMATIONS ═══════════ */
642
- @keyframes fadeDown { from { opacity: 0; transform: translateY(-10px); } to { opacity: 1; transform: translateY(0); } }
643
- @keyframes fadeUp { from { opacity: 0; transform: translateY(16px); } to { opacity: 1; transform: translateY(0); } }
644
- @keyframes fadeIn { from { opacity: 0; } to { opacity: 1; } }
645
-
646
- .gradio-container .gr-group { animation: fadeUp 0.5s cubic-bezier(0.22, 1, 0.36, 1) both; }
647
-
648
- ::selection { background: rgba(212,168,83,0.25); color: var(--tx-1); }
649
- """
650
 
651
- # =========================
652
- # BUILD UI
653
- # =========================
654
 
655
- with gr.Blocks(title="Research Intelligence Platform") as demo:
656
-
657
- # ─── HERO HEADER ──────────────────────────────────────────────────
658
- gr.HTML("""
659
- <div id="hero">
660
- <div class="hero-corner hero-corner-tl"></div>
661
- <div class="hero-corner hero-corner-tr"></div>
662
- <div class="hero-corner hero-corner-bl"></div>
663
- <div class="hero-corner hero-corner-br"></div>
664
-
665
- <div class="hero-eyebrow">Research Intelligence Platform</div>
666
-
667
- <div class="hero-title">Your <em>AI Research</em> Partner</div>
668
-
669
- <div class="hero-sub">
670
- Paste any paper DOI β€” instantly fetch, index, summarize, and interrogate
671
- research literature with LLaMA 3.3 and semantic search.
672
- </div>
673
-
674
- <div class="pill-row">
675
- <span class="pill"><span class="pill-dot" style="background:#3ecf8e"></span>RAG Architecture</span>
676
- <span class="pill"><span class="pill-dot" style="background:#d4a853"></span>LLaMA 3.3 Β· 70B</span>
677
- <span class="pill"><span class="pill-dot" style="background:#4d94ff"></span>Whisper ASR</span>
678
- <span class="pill"><span class="pill-dot" style="background:#f0a443"></span>FAISS Semantic Index</span>
679
- <span class="pill"><span class="pill-dot" style="background:#c084fc"></span>DOI Resolver</span>
680
- </div>
681
- </div>
682
- """)
683
-
684
- # ─── STEP 01 β€” LOAD PAPER ─────────────────────────────────────────
685
- with gr.Group():
686
- gr.HTML("""
687
- <div class="step-header">
688
- <div class="step-num">01</div>
689
- <div><div class="step-title">Load Paper</div></div>
690
- <div class="step-desc">Paste DOI &rarr; fetch &rarr; index</div>
691
- </div>
692
- <div class="step-divider"></div>
693
- """)
694
- with gr.Row(equal_height=True):
695
- arxiv_input = gr.Textbox(
696
- label="Paper DOI",
697
- placeholder="10.1038/s41586-021-03819-2 Β· https://doi.org/10.48550/arXiv.1706.03762",
698
- scale=5,
699
- )
700
- load_btn = gr.Button("Load Paper β†’", variant="primary", scale=1, min_width=160)
701
- status = gr.Markdown(
702
- value="*Paste a DOI above and click **Load Paper** β€” the full text will be fetched and indexed automatically.*",
703
- elem_id="status-box",
704
- )
705
 
706
- # ─── STEP 02 β€” SECTION SUMMARY ────────────────────────────────────
707
- with gr.Group():
708
- gr.HTML("""
709
- <div class="step-header">
710
- <div class="step-num">02</div>
711
- <div><div class="step-title">Section Summary</div></div>
712
- <div class="step-desc">AI-powered breakdown</div>
713
- </div>
714
- <div class="step-divider"></div>
715
- """)
716
- with gr.Row(equal_height=True):
717
- section_dropdown = gr.Dropdown(
718
- label="Select Section", choices=[], scale=5, interactive=True,
719
- )
720
- summarize_btn = gr.Button("✦ Summarize", variant="secondary", scale=1, min_width=150)
721
- summary_output = gr.Markdown(
722
- value="*Select a section from the dropdown, then click **Summarize** for a structured AI breakdown.*",
723
- elem_id="summary-out",
724
- )
725
 
726
- # ─── STEP 03 β€” CHAT ───────────────────────────────────────────────
727
- with gr.Group():
728
- gr.HTML("""
729
- <div class="step-header">
730
- <div class="step-num">03</div>
731
- <div><div class="step-title">Chat with Paper</div></div>
732
- <div class="step-desc">Context-aware Q&A</div>
733
- </div>
734
- <div class="step-divider"></div>
735
- """)
736
- chatbot = gr.Chatbot(value=[], height=440, show_label=False)
737
- with gr.Row(equal_height=True):
738
- msg = gr.Textbox(
739
- label="Ask a Question",
740
- placeholder="What is the main contribution? Β· How does the method work? Β· What datasets were used?",
741
- scale=5, lines=1, max_lines=4,
742
- )
743
- send_btn = gr.Button("Send β†’", variant="primary", scale=1, min_width=120)
744
-
745
- # ─── STEP 04 β€” VOICE ──────────────────────────────────────────────
746
- with gr.Group():
747
- gr.HTML("""
748
- <div class="step-header">
749
- <div class="step-num">04</div>
750
- <div><div class="step-title">Voice Query</div></div>
751
- <div class="step-desc">Speak &rarr; transcribe &rarr; search</div>
752
- </div>
753
- <div class="step-divider"></div>
754
- <p class="voice-desc">
755
- Record or upload an audio question β€” Whisper ASR transcribes it and runs semantic search automatically.
756
- </p>
757
- """)
758
- with gr.Row(equal_height=True):
759
- audio = gr.Audio(type="filepath", label="Record or Upload Audio", scale=5)
760
- voice_btn = gr.Button("πŸŽ™ Transcribe & Ask", variant="secondary", scale=1, min_width=170)
761
-
762
- # ─── FOOTER ───────────────────────────────────────────────────────
763
- gr.HTML("""
764
- <div id="footer-bar">
765
- <span class="footer-copy">Research Intelligence Platform &middot; 2026</span>
766
- <div class="footer-stack">
767
- <span class="footer-tag">Gradio</span>
768
- <span class="footer-tag">Groq LLaMA 3.3</span>
769
- <span class="footer-tag">FAISS</span>
770
- <span class="footer-tag">Sentence Transformers</span>
771
- <span class="footer-tag">faster-whisper</span>
772
- <span class="footer-tag">PyMuPDF</span>
773
- <span class="footer-tag">Unpaywall</span>
774
- </div>
775
- </div>
776
- """)
777
-
778
- # ─── BINDINGS ─────────────────────────────────────────────────────
779
  load_btn.click(load_paper, inputs=arxiv_input, outputs=[section_dropdown, status])
780
  summarize_btn.click(summarize_section, inputs=section_dropdown, outputs=summary_output)
781
  send_btn.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
782
- msg.submit(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
783
  voice_btn.click(voice_chat, inputs=[audio, chatbot], outputs=[chatbot, msg])
784
 
785
- demo.launch(css=CSS)
 
 
 
 
 
 
 
4
  import re
5
  import numpy as np
6
  import faiss
 
7
  from sentence_transformers import SentenceTransformer
8
  from groq import Groq
9
+ from faster_whisper import WhisperModel
10
 
11
  # =========================
12
  # INITIALIZE MODELS
13
  # =========================
14
 
15
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
16
+ whisper_model = WhisperModel("base", compute_type="int8")
17
 
18
+ # πŸ”‘ PUT YOUR GROQ API KEY HERE
19
+ client = Groq(api_key="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")
20
 
21
+ # Use stable model
 
 
 
 
 
 
 
 
 
 
 
22
  MODEL_NAME = "llama-3.3-70b-versatile"
23
 
24
+ # Global storage
25
  sections = {}
26
  section_texts = []
27
  index = None
28
 
29
+
30
  # =========================
31
  # PDF FUNCTIONS
32
  # =========================
33
 
34
+ def download_arxiv_pdf(arxiv_id):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  try:
36
+ url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
37
+ response = requests.get(url)
38
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ file_path = f"{arxiv_id}.pdf"
41
  with open(file_path, "wb") as f:
42
+ f.write(response.content)
43
 
44
+ return file_path
45
+ except:
46
+ return None
47
 
 
 
 
48
 
49
  def extract_text_from_pdf(pdf_path):
50
  doc = fitz.open(pdf_path)
 
53
  text += page.get_text()
54
  return text
55
 
56
+
57
+ # =========================
58
+ # ROBUST SECTION EXTRACTION
59
+ # =========================
60
+
61
  def extract_sections(text):
62
+
63
  patterns = [
64
+ r"\n([IVX]+\.\s+[A-Z][A-Z\s]+)", # Roman
65
+ r"\n(\d+\.\d+\.\d+\s+[^\n]+)", # 1.1.1
66
+ r"\n(\d+\.\d+\s+[^\n]+)", # 1.1
67
+ r"\n(\d+\.\s+[^\n]+)", # 1.
68
+ r"\n(\d+\s+[^\n]+)", # 1
69
+ r"\n([A-Z][A-Z\s]{4,})\n" # ALL CAPS
70
  ]
71
+
72
  matches = []
73
  for p in patterns:
74
  matches.extend(list(re.finditer(p, text)))
75
+
76
  matches = sorted(matches, key=lambda x: x.start())
77
+
78
  extracted = {}
79
+
80
  for i, match in enumerate(matches):
81
  title = match.group(1).strip()
82
+
83
  start = match.end()
84
+ end = matches[i+1].start() if i+1 < len(matches) else len(text)
85
+
86
  content = text[start:end].strip()
87
+
88
  if len(content) > 4000:
89
  content = content[:4000]
90
+
91
  extracted[title] = content
92
+
93
+ # Add abstract manually
94
  abstract_match = re.search(r"Abstract(.*?)\n", text, re.DOTALL)
95
  if abstract_match:
96
  extracted["Abstract"] = abstract_match.group(1).strip()
97
+
98
  return extracted
99
 
100
+
101
+ # =========================
102
+ # VECTOR STORE
103
+ # =========================
104
+
105
  def build_vector_store(sections_dict):
106
  global index, section_texts
107
+
108
  section_texts = list(sections_dict.values())
109
+
110
  if len(section_texts) == 0:
111
  index = None
112
  return
113
+
114
  embeddings = embedding_model.encode(section_texts)
115
  embeddings = np.array(embeddings).astype("float32")
116
+
117
  dim = embeddings.shape[1]
118
  index = faiss.IndexFlatL2(dim)
119
  index.add(embeddings)
120
 
121
+
122
+ # =========================
123
+ # LOAD PAPER
124
+ # =========================
125
+
126
+ def load_paper(arxiv_id):
127
  global sections
128
+
129
+ pdf_path = download_arxiv_pdf(arxiv_id)
130
+
131
  if pdf_path is None:
132
+ return gr.update(choices=[]), "❌ Invalid arXiv ID"
133
+
 
 
 
134
  text = extract_text_from_pdf(pdf_path)
135
  sections = extract_sections(text)
136
+
 
137
  build_vector_store(sections)
138
+
139
+ return gr.update(choices=list(sections.keys())), "βœ… Paper Loaded Successfully"
140
+
141
+
142
+ # =========================
143
+ # SUMMARY FUNCTION
144
+ # =========================
145
 
146
  def summarize_section(section_title):
147
  try:
148
  if not sections:
149
+ return "❌ Load paper first"
150
+
151
+ if section_title not in sections:
152
+ return "❌ Section not found"
153
+
154
+ content = sections[section_title]
155
+
156
  if not content:
157
+ return "❌ Empty section"
 
158
 
159
+ content = content[:4000]
 
 
 
 
160
 
161
+ prompt = f"""
162
+ Summarize this research section:
163
+
164
+ - Main idea
165
+ - Key concepts
166
+ - Simple explanation
167
+ - Importance
168
+
169
+ Section: {section_title}
170
 
171
  Content:
172
  {content}
173
  """
174
+
175
  response = client.chat.completions.create(
176
  model=MODEL_NAME,
177
  messages=[{"role": "user", "content": prompt}],
178
  temperature=0.3
179
  )
180
+
181
  return response.choices[0].message.content
182
+
183
  except Exception as e:
184
+ return f"❌ Error:\n{str(e)}"
185
+
186
+
187
+ # =========================
188
+ # RAG CHAT
189
+ # =========================
190
 
191
  def rag_chat(message, history):
192
  try:
193
  global index
194
+
 
195
  if index is None:
196
+ history.append({"role": "assistant", "content": "❌ Load paper first"})
197
  return history, ""
198
+
199
  query_embedding = embedding_model.encode([message])
200
  query_embedding = np.array(query_embedding).astype("float32")
201
+
202
  D, I = index.search(query_embedding, k=3)
203
+
204
+ retrieved = "\n\n".join([section_texts[i] for i in I[0]])
205
+
206
+ prompt = f"""
207
+ Answer using ONLY this context.
208
 
209
  Context:
210
  {retrieved}
211
 
212
+ Question:
213
+ {message}
214
  """
215
+
216
  response = client.chat.completions.create(
217
  model=MODEL_NAME,
218
  messages=[{"role": "user", "content": prompt}],
219
  temperature=0.2
220
  )
221
+
222
  answer = response.choices[0].message.content
223
+
224
+ # βœ… FIXED FORMAT
225
+ history.append({"role": "user", "content": message})
226
+ history.append({"role": "assistant", "content": answer})
227
+
228
  return history, ""
229
+
230
  except Exception as e:
231
+ history.append({"role": "assistant", "content": f"❌ Error:\n{str(e)}"})
232
  return history, ""
233
 
234
+
235
+ # =========================
236
+ # VOICE CHAT
237
+ # =========================
238
+
239
  def voice_chat(audio, history):
240
  try:
241
  if audio is None:
 
 
 
 
 
 
 
242
  return history, ""
243
+
244
+ segments, _ = whisper_model.transcribe(audio)
245
+ text = " ".join([seg.text for seg in segments])
246
+
247
  return rag_chat(text, history)
248
+
249
  except Exception as e:
250
+ history.append({"role": "assistant", "content": f"❌ Error:\n{str(e)}"})
251
  return history, ""
252
 
253
+
254
  # =========================
255
+ # UI
 
 
256
  # =========================
257
 
258
+ with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ gr.Markdown("# πŸ“š ArXiv Research Assistant", elem_id="title")
 
 
261
 
262
+ with gr.Row():
263
+ arxiv_input = gr.Textbox(label="Enter arXiv ID", scale=4)
264
+ load_btn = gr.Button("Load Paper", variant="primary", scale=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
+ status = gr.Markdown()
267
+
268
+ with gr.Row():
269
+ section_dropdown = gr.Dropdown(label="Sections", scale=3)
270
+ summarize_btn = gr.Button("Generate Summary", variant="secondary", scale=1)
271
+
272
+ summary_output = gr.Markdown()
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ gr.Markdown("## πŸ’¬ Chat with Paper")
275
+ chatbot = gr.Chatbot(height=400)
276
+
277
+ with gr.Row():
278
+ msg = gr.Textbox(label="Ask a question", scale=4)
279
+ send_btn = gr.Button("Send", variant="primary", scale=1)
280
+
281
+ gr.Markdown("## πŸŽ™ Voice Query")
282
+
283
+ with gr.Row():
284
+ audio = gr.Audio(type="filepath", scale=4)
285
+ voice_btn = gr.Button("Ask via Voice", scale=1)
286
+
287
+ # Actions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  load_btn.click(load_paper, inputs=arxiv_input, outputs=[section_dropdown, status])
289
  summarize_btn.click(summarize_section, inputs=section_dropdown, outputs=summary_output)
290
  send_btn.click(rag_chat, inputs=[msg, chatbot], outputs=[chatbot, msg])
 
291
  voice_btn.click(voice_chat, inputs=[audio, chatbot], outputs=[chatbot, msg])
292
 
293
+
294
+ demo.launch(
295
+ theme=gr.themes.Soft(),
296
+ css="""
297
+ #title {text-align:center}
298
+ """
299
+ )