sitayeb commited on
Commit
566a5de
·
verified ·
1 Parent(s): 48e63cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +593 -537
app.py CHANGED
@@ -1,7 +1,5 @@
1
  # ================================================================
2
- # 🔬 Scientific Paper Discovery Bot v7.4
3
- # FIX: Global Search now uses arXiv relevance sort (not date)
4
- # → searching "Attention is All You Need" returns the correct paper
5
  # ================================================================
6
  import os, re, time, json, pickle, threading
7
  import requests
@@ -32,10 +30,10 @@ GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
32
  S2_API_KEY = os.environ.get("S2_API_KEY", "")
33
  groq_client = Groq(api_key=GROQ_API_KEY)
34
 
35
- print("Loading embedder...")
36
  embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
37
  _ = embedder.encode(["warmup"])
38
- print("Embedder ready!")
39
 
40
  PAPERS = []
41
  ACTIVE_PAPERS = []
@@ -45,57 +43,52 @@ AUTO_LOG = []
45
  CURRENT_YEAR = datetime.now().year
46
 
47
  PERSIST_DIR = "/tmp"
48
- FAVORITES_PATH = f"{PERSIST_DIR}/favorites.pkl"
49
- SEEN_IDS_PATH = f"{PERSIST_DIR}/seen_ids.json"
50
  os.makedirs(PERSIST_DIR, exist_ok=True)
51
 
52
  CATEGORIES = {
53
- "🌐 الكل / All": "",
54
- "📊 الاقتصاد / Economics": "econ",
55
- "💰 المالية الكمية / Quant Finance": "q-fin",
56
- "🤖 الذكاء الاصطناعي / AI": "cs.AI",
57
- "🧠 تعلم الآلة / ML": "cs.LG",
58
- "💬 معالجة اللغة / NLP": "cs.CL",
59
- "📈 الإحصاء / Statistics": "stat",
60
- "🔬 علم الأحياء / Bio": "q-bio",
61
- "⚛️ الفيزياء / Physics": "physics",
62
- "📐 الرياضيات / Math": "math",
63
- "💻 علوم الحاسوب / CS": "cs",
64
  }
65
  CROSSREF_SUBJECTS = {
66
- "🌐 الكل / All": "",
67
- "📊 الاقتصاد / Economics": "economics",
68
- "💰 المالية الكمية / Quant Finance": "finance",
69
- "🤖 الذكاء الاصطناعي / AI": "artificial intelligence",
70
- "🧠 تعلم الآلة / ML": "machine learning",
71
- "💬 معالجة اللغة / NLP": "natural language processing",
72
- "📈 الإحصاء / Statistics": "statistics",
73
- "🔬 علم الأحياء / Bio": "biology",
74
- "⚛️ الفيزياء / Physics": "physics",
75
- "📐 الرياضيات / Math": "mathematics",
76
- "💻 علوم الحاسوب / CS": "computer science",
77
  }
78
- LANG_CHOICES = ["🇸🇦 عربي / Arabic", "🇬🇧 English"]
79
- SORT_CHOICES = [
80
- "📅 الأحدث / Newest",
81
- "📅 الأقدم / Oldest",
82
- "🏆 الأكثر اقتباساً / Most Cited",
83
- "📊 الأقل اقتباساً / Least Cited",
84
- ]
85
- AR_FORMAT_RULES = """
86
- قواعد التنسيق:
87
- - ابدأ كل قسم بـ ## على سطر منفرد مع سطر فارغ قبله وبعده
88
- - اكتب كل قسم في فقرة من 3-4 جمل بالعربية الفصحى
89
  - لا تكرر عنوان القسم داخل النص
90
- - لا تضف --- أو *** أو رموز زائدة
91
  """
92
 
93
  # ================================================================
94
  # HELPERS
95
  # ================================================================
96
  def detect_lang(text):
97
- try: return "ar" if detect(str(text)[:300]).startswith("ar") else "en"
98
- except: return "en"
 
 
99
 
100
  def clean_md(text):
101
  text = re.sub(r"[#*`>\[\]!_~]", "", text)
@@ -109,37 +102,39 @@ def fix_ar_format(text):
109
  def cit_badge(n):
110
  if n is None or n == "": return "—"
111
  n = int(n)
112
- if n >= 1000: return f"🥇 {n:,}"
113
- if n >= 100: return f"🏆 {n:,}"
114
- if n >= 10: return f"⭐ {n:,}"
115
- if n > 0: return f"📄 {n}"
116
  return "·"
117
 
118
  def build_table(papers_list):
119
- rows = "| # | 🏷️ العنوان | 👥 مؤلف رئيسي | 📅 التاريخ | 📊 الاقتباسات | 📡 المصدر |\n"
120
  rows += "|---|---|---|---|---|---|\n"
121
  choices = []
122
  for i, p in enumerate(papers_list):
123
- first = p["authors"][0] if p["authors"] else "N/A"
124
- badge = "🆕" if p.get("recent") else "📄"
125
- rows += (f"| {i+1} | {badge} {p['title']} | {first} | "
126
- f"{p['published']} | {cit_badge(p.get('citations'))} | "
127
- f"{p.get('source','arXiv')} |\n")
128
- choices.append(f"{i+1}. {p['title']}")
 
129
  return rows, choices
130
 
131
  def s2_headers():
132
  h = {"User-Agent": "ScientificPaperBot/7.4"}
133
- if S2_API_KEY: h["x-api-key"] = S2_API_KEY
 
134
  return h
135
 
136
  def cr_headers():
137
  return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"}
138
 
139
  # ================================================================
140
- # CrossRef date parser — rejects garbage years (2048, 2116...)
141
  # ================================================================
142
- def parse_crossref_date(item: dict) -> str:
143
  for field in ["issued", "published", "published-print", "published-online", "created"]:
144
  dp = (item.get(field) or {}).get("date-parts", [[]])
145
  if not dp or not dp[0]: continue
@@ -149,7 +144,7 @@ def parse_crossref_date(item: dict) -> str:
149
  if not (1900 <= year <= CURRENT_YEAR + 1): continue
150
  month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1))
151
  day = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1))
152
- return f"{year:04d}-{month:02d}-{day:02d}"
153
  except (ValueError, TypeError, IndexError):
154
  continue
155
  return "N/A"
@@ -175,48 +170,52 @@ def save_favorite(paper):
175
  if paper["id"] not in {p["id"] for p in favs}:
176
  favs.append(paper)
177
  with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f)
178
- return f" تم الحفظ: {paper['title']}"
179
- return "ℹ️ موجودة بالفعل."
180
 
181
  def export_favorites_csv():
182
  favs = load_favorites()
183
  if not favs: return None
184
- df = pd.DataFrame([{"Title": p["title"], "Authors": ", ".join(p["authors"][:3]),
185
- "Date": p["published"], "Citations": p.get("citations","N/A"),
186
- "URL": p["url"], "Source": p.get("source","arXiv")} for p in favs])
187
- path = f"{PERSIST_DIR}/favorites.csv"
 
 
 
 
 
188
  df.to_csv(path, index=False, encoding="utf-8-sig")
189
  return path
190
 
191
  def gr_export_fav(): return export_favorites_csv()
192
 
193
  # ================================================================
194
- # PDF EXPORT
195
  # ================================================================
196
  def export_explanation_pdf(explanation_text, paper_title="paper"):
197
  if not explanation_text or len(explanation_text) < 30: return None
198
  safe = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_")
199
- path = f"{PERSIST_DIR}/explanation_{safe}.pdf"
200
  doc = SimpleDocTemplate(path, pagesize=A4,
201
  rightMargin=2*cm, leftMargin=2*cm,
202
- topMargin=2*cm, bottomMargin=2*cm)
203
- styles = getSampleStyleSheet()
204
- h2_style = ParagraphStyle("H2", parent=styles["Heading2"],
205
- fontSize=11, textColor=colors.HexColor("#2563eb"),
206
- spaceBefore=14, spaceAfter=6)
207
- body_style = ParagraphStyle("Body", parent=styles["Normal"],
208
- fontSize=10, leading=16, spaceAfter=8)
209
- meta_style = ParagraphStyle("Meta", parent=styles["Normal"],
210
- fontSize=9, textColor=colors.HexColor("#64748b"),
211
- spaceAfter=10)
212
  story = []
213
  for line in explanation_text.split("\n"):
214
  line = line.strip()
215
  if not line: story.append(Spacer(1, 6)); continue
216
  clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line)
217
- clean = re.sub(r"\*(.+?)\*", r"\1", clean)
218
- clean = re.sub(r"`(.+?)`", r"\1", clean)
219
- clean = re.sub(r"^#{1,6}\s*", "", clean)
220
  clean = re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]", "", clean).strip()
221
  if not clean: continue
222
  if line.startswith("## ") or line.startswith("# "):
@@ -224,95 +223,109 @@ def export_explanation_pdf(explanation_text, paper_title="paper"):
224
  color=colors.HexColor("#e2e8f0"), spaceAfter=4))
225
  story.append(Paragraph(clean, h2_style))
226
  elif line.startswith(">"):
227
- q_style = ParagraphStyle("Q", parent=styles["Normal"],
228
- fontSize=9, leftIndent=20,
229
- textColor=colors.HexColor("#475569"), leading=14)
230
- story.append(Paragraph(re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]","",
231
- line.lstrip(">").strip()), q_style))
 
232
  else:
233
- story.append(Paragraph(clean, body_style))
234
- story += [Spacer(1,20),
235
- HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")),
236
- Paragraph(f"Generated by 🔬 Paper Discovery v7.4 — {datetime.now().strftime('%Y-%m-%d %H:%M')}",
237
- meta_style)]
 
 
238
  try:
239
  doc.build(story); return path
240
  except Exception as e:
241
- print(f"PDF error: {e}"); return None
242
 
243
  def gr_export_pdf(explanation_text, choice):
244
  if not explanation_text or len(explanation_text) < 50:
245
- return None, "⚠️ اشرح الورقة أولاً ثم صدّر PDF."
246
  title = choice.split(". ", 1)[-1] if choice else "paper"
247
  path = export_explanation_pdf(explanation_text, title)
248
- return (path, "✅ تم إنشاء PDF!") if path else (None, "❌ فشل إنشاء PDF.")
249
 
250
  # ================================================================
251
  # SOURCE 1 — arXiv
252
- # KEY FIX: sort_by parameter
253
- # Browse mode → "submittedDate" (latest papers in topic)
254
- # Search mode → "relevance" (most relevant to query/title)
255
  # ================================================================
256
  def fetch_arxiv_papers(query, category, max_results=20, days_back=365,
257
  sort_by="submittedDate"):
258
  parts = []
259
- # ✅ If query looks like a paper title (>3 words), use ti: prefix for precision
260
  words = query.strip().split()
261
  if len(words) >= 3 and sort_by == "relevance":
262
- parts.append(f'ti:"{query.strip()}"') # exact title search
263
  elif query.strip():
264
- parts.append(f"all:{query.strip()}")
265
  if category.strip():
266
- parts.append(f"cat:{category.strip()}")
267
- sq = " AND ".join(parts) if parts else "all:machine learning"
268
- params = {"search_query": sq, "start": 0, "max_results": max_results,
269
- "sortBy": sort_by, "sortOrder": "descending"}
 
 
 
 
 
270
  try:
271
  resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30)
272
  resp.raise_for_status()
273
- except Exception as e: print(f"arXiv: {e}"); return []
 
274
 
275
  ns_a = "http://www.w3.org/2005/Atom"
276
  ns_x = "http://arxiv.org/schemas/atom"
277
  root = ET.fromstring(resp.content)
278
  cutoff = datetime.now() - timedelta(days=days_back)
279
  papers = []
280
- for entry in root.findall(f"{{{ns_a}}}entry"):
281
  try:
282
- pid = entry.find(f"{{{ns_a}}}id").text.split("/abs/")[-1].strip()
283
- title = entry.find(f"{{{ns_a}}}title").text.strip().replace("\n"," ")
284
- abstract = entry.find(f"{{{ns_a}}}summary").text.strip().replace("\n"," ")
285
- published = entry.find(f"{{{ns_a}}}published").text[:10]
286
- authors = [a.find(f"{{{ns_a}}}name").text
287
- for a in entry.findall(f"{{{ns_a}}}author")]
288
  cats = set()
289
- pc = entry.find(f"{{{ns_x}}}primary_category")
290
  if pc is not None: cats.add(pc.get("term",""))
291
- for c in entry.findall(f"{{{ns_x}}}category"): cats.add(c.get("term",""))
292
  cats.discard("")
293
  papers.append({
294
- "id": pid, "title": title, "authors": authors[:6],
295
- "abstract": abstract[:1200], "published": published,
296
- "categories": list(cats)[:4], "citations": None,
297
- "url": f"https://arxiv.org/abs/{pid}",
298
- "pdf_url": f"https://arxiv.org/pdf/{pid}",
299
- "recent": datetime.strptime(published,"%Y-%m-%d") >= cutoff,
300
- "source": "arXiv",
 
 
 
 
301
  })
302
- except Exception as e: print(f"arXiv parse: {e}")
 
303
  return papers
304
 
305
  # ================================================================
306
- # SOURCE 2 — CrossRef (fixed date parser + title filter)
307
  # ================================================================
308
- def fetch_crossref_papers(query, category_label="", max_results=20, days_back=365,
309
- use_title=False):
310
  subject = CROSSREF_SUBJECTS.get(category_label, "")
311
- full_query = f"{query} {subject}".strip() if subject else query
 
312
  params = {
313
- "query.title" if use_title else "query": full_query,
314
- "rows": min(max_results * 3, 200),
315
- "sort": "relevance",
316
  "select": ("title,author,abstract,published,published-print,"
317
  "published-online,issued,created,DOI,"
318
  "is-referenced-by-count,link,subject"),
@@ -325,8 +338,9 @@ def fetch_crossref_papers(query, category_label="", max_results=20, days_back=36
325
  if r.status_code == 200:
326
  items = r.json().get("message",{}).get("items",[]); break
327
  if r.status_code == 429: time.sleep(2**attempt); continue
328
- print(f"CrossRef {r.status_code}"); return []
329
- except Exception as e: print(f"CrossRef {attempt+1}: {e}"); time.sleep(1)
 
330
 
331
  cutoff = datetime.now() - timedelta(days=days_back)
332
  papers, seen_ids = [], set()
@@ -339,71 +353,76 @@ def fetch_crossref_papers(query, category_label="", max_results=20, days_back=36
339
  pub = parse_crossref_date(item)
340
  if pub == "N/A": continue
341
  cit = int(item.get("is-referenced-by-count", 0) or 0)
342
- authors = [f"{a.get('given','').strip()} {a.get('family','').strip()}".strip()
343
- for a in item.get("author",[])[:6]]
 
 
344
  authors = [a for a in authors if a.strip()] or ["Unknown"]
345
- abstract = re.sub(r"<[^>]+>","", item.get("abstract","No abstract.")).strip()[:1200]
346
- doi = item.get("DOI","")
347
- url = f"https://doi.org/{doi}" if doi else "#"
348
- pid = doi or re.sub(r"\W","",title)[:40]
 
349
  if pid in seen_ids: continue
350
  seen_ids.add(pid)
351
- pdf_url = next((l.get("URL","") for l in item.get("link",[])
352
- if "pdf" in l.get("content-type","").lower()), "")
353
- try: recent = datetime.strptime(pub[:10],"%Y-%m-%d") >= cutoff
354
  except: recent = False
355
  papers.append({
356
- "id": pid, "title": title, "authors": authors,
357
- "abstract": abstract, "published": pub[:10],
 
 
 
358
  "categories": item.get("subject",[])[:3],
359
- "citations": cit, "url": url, "pdf_url": pdf_url,
360
- "recent": recent, "source": "CrossRef",
 
 
 
361
  })
362
  papers.sort(key=lambda x: x["citations"], reverse=True)
363
  return papers
364
 
365
  # ================================================================
366
- # GLOBAL PAPER SEARCH — title-aware, relevance-sorted
367
  # ================================================================
368
  def global_paper_search(query, source_choice, max_results=10):
369
  if not query or not query.strip():
370
- return "⚠️ أدخل عنوان أو كلمات مفتاحية للبحث."
371
- q = query.strip()
372
- papers = []
373
-
374
- if source_choice in ("arXiv", "كلاهما / Both"):
375
- # ✅ sort_by="relevance" → returns most relevant, not newest
376
  papers += fetch_arxiv_papers(q, "", int(max_results), 3650,
377
  sort_by="relevance")
378
-
379
- if source_choice in ("CrossRef", "كلاهما / Both"):
380
- # ✅ use_title=True → uses query.title for precise title match
381
  papers += fetch_crossref_papers(q, "", int(max_results), 3650,
382
  use_title=True)
383
-
384
  if not papers:
385
- return f" لا نتائج لـ `{q}`. جرب كلمات مختلفة."
386
 
387
- # Deduplicate
388
  seen, unique = set(), []
389
  for p in papers:
390
  key = re.sub(r"\W","",p["title"].lower())[:60]
391
  if key not in seen: seen.add(key); unique.append(p)
392
-
393
- # Sort by citation count (most cited first for well-known papers)
394
  unique.sort(key=lambda x: x.get("citations") or 0, reverse=True)
395
 
396
- md = f"## 🔎 نتائج البحث — `{q}`\n\n**{len(unique)}** ورقة\n\n---\n\n"
 
 
397
  for i, p in enumerate(unique, 1):
398
- cit = f" | {cit_badge(p.get('citations'))}" if p.get("citations") is not None else ""
399
- cats = " · ".join(p.get("categories",[])[:2])
400
- md += (f"### {i}. {p['title']}\n\n"
401
- f"👥 {', '.join(p['authors'][:3])} | 📅 {p['published']}{cit}"
402
- f" | 📡 {p.get('source','')} | 🏷️ {cats}\n\n"
403
- f"> {p['abstract'][:450]}...\n\n"
404
- f"🔗 [View]({p['url']})"
405
- +(f" 📥 [PDF]({p['pdf_url']})" if p.get("pdf_url") else "")
406
- +"\n\n---\n\n")
 
 
 
407
  return md
408
 
409
  # ================================================================
@@ -420,7 +439,8 @@ def enrich_citations(papers):
420
  id_map, batch_ids = {}, []
421
  for p in arxiv_papers:
422
  clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
423
- id_map[clean] = p; batch_ids.append(f"arXiv:{clean}")
 
424
  for i in range(0, len(batch_ids), 500):
425
  try:
426
  r = requests.post(
@@ -432,43 +452,47 @@ def enrich_citations(papers):
432
  for item in r.json():
433
  if not item: continue
434
  ext = item.get("externalIds") or {}
435
- clean = re.sub(r"v\d+$","", ext.get("ArXiv","").split("/")[-1].strip())
 
436
  if clean and clean in id_map:
437
  c = item.get("citationCount")
438
  if c is not None: id_map[clean]["citations"] = int(c)
439
  elif r.status_code == 429: time.sleep(4)
440
- except Exception as e: print(f"S2 batch: {e}")
441
  for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]:
442
  clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
443
  for attempt in range(2):
444
  try:
445
  r = requests.get(
446
- f"https://api.semanticscholar.org/graph/v1/paper/arXiv:{clean}",
447
- params={"fields":"citationCount"}, headers=s2_headers(), timeout=10)
448
- if r.status_code==200:
449
- c=r.json().get("citationCount"); p["citations"]=int(c) if c else 0; break
450
- if r.status_code==429: time.sleep(2**attempt); continue
451
- p["citations"]=0; break
452
- except: p["citations"]=0; break
 
 
453
  time.sleep(0.12)
454
  for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]:
455
  try:
456
  r = requests.get("https://api.crossref.org/works",
457
- params={"query.title":p["title"],"rows":1,
458
- "select":"is-referenced-by-count,title"},
459
  headers=cr_headers(), timeout=8)
460
- if r.status_code==200:
461
- items=r.json().get("message",{}).get("items",[])
462
  if items:
463
- found=(items[0].get("title") or [""])[0].lower()
464
- qw=set(p["title"].lower().split()[:5])
465
- fw=set(found.split()[:10])
466
- p["citations"]=(int(items[0].get("is-referenced-by-count",0) or 0)
467
- if len(qw&fw)>=2 else 0)
468
- else: p["citations"]=0
469
- else: p["citations"]=0
 
470
  time.sleep(0.12)
471
- except: p["citations"]=0
472
  for p in papers:
473
  if p.get("citations") is None: p["citations"] = 0
474
  return papers
@@ -480,10 +504,11 @@ def build_papers_index(papers):
480
  global FAISS_INDEX, PAPERS
481
  PAPERS = papers
482
  if not papers: FAISS_INDEX = None; return
483
- texts = [f"{p['title']} {p['abstract']}" for p in papers]
484
  embs = embedder.encode(texts, convert_to_numpy=True,
485
  normalize_embeddings=True).astype("float32")
486
- idx = faiss.IndexFlatIP(embs.shape[1]); idx.add(embs)
 
487
  FAISS_INDEX = idx
488
 
489
  def search_papers(query, top_k=5):
@@ -491,8 +516,8 @@ def search_papers(query, top_k=5):
491
  qe = embedder.encode([query], convert_to_numpy=True,
492
  normalize_embeddings=True).astype("float32")
493
  scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS)))
494
- return [{"paper":PAPERS[i],"score":float(s)}
495
- for s,i in zip(scores[0],ids[0]) if i>=0 and float(s)>0.1]
496
 
497
  # ================================================================
498
  # AUTO-FETCH
@@ -507,142 +532,154 @@ def auto_fetch_worker(query, category, interval):
507
  new_ps = [p for p in papers if p["id"] not in seen]
508
  if new_ps:
509
  save_seen_ids(seen | {p["id"] for p in papers})
510
- AUTO_LOG.append(f"[{datetime.now().strftime('%H:%M')}] 🆕 {len(new_ps)} — {query}")
511
- if len(AUTO_LOG)>20: AUTO_LOG.pop(0)
 
 
512
 
513
  def start_auto_fetch(query, cat_label, interval_min):
514
  global AUTO_RUNNING
515
- if AUTO_RUNNING: return "⚠️ يعمل بالفعل."
516
  AUTO_RUNNING = True
517
- threading.Thread(target=auto_fetch_worker,
518
- args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60),
519
- daemon=True).start()
520
- return f"✅ كل **{interval_min} دقيقة** — يراقب: `{query}`"
 
521
 
522
  def stop_auto_fetch():
523
- global AUTO_RUNNING; AUTO_RUNNING = False; return "🛑 تم الإيقاف."
524
 
525
  def get_auto_log():
526
- return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "_لا يوجد سجل._"
527
 
528
  # ================================================================
529
  # TRENDS
530
  # ================================================================
531
  def analyze_trends(papers):
532
- if not papers: return None, "⚠️ لا توجد أوراق."
533
  date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A")
534
  stopwords = {"the","a","an","of","in","for","on","with","and","or","to","using",
535
  "based","via","from","by","is","are","our","we","this","that","which",
536
  "towards","approach","method","new","into","over","learning","deep",
537
  "model","models","data","neural","large","language","paper","study",
538
  "analysis","results","show","also","can","used","two","its","their"}
539
- all_words = [w.lower() for p in papers for w in re.findall(r"[a-zA-Z]{4,}",p["title"])
540
- if w.lower() not in stopwords]
541
- top_words = Counter(all_words).most_common(15)
542
- sources = Counter(p.get("source","arXiv") for p in papers)
543
- cit_papers = [p for p in papers if (p.get("citations") or 0)>0]
544
- top_cited = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10]
545
- all_auth = [a for p in papers for a in p["authors"][:3]]
546
- top_authors= Counter(all_auth).most_common(10)
547
- cvals = [p["citations"] for p in cit_papers]
548
- buckets=[0,1,5,10,50,100,500,10000]; blabels=["0","1–4","5–9","10–49","50–99","100–499","500+"]
549
- bcounts=([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1])
550
- for i in range(len(buckets)-1)] if cvals else [0]*7)
551
- avg_cit = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0
552
- total_cit= sum(p.get("citations") or 0 for p in papers)
 
 
553
  C = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4",
554
  "#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16",
555
  "#fbbf24","#34d399","#f87171"]
556
  BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white"
557
- fig,axes = plt.subplots(2,3,figsize=(20,12))
558
  fig.patch.set_facecolor(BG)
559
- fig.suptitle("📊 Research Trends Dashboard",color=W,fontsize=16,fontweight="bold",y=1.01)
560
  def style(ax):
561
  ax.set_facecolor(PNL)
562
  for sp in ax.spines.values(): sp.set_edgecolor(BR)
563
  ax.tick_params(colors=W, labelsize=8)
564
- ax=axes[0,0]; style(ax)
565
  if date_counts:
566
- ms,cs=zip(*sorted(date_counts.items())); ms,cs=list(ms),list(cs)
567
- bars=ax.bar(ms,cs,color=C[0],edgecolor="#60a5fa",lw=0.8)
 
568
  for b,c in zip(bars,cs):
569
- ax.text(b.get_x()+b.get_width()/2,b.get_height()+.05,str(c),
570
- ha="center",va="bottom",color=W,fontsize=8)
571
- if len(cs)>2:
572
- z=np.polyfit(range(len(cs)),cs,1)
573
- ax.plot(ms,np.poly1d(z)(range(len(cs))),"--",color="#f59e0b",
574
- lw=1.5,alpha=.8,label="Trend")
575
- ax.legend(fontsize=8,facecolor=PNL,labelcolor=W)
576
- ax.set_title("📅 Papers per Month",color=W,fontsize=12,fontweight="bold",pad=10)
577
- ax.set_ylabel("Count",color=W,fontsize=9); ax.tick_params(rotation=45)
578
- ax=axes[0,1]; style(ax)
 
579
  if top_words:
580
- wds,wcts=zip(*top_words)
581
- ax.barh(list(wds),list(wcts),color=C[:len(wds)],edgecolor="#475569",lw=.6)
582
- for b,c in zip(ax.patches,wcts):
583
- ax.text(b.get_width()+.1,b.get_y()+b.get_height()/2,str(c),
584
- va="center",color=W,fontsize=8)
585
- ax.set_title("🔑 Top Keywords",color=W,fontsize=12,fontweight="bold",pad=10)
586
- ax.set_xlabel("Frequency",color=W,fontsize=9)
587
- ax=axes[0,2]; ax.set_facecolor(PNL)
588
  if sources:
589
- sl,sv=zip(*sources.items())
590
- _,txts,ats=ax.pie(sv,labels=sl,autopct="%1.0f%%",colors=C[:len(sl)],startangle=90,
591
- textprops={"color":W,"fontsize":10},
592
- wedgeprops={"edgecolor":BR,"linewidth":1.5})
 
593
  for at in ats: at.set_color(W); at.set_fontsize(9)
594
- ax.set_title("📡 Source Distribution",color=W,fontsize=12,fontweight="bold",pad=10)
595
- ax=axes[1,0]; style(ax)
596
  if top_cited:
597
- lbls=[p["title"][:35]+"" if len(p["title"])>35 else p["title"] for p in top_cited]
598
- cv=[p["citations"] for p in top_cited]
599
- ax.barh(lbls[::-1],cv[::-1],color=C[1],edgecolor="#475569",lw=.6)
600
- mx=max(cv) if cv else 1
601
- for b,c in zip(ax.patches,cv[::-1]):
602
- ax.text(b.get_width()+mx*.01,b.get_y()+b.get_height()/2,f"{c:,}",
603
- va="center",color=W,fontsize=8)
604
- ax.set_xlabel("Citations",color=W,fontsize=9)
 
605
  else:
606
- ax.text(.5,.5,"No citation data",ha="center",va="center",
607
- color="#94a3b8",fontsize=11,transform=ax.transAxes)
608
- ax.set_title("🏆 Top 10 Cited",color=W,fontsize=12,fontweight="bold",pad=10)
609
- ax=axes[1,1]; style(ax)
610
  if any(bcounts):
611
- ax.bar(blabels,bcounts,color=C[2],edgecolor="#475569",lw=.8)
612
- for b,c in zip(ax.patches,bcounts):
613
- if c>0: ax.text(b.get_x()+b.get_width()/2,b.get_height()+.1,str(c),
614
- ha="center",va="bottom",color=W,fontsize=9)
615
- ax.set_xlabel("Citation Range",color=W,fontsize=9)
616
- ax.set_ylabel("Papers",color=W,fontsize=9)
617
- ax.annotate(f"Avg {avg_cit} | Total {total_cit:,}",
618
- xy=(.98,.96),xycoords="axes fraction",
619
- ha="right",va="top",color="#94a3b8",fontsize=8)
 
620
  else:
621
- ax.text(.5,.5,"No citation data",ha="center",va="center",
622
- color="#94a3b8",fontsize=11,transform=ax.transAxes)
623
- ax.set_title("📊 Citation Distribution",color=W,fontsize=12,fontweight="bold",pad=10)
624
- ax=axes[1,2]; style(ax)
625
  if top_authors:
626
- an,ac=zip(*top_authors)
627
- ax.barh(list(an)[::-1],list(ac)[::-1],color=C[3],edgecolor="#475569",lw=.6)
628
- for b,c in zip(ax.patches,list(ac)[::-1]):
629
- ax.text(b.get_width()+.05,b.get_y()+b.get_height()/2,str(c),
630
- va="center",color=W,fontsize=8)
631
- ax.set_xlabel("Papers",color=W,fontsize=9)
632
- ax.set_title("👥 Top Authors",color=W,fontsize=12,fontweight="bold",pad=10)
633
  plt.tight_layout(pad=3)
634
- path=f"{PERSIST_DIR}/trends.png"
635
- plt.savefig(path,bbox_inches="tight",dpi=150,facecolor=BG); plt.close()
636
- top5 = sorted(cit_papers,key=lambda x:x["citations"],reverse=True)[:5]
637
- stats = (f"### 📊 إحصائيات\n\n| المؤشر | القيمة |\n|---|---|\n"
638
- f"| 📦 الإجمالي | **{len(papers)}** |\n"
639
- f"| 🆕 جديدة | **{sum(1 for p in papers if p.get('recent'))}** |\n"
640
- f"| 🔢 الاقتباسات | **{total_cit:,}** |\n"
641
- f"| 📈 متوسط | **{avg_cit}** |\n\n")
 
642
  if top5:
643
- stats += "### 🏆 الأكثر اقتباساً\n\n"
644
  for i,p in enumerate(top5,1):
645
- stats += f"{i}. [{p['title']}]({p['url']}) **{p['citations']:,}**\n\n"
 
646
  return path, stats
647
 
648
  # ================================================================
@@ -654,50 +691,53 @@ def _llm(messages, max_tokens=1200):
654
  model="llama-3.3-70b-versatile",
655
  messages=messages, temperature=0.3, max_tokens=max_tokens)
656
  return r.choices[0].message.content.strip()
657
- except Exception as e: return f"⚠️ LLM Error: {e}"
658
 
659
  def explain_paper(paper, lang="ar"):
660
  cit = paper.get("citations","N/A")
661
- if lang=="ar":
662
  return fix_ar_format(_llm([
663
- {"role":"system","content":
664
- f"أنت خبير أكاديمي يشرح الأبحاث بالعربية الفصحى.\n{AR_FORMAT_RULES}"},
665
  {"role":"user","content":
666
- f"اشرح الورقة:\nالعنوان: {paper['title']}\n"
667
- f"المؤلفون: {', '.join(paper['authors'][:3])}\n"
668
- f"التاريخ: {paper['published']} | الاقتباسات: {cit}\n"
669
- f"الملخص: {paper['abstract']}\n\n"
670
- "## 🎯 موضوع الورقة\n\n## المشكلة\n\n## 🔧 المنهجية\n\n"
671
- "## 📊 النتائج\n\n## 🌟 الأهمية\n\n## 🔗 التطبيقات"}]))
672
  return _llm([{"role":"user","content":
673
- f"Explain:\nTitle: {paper['title']}\nAuthors: {', '.join(paper['authors'][:3])}\n"
674
- f"Date: {paper['published']} | Citations: {cit}\nAbstract: {paper['abstract']}\n\n"
675
- "## 🎯 Topic\n## Problem\n## 🔧 Methodology\n## 📊 Findings\n"
676
- "## 🌟 Contribution\n## 🔗 Applications"}])
677
 
678
  def compare_papers(pa, pb, lang="ar"):
679
- body = (f"الأولى: {pa['title']} | اقتباسات: {pa.get('citations','N/A')}\n"
680
- f"{pa['abstract'][:500]}\n\n"
681
- f"الثانية: {pb['title']} | اقتباسات: {pb.get('citations','N/A')}\n"
682
- f"{pb['abstract'][:500]}")
683
- if lang=="ar":
684
  return fix_ar_format(_llm([{"role":"user","content":
685
- f"قارن بين الورقتين.\n{AR_FORMAT_RULES}\n\n{body}\n\n"
686
- "## 🎯 الهدف\n\n## 🔧 المنهجية\n\n## 📊 النتائج\n\n"
687
- "## 💪 القوة\n\n## ⚠️ القيود\n\n## 🏆 الخلاصة"}], 1400))
688
  return _llm([{"role":"user","content":
689
- f"Compare:\n{body}\n\n## Topic\n## Methodology\n## Results\n"
690
- "## Strengths\n## Limits\n## Verdict"}], 1400)
691
 
692
  def summarize_papers(papers, topic, lang="ar"):
693
- text = "".join(f"{i}. {p['title']} ({p['published']}): {p['abstract'][:300]}...\n\n"
694
- for i,p in enumerate(papers[:8],1))
695
- if lang=="ar":
 
 
696
  return fix_ar_format(_llm([{"role":"user","content":
697
- f"نظرة عامة أكاديمية حول \"{topic}\".\n{AR_FORMAT_RULES}\n\n{text}\n\n"
698
- "## 1. الاتجاهات\n\n## 2. أبرز الأوراق\n\n## 3. المواضيع المشتركة\n\n## 4. الفجوات"}], 900))
 
 
699
  return _llm([{"role":"user","content":
700
- f"Academic overview of \"{topic}\":\n{text}\n\n"
701
  "## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900)
702
 
703
  def generate_bibliography(papers, style="APA"):
@@ -706,38 +746,44 @@ def generate_bibliography(papers, style="APA"):
706
  auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "")
707
  year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d."
708
  t,u = p["title"], p["url"]
709
- if style=="APA": entries.append(f"{i}. {auth} ({year}). *{t}*. {u}")
710
- elif style=="IEEE":
 
711
  ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "")
712
- entries.append(f'[{i}] {ae}, "{t}," {year}. [Online]: {u}')
713
- elif style=="Chicago": entries.append(f'{i}. {auth}. "{t}." ({year}). {u}')
 
714
  else:
715
  key = re.sub(r"\W","", (p["authors"][0].split()[-1]
716
- if p["authors"] else "Auth"))+year
717
- entries.append(f"@article{{{key}{i},\n title={{{t}}},\n "
718
- f"author={{{auth}}},\n year={{{year}}},\n url={{{u}}}\n}}")
 
719
  bib = "\n\n".join(entries)
720
- path = f"{PERSIST_DIR}/bibliography_{style}.txt"
721
- with open(path,"w",encoding="utf-8") as f: f.write(bib)
722
  return bib, path
723
 
724
  def chat_about_papers(question, history):
725
  if not PAPERS:
726
- return ("⚠️ يرجى جلب الأوراق أولاً."
727
- if detect_lang(question)=="ar" else "⚠️ Fetch papers first.")
728
- lang=detect_lang(question); relevant=search_papers(question, top_k=4); context=""
 
 
729
  if relevant:
730
- context = "الأوراق ذات الصلة:\n\n" if lang=="ar" else "Relevant papers:\n\n"
731
  for r in relevant:
732
- p=r["paper"]
733
- cit=f" | {p['citations']:,} citations" if p.get("citations") else ""
734
- context += f"**{p['title']}** ({p['published']}){cit}\n{p['abstract'][:400]}\n🔗 {p['url']}\n\n"
735
- sys_msg = (f"أنت مساعد بحثي. أجب بالعربية الفصحى.\n{AR_FORMAT_RULES}"
736
- if lang=="ar" else "You are an academic assistant. Answer in English.")
 
737
  msgs = [{"role":"system","content":sys_msg}]
738
  for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]})
739
  msgs.append({"role":"user","content":
740
- f"{context}\nسؤال: {question}" if context else question})
741
  out = _llm(msgs, 800)
742
  return fix_ar_format(out) if lang=="ar" else out
743
 
@@ -745,9 +791,10 @@ def text_to_audio(text, lang="ar"):
745
  clean = clean_md(text)
746
  if not clean: return None
747
  try:
748
- tts=gTTS(text=clean, lang=lang, slow=False)
749
- path=f"{PERSIST_DIR}/audio_{lang}.mp3"; tts.save(path); return path
750
- except Exception as e: print(f"TTS: {e}"); return None
 
751
 
752
  # ================================================================
753
  # GRADIO HANDLERS
@@ -755,17 +802,17 @@ def text_to_audio(text, lang="ar"):
755
  def gr_fetch(query, category_label, max_results, days_back, source_choice,
756
  progress=gr.Progress()):
757
  global ACTIVE_PAPERS
758
- progress(0.05, desc="🌐 Connecting...")
759
  papers, warn = [], ""
760
- if source_choice in ("arXiv","كلاهما / Both"):
761
- progress(0.15, desc="📡 Fetching arXiv...")
762
  papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""),
763
  int(max_results), int(days_back),
764
  sort_by="submittedDate")
765
- if source_choice in ("CrossRef","كلاهما / Both"):
766
- progress(0.35, desc="📚 Fetching CrossRef...")
767
  cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back))
768
- if not cr: warn = "\n\n> ⚠️ CrossRef: لا نتائج — جرب موضوعاً مختلفاً."
769
  papers += cr
770
  seen, unique = set(), []
771
  for p in papers:
@@ -773,30 +820,32 @@ def gr_fetch(query, category_label, max_results, days_back, source_choice,
773
  if key not in seen: seen.add(key); unique.append(p)
774
  papers = unique
775
  if not papers:
776
- return (" لا توجد نتائج."+warn,
777
- gr.update(choices=[],value=None), gr.update(choices=[],value=None),
778
- gr.update(choices=[],value=None), gr.update(choices=[],value=None), "❌ 0")
779
- progress(0.60, desc="📊 جلب الاقتباسات (3-layer)...")
 
780
  papers = enrich_citations(papers)
781
- progress(0.85, desc="🔢 FAISS indexing...")
782
  build_papers_index(papers)
783
  ACTIVE_PAPERS = list(papers)
784
  tbl, choices = build_table(papers)
785
  recent = sum(1 for p in papers if p.get("recent"))
786
  tot_cit = sum(p.get("citations") or 0 for p in papers)
787
  zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0)
788
- note = (f"\n\n> ℹ️ **{zero_cit}** ورقة بدون اقتباسات (جديدة أو غير مفهرسة)."
789
  if zero_cit else "")
790
- md = (f"## تم جلب **{len(papers)}** ورقة\n\n"
791
- f"🆕 جديدة: **{recent}** &nbsp;|&nbsp; 📊 الاقتباسات: **{tot_cit:,}**"
792
- +warn+note+f"\n\n---\n\n{tbl}")
 
793
  upd = gr.update(choices=choices, value=choices[0] if choices else None)
794
  progress(1.0)
795
- return md, upd, upd, upd, upd, f"✅ {len(papers)} | 🆕 {recent} | 📊 {tot_cit:,}"
796
 
797
  def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by):
798
  global ACTIVE_PAPERS
799
- if not PAPERS: return "⚠️ اجلب الأوراق أولاً.", gr.update(), "⚠️"
800
  filtered = []
801
  for p in PAPERS:
802
  try:
@@ -806,114 +855,121 @@ def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by):
806
  cit = int(p.get("citations") or 0)
807
  if cit < int(cit_min) or cit > int(cit_max): continue
808
  filtered.append(p)
809
- if "Newest" in sort_by or "الأحدث" in sort_by:
810
- filtered.sort(key=lambda x:x["published"], reverse=True)
811
- elif "Oldest" in sort_by or "الأقدم" in sort_by:
812
- filtered.sort(key=lambda x:x["published"])
813
- elif "Most" in sort_by or "الأكثر" in sort_by:
814
- filtered.sort(key=lambda x:x.get("citations") or 0, reverse=True)
815
- elif "Least" in sort_by or "الأقل" in sort_by:
816
- filtered.sort(key=lambda x:x.get("citations") or 0)
817
  if not filtered:
818
- ACTIVE_PAPERS=[]
819
- return " لا توجد أوراق تطابق الفلتر.", gr.update(choices=[],value=None), "0"
820
  ACTIVE_PAPERS = list(filtered)
821
  tbl, choices = build_table(filtered)
822
  tot = sum(p.get("citations") or 0 for p in filtered)
823
- md = (f"## 🔽 **{len(filtered)}** من **{len(PAPERS)}** &nbsp;|&nbsp; "
824
- f"📅 {year_from}–{year_to} &nbsp;|&nbsp; "
825
- f"📊 {cit_min}–{cit_max} &nbsp;|&nbsp; مجموع {tot:,}\n\n---\n\n{tbl}")
826
- return md, gr.update(choices=choices, value=choices[0] if choices else None), \
827
- f"🔽 {len(filtered)}/{len(PAPERS)}"
828
 
829
  def gr_search_fetched(query):
830
- if not query or not query.strip(): return "⚠️ أدخل كلمة بحث."
831
- if not PAPERS: return "⚠️ اجلب الأوراق أولاً."
832
  results = search_papers(query.strip(), top_k=8)
833
- if not results: return f" لا نتائج لـ `{query}`."
834
- md = f"## 🔍 `{query}` — {len(results)} نتائج\n\n"
 
835
  for r in results:
836
- p,s = r["paper"],r["score"]
837
- bar = "🟩"*round(s*10)+"⬜"*(10-round(s*10))
838
- cit = f" | {cit_badge(p.get('citations'))}" if p.get("citations") else ""
839
- md += (f"### {bar} `{s*100:.0f}%` — {p['title']}\n\n"
840
- f"👥 {', '.join(p['authors'][:2])} | 📅 {p['published']}{cit}"
841
- f" | {p.get('source','')}\n\n"
842
- f"> {p['abstract'][:350]}...\n\n"
843
- f"🔗 [View]({p['url']})"
844
- +(f" 📥 [PDF]({p['pdf_url']})" if p.get("pdf_url") else "")
845
- +"\n\n---\n\n")
846
  return md
847
 
848
  def _get_paper(choice):
849
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
850
- try: return pool[int(choice.split(".")[0])-1]
851
  except: return None
852
 
853
  def gr_explain(choice, lang_choice):
854
- if not choice: return "⚠️ اجلب الأوراق ثم اختر ورقة."
855
  paper = _get_paper(choice)
856
- if not paper: return "⚠️ خطأ في الاختيار."
857
- lang = "ar" if "عربي" in lang_choice else "en"
858
- header = (f"# 📄 {paper['title']}\n\n"
859
- f"**👥** {', '.join(paper['authors'])} &nbsp;|&nbsp; "
860
- f"**📅** {paper['published']} &nbsp;|&nbsp; "
861
- f"**📊** {cit_badge(paper.get('citations'))} &nbsp;|&nbsp; "
862
- f"**📡** {paper.get('source','arXiv')}\n\n"
863
- f"🔗 [View Paper]({paper['url']})"
864
- +(f" 📥 [PDF]({paper['pdf_url']})" if paper.get("pdf_url") else "")
865
- f"\n\n---\n\n> {paper['abstract']}\n\n---\n\n"
866
- f"## 🤖 {'الشرح' if lang=='ar' else 'Explanation'} Llama 3.3 70B\n\n")
 
 
 
 
867
  return header + explain_paper(paper, lang)
868
 
869
  def gr_audio(txt, lang_choice):
870
- if not txt or len(txt)<50: return None
871
- return text_to_audio(txt, "ar" if "عربي" in lang_choice else "en")
872
 
873
  def gr_save_fav(choice):
874
- if not choice: return "⚠️ اختر ورقة أولاً."
875
  paper = _get_paper(choice)
876
- return save_favorite(paper) if paper else "⚠️ خطأ."
877
 
878
  def gr_show_favs():
879
  favs = load_favorites()
880
- if not favs: return "_لا توجد أوراق محفوظة._"
881
- lines = [f"⭐ **{p['title']}**\n👥 {p['authors'][0] if p['authors'] else 'N/A'} | "
882
- f"📅 {p['published']} | 📡 {p.get('source','')} | "
883
- f"📊 {cit_badge(p.get('citations'))} | 🔗 [رابط]({p['url']})"
 
 
 
884
  for p in favs]
885
- return f"### المفضلة {len(favs)} ورقة\n\n" + "\n\n---\n\n".join(lines)
 
886
 
887
  def gr_compare(ca, cb, lc):
888
- if not ca or not cb: return "⚠️ اختر ورقتين أولاً."
889
  pa = _get_paper(ca); pb = _get_paper(cb)
890
- if not pa or not pb: return "⚠️ خطأ في الاختيار."
891
- if pa["id"]==pb["id"]: return "⚠️ اختر ورقتين مختلفتين."
892
- return compare_papers(pa, pb, "ar" if "عربي" in lc else "en")
893
 
894
  def gr_overview(query, lc):
895
- if not PAPERS: return "⚠️ اجلب الأوراق أولاً."
896
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
897
- return (f"## 🌐 نظرة عامة\n\n"
898
- f"{summarize_papers(pool, query or 'research', 'ar' if 'عربي' in lc else 'en')}")
 
899
 
900
  def gr_trends():
901
- if not PAPERS: return None, "⚠️ اجلب الأوراق أولاً."
902
  return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS)
903
 
904
  def gr_bib(style, progress=gr.Progress()):
905
- if not PAPERS: return "⚠️ اجلب الأوراق أولاً.", None
906
- progress(0.5, desc="توليد...")
907
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
908
  text, path = generate_bibliography(pool, style)
909
  progress(1.0)
910
- return f"```\n{text[:3000]+('...' if len(text)>3000 else '')}\n```", path
 
911
 
912
  def gr_chat_fn(message, history):
913
  if not message.strip(): return history, ""
914
  hd = []
915
  for pair in history:
916
- if pair[0]: hd.append({"role":"user","content":pair[0]})
917
  if pair[1]: hd.append({"role":"assistant","content":pair[1]})
918
  history.append((message, chat_about_papers(message, hd)))
919
  return history, ""
@@ -925,185 +981,185 @@ CSS = """
925
  footer{display:none!important}
926
  h1{text-align:center}
927
  .status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0}
928
- .legend{font-size:.8rem;color:#cbd5e1;background:#1e293b;border-radius:8px;padding:6px 14px;margin-bottom:6px}
929
- .filter-box{background:#1e293b;border-radius:10px;padding:12px 16px;margin-top:8px}
930
- .gs-box{background:#1e293b;border-radius:10px;padding:14px 18px;margin-bottom:10px;border:1px solid #334155}
 
 
 
931
  """
932
 
933
  with gr.Blocks(
934
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
935
- title="🔬 Paper Discovery v7.4", css=CSS
936
  ) as demo:
937
 
938
- gr.Markdown("# 🔬 Scientific Paper Discovery v7.4\n**arXiv · CrossRef · Llama-3.3-70B · FAISS**")
939
- gr.Markdown(
940
- "📊 **الاقتباسات:** 🥇 ≥1,000 &nbsp;|&nbsp; 🏆 ≥100 &nbsp;|&nbsp; ⭐ ≥10 &nbsp;|&nbsp; 📄 <10 &nbsp;|&nbsp; · = 0",
941
- elem_classes="legend")
942
- status_bar = gr.Markdown("_لم يتم جلب أوراق بعد_", elem_classes="status-bar")
943
 
944
  with gr.Tabs():
945
 
946
- # ── TAB 1: BROWSE / FETCH ──────────────────────────
947
- with gr.Tab("🔍 البحث / Browse"):
948
  with gr.Row():
949
  with gr.Column(scale=3):
950
- t_query = gr.Textbox(label="🔎 الموضوع",
951
  placeholder="ARIMA, inflation, LLM...",
952
  value="economic forecasting")
953
- t_category = gr.Dropdown(label="📂 التصنيف",
954
  choices=list(CATEGORIES.keys()),
955
- value="📊 الاقتصاد / Economics")
956
- t_source = gr.Radio(label="📡 المصدر",
957
- choices=["arXiv","CrossRef","كلاهما / Both"],
958
  value="arXiv")
959
  with gr.Column(scale=1):
960
- t_max = gr.Slider(5, 50, value=15, step=5, label="📊 عدد الأوراق")
961
- t_days = gr.Slider(1, 1500, value=365, step=30, label="📅 آخر N يوم")
962
- btn_fetch = gr.Button("🚀 جلب الأوراق", variant="primary", size="lg")
963
- papers_table_md = gr.Markdown("_ستظهر النتائج هنا._")
964
- paper_selector = gr.Dropdown(label="📄 اختر ورقة", choices=[], interactive=True)
965
  with gr.Group(elem_classes="filter-box"):
966
- gr.Markdown("### 🔽 فلترة وترتيب")
967
  with gr.Row():
968
- f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="📅 من سنة")
969
- f_year_to = gr.Slider(2000,2026,value=2026,step=1,label="📅 إلى سنة")
970
  with gr.Row():
971
- f_cit_min = gr.Slider(0,5000,value=0, step=5,label="📊 اقتباسات من")
972
- f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="📊 اقتباسات إلى")
973
  with gr.Row():
974
  f_sort = gr.Dropdown(choices=SORT_CHOICES,
975
- value=SORT_CHOICES[2],label="🔃 الترتيب",scale=3)
976
- btn_filter = gr.Button("✅ تطبيق",variant="primary",scale=1)
977
- gr.Markdown("---\n### 🔍 بحث دلالي داخلي (FAISS)")
978
  with gr.Row():
979
- search_in_box = gr.Textbox(label="🔍 ابحث في الأوراق المحملة",
980
  placeholder="ARIMA, transformer...",scale=5)
981
- btn_search_in = gr.Button("بحث 🔍",scale=1)
982
  search_in_out = gr.Markdown()
983
 
984
- # ── TAB 2: GLOBAL SEARCH ─────────────────────────
985
- with gr.Tab("🌐 بحث عالمي / Global Search"):
986
  gr.Markdown(
987
- "### 🌐 ابحث عن أي ورقة بالعنوان أو الكلمات المفتاحية\n\n"
988
- "> يستخدم **arXiv relevance** و **CrossRef title search** "
989
- "للعثور على الورقة الصحيحة مباشرة."
990
  )
991
  with gr.Group(elem_classes="gs-box"):
992
  with gr.Row():
993
  gs_query = gr.Textbox(
994
- label="🔎 العنوان أو الكلمات المفتاحية",
995
- placeholder="Attention is All You Need | ARIMA inflation Algeria ...",
996
  scale=4)
997
- gs_source = gr.Radio(
998
- label="📡 المصدر",
999
- choices=["arXiv","CrossRef","كلاهما / Both"],
1000
- value="كلاهما / Both", scale=2)
1001
- gs_max = gr.Slider(5,30,value=10,step=5,label="📊 عدد النتائج",scale=1)
1002
- btn_gs = gr.Button("🔎 بحث الآن", variant="primary", size="lg")
1003
- gs_out = gr.Markdown("_أدخل عنوان ورقة أو كلمات مفتاحية..._")
1004
 
1005
  # ── TAB 3: EXPLAIN ─────────────────────────────────
1006
- with gr.Tab("📖 الشرح / Explain"):
1007
  with gr.Row():
1008
- paper_sel2 = gr.Dropdown(label="📄 اختر الورقة",
1009
  choices=[], interactive=True, scale=4)
1010
- lang_exp = gr.Radio(LANG_CHOICES, value=LANG_CHOICES[0],
1011
- label="🌐 اللغة", scale=1)
1012
  with gr.Row():
1013
- btn_explain = gr.Button("📖 اشرح", variant="primary")
1014
- btn_fav = gr.Button(" حفظ")
1015
- btn_audio = gr.Button("🔊 استمع")
1016
- btn_export_pdf = gr.Button("📄 تصدير PDF", variant="secondary")
1017
  with gr.Row():
1018
  fav_status = gr.Markdown()
1019
  pdf_status = gr.Markdown()
1020
- explanation_out = gr.Markdown("_اجلب الأوراق واختر ورقة._")
1021
- audio_out = gr.Audio(label="🔊", type="filepath")
1022
- pdf_out = gr.File(label="📄 تحميل PDF")
1023
 
1024
  # ── TAB 4: COMPARE ─────────────────────────────────
1025
- with gr.Tab("⚖️ المقارنة / Compare"):
1026
  with gr.Row():
1027
- cmp_a = gr.Dropdown(label="📄 الأولى", choices=[], interactive=True)
1028
- cmp_b = gr.Dropdown(label="📄 الثانية", choices=[], interactive=True)
1029
- lang_cmp = gr.Radio(LANG_CHOICES, value=LANG_CHOICES[0],
1030
- label="🌐 اللغة", scale=1)
1031
- btn_compare = gr.Button("⚖️ قارن الآن", variant="primary")
1032
- compare_out = gr.Markdown("_اختر ورقتين._")
1033
 
1034
  # ── TAB 5: OVERVIEW ────────────────────────────────
1035
- with gr.Tab("🌐 نظرة عامة"):
1036
  with gr.Row():
1037
- lang_ov = gr.Radio(LANG_CHOICES, value=LANG_CHOICES[0],
1038
- label="🌐 اللغة", scale=1)
1039
- btn_overview = gr.Button("🤖 توليد التقرير", variant="primary", scale=3)
1040
- overview_out = gr.Markdown("_اجلب الأوراق أولاً._")
1041
 
1042
  # ── TAB 6: TRENDS ──────────────────────────────────
1043
- with gr.Tab("📊 الاتجاهات / Trends"):
1044
- btn_trends = gr.Button("📊 تحليل الاتجاهات", variant="primary", size="lg")
1045
- trend_chart = gr.Image(label="📊 لوحة الاتجاهات", type="filepath")
1046
- trend_stats = gr.Markdown("_اجلب الأوراق أولاً._")
1047
 
1048
  # ── TAB 7: BIBLIOGRAPHY ────────────────────────────
1049
- with gr.Tab("📚 المراجع"):
1050
- bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"], value="APA",
1051
- label="📐 النمط")
1052
- btn_bib = gr.Button("📚 توليد المراجع", variant="primary")
1053
  bib_out = gr.Markdown()
1054
- bib_file = gr.File(label="📥 تحميل")
1055
 
1056
  # ── TAB 8: FAVORITES ───────────────────────────────
1057
- with gr.Tab("⭐ المفضلة"):
1058
- btn_show_fav = gr.Button("📋 عرض المفضلة")
1059
- favs_md = gr.Markdown("_اضغط عرض._")
1060
- btn_export_fav = gr.Button("📥 تصدير CSV", variant="secondary")
1061
- fav_csv_file = gr.File(label="📄 CSV")
1062
 
1063
  # ── TAB 9: AUTO-FETCH ──────────────────────────────
1064
- with gr.Tab("🔔 تحديث تلقائي"):
1065
  with gr.Row():
1066
- auto_q = gr.Textbox(label="🔎 الموضوع",
1067
  value="economic forecasting", scale=3)
1068
- auto_cat = gr.Dropdown(label="📂 التصنيف",
1069
  choices=list(CATEGORIES.keys()),
1070
- value="📊 الاقتصاد / Economics", scale=2)
1071
  auto_interval = gr.Slider(5,120,value=60,step=5,
1072
- label="⏱️ كل (دقيقة)",scale=1)
1073
  with gr.Row():
1074
- btn_start_auto = gr.Button("▶️ بدء", variant="primary")
1075
- btn_stop_auto = gr.Button("⏹️ إيقاف", variant="stop")
1076
- btn_refresh_log = gr.Button("🔄 السجل")
1077
  auto_status = gr.Markdown()
1078
- auto_log_md = gr.Markdown("_لا يوجد سجل._")
1079
 
1080
  # ── TAB 10: CHAT ───────────────────────────────────
1081
- with gr.Tab("💬 محادثة / Chat"):
1082
- chatbot_ui = gr.Chatbot(label="مساعد الأبحاث", height=480,
1083
- bubble_full_width=False)
1084
  with gr.Row():
1085
- chat_in = gr.Textbox(label="سؤالك", scale=5,
1086
- placeholder="🇸🇦 ما أبرز النتائج؟ | 🇬🇧 Key findings?")
1087
- btn_send = gr.Button("إرسال ✉️", variant="primary", scale=1)
1088
- btn_clear = gr.Button("🗑️ مسح", size="sm")
1089
 
1090
  # ── TAB 11: ABOUT ──────────────────────────────────
1091
- with gr.Tab("ℹ️ حول"):
1092
  gr.Markdown("""
1093
- ## 🔬 Scientific Paper Discovery — v7.4
1094
-
1095
- ### ✅ جديد في v7.4
1096
- | الميزة | التفاصيل |
1097
- |---|---|
1098
- | 🌐 بحث عالمي محسّن | يستخدم `sort_by="relevance"` + `ti:"..."` للعثور على الورقة بالعنوان الدقيق |
1099
- | 📄 تصدير PDF | شرح كامل بتنسيق احترافي بزر واحد |
1100
 
1101
- ### 🔧 مقارنة أوضاع البحث
1102
- | الوضع | يستخدم | مناسب لـ |
1103
  |---|---|---|
1104
- | 🔍 Browse | `sortBy=submittedDate` | استكشاف أحدث أوراق موضوع |
1105
- | 🌐 Global Search | `sortBy=relevance` + `ti:` | البحث عن ورقة بعنوانها |
1106
- | 🔍 FAISS (داخلي) | Cosine similarity | البحث في الأوراق المحملة |
 
 
 
 
 
1107
  """)
1108
 
1109
  # ── WIRING ──────────────────────────────────────────────
@@ -1125,17 +1181,17 @@ with gr.Blocks(
1125
  btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
1126
  gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
1127
 
1128
- btn_explain.click(gr_explain, inputs=[paper_sel2, lang_exp], outputs=[explanation_out])
1129
- btn_fav.click(gr_save_fav, inputs=[paper_sel2], outputs=[fav_status])
1130
- btn_audio.click(gr_audio, inputs=[explanation_out, lang_exp], outputs=[audio_out])
1131
  btn_export_pdf.click(gr_export_pdf,
1132
  inputs=[explanation_out, paper_sel2],
1133
  outputs=[pdf_out, pdf_status])
1134
 
1135
- btn_compare.click(gr_compare, inputs=[cmp_a, cmp_b, lang_cmp], outputs=[compare_out])
1136
- btn_overview.click(gr_overview, inputs=[t_query, lang_ov], outputs=[overview_out])
1137
  btn_trends.click(gr_trends, outputs=[trend_chart, trend_stats])
1138
- btn_bib.click(gr_bib, inputs=[bib_style], outputs=[bib_out, bib_file])
1139
 
1140
  btn_show_fav.click(gr_show_favs, outputs=[favs_md])
1141
  btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file])
 
1
  # ================================================================
2
+ # Scientific Paper Discovery Bot v7.4 — SyntaxError FIXED
 
 
3
  # ================================================================
4
  import os, re, time, json, pickle, threading
5
  import requests
 
30
  S2_API_KEY = os.environ.get("S2_API_KEY", "")
31
  groq_client = Groq(api_key=GROQ_API_KEY)
32
 
33
+ print("Loading embedder...")
34
  embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
35
  _ = embedder.encode(["warmup"])
36
+ print("Embedder ready!")
37
 
38
  PAPERS = []
39
  ACTIVE_PAPERS = []
 
43
  CURRENT_YEAR = datetime.now().year
44
 
45
  PERSIST_DIR = "/tmp"
46
+ FAVORITES_PATH = PERSIST_DIR + "/favorites.pkl"
47
+ SEEN_IDS_PATH = PERSIST_DIR + "/seen_ids.json"
48
  os.makedirs(PERSIST_DIR, exist_ok=True)
49
 
50
  CATEGORIES = {
51
+ "All": "",
52
+ "Economics": "econ",
53
+ "Quant Fin": "q-fin",
54
+ "AI": "cs.AI",
55
+ "ML": "cs.LG",
56
+ "NLP": "cs.CL",
57
+ "Statistics": "stat",
58
+ "Biology": "q-bio",
59
+ "Physics": "physics",
60
+ "Math": "math",
61
+ "CS": "cs",
62
  }
63
  CROSSREF_SUBJECTS = {
64
+ "All": "",
65
+ "Economics": "economics",
66
+ "Quant Fin": "finance",
67
+ "AI": "artificial intelligence",
68
+ "ML": "machine learning",
69
+ "NLP": "natural language processing",
70
+ "Statistics": "statistics",
71
+ "Biology": "biology",
72
+ "Physics": "physics",
73
+ "Math": "mathematics",
74
+ "CS": "computer science",
75
  }
76
+ LANG_CHOICES = ["Arabic", "English"]
77
+ SORT_CHOICES = ["Newest", "Oldest", "Most Cited", "Least Cited"]
78
+ AR_RULES = """
79
+ - ابدأ كل قسم بـ ## مع سطر فارغ قبله وبعده
80
+ - اكتب كل قسم في فقرة 3-4 جمل بالعربية الفصحى
 
 
 
 
 
 
81
  - لا تكرر عنوان القسم داخل النص
 
82
  """
83
 
84
  # ================================================================
85
  # HELPERS
86
  # ================================================================
87
  def detect_lang(text):
88
+ try:
89
+ return "ar" if detect(str(text)[:300]).startswith("ar") else "en"
90
+ except:
91
+ return "en"
92
 
93
  def clean_md(text):
94
  text = re.sub(r"[#*`>\[\]!_~]", "", text)
 
102
  def cit_badge(n):
103
  if n is None or n == "": return "—"
104
  n = int(n)
105
+ if n >= 1000: return "🥇 " + "{:,}".format(n)
106
+ if n >= 100: return "🏆 " + "{:,}".format(n)
107
+ if n >= 10: return "⭐ " + "{:,}".format(n)
108
+ if n > 0: return "📄 " + str(n)
109
  return "·"
110
 
111
  def build_table(papers_list):
112
+ rows = "| # | Title | Author | Date | Citations | Source |\n"
113
  rows += "|---|---|---|---|---|---|\n"
114
  choices = []
115
  for i, p in enumerate(papers_list):
116
+ first = p["authors"][0] if p["authors"] else "N/A"
117
+ badge = "NEW" if p.get("recent") else "📄"
118
+ rows += "| {} | {} {} | {} | {} | {} | {} |\n".format(
119
+ i+1, badge, p["title"], first,
120
+ p["published"], cit_badge(p.get("citations")),
121
+ p.get("source","arXiv"))
122
+ choices.append("{}. {}".format(i+1, p["title"]))
123
  return rows, choices
124
 
125
  def s2_headers():
126
  h = {"User-Agent": "ScientificPaperBot/7.4"}
127
+ if S2_API_KEY:
128
+ h["x-api-key"] = S2_API_KEY
129
  return h
130
 
131
  def cr_headers():
132
  return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"}
133
 
134
  # ================================================================
135
+ # CrossRef date parser — rejects garbage years
136
  # ================================================================
137
+ def parse_crossref_date(item):
138
  for field in ["issued", "published", "published-print", "published-online", "created"]:
139
  dp = (item.get(field) or {}).get("date-parts", [[]])
140
  if not dp or not dp[0]: continue
 
144
  if not (1900 <= year <= CURRENT_YEAR + 1): continue
145
  month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1))
146
  day = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1))
147
+ return "{:04d}-{:02d}-{:02d}".format(year, month, day)
148
  except (ValueError, TypeError, IndexError):
149
  continue
150
  return "N/A"
 
170
  if paper["id"] not in {p["id"] for p in favs}:
171
  favs.append(paper)
172
  with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f)
173
+ return "Saved: " + paper["title"]
174
+ return "Already saved."
175
 
176
  def export_favorites_csv():
177
  favs = load_favorites()
178
  if not favs: return None
179
+ df = pd.DataFrame([{
180
+ "Title": p["title"],
181
+ "Authors": ", ".join(p["authors"][:3]),
182
+ "Date": p["published"],
183
+ "Citations": p.get("citations","N/A"),
184
+ "URL": p["url"],
185
+ "Source": p.get("source","arXiv")
186
+ } for p in favs])
187
+ path = PERSIST_DIR + "/favorites.csv"
188
  df.to_csv(path, index=False, encoding="utf-8-sig")
189
  return path
190
 
191
  def gr_export_fav(): return export_favorites_csv()
192
 
193
  # ================================================================
194
+ # PDF EXPORT
195
  # ================================================================
196
  def export_explanation_pdf(explanation_text, paper_title="paper"):
197
  if not explanation_text or len(explanation_text) < 30: return None
198
  safe = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_")
199
+ path = PERSIST_DIR + "/explanation_" + safe + ".pdf"
200
  doc = SimpleDocTemplate(path, pagesize=A4,
201
  rightMargin=2*cm, leftMargin=2*cm,
202
+ topMargin=2*cm, bottomMargin=2*cm)
203
+ styles = getSampleStyleSheet()
204
+ h2_style = ParagraphStyle("H2", parent=styles["Heading2"],
205
+ fontSize=11, textColor=colors.HexColor("#2563eb"),
206
+ spaceBefore=14, spaceAfter=6)
207
+ bd_style = ParagraphStyle("BD", parent=styles["Normal"],
208
+ fontSize=10, leading=16, spaceAfter=8)
209
+ mt_style = ParagraphStyle("MT", parent=styles["Normal"],
210
+ fontSize=9, textColor=colors.HexColor("#64748b"))
 
211
  story = []
212
  for line in explanation_text.split("\n"):
213
  line = line.strip()
214
  if not line: story.append(Spacer(1, 6)); continue
215
  clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line)
216
+ clean = re.sub(r"\*(.+?)\*", r"\1", clean)
217
+ clean = re.sub(r"`(.+?)`", r"\1", clean)
218
+ clean = re.sub(r"^#{1,6}\s*", "", clean)
219
  clean = re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]", "", clean).strip()
220
  if not clean: continue
221
  if line.startswith("## ") or line.startswith("# "):
 
223
  color=colors.HexColor("#e2e8f0"), spaceAfter=4))
224
  story.append(Paragraph(clean, h2_style))
225
  elif line.startswith(">"):
226
+ q_st = ParagraphStyle("Q", parent=styles["Normal"],
227
+ fontSize=9, leftIndent=20,
228
+ textColor=colors.HexColor("#475569"), leading=14)
229
+ story.append(Paragraph(
230
+ re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]","",line.lstrip(">").strip()),
231
+ q_st))
232
  else:
233
+ story.append(Paragraph(clean, bd_style))
234
+ story += [
235
+ Spacer(1, 20),
236
+ HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")),
237
+ Paragraph("Generated by Paper Discovery v7.4 — " +
238
+ datetime.now().strftime("%Y-%m-%d %H:%M"), mt_style)
239
+ ]
240
  try:
241
  doc.build(story); return path
242
  except Exception as e:
243
+ print("PDF error: " + str(e)); return None
244
 
245
  def gr_export_pdf(explanation_text, choice):
246
  if not explanation_text or len(explanation_text) < 50:
247
+ return None, "Explain a paper first."
248
  title = choice.split(". ", 1)[-1] if choice else "paper"
249
  path = export_explanation_pdf(explanation_text, title)
250
+ return (path, "PDF ready!") if path else (None, "PDF failed.")
251
 
252
  # ================================================================
253
  # SOURCE 1 — arXiv
254
+ # KEY FIX: sort_by parameter
255
+ # Browse → "submittedDate" latest papers
256
+ # Global → "relevance" exact title match
257
  # ================================================================
258
  def fetch_arxiv_papers(query, category, max_results=20, days_back=365,
259
  sort_by="submittedDate"):
260
  parts = []
 
261
  words = query.strip().split()
262
  if len(words) >= 3 and sort_by == "relevance":
263
+ parts.append('ti:"' + query.strip() + '"')
264
  elif query.strip():
265
+ parts.append("all:" + query.strip())
266
  if category.strip():
267
+ parts.append("cat:" + category.strip())
268
+ sq = " AND ".join(parts) if parts else "all:machine learning"
269
+ params = {
270
+ "search_query": sq,
271
+ "start": 0,
272
+ "max_results": max_results,
273
+ "sortBy": sort_by,
274
+ "sortOrder": "descending",
275
+ }
276
  try:
277
  resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30)
278
  resp.raise_for_status()
279
+ except Exception as e:
280
+ print("arXiv error: " + str(e)); return []
281
 
282
  ns_a = "http://www.w3.org/2005/Atom"
283
  ns_x = "http://arxiv.org/schemas/atom"
284
  root = ET.fromstring(resp.content)
285
  cutoff = datetime.now() - timedelta(days=days_back)
286
  papers = []
287
+ for entry in root.findall("{" + ns_a + "}entry"):
288
  try:
289
+ pid = entry.find("{" + ns_a + "}id").text.split("/abs/")[-1].strip()
290
+ title = entry.find("{" + ns_a + "}title").text.strip().replace("\n"," ")
291
+ abstract = entry.find("{" + ns_a + "}summary").text.strip().replace("\n"," ")
292
+ published = entry.find("{" + ns_a + "}published").text[:10]
293
+ authors = [a.find("{" + ns_a + "}name").text
294
+ for a in entry.findall("{" + ns_a + "}author")]
295
  cats = set()
296
+ pc = entry.find("{" + ns_x + "}primary_category")
297
  if pc is not None: cats.add(pc.get("term",""))
298
+ for c in entry.findall("{" + ns_x + "}category"): cats.add(c.get("term",""))
299
  cats.discard("")
300
  papers.append({
301
+ "id": pid,
302
+ "title": title,
303
+ "authors": authors[:6],
304
+ "abstract": abstract[:1200],
305
+ "published": published,
306
+ "categories": list(cats)[:4],
307
+ "citations": None,
308
+ "url": "https://arxiv.org/abs/" + pid,
309
+ "pdf_url": "https://arxiv.org/pdf/" + pid,
310
+ "recent": datetime.strptime(published, "%Y-%m-%d") >= cutoff,
311
+ "source": "arXiv",
312
  })
313
+ except Exception as e:
314
+ print("arXiv parse: " + str(e))
315
  return papers
316
 
317
  # ================================================================
318
+ # SOURCE 2 — CrossRef
319
  # ================================================================
320
+ def fetch_crossref_papers(query, category_label="", max_results=20,
321
+ days_back=365, use_title=False):
322
  subject = CROSSREF_SUBJECTS.get(category_label, "")
323
+ full_query = (query + " " + subject).strip() if subject else query
324
+ key = "query.title" if use_title else "query"
325
  params = {
326
+ key: full_query,
327
+ "rows": min(max_results * 3, 200),
328
+ "sort": "relevance",
329
  "select": ("title,author,abstract,published,published-print,"
330
  "published-online,issued,created,DOI,"
331
  "is-referenced-by-count,link,subject"),
 
338
  if r.status_code == 200:
339
  items = r.json().get("message",{}).get("items",[]); break
340
  if r.status_code == 429: time.sleep(2**attempt); continue
341
+ print("CrossRef " + str(r.status_code)); return []
342
+ except Exception as e:
343
+ print("CrossRef attempt " + str(attempt) + ": " + str(e)); time.sleep(1)
344
 
345
  cutoff = datetime.now() - timedelta(days=days_back)
346
  papers, seen_ids = [], set()
 
353
  pub = parse_crossref_date(item)
354
  if pub == "N/A": continue
355
  cit = int(item.get("is-referenced-by-count", 0) or 0)
356
+ authors = [
357
+ (a.get("given","") + " " + a.get("family","")).strip()
358
+ for a in item.get("author",[])[:6]
359
+ ]
360
  authors = [a for a in authors if a.strip()] or ["Unknown"]
361
+ abstract = re.sub(r"<[^>]+>","",
362
+ item.get("abstract","No abstract.")).strip()[:1200]
363
+ doi = item.get("DOI","")
364
+ url = "https://doi.org/" + doi if doi else "#"
365
+ pid = doi or re.sub(r"\W","",title)[:40]
366
  if pid in seen_ids: continue
367
  seen_ids.add(pid)
368
+ pdf_url = next((l.get("URL","") for l in item.get("link",[])
369
+ if "pdf" in l.get("content-type","").lower()), "")
370
+ try: recent = datetime.strptime(pub[:10], "%Y-%m-%d") >= cutoff
371
  except: recent = False
372
  papers.append({
373
+ "id": pid,
374
+ "title": title,
375
+ "authors": authors,
376
+ "abstract": abstract,
377
+ "published": pub[:10],
378
  "categories": item.get("subject",[])[:3],
379
+ "citations": cit,
380
+ "url": url,
381
+ "pdf_url": pdf_url,
382
+ "recent": recent,
383
+ "source": "CrossRef",
384
  })
385
  papers.sort(key=lambda x: x["citations"], reverse=True)
386
  return papers
387
 
388
  # ================================================================
389
+ # GLOBAL PAPER SEARCH — relevance sorted
390
  # ================================================================
391
  def global_paper_search(query, source_choice, max_results=10):
392
  if not query or not query.strip():
393
+ return "Enter a title or keywords."
394
+ q = query.strip(); papers = []
395
+ if source_choice in ("arXiv", "Both"):
 
 
 
396
  papers += fetch_arxiv_papers(q, "", int(max_results), 3650,
397
  sort_by="relevance")
398
+ if source_choice in ("CrossRef", "Both"):
 
 
399
  papers += fetch_crossref_papers(q, "", int(max_results), 3650,
400
  use_title=True)
 
401
  if not papers:
402
+ return "No results for: " + q
403
 
 
404
  seen, unique = set(), []
405
  for p in papers:
406
  key = re.sub(r"\W","",p["title"].lower())[:60]
407
  if key not in seen: seen.add(key); unique.append(p)
 
 
408
  unique.sort(key=lambda x: x.get("citations") or 0, reverse=True)
409
 
410
+ NL = "\n"
411
+ md = "## Search Results: " + q + NL + NL
412
+ md += "**" + str(len(unique)) + " papers found**" + NL + NL + "---" + NL + NL
413
  for i, p in enumerate(unique, 1):
414
+ cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
415
+ cats = " | ".join(p.get("categories",[])[:2])
416
+ auth = ", ".join(p["authors"][:3])
417
+ abst = p["abstract"][:450]
418
+ link = "[View](" + p["url"] + ")"
419
+ pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
420
+ src = p.get("source","")
421
+ md += ("### " + str(i) + ". " + p["title"] + NL + NL +
422
+ auth + " | " + p["published"] + cit + " | " + src +
423
+ (" | " + cats if cats else "") + NL + NL +
424
+ "> " + abst + "..." + NL + NL +
425
+ link + pdf + NL + NL + "---" + NL + NL)
426
  return md
427
 
428
  # ================================================================
 
439
  id_map, batch_ids = {}, []
440
  for p in arxiv_papers:
441
  clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
442
+ id_map[clean] = p
443
+ batch_ids.append("arXiv:" + clean)
444
  for i in range(0, len(batch_ids), 500):
445
  try:
446
  r = requests.post(
 
452
  for item in r.json():
453
  if not item: continue
454
  ext = item.get("externalIds") or {}
455
+ clean = re.sub(r"v\d+$","",
456
+ ext.get("ArXiv","").split("/")[-1].strip())
457
  if clean and clean in id_map:
458
  c = item.get("citationCount")
459
  if c is not None: id_map[clean]["citations"] = int(c)
460
  elif r.status_code == 429: time.sleep(4)
461
+ except Exception as e: print("S2 batch: " + str(e))
462
  for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]:
463
  clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
464
  for attempt in range(2):
465
  try:
466
  r = requests.get(
467
+ "https://api.semanticscholar.org/graph/v1/paper/arXiv:" + clean,
468
+ params={"fields":"citationCount"},
469
+ headers=s2_headers(), timeout=10)
470
+ if r.status_code == 200:
471
+ c = r.json().get("citationCount")
472
+ p["citations"] = int(c) if c else 0; break
473
+ if r.status_code == 429: time.sleep(2**attempt); continue
474
+ p["citations"] = 0; break
475
+ except: p["citations"] = 0; break
476
  time.sleep(0.12)
477
  for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]:
478
  try:
479
  r = requests.get("https://api.crossref.org/works",
480
+ params={"query.title": p["title"], "rows": 1,
481
+ "select": "is-referenced-by-count,title"},
482
  headers=cr_headers(), timeout=8)
483
+ if r.status_code == 200:
484
+ items = r.json().get("message",{}).get("items",[])
485
  if items:
486
+ found = (items[0].get("title") or [""])[0].lower()
487
+ qw = set(p["title"].lower().split()[:5])
488
+ fw = set(found.split()[:10])
489
+ p["citations"] = (
490
+ int(items[0].get("is-referenced-by-count",0) or 0)
491
+ if len(qw & fw) >= 2 else 0)
492
+ else: p["citations"] = 0
493
+ else: p["citations"] = 0
494
  time.sleep(0.12)
495
+ except: p["citations"] = 0
496
  for p in papers:
497
  if p.get("citations") is None: p["citations"] = 0
498
  return papers
 
504
  global FAISS_INDEX, PAPERS
505
  PAPERS = papers
506
  if not papers: FAISS_INDEX = None; return
507
+ texts = [p["title"] + " " + p["abstract"] for p in papers]
508
  embs = embedder.encode(texts, convert_to_numpy=True,
509
  normalize_embeddings=True).astype("float32")
510
+ idx = faiss.IndexFlatIP(embs.shape[1])
511
+ idx.add(embs)
512
  FAISS_INDEX = idx
513
 
514
  def search_papers(query, top_k=5):
 
516
  qe = embedder.encode([query], convert_to_numpy=True,
517
  normalize_embeddings=True).astype("float32")
518
  scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS)))
519
+ return [{"paper": PAPERS[i], "score": float(s)}
520
+ for s, i in zip(scores[0], ids[0]) if i >= 0 and float(s) > 0.1]
521
 
522
  # ================================================================
523
  # AUTO-FETCH
 
532
  new_ps = [p for p in papers if p["id"] not in seen]
533
  if new_ps:
534
  save_seen_ids(seen | {p["id"] for p in papers})
535
+ AUTO_LOG.append(
536
+ "[" + datetime.now().strftime("%H:%M") + "] NEW " +
537
+ str(len(new_ps)) + " — " + query)
538
+ if len(AUTO_LOG) > 20: AUTO_LOG.pop(0)
539
 
540
  def start_auto_fetch(query, cat_label, interval_min):
541
  global AUTO_RUNNING
542
+ if AUTO_RUNNING: return "Already running."
543
  AUTO_RUNNING = True
544
+ threading.Thread(
545
+ target=auto_fetch_worker,
546
+ args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60),
547
+ daemon=True).start()
548
+ return "Auto-fetch started every " + str(interval_min) + " min for: " + query
549
 
550
  def stop_auto_fetch():
551
+ global AUTO_RUNNING; AUTO_RUNNING = False; return "Stopped."
552
 
553
  def get_auto_log():
554
+ return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "No log."
555
 
556
  # ================================================================
557
  # TRENDS
558
  # ================================================================
559
  def analyze_trends(papers):
560
+ if not papers: return None, "No papers."
561
  date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A")
562
  stopwords = {"the","a","an","of","in","for","on","with","and","or","to","using",
563
  "based","via","from","by","is","are","our","we","this","that","which",
564
  "towards","approach","method","new","into","over","learning","deep",
565
  "model","models","data","neural","large","language","paper","study",
566
  "analysis","results","show","also","can","used","two","its","their"}
567
+ all_words = [w.lower() for p in papers
568
+ for w in re.findall(r"[a-zA-Z]{4,}", p["title"])
569
+ if w.lower() not in stopwords]
570
+ top_words = Counter(all_words).most_common(15)
571
+ sources = Counter(p.get("source","arXiv") for p in papers)
572
+ cit_papers = [p for p in papers if (p.get("citations") or 0)>0]
573
+ top_cited = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10]
574
+ all_auth = [a for p in papers for a in p["authors"][:3]]
575
+ top_authors = Counter(all_auth).most_common(10)
576
+ cvals = [p["citations"] for p in cit_papers]
577
+ buckets = [0,1,5,10,50,100,500,10000]
578
+ blabels = ["0","1-4","5-9","10-49","50-99","100-499","500+"]
579
+ bcounts = ([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1])
580
+ for i in range(len(buckets)-1)] if cvals else [0]*7)
581
+ avg_cit = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0
582
+ total_cit = sum(p.get("citations") or 0 for p in papers)
583
  C = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4",
584
  "#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16",
585
  "#fbbf24","#34d399","#f87171"]
586
  BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white"
587
+ fig, axes = plt.subplots(2, 3, figsize=(20,12))
588
  fig.patch.set_facecolor(BG)
589
+ fig.suptitle("Research Trends", color=W, fontsize=16, fontweight="bold", y=1.01)
590
  def style(ax):
591
  ax.set_facecolor(PNL)
592
  for sp in ax.spines.values(): sp.set_edgecolor(BR)
593
  ax.tick_params(colors=W, labelsize=8)
594
+ ax = axes[0,0]; style(ax)
595
  if date_counts:
596
+ ms,cs = zip(*sorted(date_counts.items()))
597
+ ms,cs = list(ms), list(cs)
598
+ bars = ax.bar(ms, cs, color=C[0], edgecolor="#60a5fa", lw=0.8)
599
  for b,c in zip(bars,cs):
600
+ ax.text(b.get_x()+b.get_width()/2, b.get_height()+.05, str(c),
601
+ ha="center", va="bottom", color=W, fontsize=8)
602
+ if len(cs) > 2:
603
+ z = np.polyfit(range(len(cs)), cs, 1)
604
+ ax.plot(ms, np.poly1d(z)(range(len(cs))), "--",
605
+ color="#f59e0b", lw=1.5, alpha=.8, label="Trend")
606
+ ax.legend(fontsize=8, facecolor=PNL, labelcolor=W)
607
+ ax.set_title("Papers per Month", color=W, fontsize=12, fontweight="bold", pad=10)
608
+ ax.set_ylabel("Count", color=W, fontsize=9)
609
+ ax.tick_params(rotation=45)
610
+ ax = axes[0,1]; style(ax)
611
  if top_words:
612
+ wds,wcts = zip(*top_words)
613
+ ax.barh(list(wds), list(wcts), color=C[:len(wds)], edgecolor="#475569", lw=.6)
614
+ for b,c in zip(ax.patches, wcts):
615
+ ax.text(b.get_width()+.1, b.get_y()+b.get_height()/2, str(c),
616
+ va="center", color=W, fontsize=8)
617
+ ax.set_title("Top Keywords", color=W, fontsize=12, fontweight="bold", pad=10)
618
+ ax.set_xlabel("Frequency", color=W, fontsize=9)
619
+ ax = axes[0,2]; ax.set_facecolor(PNL)
620
  if sources:
621
+ sl,sv = zip(*sources.items())
622
+ _,txts,ats = ax.pie(sv, labels=sl, autopct="%1.0f%%",
623
+ colors=C[:len(sl)], startangle=90,
624
+ textprops={"color":W,"fontsize":10},
625
+ wedgeprops={"edgecolor":BR,"linewidth":1.5})
626
  for at in ats: at.set_color(W); at.set_fontsize(9)
627
+ ax.set_title("Source Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
628
+ ax = axes[1,0]; style(ax)
629
  if top_cited:
630
+ lbls = [(p["title"][:35]+"..." if len(p["title"])>35 else p["title"])
631
+ for p in top_cited]
632
+ cv = [p["citations"] for p in top_cited]
633
+ ax.barh(lbls[::-1], cv[::-1], color=C[1], edgecolor="#475569", lw=.6)
634
+ mx = max(cv) if cv else 1
635
+ for b,c in zip(ax.patches, cv[::-1]):
636
+ ax.text(b.get_width()+mx*.01, b.get_y()+b.get_height()/2,
637
+ "{:,}".format(c), va="center", color=W, fontsize=8)
638
+ ax.set_xlabel("Citations", color=W, fontsize=9)
639
  else:
640
+ ax.text(.5,.5,"No citation data", ha="center", va="center",
641
+ color="#94a3b8", fontsize=11, transform=ax.transAxes)
642
+ ax.set_title("Top 10 Cited", color=W, fontsize=12, fontweight="bold", pad=10)
643
+ ax = axes[1,1]; style(ax)
644
  if any(bcounts):
645
+ ax.bar(blabels, bcounts, color=C[2], edgecolor="#475569", lw=.8)
646
+ for b,c in zip(ax.patches, bcounts):
647
+ if c > 0:
648
+ ax.text(b.get_x()+b.get_width()/2, b.get_height()+.1, str(c),
649
+ ha="center", va="bottom", color=W, fontsize=9)
650
+ ax.set_xlabel("Citation Range", color=W, fontsize=9)
651
+ ax.set_ylabel("Papers", color=W, fontsize=9)
652
+ ax.annotate("Avg " + str(avg_cit) + " | Total " + "{:,}".format(total_cit),
653
+ xy=(.98,.96), xycoords="axes fraction",
654
+ ha="right", va="top", color="#94a3b8", fontsize=8)
655
  else:
656
+ ax.text(.5,.5,"No citation data", ha="center", va="center",
657
+ color="#94a3b8", fontsize=11, transform=ax.transAxes)
658
+ ax.set_title("Citation Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
659
+ ax = axes[1,2]; style(ax)
660
  if top_authors:
661
+ an,ac = zip(*top_authors)
662
+ ax.barh(list(an)[::-1], list(ac)[::-1], color=C[3], edgecolor="#475569", lw=.6)
663
+ for b,c in zip(ax.patches, list(ac)[::-1]):
664
+ ax.text(b.get_width()+.05, b.get_y()+b.get_height()/2, str(c),
665
+ va="center", color=W, fontsize=8)
666
+ ax.set_xlabel("Papers", color=W, fontsize=9)
667
+ ax.set_title("Top Authors", color=W, fontsize=12, fontweight="bold", pad=10)
668
  plt.tight_layout(pad=3)
669
+ path = PERSIST_DIR + "/trends.png"
670
+ plt.savefig(path, bbox_inches="tight", dpi=150, facecolor=BG)
671
+ plt.close()
672
+ top5 = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:5]
673
+ stats = ("### Stats\n\n| Metric | Value |\n|---|---|\n" +
674
+ "| Total | **" + str(len(papers)) + "** |\n" +
675
+ "| New | **" + str(sum(1 for p in papers if p.get("recent"))) + "** |\n" +
676
+ "| Citations | **" + "{:,}".format(total_cit) + "** |\n" +
677
+ "| Average | **" + str(avg_cit) + "** |\n\n")
678
  if top5:
679
+ stats += "### Top Cited\n\n"
680
  for i,p in enumerate(top5,1):
681
+ stats += (str(i) + ". [" + p["title"] + "](" + p["url"] + ")" +
682
+ " — **" + "{:,}".format(p["citations"]) + "**\n\n")
683
  return path, stats
684
 
685
  # ================================================================
 
691
  model="llama-3.3-70b-versatile",
692
  messages=messages, temperature=0.3, max_tokens=max_tokens)
693
  return r.choices[0].message.content.strip()
694
+ except Exception as e: return "LLM Error: " + str(e)
695
 
696
  def explain_paper(paper, lang="ar"):
697
  cit = paper.get("citations","N/A")
698
+ if lang == "ar":
699
  return fix_ar_format(_llm([
700
+ {"role":"system","content": "أنت خبير أكاديمي يشرح الأبحاث بالعربية الفصحى.\n" + AR_RULES},
 
701
  {"role":"user","content":
702
+ "اشرح الورقة:\nالعنوان: " + paper["title"] + "\n" +
703
+ "المؤلفون: " + ", ".join(paper["authors"][:3]) + "\n" +
704
+ "التاريخ: " + paper["published"] + " | الاقتباسات: " + str(cit) + "\n" +
705
+ "الملخص: " + paper["abstract"] + "\n\n" +
706
+ "## موضوع الورقة\n\n## المشكلة\n\n## المنهجية\n\n" +
707
+ "## النتائج\n\n## الأهمية\n\n## التطبيقات"}]))
708
  return _llm([{"role":"user","content":
709
+ "Explain:\nTitle: " + paper["title"] + "\nAuthors: " +
710
+ ", ".join(paper["authors"][:3]) + "\nDate: " + paper["published"] +
711
+ " | Citations: " + str(cit) + "\nAbstract: " + paper["abstract"] + "\n\n" +
712
+ "## Topic\n## Problem\n## Methodology\n## Findings\n## Contribution\n## Applications"}])
713
 
714
  def compare_papers(pa, pb, lang="ar"):
715
+ body = ("Paper A: " + pa["title"] + " | Citations: " + str(pa.get("citations","N/A")) +
716
+ "\n" + pa["abstract"][:500] + "\n\nPaper B: " +
717
+ pb["title"] + " | Citations: " + str(pb.get("citations","N/A")) +
718
+ "\n" + pb["abstract"][:500])
719
+ if lang == "ar":
720
  return fix_ar_format(_llm([{"role":"user","content":
721
+ "قارن بين الورقتين.\n" + AR_RULES + "\n\n" + body + "\n\n" +
722
+ "## الهدف\n\n## المنهجية\n\n## النتائج\n\n" +
723
+ "## القوة\n\n## القيود\n\n## الخلاصة"}], 1400))
724
  return _llm([{"role":"user","content":
725
+ "Compare:\n" + body + "\n\n" +
726
+ "## Topic\n## Methodology\n## Results\n## Strengths\n## Limits\n## Verdict"}], 1400)
727
 
728
  def summarize_papers(papers, topic, lang="ar"):
729
+ text = "".join(
730
+ str(i) + ". " + p["title"] + " (" + p["published"] + "): " +
731
+ p["abstract"][:300] + "...\n\n"
732
+ for i,p in enumerate(papers[:8],1))
733
+ if lang == "ar":
734
  return fix_ar_format(_llm([{"role":"user","content":
735
+ "نظرة عامة أكاديمية حول \"" + topic + "\".\n" + AR_RULES +
736
+ "\n\n" + text + "\n\n" +
737
+ "## الاتجاهات\n\n## أبرز الأوراق\n\n" +
738
+ "## المواضيع المشتركة\n\n## الفجوات"}], 900))
739
  return _llm([{"role":"user","content":
740
+ "Academic overview of \"" + topic + "\":\n" + text + "\n\n" +
741
  "## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900)
742
 
743
  def generate_bibliography(papers, style="APA"):
 
746
  auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "")
747
  year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d."
748
  t,u = p["title"], p["url"]
749
+ if style == "APA":
750
+ entries.append(str(i) + ". " + auth + " (" + year + "). *" + t + "*. " + u)
751
+ elif style == "IEEE":
752
  ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "")
753
+ entries.append("[" + str(i) + "] " + ae + ', "' + t + '," ' + year + ". [Online]: " + u)
754
+ elif style == "Chicago":
755
+ entries.append(str(i) + ". " + auth + '. "' + t + '." (' + year + "). " + u)
756
  else:
757
  key = re.sub(r"\W","", (p["authors"][0].split()[-1]
758
+ if p["authors"] else "Auth")) + year
759
+ entries.append("@article{" + key + str(i) + ",\n title={" + t +
760
+ "},\n author={" + auth + "},\n year={" + year +
761
+ "},\n url={" + u + "}\n}")
762
  bib = "\n\n".join(entries)
763
+ path = PERSIST_DIR + "/bibliography_" + style + ".txt"
764
+ with open(path, "w", encoding="utf-8") as f: f.write(bib)
765
  return bib, path
766
 
767
  def chat_about_papers(question, history):
768
  if not PAPERS:
769
+ return ("يرجى جلب الأوراق أولاً." if detect_lang(question)=="ar"
770
+ else "Fetch papers first.")
771
+ lang = detect_lang(question)
772
+ relevant = search_papers(question, top_k=4)
773
+ context = ""
774
  if relevant:
775
+ context = ("الأوراق ذات الصلة:\n\n" if lang=="ar" else "Relevant papers:\n\n")
776
  for r in relevant:
777
+ p = r["paper"]
778
+ cit = (" | " + str(p["citations"]) + " citations") if p.get("citations") else ""
779
+ context += ("**" + p["title"] + "** (" + p["published"] + ")" + cit +
780
+ "\n" + p["abstract"][:400] + "\n🔗 " + p["url"] + "\n\n")
781
+ sys_msg = (("أنت مساعد بحثي. أجب بالعربية الفصحى.\n" + AR_RULES) if lang=="ar"
782
+ else "You are an academic assistant. Answer in English.")
783
  msgs = [{"role":"system","content":sys_msg}]
784
  for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]})
785
  msgs.append({"role":"user","content":
786
+ (context + "\nسؤال: " + question) if context else question})
787
  out = _llm(msgs, 800)
788
  return fix_ar_format(out) if lang=="ar" else out
789
 
 
791
  clean = clean_md(text)
792
  if not clean: return None
793
  try:
794
+ tts = gTTS(text=clean, lang=lang, slow=False)
795
+ path = PERSIST_DIR + "/audio_" + lang + ".mp3"
796
+ tts.save(path); return path
797
+ except Exception as e: print("TTS: " + str(e)); return None
798
 
799
  # ================================================================
800
  # GRADIO HANDLERS
 
802
  def gr_fetch(query, category_label, max_results, days_back, source_choice,
803
  progress=gr.Progress()):
804
  global ACTIVE_PAPERS
805
+ progress(0.05, desc="Connecting...")
806
  papers, warn = [], ""
807
+ if source_choice in ("arXiv", "Both"):
808
+ progress(0.15, desc="Fetching arXiv...")
809
  papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""),
810
  int(max_results), int(days_back),
811
  sort_by="submittedDate")
812
+ if source_choice in ("CrossRef", "Both"):
813
+ progress(0.35, desc="Fetching CrossRef...")
814
  cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back))
815
+ if not cr: warn = "\n\n> CrossRef: no results."
816
  papers += cr
817
  seen, unique = set(), []
818
  for p in papers:
 
820
  if key not in seen: seen.add(key); unique.append(p)
821
  papers = unique
822
  if not papers:
823
+ return ("No results." + warn,
824
+ gr.update(choices=[], value=None), gr.update(choices=[], value=None),
825
+ gr.update(choices=[], value=None), gr.update(choices=[], value=None),
826
+ "0 papers")
827
+ progress(0.60, desc="Fetching citations...")
828
  papers = enrich_citations(papers)
829
+ progress(0.85, desc="FAISS indexing...")
830
  build_papers_index(papers)
831
  ACTIVE_PAPERS = list(papers)
832
  tbl, choices = build_table(papers)
833
  recent = sum(1 for p in papers if p.get("recent"))
834
  tot_cit = sum(p.get("citations") or 0 for p in papers)
835
  zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0)
836
+ note = ("\n\n> " + str(zero_cit) + " papers with 0 citations (new/unindexed)."
837
  if zero_cit else "")
838
+ md = ("## Fetched **" + str(len(papers)) + "** papers\n\n" +
839
+ "New: **" + str(recent) + "** | Citations: **" +
840
+ "{:,}".format(tot_cit) + "**" + warn + note +
841
+ "\n\n---\n\n" + tbl)
842
  upd = gr.update(choices=choices, value=choices[0] if choices else None)
843
  progress(1.0)
844
+ return md, upd, upd, upd, upd, str(len(papers)) + " papers | " + "{:,}".format(tot_cit) + " cit."
845
 
846
  def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by):
847
  global ACTIVE_PAPERS
848
+ if not PAPERS: return "Fetch papers first.", gr.update(), "0"
849
  filtered = []
850
  for p in PAPERS:
851
  try:
 
855
  cit = int(p.get("citations") or 0)
856
  if cit < int(cit_min) or cit > int(cit_max): continue
857
  filtered.append(p)
858
+ if sort_by == "Newest": filtered.sort(key=lambda x: x["published"], reverse=True)
859
+ elif sort_by == "Oldest": filtered.sort(key=lambda x: x["published"])
860
+ elif sort_by == "Most Cited": filtered.sort(key=lambda x: x.get("citations") or 0, reverse=True)
861
+ elif sort_by == "Least Cited":filtered.sort(key=lambda x: x.get("citations") or 0)
 
 
 
 
862
  if not filtered:
863
+ ACTIVE_PAPERS = []
864
+ return "No matching papers.", gr.update(choices=[], value=None), "0"
865
  ACTIVE_PAPERS = list(filtered)
866
  tbl, choices = build_table(filtered)
867
  tot = sum(p.get("citations") or 0 for p in filtered)
868
+ md = ("## " + str(len(filtered)) + "/" + str(len(PAPERS)) + " papers" +
869
+ " | " + str(year_from) + "-" + str(year_to) +
870
+ " | cit " + str(cit_min) + "-" + str(cit_max) +
871
+ " | total " + "{:,}".format(tot) + "\n\n---\n\n" + tbl)
872
+ return md, gr.update(choices=choices, value=choices[0] if choices else None), str(len(filtered)) + "/" + str(len(PAPERS))
873
 
874
  def gr_search_fetched(query):
875
+ if not query or not query.strip(): return "Enter a query."
876
+ if not PAPERS: return "Fetch papers first."
877
  results = search_papers(query.strip(), top_k=8)
878
+ if not results: return "No results for: " + query
879
+ NL = "\n"
880
+ md = "## Search: " + query + " — " + str(len(results)) + " results" + NL + NL
881
  for r in results:
882
+ p,s = r["paper"], r["score"]
883
+ bar = "green " * round(s*10)
884
+ cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
885
+ link = "[View](" + p["url"] + ")"
886
+ pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
887
+ md += ("### " + "{:.0f}".format(s*100) + "% — " + p["title"] + NL + NL +
888
+ ", ".join(p["authors"][:2]) + " | " + p["published"] + cit +
889
+ " | " + p.get("source","") + NL + NL +
890
+ "> " + p["abstract"][:350] + "..." + NL + NL +
891
+ link + pdf + NL + NL + "---" + NL + NL)
892
  return md
893
 
894
  def _get_paper(choice):
895
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
896
+ try: return pool[int(choice.split(".")[0]) - 1]
897
  except: return None
898
 
899
  def gr_explain(choice, lang_choice):
900
+ if not choice: return "Fetch papers and select one."
901
  paper = _get_paper(choice)
902
+ if not paper: return "Selection error."
903
+ lang = "ar" if "Arabic" in lang_choice else "en"
904
+ NL = "\n"
905
+ # FIX: No backslash inside f-string — use concatenation
906
+ pdf_link = (" [PDF](" + paper["pdf_url"] + ")") if paper.get("pdf_url") else ""
907
+ header = ("# " + paper["title"] + NL + NL +
908
+ "**Authors:** " + ", ".join(paper["authors"]) + NL + NL +
909
+ "**Date:** " + paper["published"] +
910
+ " | **Citations:** " + cit_badge(paper.get("citations")) +
911
+ " | **Source:** " + paper.get("source","arXiv") + NL + NL +
912
+ "[View Paper](" + paper["url"] + ")" + pdf_link + NL + NL +
913
+ "---" + NL + NL +
914
+ "> " + paper["abstract"] + NL + NL +
915
+ "---" + NL + NL +
916
+ "## Explanation (Llama 3.3 70B)" + NL + NL)
917
  return header + explain_paper(paper, lang)
918
 
919
  def gr_audio(txt, lang_choice):
920
+ if not txt or len(txt) < 50: return None
921
+ return text_to_audio(txt, "ar" if "Arabic" in lang_choice else "en")
922
 
923
  def gr_save_fav(choice):
924
+ if not choice: return "Select a paper first."
925
  paper = _get_paper(choice)
926
+ return save_favorite(paper) if paper else "Error."
927
 
928
  def gr_show_favs():
929
  favs = load_favorites()
930
+ if not favs: return "No saved papers."
931
+ NL = "\n"
932
+ lines = [("**" + p["title"] + "**" + NL +
933
+ (p["authors"][0] if p["authors"] else "N/A") +
934
+ " | " + p["published"] + " | " + p.get("source","") +
935
+ " | " + cit_badge(p.get("citations")) +
936
+ " | [Link](" + p["url"] + ")")
937
  for p in favs]
938
+ return ("### Favorites" + str(len(favs)) + " papers" + NL + NL +
939
+ (NL + NL + "---" + NL + NL).join(lines))
940
 
941
  def gr_compare(ca, cb, lc):
942
+ if not ca or not cb: return "Select two papers."
943
  pa = _get_paper(ca); pb = _get_paper(cb)
944
+ if not pa or not pb: return "Selection error."
945
+ if pa["id"] == pb["id"]: return "Select two different papers."
946
+ return compare_papers(pa, pb, "ar" if "Arabic" in lc else "en")
947
 
948
  def gr_overview(query, lc):
949
+ if not PAPERS: return "Fetch papers first."
950
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
951
+ return ("## Overview\n\n" +
952
+ summarize_papers(pool, query or "research",
953
+ "ar" if "Arabic" in lc else "en"))
954
 
955
  def gr_trends():
956
+ if not PAPERS: return None, "Fetch papers first."
957
  return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS)
958
 
959
  def gr_bib(style, progress=gr.Progress()):
960
+ if not PAPERS: return "Fetch papers first.", None
961
+ progress(0.5, desc="Generating...")
962
  pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
963
  text, path = generate_bibliography(pool, style)
964
  progress(1.0)
965
+ short = text[:3000] + ("..." if len(text)>3000 else "")
966
+ return "```\n" + short + "\n```", path
967
 
968
  def gr_chat_fn(message, history):
969
  if not message.strip(): return history, ""
970
  hd = []
971
  for pair in history:
972
+ if pair[0]: hd.append({"role":"user", "content":pair[0]})
973
  if pair[1]: hd.append({"role":"assistant","content":pair[1]})
974
  history.append((message, chat_about_papers(message, hd)))
975
  return history, ""
 
981
  footer{display:none!important}
982
  h1{text-align:center}
983
  .status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0}
984
+ .legend{font-size:.8rem;color:#cbd5e1;background:#1e293b;
985
+ border-radius:8px;padding:6px 14px;margin-bottom:6px}
986
+ .filter-box{background:#1e293b;border-radius:10px;
987
+ padding:12px 16px;margin-top:8px}
988
+ .gs-box{background:#1e293b;border-radius:10px;padding:14px 18px;
989
+ margin-bottom:10px;border:1px solid #334155}
990
  """
991
 
992
  with gr.Blocks(
993
  theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
994
+ title="Scientific Paper Discovery v7.4", css=CSS
995
  ) as demo:
996
 
997
+ gr.Markdown("# Scientific Paper Discovery v7.4\narXiv · CrossRef · Llama-3.3-70B · FAISS")
998
+ gr.Markdown("Citations: 🥇 >=1000 | 🏆 >=100 | ⭐ >=10 | 📄 <10 | · = 0",
999
+ elem_classes="legend")
1000
+ status_bar = gr.Markdown("No papers loaded yet.", elem_classes="status-bar")
 
1001
 
1002
  with gr.Tabs():
1003
 
1004
+ # ── TAB 1: BROWSE ──────────────────────────────────
1005
+ with gr.Tab("Browse / Search"):
1006
  with gr.Row():
1007
  with gr.Column(scale=3):
1008
+ t_query = gr.Textbox(label="Topic",
1009
  placeholder="ARIMA, inflation, LLM...",
1010
  value="economic forecasting")
1011
+ t_category = gr.Dropdown(label="Category",
1012
  choices=list(CATEGORIES.keys()),
1013
+ value="Economics")
1014
+ t_source = gr.Radio(label="Source",
1015
+ choices=["arXiv","CrossRef","Both"],
1016
  value="arXiv")
1017
  with gr.Column(scale=1):
1018
+ t_max = gr.Slider(5, 50, value=15, step=5, label="Max papers")
1019
+ t_days = gr.Slider(1, 1500, value=365, step=30, label="Last N days")
1020
+ btn_fetch = gr.Button("Fetch Papers", variant="primary", size="lg")
1021
+ papers_table_md = gr.Markdown("Results appear here.")
1022
+ paper_selector = gr.Dropdown(label="Select paper", choices=[], interactive=True)
1023
  with gr.Group(elem_classes="filter-box"):
1024
+ gr.Markdown("### Filter & Sort")
1025
  with gr.Row():
1026
+ f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="Year from")
1027
+ f_year_to = gr.Slider(2000,2026,value=2026,step=1,label="Year to")
1028
  with gr.Row():
1029
+ f_cit_min = gr.Slider(0,5000,value=0, step=5,label="Citations min")
1030
+ f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="Citations max")
1031
  with gr.Row():
1032
  f_sort = gr.Dropdown(choices=SORT_CHOICES,
1033
+ value="Most Cited",label="Sort",scale=3)
1034
+ btn_filter = gr.Button("Apply",variant="primary",scale=1)
1035
+ gr.Markdown("---\n### Semantic Search (FAISS — in loaded papers)")
1036
  with gr.Row():
1037
+ search_in_box = gr.Textbox(label="Search in loaded papers",
1038
  placeholder="ARIMA, transformer...",scale=5)
1039
+ btn_search_in = gr.Button("Search",scale=1)
1040
  search_in_out = gr.Markdown()
1041
 
1042
+ # ── TAB 2: GLOBAL SEARCH ───────────────────────────
1043
+ with gr.Tab("Global Search"):
1044
  gr.Markdown(
1045
+ "### Search any paper by title or keywords\n\n"
1046
+ "> Uses arXiv **relevance** sort + CrossRef **title** search.\n"
1047
+ "> Example: `Attention is All You Need`"
1048
  )
1049
  with gr.Group(elem_classes="gs-box"):
1050
  with gr.Row():
1051
  gs_query = gr.Textbox(
1052
+ label="Title or keywords",
1053
+ placeholder="Attention is All You Need | ARIMA forecasting ...",
1054
  scale=4)
1055
+ gs_source = gr.Radio(label="Source",
1056
+ choices=["arXiv","CrossRef","Both"],
1057
+ value="Both", scale=2)
1058
+ gs_max = gr.Slider(5,30,value=10,step=5,label="Max results",scale=1)
1059
+ btn_gs = gr.Button("Search Now", variant="primary", size="lg")
1060
+ gs_out = gr.Markdown("Enter a title or keywords...")
 
1061
 
1062
  # ── TAB 3: EXPLAIN ─────────────────────────────────
1063
+ with gr.Tab("Explain"):
1064
  with gr.Row():
1065
+ paper_sel2 = gr.Dropdown(label="Select paper",
1066
  choices=[], interactive=True, scale=4)
1067
+ lang_exp = gr.Radio(LANG_CHOICES, value="Arabic",
1068
+ label="Language", scale=1)
1069
  with gr.Row():
1070
+ btn_explain = gr.Button("Explain", variant="primary")
1071
+ btn_fav = gr.Button("Save Fav")
1072
+ btn_audio = gr.Button("Listen")
1073
+ btn_export_pdf = gr.Button("Export PDF", variant="secondary")
1074
  with gr.Row():
1075
  fav_status = gr.Markdown()
1076
  pdf_status = gr.Markdown()
1077
+ explanation_out = gr.Markdown("Fetch papers and select one.")
1078
+ audio_out = gr.Audio(label="Audio", type="filepath")
1079
+ pdf_out = gr.File(label="Download PDF")
1080
 
1081
  # ── TAB 4: COMPARE ─────────────────────────────────
1082
+ with gr.Tab("Compare"):
1083
  with gr.Row():
1084
+ cmp_a = gr.Dropdown(label="Paper A", choices=[], interactive=True)
1085
+ cmp_b = gr.Dropdown(label="Paper B", choices=[], interactive=True)
1086
+ lang_cmp = gr.Radio(LANG_CHOICES, value="Arabic",
1087
+ label="Language", scale=1)
1088
+ btn_compare = gr.Button("Compare", variant="primary")
1089
+ compare_out = gr.Markdown("Select two papers.")
1090
 
1091
  # ── TAB 5: OVERVIEW ────────────────────────────────
1092
+ with gr.Tab("Overview"):
1093
  with gr.Row():
1094
+ lang_ov = gr.Radio(LANG_CHOICES, value="Arabic",
1095
+ label="Language", scale=1)
1096
+ btn_overview = gr.Button("Generate Report", variant="primary", scale=3)
1097
+ overview_out = gr.Markdown("Fetch papers first.")
1098
 
1099
  # ── TAB 6: TRENDS ──────────────────────────────────
1100
+ with gr.Tab("Trends"):
1101
+ btn_trends = gr.Button("Analyze Trends", variant="primary", size="lg")
1102
+ trend_chart = gr.Image(label="Trends Dashboard", type="filepath")
1103
+ trend_stats = gr.Markdown("Fetch papers first.")
1104
 
1105
  # ── TAB 7: BIBLIOGRAPHY ────────────────────────────
1106
+ with gr.Tab("Bibliography"):
1107
+ bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"],
1108
+ value="APA", label="Style")
1109
+ btn_bib = gr.Button("Generate Bibliography", variant="primary")
1110
  bib_out = gr.Markdown()
1111
+ bib_file = gr.File(label="Download")
1112
 
1113
  # ── TAB 8: FAVORITES ───────────────────────────────
1114
+ with gr.Tab("Favorites"):
1115
+ btn_show_fav = gr.Button("Show Favorites")
1116
+ favs_md = gr.Markdown("Press to show.")
1117
+ btn_export_fav = gr.Button("Export CSV", variant="secondary")
1118
+ fav_csv_file = gr.File(label="CSV File")
1119
 
1120
  # ── TAB 9: AUTO-FETCH ──────────────────────────────
1121
+ with gr.Tab("Auto-Fetch"):
1122
  with gr.Row():
1123
+ auto_q = gr.Textbox(label="Topic",
1124
  value="economic forecasting", scale=3)
1125
+ auto_cat = gr.Dropdown(label="Category",
1126
  choices=list(CATEGORIES.keys()),
1127
+ value="Economics", scale=2)
1128
  auto_interval = gr.Slider(5,120,value=60,step=5,
1129
+ label="Every (min)",scale=1)
1130
  with gr.Row():
1131
+ btn_start_auto = gr.Button("Start", variant="primary")
1132
+ btn_stop_auto = gr.Button("Stop", variant="stop")
1133
+ btn_refresh_log = gr.Button("Refresh Log")
1134
  auto_status = gr.Markdown()
1135
+ auto_log_md = gr.Markdown("No log.")
1136
 
1137
  # ── TAB 10: CHAT ───────────────────────────────────
1138
+ with gr.Tab("Chat"):
1139
+ chatbot_ui = gr.Chatbot(label="Research Assistant",
1140
+ height=480, bubble_full_width=False)
1141
  with gr.Row():
1142
+ chat_in = gr.Textbox(label="Question", scale=5,
1143
+ placeholder="Key findings? | ما أبرز النتائج؟")
1144
+ btn_send = gr.Button("Send", variant="primary", scale=1)
1145
+ btn_clear = gr.Button("Clear", size="sm")
1146
 
1147
  # ── TAB 11: ABOUT ──────────────────────────────────
1148
+ with gr.Tab("About"):
1149
  gr.Markdown("""
1150
+ ## Scientific Paper Discovery — v7.4
 
 
 
 
 
 
1151
 
1152
+ ### Search Mode Comparison
1153
+ | Mode | sortBy | Best for |
1154
  |---|---|---|
1155
+ | Browse tab | `submittedDate` | Latest papers on a topic |
1156
+ | Global Search | `relevance` + `ti:` | Finding a paper by exact title |
1157
+ | FAISS (internal) | Cosine similarity | Semantic search in loaded papers |
1158
+
1159
+ ### v7.4 Fixes
1160
+ - **arXiv Global Search** now uses `sortBy=relevance` + `ti:"..."` prefix
1161
+ - **CrossRef Global Search** now uses `query.title` for precise title matching
1162
+ - **SyntaxError fix**: removed backslashes from inside f-strings
1163
  """)
1164
 
1165
  # ── WIRING ──────────────────────────────────────────────
 
1181
  btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
1182
  gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
1183
 
1184
+ btn_explain.click(gr_explain, inputs=[paper_sel2, lang_exp], outputs=[explanation_out])
1185
+ btn_fav.click(gr_save_fav, inputs=[paper_sel2], outputs=[fav_status])
1186
+ btn_audio.click(gr_audio, inputs=[explanation_out, lang_exp], outputs=[audio_out])
1187
  btn_export_pdf.click(gr_export_pdf,
1188
  inputs=[explanation_out, paper_sel2],
1189
  outputs=[pdf_out, pdf_status])
1190
 
1191
+ btn_compare.click(gr_compare, inputs=[cmp_a, cmp_b, lang_cmp], outputs=[compare_out])
1192
+ btn_overview.click(gr_overview, inputs=[t_query, lang_ov], outputs=[overview_out])
1193
  btn_trends.click(gr_trends, outputs=[trend_chart, trend_stats])
1194
+ btn_bib.click(gr_bib, inputs=[bib_style], outputs=[bib_out, bib_file])
1195
 
1196
  btn_show_fav.click(gr_show_favs, outputs=[favs_md])
1197
  btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file])