| | |
| | |
| | |
| | import os, re, time, json, pickle, threading |
| | import requests |
| | import xml.etree.ElementTree as ET |
| | from datetime import datetime, timedelta |
| | from collections import Counter |
| |
|
| | import numpy as np |
| | import faiss |
| | import pandas as pd |
| | import matplotlib |
| | matplotlib.use("Agg") |
| | import matplotlib.pyplot as plt |
| | import gradio as gr |
| | from sentence_transformers import SentenceTransformer |
| | from groq import Groq |
| | from gtts import gTTS |
| | from langdetect import detect, DetectorFactory |
| | from reportlab.lib.pagesizes import A4 |
| | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle |
| | from reportlab.lib.units import cm |
| | from reportlab.lib import colors |
| | from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable |
| |
|
| | DetectorFactory.seed = 0 |
| |
|
| | GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") |
| | S2_API_KEY = os.environ.get("S2_API_KEY", "") |
| | groq_client = Groq(api_key=GROQ_API_KEY) |
| |
|
| | print("Loading embedder...") |
| | embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
| | _ = embedder.encode(["warmup"]) |
| | print("Embedder ready!") |
| |
|
| | PAPERS = [] |
| | ACTIVE_PAPERS = [] |
| | FAISS_INDEX = None |
| | AUTO_RUNNING = False |
| | AUTO_LOG = [] |
| | CURRENT_YEAR = datetime.now().year |
| |
|
| | PERSIST_DIR = "/tmp" |
| | FAVORITES_PATH = PERSIST_DIR + "/favorites.pkl" |
| | SEEN_IDS_PATH = PERSIST_DIR + "/seen_ids.json" |
| | os.makedirs(PERSIST_DIR, exist_ok=True) |
| |
|
| | CATEGORIES = { |
| | "๐ All": "", |
| | "๐ Economics": "econ", |
| | "๐ฐ Quant Finance": "q-fin", |
| | "๐ค AI": "cs.AI", |
| | "๐ง Machine Learning":"cs.LG", |
| | "๐ฌ NLP": "cs.CL", |
| | "๐ Statistics": "stat", |
| | "๐ฌ Biology": "q-bio", |
| | "โ๏ธ Physics": "physics", |
| | "๐ Mathematics": "math", |
| | "๐ป Computer Science":"cs", |
| | } |
| | CROSSREF_SUBJECTS = { |
| | "๐ All": "", |
| | "๐ Economics": "economics", |
| | "๐ฐ Quant Finance": "finance", |
| | "๐ค AI": "artificial intelligence", |
| | "๐ง Machine Learning":"machine learning", |
| | "๐ฌ NLP": "natural language processing", |
| | "๐ Statistics": "statistics", |
| | "๐ฌ Biology": "biology", |
| | "โ๏ธ Physics": "physics", |
| | "๐ Mathematics": "mathematics", |
| | "๐ป Computer Science":"computer science", |
| | } |
| | LANG_CHOICES = ["Arabic", "English"] |
| | SORT_CHOICES = ["Newest", "Oldest", "Most Cited", "Least Cited"] |
| | AR_RULES = """ |
| | - ุงุจุฏุฃ ูู ูุณู
ุจู ## ู
ุน ุณุทุฑ ูุงุฑุบ ูุจูู ูุจุนุฏู |
| | - ุงูุชุจ ูู ูุณู
ูู ููุฑุฉ 3-4 ุฌู
ู ุจุงูุนุฑุจูุฉ ุงููุตุญู |
| | - ูุง ุชูุฑุฑ ุนููุงู ุงููุณู
ุฏุงุฎู ุงููุต |
| | """ |
| |
|
| | |
| | |
| | |
| | def detect_lang(text): |
| | try: |
| | return "ar" if detect(str(text)[:300]).startswith("ar") else "en" |
| | except: |
| | return "en" |
| |
|
| | def clean_md(text): |
| | text = re.sub(r"[#*`>\[\]!_~]", "", text) |
| | return re.sub(r"\n+", " ", text).strip()[:2500] |
| |
|
| | def fix_ar_format(text): |
| | text = re.sub(r"\n(##)", r"\n\n\1", text) |
| | text = re.sub(r"(## [^\n]+)\n([^\n#])", r"\1\n\n\2", text) |
| | return re.sub(r"\n{3,}", "\n\n", text).strip() |
| |
|
| | def cit_badge(n): |
| | if n is None or n == "": return "โ" |
| | n = int(n) |
| | if n >= 1000: return "๐ฅ " + "{:,}".format(n) |
| | if n >= 100: return "๐ " + "{:,}".format(n) |
| | if n >= 10: return "โญ " + "{:,}".format(n) |
| | if n > 0: return "๐ " + str(n) |
| | return "ยท" |
| |
|
| | def build_table(papers_list): |
| | rows = "| # | Title | Author | Date | Citations | Source |\n" |
| | rows += "|---|---|---|---|---|---|\n" |
| | choices = [] |
| | for i, p in enumerate(papers_list): |
| | first = p["authors"][0] if p["authors"] else "N/A" |
| | badge = "NEW" if p.get("recent") else "๐" |
| | rows += "| {} | {} {} | {} | {} | {} | {} |\n".format( |
| | i+1, badge, p["title"], first, |
| | p["published"], cit_badge(p.get("citations")), |
| | p.get("source","arXiv")) |
| | choices.append("{}. {}".format(i+1, p["title"])) |
| | return rows, choices |
| |
|
| | def s2_headers(): |
| | h = {"User-Agent": "ScientificPaperBot/7.4"} |
| | if S2_API_KEY: |
| | h["x-api-key"] = S2_API_KEY |
| | return h |
| |
|
| | def cr_headers(): |
| | return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"} |
| |
|
| | |
| | |
| | |
| | def parse_crossref_date(item): |
| | for field in ["issued", "published", "published-print", "published-online", "created"]: |
| | dp = (item.get(field) or {}).get("date-parts", [[]]) |
| | if not dp or not dp[0]: continue |
| | pts = dp[0] |
| | try: |
| | year = int(pts[0]) |
| | if not (1900 <= year <= CURRENT_YEAR + 1): continue |
| | month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1)) |
| | day = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1)) |
| | return "{:04d}-{:02d}-{:02d}".format(year, month, day) |
| | except (ValueError, TypeError, IndexError): |
| | continue |
| | return "N/A" |
| |
|
| | |
| | |
| | |
| | def load_seen_ids(): |
| | try: |
| | with open(SEEN_IDS_PATH) as f: return set(json.load(f)) |
| | except: return set() |
| |
|
| | def save_seen_ids(ids): |
| | with open(SEEN_IDS_PATH, "w") as f: json.dump(list(ids), f) |
| |
|
| | def load_favorites(): |
| | try: |
| | with open(FAVORITES_PATH, "rb") as f: return pickle.load(f) |
| | except: return [] |
| |
|
| | def save_favorite(paper): |
| | favs = load_favorites() |
| | if paper["id"] not in {p["id"] for p in favs}: |
| | favs.append(paper) |
| | with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f) |
| | return "Saved: " + paper["title"] |
| | return "Already saved." |
| |
|
| | def export_favorites_csv(): |
| | favs = load_favorites() |
| | if not favs: return None |
| | df = pd.DataFrame([{ |
| | "Title": p["title"], |
| | "Authors": ", ".join(p["authors"][:3]), |
| | "Date": p["published"], |
| | "Citations": p.get("citations","N/A"), |
| | "URL": p["url"], |
| | "Source": p.get("source","arXiv") |
| | } for p in favs]) |
| | path = PERSIST_DIR + "/favorites.csv" |
| | df.to_csv(path, index=False, encoding="utf-8-sig") |
| | return path |
| |
|
| | def gr_export_fav(): return export_favorites_csv() |
| |
|
| | |
| | |
| | |
| | def export_explanation_pdf(explanation_text, paper_title="paper"): |
| | if not explanation_text or len(explanation_text) < 30: return None |
| | safe = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_") |
| | path = PERSIST_DIR + "/explanation_" + safe + ".pdf" |
| | doc = SimpleDocTemplate(path, pagesize=A4, |
| | rightMargin=2*cm, leftMargin=2*cm, |
| | topMargin=2*cm, bottomMargin=2*cm) |
| | styles = getSampleStyleSheet() |
| | h2_style = ParagraphStyle("H2", parent=styles["Heading2"], |
| | fontSize=11, textColor=colors.HexColor("#2563eb"), |
| | spaceBefore=14, spaceAfter=6) |
| | bd_style = ParagraphStyle("BD", parent=styles["Normal"], |
| | fontSize=10, leading=16, spaceAfter=8) |
| | mt_style = ParagraphStyle("MT", parent=styles["Normal"], |
| | fontSize=9, textColor=colors.HexColor("#64748b")) |
| | story = [] |
| | for line in explanation_text.split("\n"): |
| | line = line.strip() |
| | if not line: story.append(Spacer(1, 6)); continue |
| | clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line) |
| | clean = re.sub(r"\*(.+?)\*", r"\1", clean) |
| | clean = re.sub(r"`(.+?)`", r"\1", clean) |
| | clean = re.sub(r"^#{1,6}\s*", "", clean) |
| | clean = re.sub(r"[๐ฏโ๐ง๐๐๐๐๐ฅ๐
๐ก๐ค#*_~]", "", clean).strip() |
| | if not clean: continue |
| | if line.startswith("## ") or line.startswith("# "): |
| | story.append(HRFlowable(width="100%", thickness=0.5, |
| | color=colors.HexColor("#e2e8f0"), spaceAfter=4)) |
| | story.append(Paragraph(clean, h2_style)) |
| | elif line.startswith(">"): |
| | q_st = ParagraphStyle("Q", parent=styles["Normal"], |
| | fontSize=9, leftIndent=20, |
| | textColor=colors.HexColor("#475569"), leading=14) |
| | story.append(Paragraph( |
| | re.sub(r"[๐ฏโ๐ง๐๐๐๐๐ฅ๐
๐ก๐ค#*_~]","",line.lstrip(">").strip()), |
| | q_st)) |
| | else: |
| | story.append(Paragraph(clean, bd_style)) |
| | story += [ |
| | Spacer(1, 20), |
| | HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")), |
| | Paragraph("Generated by Paper Discovery v7.4 โ " + |
| | datetime.now().strftime("%Y-%m-%d %H:%M"), mt_style) |
| | ] |
| | try: |
| | doc.build(story); return path |
| | except Exception as e: |
| | print("PDF error: " + str(e)); return None |
| |
|
| | def gr_export_pdf(explanation_text, choice): |
| | if not explanation_text or len(explanation_text) < 50: |
| | return None, "Explain a paper first." |
| | title = choice.split(". ", 1)[-1] if choice else "paper" |
| | path = export_explanation_pdf(explanation_text, title) |
| | return (path, "PDF ready!") if path else (None, "PDF failed.") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | def fetch_arxiv_papers(query, category, max_results=20, days_back=365, |
| | sort_by="submittedDate"): |
| | parts = [] |
| | words = query.strip().split() |
| | if len(words) >= 3 and sort_by == "relevance": |
| | parts.append('ti:"' + query.strip() + '"') |
| | elif query.strip(): |
| | parts.append("all:" + query.strip()) |
| | if category.strip(): |
| | parts.append("cat:" + category.strip()) |
| | sq = " AND ".join(parts) if parts else "all:machine learning" |
| | params = { |
| | "search_query": sq, |
| | "start": 0, |
| | "max_results": max_results, |
| | "sortBy": sort_by, |
| | "sortOrder": "descending", |
| | } |
| | try: |
| | resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30) |
| | resp.raise_for_status() |
| | except Exception as e: |
| | print("arXiv error: " + str(e)); return [] |
| |
|
| | ns_a = "http://www.w3.org/2005/Atom" |
| | ns_x = "http://arxiv.org/schemas/atom" |
| | root = ET.fromstring(resp.content) |
| | cutoff = datetime.now() - timedelta(days=days_back) |
| | papers = [] |
| | for entry in root.findall("{" + ns_a + "}entry"): |
| | try: |
| | pid = entry.find("{" + ns_a + "}id").text.split("/abs/")[-1].strip() |
| | title = entry.find("{" + ns_a + "}title").text.strip().replace("\n"," ") |
| | abstract = entry.find("{" + ns_a + "}summary").text.strip().replace("\n"," ") |
| | published = entry.find("{" + ns_a + "}published").text[:10] |
| | authors = [a.find("{" + ns_a + "}name").text |
| | for a in entry.findall("{" + ns_a + "}author")] |
| | cats = set() |
| | pc = entry.find("{" + ns_x + "}primary_category") |
| | if pc is not None: cats.add(pc.get("term","")) |
| | for c in entry.findall("{" + ns_x + "}category"): cats.add(c.get("term","")) |
| | cats.discard("") |
| | papers.append({ |
| | "id": pid, |
| | "title": title, |
| | "authors": authors[:6], |
| | "abstract": abstract[:1200], |
| | "published": published, |
| | "categories": list(cats)[:4], |
| | "citations": None, |
| | "url": "https://arxiv.org/abs/" + pid, |
| | "pdf_url": "https://arxiv.org/pdf/" + pid, |
| | "recent": datetime.strptime(published, "%Y-%m-%d") >= cutoff, |
| | "source": "arXiv", |
| | }) |
| | except Exception as e: |
| | print("arXiv parse: " + str(e)) |
| | return papers |
| |
|
| | |
| | |
| | |
| | def fetch_crossref_papers(query, category_label="", max_results=20, |
| | days_back=365, use_title=False): |
| | subject = CROSSREF_SUBJECTS.get(category_label, "") |
| | full_query = (query + " " + subject).strip() if subject else query |
| | key = "query.title" if use_title else "query" |
| | params = { |
| | key: full_query, |
| | "rows": min(max_results * 3, 200), |
| | "sort": "relevance", |
| | "select": ("title,author,abstract,published,published-print," |
| | "published-online,issued,created,DOI," |
| | "is-referenced-by-count,link,subject"), |
| | } |
| | items = [] |
| | for attempt in range(3): |
| | try: |
| | r = requests.get("https://api.crossref.org/works", |
| | params=params, headers=cr_headers(), timeout=30) |
| | if r.status_code == 200: |
| | items = r.json().get("message",{}).get("items",[]); break |
| | if r.status_code == 429: time.sleep(2**attempt); continue |
| | print("CrossRef " + str(r.status_code)); return [] |
| | except Exception as e: |
| | print("CrossRef attempt " + str(attempt) + ": " + str(e)); time.sleep(1) |
| |
|
| | cutoff = datetime.now() - timedelta(days=days_back) |
| | papers, seen_ids = [], set() |
| | for item in items: |
| | if len(papers) >= max_results: break |
| | title_list = item.get("title", []) |
| | if not title_list: continue |
| | title = title_list[0].strip() |
| | if not title or title.lower().startswith("title pending"): continue |
| | pub = parse_crossref_date(item) |
| | if pub == "N/A": continue |
| | cit = int(item.get("is-referenced-by-count", 0) or 0) |
| | authors = [ |
| | (a.get("given","") + " " + a.get("family","")).strip() |
| | for a in item.get("author",[])[:6] |
| | ] |
| | authors = [a for a in authors if a.strip()] or ["Unknown"] |
| | abstract = re.sub(r"<[^>]+>","", |
| | item.get("abstract","No abstract.")).strip()[:1200] |
| | doi = item.get("DOI","") |
| | url = "https://doi.org/" + doi if doi else "#" |
| | pid = doi or re.sub(r"\W","",title)[:40] |
| | if pid in seen_ids: continue |
| | seen_ids.add(pid) |
| | pdf_url = next((l.get("URL","") for l in item.get("link",[]) |
| | if "pdf" in l.get("content-type","").lower()), "") |
| | try: recent = datetime.strptime(pub[:10], "%Y-%m-%d") >= cutoff |
| | except: recent = False |
| | papers.append({ |
| | "id": pid, |
| | "title": title, |
| | "authors": authors, |
| | "abstract": abstract, |
| | "published": pub[:10], |
| | "categories": item.get("subject",[])[:3], |
| | "citations": cit, |
| | "url": url, |
| | "pdf_url": pdf_url, |
| | "recent": recent, |
| | "source": "CrossRef", |
| | }) |
| | papers.sort(key=lambda x: x["citations"], reverse=True) |
| | return papers |
| |
|
| | |
| | |
| | |
| | def global_paper_search(query, source_choice, max_results=10): |
| | if not query or not query.strip(): |
| | return "Enter a title or keywords." |
| | q = query.strip(); papers = [] |
| | if source_choice in ("arXiv", "Both"): |
| | papers += fetch_arxiv_papers(q, "", int(max_results), 3650, |
| | sort_by="relevance") |
| | if source_choice in ("CrossRef", "Both"): |
| | papers += fetch_crossref_papers(q, "", int(max_results), 3650, |
| | use_title=True) |
| | if not papers: |
| | return "No results for: " + q |
| |
|
| | seen, unique = set(), [] |
| | for p in papers: |
| | key = re.sub(r"\W","",p["title"].lower())[:60] |
| | if key not in seen: seen.add(key); unique.append(p) |
| | unique.sort(key=lambda x: x.get("citations") or 0, reverse=True) |
| |
|
| | NL = "\n" |
| | md = "## Search Results: " + q + NL + NL |
| | md += "**" + str(len(unique)) + " papers found**" + NL + NL + "---" + NL + NL |
| | for i, p in enumerate(unique, 1): |
| | cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else "" |
| | cats = " | ".join(p.get("categories",[])[:2]) |
| | auth = ", ".join(p["authors"][:3]) |
| | abst = p["abstract"][:450] |
| | link = "[View](" + p["url"] + ")" |
| | pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else "" |
| | src = p.get("source","") |
| | md += ("### " + str(i) + ". " + p["title"] + NL + NL + |
| | auth + " | " + p["published"] + cit + " | " + src + |
| | (" | " + cats if cats else "") + NL + NL + |
| | "> " + abst + "..." + NL + NL + |
| | link + pdf + NL + NL + "---" + NL + NL) |
| | return md |
| |
|
| | |
| | |
| | |
| | def enrich_citations(papers): |
| | arxiv_papers = [p for p in papers |
| | if p.get("source")=="arXiv" and |
| | (p.get("citations") is None or p.get("citations")==0)] |
| | if not arxiv_papers: |
| | for p in papers: |
| | if p.get("citations") is None: p["citations"] = 0 |
| | return papers |
| | id_map, batch_ids = {}, [] |
| | for p in arxiv_papers: |
| | clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip()) |
| | id_map[clean] = p |
| | batch_ids.append("arXiv:" + clean) |
| | for i in range(0, len(batch_ids), 500): |
| | try: |
| | r = requests.post( |
| | "https://api.semanticscholar.org/graph/v1/paper/batch", |
| | json={"ids": batch_ids[i:i+500]}, |
| | params={"fields":"citationCount,externalIds"}, |
| | headers=s2_headers(), timeout=30) |
| | if r.status_code == 200: |
| | for item in r.json(): |
| | if not item: continue |
| | ext = item.get("externalIds") or {} |
| | clean = re.sub(r"v\d+$","", |
| | ext.get("ArXiv","").split("/")[-1].strip()) |
| | if clean and clean in id_map: |
| | c = item.get("citationCount") |
| | if c is not None: id_map[clean]["citations"] = int(c) |
| | elif r.status_code == 429: time.sleep(4) |
| | except Exception as e: print("S2 batch: " + str(e)) |
| | for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]: |
| | clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip()) |
| | for attempt in range(2): |
| | try: |
| | r = requests.get( |
| | "https://api.semanticscholar.org/graph/v1/paper/arXiv:" + clean, |
| | params={"fields":"citationCount"}, |
| | headers=s2_headers(), timeout=10) |
| | if r.status_code == 200: |
| | c = r.json().get("citationCount") |
| | p["citations"] = int(c) if c else 0; break |
| | if r.status_code == 429: time.sleep(2**attempt); continue |
| | p["citations"] = 0; break |
| | except: p["citations"] = 0; break |
| | time.sleep(0.12) |
| | for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]: |
| | try: |
| | r = requests.get("https://api.crossref.org/works", |
| | params={"query.title": p["title"], "rows": 1, |
| | "select": "is-referenced-by-count,title"}, |
| | headers=cr_headers(), timeout=8) |
| | if r.status_code == 200: |
| | items = r.json().get("message",{}).get("items",[]) |
| | if items: |
| | found = (items[0].get("title") or [""])[0].lower() |
| | qw = set(p["title"].lower().split()[:5]) |
| | fw = set(found.split()[:10]) |
| | p["citations"] = ( |
| | int(items[0].get("is-referenced-by-count",0) or 0) |
| | if len(qw & fw) >= 2 else 0) |
| | else: p["citations"] = 0 |
| | else: p["citations"] = 0 |
| | time.sleep(0.12) |
| | except: p["citations"] = 0 |
| | for p in papers: |
| | if p.get("citations") is None: p["citations"] = 0 |
| | return papers |
| |
|
| | |
| | |
| | |
| | def build_papers_index(papers): |
| | global FAISS_INDEX, PAPERS |
| | PAPERS = papers |
| | if not papers: FAISS_INDEX = None; return |
| | texts = [p["title"] + " " + p["abstract"] for p in papers] |
| | embs = embedder.encode(texts, convert_to_numpy=True, |
| | normalize_embeddings=True).astype("float32") |
| | idx = faiss.IndexFlatIP(embs.shape[1]) |
| | idx.add(embs) |
| | FAISS_INDEX = idx |
| |
|
| | def search_papers(query, top_k=5): |
| | if FAISS_INDEX is None or not PAPERS: return [] |
| | qe = embedder.encode([query], convert_to_numpy=True, |
| | normalize_embeddings=True).astype("float32") |
| | scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS))) |
| | return [{"paper": PAPERS[i], "score": float(s)} |
| | for s, i in zip(scores[0], ids[0]) if i >= 0 and float(s) > 0.1] |
| |
|
| | |
| | |
| | |
| | def auto_fetch_worker(query, category, interval): |
| | global AUTO_RUNNING |
| | while AUTO_RUNNING: |
| | time.sleep(interval) |
| | if not AUTO_RUNNING: break |
| | papers = fetch_arxiv_papers(query, category, 30, 1) |
| | seen = load_seen_ids() |
| | new_ps = [p for p in papers if p["id"] not in seen] |
| | if new_ps: |
| | save_seen_ids(seen | {p["id"] for p in papers}) |
| | AUTO_LOG.append( |
| | "[" + datetime.now().strftime("%H:%M") + "] NEW " + |
| | str(len(new_ps)) + " โ " + query) |
| | if len(AUTO_LOG) > 20: AUTO_LOG.pop(0) |
| |
|
| | def start_auto_fetch(query, cat_label, interval_min): |
| | global AUTO_RUNNING |
| | if AUTO_RUNNING: return "Already running." |
| | AUTO_RUNNING = True |
| | threading.Thread( |
| | target=auto_fetch_worker, |
| | args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60), |
| | daemon=True).start() |
| | return "Auto-fetch started every " + str(interval_min) + " min for: " + query |
| |
|
| | def stop_auto_fetch(): |
| | global AUTO_RUNNING; AUTO_RUNNING = False; return "Stopped." |
| |
|
| | def get_auto_log(): |
| | return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "No log." |
| |
|
| | |
| | |
| | |
| | def analyze_trends(papers): |
| | if not papers: return None, "No papers." |
| | date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A") |
| | stopwords = {"the","a","an","of","in","for","on","with","and","or","to","using", |
| | "based","via","from","by","is","are","our","we","this","that","which", |
| | "towards","approach","method","new","into","over","learning","deep", |
| | "model","models","data","neural","large","language","paper","study", |
| | "analysis","results","show","also","can","used","two","its","their"} |
| | all_words = [w.lower() for p in papers |
| | for w in re.findall(r"[a-zA-Z]{4,}", p["title"]) |
| | if w.lower() not in stopwords] |
| | top_words = Counter(all_words).most_common(15) |
| | sources = Counter(p.get("source","arXiv") for p in papers) |
| | cit_papers = [p for p in papers if (p.get("citations") or 0)>0] |
| | top_cited = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10] |
| | all_auth = [a for p in papers for a in p["authors"][:3]] |
| | top_authors = Counter(all_auth).most_common(10) |
| | cvals = [p["citations"] for p in cit_papers] |
| | buckets = [0,1,5,10,50,100,500,10000] |
| | blabels = ["0","1-4","5-9","10-49","50-99","100-499","500+"] |
| | bcounts = ([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1]) |
| | for i in range(len(buckets)-1)] if cvals else [0]*7) |
| | avg_cit = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0 |
| | total_cit = sum(p.get("citations") or 0 for p in papers) |
| | C = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4", |
| | "#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16", |
| | "#fbbf24","#34d399","#f87171"] |
| | BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white" |
| | fig, axes = plt.subplots(2, 3, figsize=(20,12)) |
| | fig.patch.set_facecolor(BG) |
| | fig.suptitle("Research Trends", color=W, fontsize=16, fontweight="bold", y=1.01) |
| | def style(ax): |
| | ax.set_facecolor(PNL) |
| | for sp in ax.spines.values(): sp.set_edgecolor(BR) |
| | ax.tick_params(colors=W, labelsize=8) |
| | ax = axes[0,0]; style(ax) |
| | if date_counts: |
| | ms,cs = zip(*sorted(date_counts.items())) |
| | ms,cs = list(ms), list(cs) |
| | bars = ax.bar(ms, cs, color=C[0], edgecolor="#60a5fa", lw=0.8) |
| | for b,c in zip(bars,cs): |
| | ax.text(b.get_x()+b.get_width()/2, b.get_height()+.05, str(c), |
| | ha="center", va="bottom", color=W, fontsize=8) |
| | if len(cs) > 2: |
| | z = np.polyfit(range(len(cs)), cs, 1) |
| | ax.plot(ms, np.poly1d(z)(range(len(cs))), "--", |
| | color="#f59e0b", lw=1.5, alpha=.8, label="Trend") |
| | ax.legend(fontsize=8, facecolor=PNL, labelcolor=W) |
| | ax.set_title("Papers per Month", color=W, fontsize=12, fontweight="bold", pad=10) |
| | ax.set_ylabel("Count", color=W, fontsize=9) |
| | ax.tick_params(rotation=45) |
| | ax = axes[0,1]; style(ax) |
| | if top_words: |
| | wds,wcts = zip(*top_words) |
| | ax.barh(list(wds), list(wcts), color=C[:len(wds)], edgecolor="#475569", lw=.6) |
| | for b,c in zip(ax.patches, wcts): |
| | ax.text(b.get_width()+.1, b.get_y()+b.get_height()/2, str(c), |
| | va="center", color=W, fontsize=8) |
| | ax.set_title("Top Keywords", color=W, fontsize=12, fontweight="bold", pad=10) |
| | ax.set_xlabel("Frequency", color=W, fontsize=9) |
| | ax = axes[0,2]; ax.set_facecolor(PNL) |
| | if sources: |
| | sl,sv = zip(*sources.items()) |
| | _,txts,ats = ax.pie(sv, labels=sl, autopct="%1.0f%%", |
| | colors=C[:len(sl)], startangle=90, |
| | textprops={"color":W,"fontsize":10}, |
| | wedgeprops={"edgecolor":BR,"linewidth":1.5}) |
| | for at in ats: at.set_color(W); at.set_fontsize(9) |
| | ax.set_title("Source Distribution", color=W, fontsize=12, fontweight="bold", pad=10) |
| | ax = axes[1,0]; style(ax) |
| | if top_cited: |
| | lbls = [(p["title"][:35]+"..." if len(p["title"])>35 else p["title"]) |
| | for p in top_cited] |
| | cv = [p["citations"] for p in top_cited] |
| | ax.barh(lbls[::-1], cv[::-1], color=C[1], edgecolor="#475569", lw=.6) |
| | mx = max(cv) if cv else 1 |
| | for b,c in zip(ax.patches, cv[::-1]): |
| | ax.text(b.get_width()+mx*.01, b.get_y()+b.get_height()/2, |
| | "{:,}".format(c), va="center", color=W, fontsize=8) |
| | ax.set_xlabel("Citations", color=W, fontsize=9) |
| | else: |
| | ax.text(.5,.5,"No citation data", ha="center", va="center", |
| | color="#94a3b8", fontsize=11, transform=ax.transAxes) |
| | ax.set_title("Top 10 Cited", color=W, fontsize=12, fontweight="bold", pad=10) |
| | ax = axes[1,1]; style(ax) |
| | if any(bcounts): |
| | ax.bar(blabels, bcounts, color=C[2], edgecolor="#475569", lw=.8) |
| | for b,c in zip(ax.patches, bcounts): |
| | if c > 0: |
| | ax.text(b.get_x()+b.get_width()/2, b.get_height()+.1, str(c), |
| | ha="center", va="bottom", color=W, fontsize=9) |
| | ax.set_xlabel("Citation Range", color=W, fontsize=9) |
| | ax.set_ylabel("Papers", color=W, fontsize=9) |
| | ax.annotate("Avg " + str(avg_cit) + " | Total " + "{:,}".format(total_cit), |
| | xy=(.98,.96), xycoords="axes fraction", |
| | ha="right", va="top", color="#94a3b8", fontsize=8) |
| | else: |
| | ax.text(.5,.5,"No citation data", ha="center", va="center", |
| | color="#94a3b8", fontsize=11, transform=ax.transAxes) |
| | ax.set_title("Citation Distribution", color=W, fontsize=12, fontweight="bold", pad=10) |
| | ax = axes[1,2]; style(ax) |
| | if top_authors: |
| | an,ac = zip(*top_authors) |
| | ax.barh(list(an)[::-1], list(ac)[::-1], color=C[3], edgecolor="#475569", lw=.6) |
| | for b,c in zip(ax.patches, list(ac)[::-1]): |
| | ax.text(b.get_width()+.05, b.get_y()+b.get_height()/2, str(c), |
| | va="center", color=W, fontsize=8) |
| | ax.set_xlabel("Papers", color=W, fontsize=9) |
| | ax.set_title("Top Authors", color=W, fontsize=12, fontweight="bold", pad=10) |
| | plt.tight_layout(pad=3) |
| | path = PERSIST_DIR + "/trends.png" |
| | plt.savefig(path, bbox_inches="tight", dpi=150, facecolor=BG) |
| | plt.close() |
| | top5 = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:5] |
| | stats = ("### Stats\n\n| Metric | Value |\n|---|---|\n" + |
| | "| Total | **" + str(len(papers)) + "** |\n" + |
| | "| New | **" + str(sum(1 for p in papers if p.get("recent"))) + "** |\n" + |
| | "| Citations | **" + "{:,}".format(total_cit) + "** |\n" + |
| | "| Average | **" + str(avg_cit) + "** |\n\n") |
| | if top5: |
| | stats += "### Top Cited\n\n" |
| | for i,p in enumerate(top5,1): |
| | stats += (str(i) + ". [" + p["title"] + "](" + p["url"] + ")" + |
| | " โ **" + "{:,}".format(p["citations"]) + "**\n\n") |
| | return path, stats |
| |
|
| | |
| | |
| | |
| | def _llm(messages, max_tokens=1200): |
| | try: |
| | r = groq_client.chat.completions.create( |
| | model="llama-3.3-70b-versatile", |
| | messages=messages, temperature=0.3, max_tokens=max_tokens) |
| | return r.choices[0].message.content.strip() |
| | except Exception as e: return "LLM Error: " + str(e) |
| |
|
| | def explain_paper(paper, lang="ar"): |
| | cit = paper.get("citations","N/A") |
| | if lang == "ar": |
| | return fix_ar_format(_llm([ |
| | {"role":"system","content": "ุฃูุช ุฎุจูุฑ ุฃูุงุฏูู
ู ูุดุฑุญ ุงูุฃุจุญุงุซ ุจุงูุนุฑุจูุฉ ุงููุตุญู.\n" + AR_RULES}, |
| | {"role":"user","content": |
| | "ุงุดุฑุญ ุงููุฑูุฉ:\nุงูุนููุงู: " + paper["title"] + "\n" + |
| | "ุงูู
ุคูููู: " + ", ".join(paper["authors"][:3]) + "\n" + |
| | "ุงูุชุงุฑูุฎ: " + paper["published"] + " | ุงูุงูุชุจุงุณุงุช: " + str(cit) + "\n" + |
| | "ุงูู
ูุฎุต: " + paper["abstract"] + "\n\n" + |
| | "## ู
ูุถูุน ุงููุฑูุฉ\n\n## ุงูู
ุดููุฉ\n\n## ุงูู
ููุฌูุฉ\n\n" + |
| | "## ุงููุชุงุฆุฌ\n\n## ุงูุฃูู
ูุฉ\n\n## ุงูุชุทุจููุงุช"}])) |
| | return _llm([{"role":"user","content": |
| | "Explain:\nTitle: " + paper["title"] + "\nAuthors: " + |
| | ", ".join(paper["authors"][:3]) + "\nDate: " + paper["published"] + |
| | " | Citations: " + str(cit) + "\nAbstract: " + paper["abstract"] + "\n\n" + |
| | "## Topic\n## Problem\n## Methodology\n## Findings\n## Contribution\n## Applications"}]) |
| |
|
| | def compare_papers(pa, pb, lang="ar"): |
| | body = ("Paper A: " + pa["title"] + " | Citations: " + str(pa.get("citations","N/A")) + |
| | "\n" + pa["abstract"][:500] + "\n\nPaper B: " + |
| | pb["title"] + " | Citations: " + str(pb.get("citations","N/A")) + |
| | "\n" + pb["abstract"][:500]) |
| | if lang == "ar": |
| | return fix_ar_format(_llm([{"role":"user","content": |
| | "ูุงุฑู ุจูู ุงููุฑูุชูู.\n" + AR_RULES + "\n\n" + body + "\n\n" + |
| | "## ุงููุฏู\n\n## ุงูู
ููุฌูุฉ\n\n## ุงููุชุงุฆุฌ\n\n" + |
| | "## ุงูููุฉ\n\n## ุงููููุฏ\n\n## ุงูุฎูุงุตุฉ"}], 1400)) |
| | return _llm([{"role":"user","content": |
| | "Compare:\n" + body + "\n\n" + |
| | "## Topic\n## Methodology\n## Results\n## Strengths\n## Limits\n## Verdict"}], 1400) |
| |
|
| | def summarize_papers(papers, topic, lang="ar"): |
| | text = "".join( |
| | str(i) + ". " + p["title"] + " (" + p["published"] + "): " + |
| | p["abstract"][:300] + "...\n\n" |
| | for i,p in enumerate(papers[:8],1)) |
| | if lang == "ar": |
| | return fix_ar_format(_llm([{"role":"user","content": |
| | "ูุธุฑุฉ ุนุงู
ุฉ ุฃูุงุฏูู
ูุฉ ุญูู \"" + topic + "\".\n" + AR_RULES + |
| | "\n\n" + text + "\n\n" + |
| | "## ุงูุงุชุฌุงูุงุช\n\n## ุฃุจุฑุฒ ุงูุฃูุฑุงู\n\n" + |
| | "## ุงูู
ูุงุถูุน ุงูู
ุดุชุฑูุฉ\n\n## ุงููุฌูุงุช"}], 900)) |
| | return _llm([{"role":"user","content": |
| | "Academic overview of \"" + topic + "\":\n" + text + "\n\n" + |
| | "## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900) |
| |
|
| | def generate_bibliography(papers, style="APA"): |
| | entries = [] |
| | for i,p in enumerate(papers,1): |
| | auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "") |
| | year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d." |
| | t,u = p["title"], p["url"] |
| | if style == "APA": |
| | entries.append(str(i) + ". " + auth + " (" + year + "). *" + t + "*. " + u) |
| | elif style == "IEEE": |
| | ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "") |
| | entries.append("[" + str(i) + "] " + ae + ', "' + t + '," ' + year + ". [Online]: " + u) |
| | elif style == "Chicago": |
| | entries.append(str(i) + ". " + auth + '. "' + t + '." (' + year + "). " + u) |
| | else: |
| | key = re.sub(r"\W","", (p["authors"][0].split()[-1] |
| | if p["authors"] else "Auth")) + year |
| | entries.append("@article{" + key + str(i) + ",\n title={" + t + |
| | "},\n author={" + auth + "},\n year={" + year + |
| | "},\n url={" + u + "}\n}") |
| | bib = "\n\n".join(entries) |
| | path = PERSIST_DIR + "/bibliography_" + style + ".txt" |
| | with open(path, "w", encoding="utf-8") as f: f.write(bib) |
| | return bib, path |
| |
|
| | def chat_about_papers(question, history): |
| | if not PAPERS: |
| | return ("ูุฑุฌู ุฌูุจ ุงูุฃูุฑุงู ุฃููุงู." if detect_lang(question)=="ar" |
| | else "Fetch papers first.") |
| | lang = detect_lang(question) |
| | relevant = search_papers(question, top_k=4) |
| | context = "" |
| | if relevant: |
| | context = ("ุงูุฃูุฑุงู ุฐุงุช ุงูุตูุฉ:\n\n" if lang=="ar" else "Relevant papers:\n\n") |
| | for r in relevant: |
| | p = r["paper"] |
| | cit = (" | " + str(p["citations"]) + " citations") if p.get("citations") else "" |
| | context += ("**" + p["title"] + "** (" + p["published"] + ")" + cit + |
| | "\n" + p["abstract"][:400] + "\n๐ " + p["url"] + "\n\n") |
| | sys_msg = (("ุฃูุช ู
ุณุงุนุฏ ุจุญุซู. ุฃุฌุจ ุจุงูุนุฑุจูุฉ ุงููุตุญู.\n" + AR_RULES) if lang=="ar" |
| | else "You are an academic assistant. Answer in English.") |
| | msgs = [{"role":"system","content":sys_msg}] |
| | for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]}) |
| | msgs.append({"role":"user","content": |
| | (context + "\nุณุคุงู: " + question) if context else question}) |
| | out = _llm(msgs, 800) |
| | return fix_ar_format(out) if lang=="ar" else out |
| |
|
| | def text_to_audio(text, lang="ar"): |
| | clean = clean_md(text) |
| | if not clean: return None |
| | try: |
| | tts = gTTS(text=clean, lang=lang, slow=False) |
| | path = PERSIST_DIR + "/audio_" + lang + ".mp3" |
| | tts.save(path); return path |
| | except Exception as e: print("TTS: " + str(e)); return None |
| |
|
| | |
| | |
| | |
| | def gr_fetch(query, category_label, max_results, days_back, source_choice, |
| | progress=gr.Progress()): |
| | global ACTIVE_PAPERS |
| | progress(0.05, desc="Connecting...") |
| | papers, warn = [], "" |
| | if source_choice in ("arXiv", "Both"): |
| | progress(0.15, desc="Fetching arXiv...") |
| | papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""), |
| | int(max_results), int(days_back), |
| | sort_by="submittedDate") |
| | if source_choice in ("CrossRef", "Both"): |
| | progress(0.35, desc="Fetching CrossRef...") |
| | cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back)) |
| | if not cr: warn = "\n\n> CrossRef: no results." |
| | papers += cr |
| | seen, unique = set(), [] |
| | for p in papers: |
| | key = re.sub(r"\W","",p["title"].lower())[:60] |
| | if key not in seen: seen.add(key); unique.append(p) |
| | papers = unique |
| | if not papers: |
| | return ("No results." + warn, |
| | gr.update(choices=[], value=None), gr.update(choices=[], value=None), |
| | gr.update(choices=[], value=None), gr.update(choices=[], value=None), |
| | "0 papers") |
| | progress(0.60, desc="Fetching citations...") |
| | papers = enrich_citations(papers) |
| | progress(0.85, desc="FAISS indexing...") |
| | build_papers_index(papers) |
| | ACTIVE_PAPERS = list(papers) |
| | tbl, choices = build_table(papers) |
| | recent = sum(1 for p in papers if p.get("recent")) |
| | tot_cit = sum(p.get("citations") or 0 for p in papers) |
| | zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0) |
| | note = ("\n\n> " + str(zero_cit) + " papers with 0 citations (new/unindexed)." |
| | if zero_cit else "") |
| | md = ("## Fetched **" + str(len(papers)) + "** papers\n\n" + |
| | "New: **" + str(recent) + "** | Citations: **" + |
| | "{:,}".format(tot_cit) + "**" + warn + note + |
| | "\n\n---\n\n" + tbl) |
| | upd = gr.update(choices=choices, value=choices[0] if choices else None) |
| | progress(1.0) |
| | return md, upd, upd, upd, upd, str(len(papers)) + " papers | " + "{:,}".format(tot_cit) + " cit." |
| |
|
| | def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by): |
| | global ACTIVE_PAPERS |
| | if not PAPERS: return "Fetch papers first.", gr.update(), "0" |
| | filtered = [] |
| | for p in PAPERS: |
| | try: |
| | y = int(p["published"][:4]) |
| | if y < int(year_from) or y > int(year_to): continue |
| | except: pass |
| | cit = int(p.get("citations") or 0) |
| | if cit < int(cit_min) or cit > int(cit_max): continue |
| | filtered.append(p) |
| | if sort_by == "Newest": filtered.sort(key=lambda x: x["published"], reverse=True) |
| | elif sort_by == "Oldest": filtered.sort(key=lambda x: x["published"]) |
| | elif sort_by == "Most Cited": filtered.sort(key=lambda x: x.get("citations") or 0, reverse=True) |
| | elif sort_by == "Least Cited":filtered.sort(key=lambda x: x.get("citations") or 0) |
| | if not filtered: |
| | ACTIVE_PAPERS = [] |
| | return "No matching papers.", gr.update(choices=[], value=None), "0" |
| | ACTIVE_PAPERS = list(filtered) |
| | tbl, choices = build_table(filtered) |
| | tot = sum(p.get("citations") or 0 for p in filtered) |
| | md = ("## " + str(len(filtered)) + "/" + str(len(PAPERS)) + " papers" + |
| | " | " + str(year_from) + "-" + str(year_to) + |
| | " | cit " + str(cit_min) + "-" + str(cit_max) + |
| | " | total " + "{:,}".format(tot) + "\n\n---\n\n" + tbl) |
| | return md, gr.update(choices=choices, value=choices[0] if choices else None), str(len(filtered)) + "/" + str(len(PAPERS)) |
| |
|
| | def gr_search_fetched(query): |
| | if not query or not query.strip(): return "Enter a query." |
| | if not PAPERS: return "Fetch papers first." |
| | results = search_papers(query.strip(), top_k=8) |
| | if not results: return "No results for: " + query |
| | NL = "\n" |
| | md = "## Search: " + query + " โ " + str(len(results)) + " results" + NL + NL |
| | for r in results: |
| | p,s = r["paper"], r["score"] |
| | bar = "green " * round(s*10) |
| | cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else "" |
| | link = "[View](" + p["url"] + ")" |
| | pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else "" |
| | md += ("### " + "{:.0f}".format(s*100) + "% โ " + p["title"] + NL + NL + |
| | ", ".join(p["authors"][:2]) + " | " + p["published"] + cit + |
| | " | " + p.get("source","") + NL + NL + |
| | "> " + p["abstract"][:350] + "..." + NL + NL + |
| | link + pdf + NL + NL + "---" + NL + NL) |
| | return md |
| |
|
| | def _get_paper(choice): |
| | pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| | try: return pool[int(choice.split(".")[0]) - 1] |
| | except: return None |
| |
|
| | def gr_explain(choice, lang_choice): |
| | if not choice: return "Fetch papers and select one." |
| | paper = _get_paper(choice) |
| | if not paper: return "Selection error." |
| | lang = "ar" if "Arabic" in lang_choice else "en" |
| | NL = "\n" |
| | |
| | pdf_link = (" [PDF](" + paper["pdf_url"] + ")") if paper.get("pdf_url") else "" |
| | header = ("# " + paper["title"] + NL + NL + |
| | "**Authors:** " + ", ".join(paper["authors"]) + NL + NL + |
| | "**Date:** " + paper["published"] + |
| | " | **Citations:** " + cit_badge(paper.get("citations")) + |
| | " | **Source:** " + paper.get("source","arXiv") + NL + NL + |
| | "[View Paper](" + paper["url"] + ")" + pdf_link + NL + NL + |
| | "---" + NL + NL + |
| | "> " + paper["abstract"] + NL + NL + |
| | "---" + NL + NL + |
| | "## Explanation (Llama 3.3 70B)" + NL + NL) |
| | return header + explain_paper(paper, lang) |
| |
|
| | def gr_audio(txt, lang_choice): |
| | if not txt or len(txt) < 50: return None |
| | return text_to_audio(txt, "ar" if "Arabic" in lang_choice else "en") |
| |
|
| | def gr_save_fav(choice): |
| | if not choice: return "Select a paper first." |
| | paper = _get_paper(choice) |
| | return save_favorite(paper) if paper else "Error." |
| |
|
| | def gr_show_favs(): |
| | favs = load_favorites() |
| | if not favs: return "No saved papers." |
| | NL = "\n" |
| | lines = [("**" + p["title"] + "**" + NL + |
| | (p["authors"][0] if p["authors"] else "N/A") + |
| | " | " + p["published"] + " | " + p.get("source","") + |
| | " | " + cit_badge(p.get("citations")) + |
| | " | [Link](" + p["url"] + ")") |
| | for p in favs] |
| | return ("### Favorites โ " + str(len(favs)) + " papers" + NL + NL + |
| | (NL + NL + "---" + NL + NL).join(lines)) |
| |
|
| | def gr_compare(ca, cb, lc): |
| | if not ca or not cb: return "Select two papers." |
| | pa = _get_paper(ca); pb = _get_paper(cb) |
| | if not pa or not pb: return "Selection error." |
| | if pa["id"] == pb["id"]: return "Select two different papers." |
| | return compare_papers(pa, pb, "ar" if "Arabic" in lc else "en") |
| |
|
| | def gr_overview(query, lc): |
| | if not PAPERS: return "Fetch papers first." |
| | pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| | return ("## Overview\n\n" + |
| | summarize_papers(pool, query or "research", |
| | "ar" if "Arabic" in lc else "en")) |
| |
|
| | def gr_trends(): |
| | if not PAPERS: return None, "Fetch papers first." |
| | return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS) |
| |
|
| | def gr_bib(style, progress=gr.Progress()): |
| | if not PAPERS: return "Fetch papers first.", None |
| | progress(0.5, desc="Generating...") |
| | pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS |
| | text, path = generate_bibliography(pool, style) |
| | progress(1.0) |
| | short = text[:3000] + ("..." if len(text)>3000 else "") |
| | return "```\n" + short + "\n```", path |
| |
|
| | def gr_chat_fn(message, history): |
| | if not message.strip(): return history, "" |
| | hd = [] |
| | for pair in history: |
| | if pair[0]: hd.append({"role":"user", "content":pair[0]}) |
| | if pair[1]: hd.append({"role":"assistant","content":pair[1]}) |
| | history.append((message, chat_about_papers(message, hd))) |
| | return history, "" |
| |
|
| | |
| | |
| | |
| | CSS = """ |
| | footer{display:none!important} |
| | h1{text-align:center} |
| | .status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0} |
| | .legend{font-size:.8rem;color:#cbd5e1;background:#1e293b; |
| | border-radius:8px;padding:6px 14px;margin-bottom:6px} |
| | .filter-box{background:#1e293b;border-radius:10px; |
| | padding:12px 16px;margin-top:8px} |
| | .gs-box{background:#1e293b;border-radius:10px;padding:14px 18px; |
| | margin-bottom:10px;border:1px solid #334155} |
| | """ |
| |
|
| | with gr.Blocks( |
| | theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"), |
| | title="Scientific Paper Discovery v7.4", css=CSS |
| | ) as demo: |
| |
|
| | gr.Markdown("# Scientific Paper Discovery v7.4\narXiv ยท CrossRef ยท Llama-3.3-70B ยท FAISS") |
| | gr.Markdown("Citations: ๐ฅ >=1000 | ๐ >=100 | โญ >=10 | ๐ <10 | ยท = 0", |
| | elem_classes="legend") |
| | status_bar = gr.Markdown("No papers loaded yet.", elem_classes="status-bar") |
| |
|
| | with gr.Tabs(): |
| |
|
| | |
| | with gr.Tab("Browse / Search"): |
| | with gr.Row(): |
| | with gr.Column(scale=3): |
| | t_query = gr.Textbox(label="Topic", |
| | placeholder="ARIMA, inflation, LLM...", |
| | value="economic forecasting") |
| | t_category = gr.Dropdown(label="Category", |
| | choices=list(CATEGORIES.keys()), |
| | value="๐ Economics") |
| | t_source = gr.Radio(label="Source", |
| | choices=["arXiv","CrossRef","Both"], |
| | value="arXiv") |
| | with gr.Column(scale=1): |
| | t_max = gr.Slider(5, 50, value=15, step=5, label="Max papers") |
| | t_days = gr.Slider(1, 1500, value=365, step=30, label="Last N days") |
| | btn_fetch = gr.Button("Fetch Papers", variant="primary", size="lg") |
| | papers_table_md = gr.Markdown("Results appear here.") |
| | paper_selector = gr.Dropdown(label="Select paper", choices=[], interactive=True) |
| | with gr.Group(elem_classes="filter-box"): |
| | gr.Markdown("### Filter & Sort") |
| | with gr.Row(): |
| | f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="Year from") |
| | f_year_to = gr.Slider(2000,2026,value=2026,step=1,label="Year to") |
| | with gr.Row(): |
| | f_cit_min = gr.Slider(0,5000,value=0, step=5,label="Citations min") |
| | f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="Citations max") |
| | with gr.Row(): |
| | f_sort = gr.Dropdown(choices=SORT_CHOICES, |
| | value="Most Cited",label="Sort",scale=3) |
| | btn_filter = gr.Button("Apply",variant="primary",scale=1) |
| | gr.Markdown("---\n### Semantic Search (FAISS โ in loaded papers)") |
| | with gr.Row(): |
| | search_in_box = gr.Textbox(label="Search in loaded papers", |
| | placeholder="ARIMA, transformer...",scale=5) |
| | btn_search_in = gr.Button("Search",scale=1) |
| | search_in_out = gr.Markdown() |
| |
|
| | |
| | with gr.Tab("Global Search"): |
| | gr.Markdown( |
| | "### Search any paper by title or keywords\n\n" |
| | "> Uses arXiv **relevance** sort + CrossRef **title** search.\n" |
| | "> Example: `Attention is All You Need`" |
| | ) |
| | with gr.Group(elem_classes="gs-box"): |
| | with gr.Row(): |
| | gs_query = gr.Textbox( |
| | label="Title or keywords", |
| | placeholder="Attention is All You Need | ARIMA forecasting ...", |
| | scale=4) |
| | gs_source = gr.Radio(label="Source", |
| | choices=["arXiv","CrossRef","Both"], |
| | value="Both", scale=2) |
| | gs_max = gr.Slider(5,30,value=10,step=5,label="Max results",scale=1) |
| | btn_gs = gr.Button("Search Now", variant="primary", size="lg") |
| | gs_out = gr.Markdown("Enter a title or keywords...") |
| |
|
| | |
| | with gr.Tab("Explain"): |
| | with gr.Row(): |
| | paper_sel2 = gr.Dropdown(label="Select paper", |
| | choices=[], interactive=True, scale=4) |
| | lang_exp = gr.Radio(LANG_CHOICES, value="Arabic", |
| | label="Language", scale=1) |
| | with gr.Row(): |
| | btn_explain = gr.Button("Explain", variant="primary") |
| | btn_fav = gr.Button("Save Fav") |
| | btn_audio = gr.Button("Listen") |
| | btn_export_pdf = gr.Button("Export PDF", variant="secondary") |
| | with gr.Row(): |
| | fav_status = gr.Markdown() |
| | pdf_status = gr.Markdown() |
| | explanation_out = gr.Markdown("Fetch papers and select one.") |
| | audio_out = gr.Audio(label="Audio", type="filepath") |
| | pdf_out = gr.File(label="Download PDF") |
| |
|
| | |
| | with gr.Tab("Compare"): |
| | with gr.Row(): |
| | cmp_a = gr.Dropdown(label="Paper A", choices=[], interactive=True) |
| | cmp_b = gr.Dropdown(label="Paper B", choices=[], interactive=True) |
| | lang_cmp = gr.Radio(LANG_CHOICES, value="Arabic", |
| | label="Language", scale=1) |
| | btn_compare = gr.Button("Compare", variant="primary") |
| | compare_out = gr.Markdown("Select two papers.") |
| |
|
| | |
| | with gr.Tab("Chat"): |
| | chatbot_ui = gr.Chatbot(label="Research Assistant", |
| | height=480, bubble_full_width=False) |
| | with gr.Row(): |
| | chat_in = gr.Textbox(label="Question", scale=5, |
| | placeholder="Key findings? | ู
ุง ุฃุจุฑุฒ ุงููุชุงุฆุฌุ") |
| | btn_send = gr.Button("Send", variant="primary", scale=1) |
| | btn_clear = gr.Button("Clear", size="sm") |
| |
|
| | |
| | with gr.Tab("Overview"): |
| | with gr.Row(): |
| | lang_ov = gr.Radio(LANG_CHOICES, value="Arabic", |
| | label="Language", scale=1) |
| | btn_overview = gr.Button("Generate Report", variant="primary", scale=3) |
| | overview_out = gr.Markdown("Fetch papers first.") |
| |
|
| | |
| | with gr.Tab("Trends"): |
| | btn_trends = gr.Button("Analyze Trends", variant="primary", size="lg") |
| | trend_chart = gr.Image(label="Trends Dashboard", type="filepath") |
| | trend_stats = gr.Markdown("Fetch papers first.") |
| |
|
| | |
| | with gr.Tab("Bibliography"): |
| | bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"], |
| | value="APA", label="Style") |
| | btn_bib = gr.Button("Generate Bibliography", variant="primary") |
| | bib_out = gr.Markdown() |
| | bib_file = gr.File(label="Download") |
| |
|
| | |
| | with gr.Tab("Favorites"): |
| | btn_show_fav = gr.Button("Show Favorites") |
| | favs_md = gr.Markdown("Press to show.") |
| | btn_export_fav = gr.Button("Export CSV", variant="secondary") |
| | fav_csv_file = gr.File(label="CSV File") |
| |
|
| | |
| | with gr.Tab("Auto-Fetch"): |
| | with gr.Row(): |
| | auto_q = gr.Textbox(label="Topic", |
| | value="economic forecasting", scale=3) |
| | auto_cat = gr.Dropdown(label="Category", |
| | choices=list(CATEGORIES.keys()), |
| | value="๐ Economics", scale=2) |
| | auto_interval = gr.Slider(5,120,value=60,step=5, |
| | label="Every (min)",scale=1) |
| | with gr.Row(): |
| | btn_start_auto = gr.Button("Start", variant="primary") |
| | btn_stop_auto = gr.Button("Stop", variant="stop") |
| | btn_refresh_log = gr.Button("Refresh Log") |
| | auto_status = gr.Markdown() |
| | auto_log_md = gr.Markdown("No log.") |
| |
|
| | |
| | with gr.Tab("About"): |
| | gr.Markdown(""" |
| | # ๐ฌ Scientific Paper Discovery |
| | ### Version 7.4 โ Intelligent Research Assistant |
| | |
| | --- |
| | |
| | ## ๐ง About This Tool |
| | |
| | **Scientific Paper Discovery** is an AI-powered academic research assistant that enables researchers, students, and scientists to **discover, understand, and organize** scientific literature with unprecedented ease. It combines state-of-the-art language models with multi-source academic APIs to deliver a seamless research experience. |
| | |
| | --- |
| | |
| | ## โ๏ธ Core Technologies |
| | |
| | | Component | Technology | Role | |
| | |---|---|---| |
| | | ๐ค Language Model | **Llama 3.3 70B** via Groq API | Paper explanation, comparison & chat | |
| | | ๐ Semantic Search | **FAISS** + MiniLM-L12-v2 | Vector similarity search | |
| | | ๐ก Source 1 | **arXiv API** | Preprints across all sciences | |
| | | ๐ Source 2 | **CrossRef API** | Peer-reviewed journal articles | |
| | | ๐ Citations | **Semantic Scholar** (3-layer) | Real citation counts | |
| | | ๐๏ธ Text-to-Speech | **gTTS** | Audio playback of explanations | |
| | | ๐ PDF Export | **ReportLab** | Professional PDF generation | |
| | |
| | --- |
| | |
| | ## ๐๏ธ Feature Overview |
| | |
| | | Tab | Feature | Description | |
| | |---|---|---| |
| | | ๐ Browse | Paper Fetching | Fetch latest papers by topic & category | |
| | | ๐ Global Search | Title Search | Find any paper by exact title (relevance-sorted) | |
| | | ๐ Explain | AI Explanation | Full structured explanation in Arabic or English | |
| | | โ๏ธ Compare | Paper Comparison | Side-by-side AI comparison of two papers | |
| | | ๐ฌ Chat | Research Chat | Ask questions about loaded papers | |
| | | ๐ Overview | Batch Summary | Academic overview of all loaded papers | |
| | | ๐ Trends | Analytics | Citation, keyword & author trend charts | |
| | | ๐ Bibliography | Citation Export | APA, IEEE, Chicago, BibTeX formats | |
| | | โญ Favorites | Saved Papers | Bookmark & export favorite papers | |
| | | ๐ Auto-Fetch | Monitoring | Automatic periodic paper discovery | |
| | |
| | --- |
| | |
| | ## ๐ Search Mode Guide |
| | |
| | | Mode | Algorithm | Best For | |
| | |---|---|---| |
| | | Browse | `sortBy=submittedDate` | Discovering latest papers on a topic | |
| | | ๐ Global Search | `sortBy=relevance` + `ti:"..."` | Finding a specific paper by title | |
| | | FAISS (internal) | Cosine similarity | Semantic search within loaded papers | |
| | |
| | --- |
| | |
| | ## ๐ Citation Badges |
| | |
| | | Badge | Meaning | |
| | |---|---| |
| | | ๐ฅ | โฅ 1,000 citations โ Highly influential | |
| | | ๐ | โฅ 100 citations โ Well-cited | |
| | | โญ | โฅ 10 citations โ Notable | |
| | | ๐ | < 10 citations โ Recent or niche | |
| | | ยท | 0 citations โ New or unindexed | |
| | |
| | --- |
| | |
| | *Built with โค๏ธ for the research community โ v7.4* |
| | """) |
| |
|
| | |
| | FETCH_OUT = [papers_table_md, paper_selector, paper_sel2, cmp_a, cmp_b, status_bar] |
| |
|
| | btn_fetch.click(gr_fetch, |
| | inputs=[t_query, t_category, t_max, t_days, t_source], |
| | outputs=FETCH_OUT) |
| | btn_filter.click(gr_filter_papers, |
| | inputs=[f_year_from, f_year_to, f_cit_min, f_cit_max, f_sort], |
| | outputs=[papers_table_md, paper_selector, status_bar]) |
| | paper_selector.change(lambda x: [gr.update(value=x)]*3, |
| | inputs=[paper_selector], |
| | outputs=[paper_sel2, cmp_a, cmp_b]) |
| |
|
| | btn_search_in.click(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out]) |
| | search_in_box.submit(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out]) |
| |
|
| | btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out]) |
| | gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out]) |
| |
|
| | btn_explain.click(gr_explain, inputs=[paper_sel2, lang_exp], outputs=[explanation_out]) |
| | btn_fav.click(gr_save_fav, inputs=[paper_sel2], outputs=[fav_status]) |
| | btn_audio.click(gr_audio, inputs=[explanation_out, lang_exp], outputs=[audio_out]) |
| | btn_export_pdf.click(gr_export_pdf, |
| | inputs=[explanation_out, paper_sel2], |
| | outputs=[pdf_out, pdf_status]) |
| |
|
| | btn_compare.click(gr_compare, inputs=[cmp_a, cmp_b, lang_cmp], outputs=[compare_out]) |
| | btn_overview.click(gr_overview, inputs=[t_query, lang_ov], outputs=[overview_out]) |
| | btn_trends.click(gr_trends, outputs=[trend_chart, trend_stats]) |
| | btn_bib.click(gr_bib, inputs=[bib_style], outputs=[bib_out, bib_file]) |
| |
|
| | btn_show_fav.click(gr_show_favs, outputs=[favs_md]) |
| | btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file]) |
| |
|
| | btn_start_auto.click(start_auto_fetch, |
| | inputs=[auto_q, auto_cat, auto_interval], |
| | outputs=[auto_status]) |
| | btn_stop_auto.click(stop_auto_fetch, outputs=[auto_status]) |
| | btn_refresh_log.click(get_auto_log, outputs=[auto_log_md]) |
| |
|
| | btn_send.click(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in]) |
| | chat_in.submit(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in]) |
| | btn_clear.click(lambda: ([], ""), outputs=[chatbot_ui, chat_in]) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |