sitayeb's picture
Update app.py
b54c8f9 verified
# ================================================================
# Scientific Paper Discovery Bot v7.4 โ€” SyntaxError FIXED
# ================================================================
import os, re, time, json, pickle, threading
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from collections import Counter
import numpy as np
import faiss
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import gradio as gr
from sentence_transformers import SentenceTransformer
from groq import Groq
from gtts import gTTS
from langdetect import detect, DetectorFactory
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable
DetectorFactory.seed = 0
GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
S2_API_KEY = os.environ.get("S2_API_KEY", "")
groq_client = Groq(api_key=GROQ_API_KEY)
print("Loading embedder...")
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
_ = embedder.encode(["warmup"])
print("Embedder ready!")
PAPERS = []
ACTIVE_PAPERS = []
FAISS_INDEX = None
AUTO_RUNNING = False
AUTO_LOG = []
CURRENT_YEAR = datetime.now().year
PERSIST_DIR = "/tmp"
FAVORITES_PATH = PERSIST_DIR + "/favorites.pkl"
SEEN_IDS_PATH = PERSIST_DIR + "/seen_ids.json"
os.makedirs(PERSIST_DIR, exist_ok=True)
CATEGORIES = {
"๐ŸŒ All": "",
"๐Ÿ“Š Economics": "econ",
"๐Ÿ’ฐ Quant Finance": "q-fin",
"๐Ÿค– AI": "cs.AI",
"๐Ÿง  Machine Learning":"cs.LG",
"๐Ÿ’ฌ NLP": "cs.CL",
"๐Ÿ“ˆ Statistics": "stat",
"๐Ÿ”ฌ Biology": "q-bio",
"โš›๏ธ Physics": "physics",
"๐Ÿ“ Mathematics": "math",
"๐Ÿ’ป Computer Science":"cs",
}
CROSSREF_SUBJECTS = {
"๐ŸŒ All": "",
"๐Ÿ“Š Economics": "economics",
"๐Ÿ’ฐ Quant Finance": "finance",
"๐Ÿค– AI": "artificial intelligence",
"๐Ÿง  Machine Learning":"machine learning",
"๐Ÿ’ฌ NLP": "natural language processing",
"๐Ÿ“ˆ Statistics": "statistics",
"๐Ÿ”ฌ Biology": "biology",
"โš›๏ธ Physics": "physics",
"๐Ÿ“ Mathematics": "mathematics",
"๐Ÿ’ป Computer Science":"computer science",
}
LANG_CHOICES = ["Arabic", "English"]
SORT_CHOICES = ["Newest", "Oldest", "Most Cited", "Least Cited"]
AR_RULES = """
- ุงุจุฏุฃ ูƒู„ ู‚ุณู… ุจู€ ## ู…ุน ุณุทุฑ ูุงุฑุบ ู‚ุจู„ู‡ ูˆุจุนุฏู‡
- ุงูƒุชุจ ูƒู„ ู‚ุณู… ููŠ ูู‚ุฑุฉ 3-4 ุฌู…ู„ ุจุงู„ุนุฑุจูŠุฉ ุงู„ูุตุญู‰
- ู„ุง ุชูƒุฑุฑ ุนู†ูˆุงู† ุงู„ู‚ุณู… ุฏุงุฎู„ ุงู„ู†ุต
"""
# ================================================================
# HELPERS
# ================================================================
def detect_lang(text):
try:
return "ar" if detect(str(text)[:300]).startswith("ar") else "en"
except:
return "en"
def clean_md(text):
text = re.sub(r"[#*`>\[\]!_~]", "", text)
return re.sub(r"\n+", " ", text).strip()[:2500]
def fix_ar_format(text):
text = re.sub(r"\n(##)", r"\n\n\1", text)
text = re.sub(r"(## [^\n]+)\n([^\n#])", r"\1\n\n\2", text)
return re.sub(r"\n{3,}", "\n\n", text).strip()
def cit_badge(n):
if n is None or n == "": return "โ€”"
n = int(n)
if n >= 1000: return "๐Ÿฅ‡ " + "{:,}".format(n)
if n >= 100: return "๐Ÿ† " + "{:,}".format(n)
if n >= 10: return "โญ " + "{:,}".format(n)
if n > 0: return "๐Ÿ“„ " + str(n)
return "ยท"
def build_table(papers_list):
rows = "| # | Title | Author | Date | Citations | Source |\n"
rows += "|---|---|---|---|---|---|\n"
choices = []
for i, p in enumerate(papers_list):
first = p["authors"][0] if p["authors"] else "N/A"
badge = "NEW" if p.get("recent") else "๐Ÿ“„"
rows += "| {} | {} {} | {} | {} | {} | {} |\n".format(
i+1, badge, p["title"], first,
p["published"], cit_badge(p.get("citations")),
p.get("source","arXiv"))
choices.append("{}. {}".format(i+1, p["title"]))
return rows, choices
def s2_headers():
h = {"User-Agent": "ScientificPaperBot/7.4"}
if S2_API_KEY:
h["x-api-key"] = S2_API_KEY
return h
def cr_headers():
return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"}
# ================================================================
# CrossRef date parser โ€” rejects garbage years
# ================================================================
def parse_crossref_date(item):
for field in ["issued", "published", "published-print", "published-online", "created"]:
dp = (item.get(field) or {}).get("date-parts", [[]])
if not dp or not dp[0]: continue
pts = dp[0]
try:
year = int(pts[0])
if not (1900 <= year <= CURRENT_YEAR + 1): continue
month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1))
day = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1))
return "{:04d}-{:02d}-{:02d}".format(year, month, day)
except (ValueError, TypeError, IndexError):
continue
return "N/A"
# ================================================================
# SEEN / FAVORITES
# ================================================================
def load_seen_ids():
try:
with open(SEEN_IDS_PATH) as f: return set(json.load(f))
except: return set()
def save_seen_ids(ids):
with open(SEEN_IDS_PATH, "w") as f: json.dump(list(ids), f)
def load_favorites():
try:
with open(FAVORITES_PATH, "rb") as f: return pickle.load(f)
except: return []
def save_favorite(paper):
favs = load_favorites()
if paper["id"] not in {p["id"] for p in favs}:
favs.append(paper)
with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f)
return "Saved: " + paper["title"]
return "Already saved."
def export_favorites_csv():
favs = load_favorites()
if not favs: return None
df = pd.DataFrame([{
"Title": p["title"],
"Authors": ", ".join(p["authors"][:3]),
"Date": p["published"],
"Citations": p.get("citations","N/A"),
"URL": p["url"],
"Source": p.get("source","arXiv")
} for p in favs])
path = PERSIST_DIR + "/favorites.csv"
df.to_csv(path, index=False, encoding="utf-8-sig")
return path
def gr_export_fav(): return export_favorites_csv()
# ================================================================
# PDF EXPORT
# ================================================================
def export_explanation_pdf(explanation_text, paper_title="paper"):
if not explanation_text or len(explanation_text) < 30: return None
safe = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_")
path = PERSIST_DIR + "/explanation_" + safe + ".pdf"
doc = SimpleDocTemplate(path, pagesize=A4,
rightMargin=2*cm, leftMargin=2*cm,
topMargin=2*cm, bottomMargin=2*cm)
styles = getSampleStyleSheet()
h2_style = ParagraphStyle("H2", parent=styles["Heading2"],
fontSize=11, textColor=colors.HexColor("#2563eb"),
spaceBefore=14, spaceAfter=6)
bd_style = ParagraphStyle("BD", parent=styles["Normal"],
fontSize=10, leading=16, spaceAfter=8)
mt_style = ParagraphStyle("MT", parent=styles["Normal"],
fontSize=9, textColor=colors.HexColor("#64748b"))
story = []
for line in explanation_text.split("\n"):
line = line.strip()
if not line: story.append(Spacer(1, 6)); continue
clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line)
clean = re.sub(r"\*(.+?)\*", r"\1", clean)
clean = re.sub(r"`(.+?)`", r"\1", clean)
clean = re.sub(r"^#{1,6}\s*", "", clean)
clean = re.sub(r"[๐ŸŽฏโ“๐Ÿ”ง๐Ÿ“Š๐ŸŒŸ๐Ÿ”—๐Ÿ“„๐Ÿ‘ฅ๐Ÿ“…๐Ÿ“ก๐Ÿค–#*_~]", "", clean).strip()
if not clean: continue
if line.startswith("## ") or line.startswith("# "):
story.append(HRFlowable(width="100%", thickness=0.5,
color=colors.HexColor("#e2e8f0"), spaceAfter=4))
story.append(Paragraph(clean, h2_style))
elif line.startswith(">"):
q_st = ParagraphStyle("Q", parent=styles["Normal"],
fontSize=9, leftIndent=20,
textColor=colors.HexColor("#475569"), leading=14)
story.append(Paragraph(
re.sub(r"[๐ŸŽฏโ“๐Ÿ”ง๐Ÿ“Š๐ŸŒŸ๐Ÿ”—๐Ÿ“„๐Ÿ‘ฅ๐Ÿ“…๐Ÿ“ก๐Ÿค–#*_~]","",line.lstrip(">").strip()),
q_st))
else:
story.append(Paragraph(clean, bd_style))
story += [
Spacer(1, 20),
HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")),
Paragraph("Generated by Paper Discovery v7.4 โ€” " +
datetime.now().strftime("%Y-%m-%d %H:%M"), mt_style)
]
try:
doc.build(story); return path
except Exception as e:
print("PDF error: " + str(e)); return None
def gr_export_pdf(explanation_text, choice):
if not explanation_text or len(explanation_text) < 50:
return None, "Explain a paper first."
title = choice.split(". ", 1)[-1] if choice else "paper"
path = export_explanation_pdf(explanation_text, title)
return (path, "PDF ready!") if path else (None, "PDF failed.")
# ================================================================
# SOURCE 1 โ€” arXiv
# KEY FIX: sort_by parameter
# Browse โ†’ "submittedDate" latest papers
# Global โ†’ "relevance" exact title match
# ================================================================
def fetch_arxiv_papers(query, category, max_results=20, days_back=365,
sort_by="submittedDate"):
parts = []
words = query.strip().split()
if len(words) >= 3 and sort_by == "relevance":
parts.append('ti:"' + query.strip() + '"')
elif query.strip():
parts.append("all:" + query.strip())
if category.strip():
parts.append("cat:" + category.strip())
sq = " AND ".join(parts) if parts else "all:machine learning"
params = {
"search_query": sq,
"start": 0,
"max_results": max_results,
"sortBy": sort_by,
"sortOrder": "descending",
}
try:
resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30)
resp.raise_for_status()
except Exception as e:
print("arXiv error: " + str(e)); return []
ns_a = "http://www.w3.org/2005/Atom"
ns_x = "http://arxiv.org/schemas/atom"
root = ET.fromstring(resp.content)
cutoff = datetime.now() - timedelta(days=days_back)
papers = []
for entry in root.findall("{" + ns_a + "}entry"):
try:
pid = entry.find("{" + ns_a + "}id").text.split("/abs/")[-1].strip()
title = entry.find("{" + ns_a + "}title").text.strip().replace("\n"," ")
abstract = entry.find("{" + ns_a + "}summary").text.strip().replace("\n"," ")
published = entry.find("{" + ns_a + "}published").text[:10]
authors = [a.find("{" + ns_a + "}name").text
for a in entry.findall("{" + ns_a + "}author")]
cats = set()
pc = entry.find("{" + ns_x + "}primary_category")
if pc is not None: cats.add(pc.get("term",""))
for c in entry.findall("{" + ns_x + "}category"): cats.add(c.get("term",""))
cats.discard("")
papers.append({
"id": pid,
"title": title,
"authors": authors[:6],
"abstract": abstract[:1200],
"published": published,
"categories": list(cats)[:4],
"citations": None,
"url": "https://arxiv.org/abs/" + pid,
"pdf_url": "https://arxiv.org/pdf/" + pid,
"recent": datetime.strptime(published, "%Y-%m-%d") >= cutoff,
"source": "arXiv",
})
except Exception as e:
print("arXiv parse: " + str(e))
return papers
# ================================================================
# SOURCE 2 โ€” CrossRef
# ================================================================
def fetch_crossref_papers(query, category_label="", max_results=20,
days_back=365, use_title=False):
subject = CROSSREF_SUBJECTS.get(category_label, "")
full_query = (query + " " + subject).strip() if subject else query
key = "query.title" if use_title else "query"
params = {
key: full_query,
"rows": min(max_results * 3, 200),
"sort": "relevance",
"select": ("title,author,abstract,published,published-print,"
"published-online,issued,created,DOI,"
"is-referenced-by-count,link,subject"),
}
items = []
for attempt in range(3):
try:
r = requests.get("https://api.crossref.org/works",
params=params, headers=cr_headers(), timeout=30)
if r.status_code == 200:
items = r.json().get("message",{}).get("items",[]); break
if r.status_code == 429: time.sleep(2**attempt); continue
print("CrossRef " + str(r.status_code)); return []
except Exception as e:
print("CrossRef attempt " + str(attempt) + ": " + str(e)); time.sleep(1)
cutoff = datetime.now() - timedelta(days=days_back)
papers, seen_ids = [], set()
for item in items:
if len(papers) >= max_results: break
title_list = item.get("title", [])
if not title_list: continue
title = title_list[0].strip()
if not title or title.lower().startswith("title pending"): continue
pub = parse_crossref_date(item)
if pub == "N/A": continue
cit = int(item.get("is-referenced-by-count", 0) or 0)
authors = [
(a.get("given","") + " " + a.get("family","")).strip()
for a in item.get("author",[])[:6]
]
authors = [a for a in authors if a.strip()] or ["Unknown"]
abstract = re.sub(r"<[^>]+>","",
item.get("abstract","No abstract.")).strip()[:1200]
doi = item.get("DOI","")
url = "https://doi.org/" + doi if doi else "#"
pid = doi or re.sub(r"\W","",title)[:40]
if pid in seen_ids: continue
seen_ids.add(pid)
pdf_url = next((l.get("URL","") for l in item.get("link",[])
if "pdf" in l.get("content-type","").lower()), "")
try: recent = datetime.strptime(pub[:10], "%Y-%m-%d") >= cutoff
except: recent = False
papers.append({
"id": pid,
"title": title,
"authors": authors,
"abstract": abstract,
"published": pub[:10],
"categories": item.get("subject",[])[:3],
"citations": cit,
"url": url,
"pdf_url": pdf_url,
"recent": recent,
"source": "CrossRef",
})
papers.sort(key=lambda x: x["citations"], reverse=True)
return papers
# ================================================================
# GLOBAL PAPER SEARCH โ€” relevance sorted
# ================================================================
def global_paper_search(query, source_choice, max_results=10):
if not query or not query.strip():
return "Enter a title or keywords."
q = query.strip(); papers = []
if source_choice in ("arXiv", "Both"):
papers += fetch_arxiv_papers(q, "", int(max_results), 3650,
sort_by="relevance")
if source_choice in ("CrossRef", "Both"):
papers += fetch_crossref_papers(q, "", int(max_results), 3650,
use_title=True)
if not papers:
return "No results for: " + q
seen, unique = set(), []
for p in papers:
key = re.sub(r"\W","",p["title"].lower())[:60]
if key not in seen: seen.add(key); unique.append(p)
unique.sort(key=lambda x: x.get("citations") or 0, reverse=True)
NL = "\n"
md = "## Search Results: " + q + NL + NL
md += "**" + str(len(unique)) + " papers found**" + NL + NL + "---" + NL + NL
for i, p in enumerate(unique, 1):
cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
cats = " | ".join(p.get("categories",[])[:2])
auth = ", ".join(p["authors"][:3])
abst = p["abstract"][:450]
link = "[View](" + p["url"] + ")"
pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
src = p.get("source","")
md += ("### " + str(i) + ". " + p["title"] + NL + NL +
auth + " | " + p["published"] + cit + " | " + src +
(" | " + cats if cats else "") + NL + NL +
"> " + abst + "..." + NL + NL +
link + pdf + NL + NL + "---" + NL + NL)
return md
# ================================================================
# CITATION ENGINE โ€” 3-layer
# ================================================================
def enrich_citations(papers):
arxiv_papers = [p for p in papers
if p.get("source")=="arXiv" and
(p.get("citations") is None or p.get("citations")==0)]
if not arxiv_papers:
for p in papers:
if p.get("citations") is None: p["citations"] = 0
return papers
id_map, batch_ids = {}, []
for p in arxiv_papers:
clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
id_map[clean] = p
batch_ids.append("arXiv:" + clean)
for i in range(0, len(batch_ids), 500):
try:
r = requests.post(
"https://api.semanticscholar.org/graph/v1/paper/batch",
json={"ids": batch_ids[i:i+500]},
params={"fields":"citationCount,externalIds"},
headers=s2_headers(), timeout=30)
if r.status_code == 200:
for item in r.json():
if not item: continue
ext = item.get("externalIds") or {}
clean = re.sub(r"v\d+$","",
ext.get("ArXiv","").split("/")[-1].strip())
if clean and clean in id_map:
c = item.get("citationCount")
if c is not None: id_map[clean]["citations"] = int(c)
elif r.status_code == 429: time.sleep(4)
except Exception as e: print("S2 batch: " + str(e))
for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]:
clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
for attempt in range(2):
try:
r = requests.get(
"https://api.semanticscholar.org/graph/v1/paper/arXiv:" + clean,
params={"fields":"citationCount"},
headers=s2_headers(), timeout=10)
if r.status_code == 200:
c = r.json().get("citationCount")
p["citations"] = int(c) if c else 0; break
if r.status_code == 429: time.sleep(2**attempt); continue
p["citations"] = 0; break
except: p["citations"] = 0; break
time.sleep(0.12)
for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]:
try:
r = requests.get("https://api.crossref.org/works",
params={"query.title": p["title"], "rows": 1,
"select": "is-referenced-by-count,title"},
headers=cr_headers(), timeout=8)
if r.status_code == 200:
items = r.json().get("message",{}).get("items",[])
if items:
found = (items[0].get("title") or [""])[0].lower()
qw = set(p["title"].lower().split()[:5])
fw = set(found.split()[:10])
p["citations"] = (
int(items[0].get("is-referenced-by-count",0) or 0)
if len(qw & fw) >= 2 else 0)
else: p["citations"] = 0
else: p["citations"] = 0
time.sleep(0.12)
except: p["citations"] = 0
for p in papers:
if p.get("citations") is None: p["citations"] = 0
return papers
# ================================================================
# FAISS
# ================================================================
def build_papers_index(papers):
global FAISS_INDEX, PAPERS
PAPERS = papers
if not papers: FAISS_INDEX = None; return
texts = [p["title"] + " " + p["abstract"] for p in papers]
embs = embedder.encode(texts, convert_to_numpy=True,
normalize_embeddings=True).astype("float32")
idx = faiss.IndexFlatIP(embs.shape[1])
idx.add(embs)
FAISS_INDEX = idx
def search_papers(query, top_k=5):
if FAISS_INDEX is None or not PAPERS: return []
qe = embedder.encode([query], convert_to_numpy=True,
normalize_embeddings=True).astype("float32")
scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS)))
return [{"paper": PAPERS[i], "score": float(s)}
for s, i in zip(scores[0], ids[0]) if i >= 0 and float(s) > 0.1]
# ================================================================
# AUTO-FETCH
# ================================================================
def auto_fetch_worker(query, category, interval):
global AUTO_RUNNING
while AUTO_RUNNING:
time.sleep(interval)
if not AUTO_RUNNING: break
papers = fetch_arxiv_papers(query, category, 30, 1)
seen = load_seen_ids()
new_ps = [p for p in papers if p["id"] not in seen]
if new_ps:
save_seen_ids(seen | {p["id"] for p in papers})
AUTO_LOG.append(
"[" + datetime.now().strftime("%H:%M") + "] NEW " +
str(len(new_ps)) + " โ€” " + query)
if len(AUTO_LOG) > 20: AUTO_LOG.pop(0)
def start_auto_fetch(query, cat_label, interval_min):
global AUTO_RUNNING
if AUTO_RUNNING: return "Already running."
AUTO_RUNNING = True
threading.Thread(
target=auto_fetch_worker,
args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60),
daemon=True).start()
return "Auto-fetch started every " + str(interval_min) + " min for: " + query
def stop_auto_fetch():
global AUTO_RUNNING; AUTO_RUNNING = False; return "Stopped."
def get_auto_log():
return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "No log."
# ================================================================
# TRENDS
# ================================================================
def analyze_trends(papers):
if not papers: return None, "No papers."
date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A")
stopwords = {"the","a","an","of","in","for","on","with","and","or","to","using",
"based","via","from","by","is","are","our","we","this","that","which",
"towards","approach","method","new","into","over","learning","deep",
"model","models","data","neural","large","language","paper","study",
"analysis","results","show","also","can","used","two","its","their"}
all_words = [w.lower() for p in papers
for w in re.findall(r"[a-zA-Z]{4,}", p["title"])
if w.lower() not in stopwords]
top_words = Counter(all_words).most_common(15)
sources = Counter(p.get("source","arXiv") for p in papers)
cit_papers = [p for p in papers if (p.get("citations") or 0)>0]
top_cited = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10]
all_auth = [a for p in papers for a in p["authors"][:3]]
top_authors = Counter(all_auth).most_common(10)
cvals = [p["citations"] for p in cit_papers]
buckets = [0,1,5,10,50,100,500,10000]
blabels = ["0","1-4","5-9","10-49","50-99","100-499","500+"]
bcounts = ([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1])
for i in range(len(buckets)-1)] if cvals else [0]*7)
avg_cit = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0
total_cit = sum(p.get("citations") or 0 for p in papers)
C = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4",
"#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16",
"#fbbf24","#34d399","#f87171"]
BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white"
fig, axes = plt.subplots(2, 3, figsize=(20,12))
fig.patch.set_facecolor(BG)
fig.suptitle("Research Trends", color=W, fontsize=16, fontweight="bold", y=1.01)
def style(ax):
ax.set_facecolor(PNL)
for sp in ax.spines.values(): sp.set_edgecolor(BR)
ax.tick_params(colors=W, labelsize=8)
ax = axes[0,0]; style(ax)
if date_counts:
ms,cs = zip(*sorted(date_counts.items()))
ms,cs = list(ms), list(cs)
bars = ax.bar(ms, cs, color=C[0], edgecolor="#60a5fa", lw=0.8)
for b,c in zip(bars,cs):
ax.text(b.get_x()+b.get_width()/2, b.get_height()+.05, str(c),
ha="center", va="bottom", color=W, fontsize=8)
if len(cs) > 2:
z = np.polyfit(range(len(cs)), cs, 1)
ax.plot(ms, np.poly1d(z)(range(len(cs))), "--",
color="#f59e0b", lw=1.5, alpha=.8, label="Trend")
ax.legend(fontsize=8, facecolor=PNL, labelcolor=W)
ax.set_title("Papers per Month", color=W, fontsize=12, fontweight="bold", pad=10)
ax.set_ylabel("Count", color=W, fontsize=9)
ax.tick_params(rotation=45)
ax = axes[0,1]; style(ax)
if top_words:
wds,wcts = zip(*top_words)
ax.barh(list(wds), list(wcts), color=C[:len(wds)], edgecolor="#475569", lw=.6)
for b,c in zip(ax.patches, wcts):
ax.text(b.get_width()+.1, b.get_y()+b.get_height()/2, str(c),
va="center", color=W, fontsize=8)
ax.set_title("Top Keywords", color=W, fontsize=12, fontweight="bold", pad=10)
ax.set_xlabel("Frequency", color=W, fontsize=9)
ax = axes[0,2]; ax.set_facecolor(PNL)
if sources:
sl,sv = zip(*sources.items())
_,txts,ats = ax.pie(sv, labels=sl, autopct="%1.0f%%",
colors=C[:len(sl)], startangle=90,
textprops={"color":W,"fontsize":10},
wedgeprops={"edgecolor":BR,"linewidth":1.5})
for at in ats: at.set_color(W); at.set_fontsize(9)
ax.set_title("Source Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
ax = axes[1,0]; style(ax)
if top_cited:
lbls = [(p["title"][:35]+"..." if len(p["title"])>35 else p["title"])
for p in top_cited]
cv = [p["citations"] for p in top_cited]
ax.barh(lbls[::-1], cv[::-1], color=C[1], edgecolor="#475569", lw=.6)
mx = max(cv) if cv else 1
for b,c in zip(ax.patches, cv[::-1]):
ax.text(b.get_width()+mx*.01, b.get_y()+b.get_height()/2,
"{:,}".format(c), va="center", color=W, fontsize=8)
ax.set_xlabel("Citations", color=W, fontsize=9)
else:
ax.text(.5,.5,"No citation data", ha="center", va="center",
color="#94a3b8", fontsize=11, transform=ax.transAxes)
ax.set_title("Top 10 Cited", color=W, fontsize=12, fontweight="bold", pad=10)
ax = axes[1,1]; style(ax)
if any(bcounts):
ax.bar(blabels, bcounts, color=C[2], edgecolor="#475569", lw=.8)
for b,c in zip(ax.patches, bcounts):
if c > 0:
ax.text(b.get_x()+b.get_width()/2, b.get_height()+.1, str(c),
ha="center", va="bottom", color=W, fontsize=9)
ax.set_xlabel("Citation Range", color=W, fontsize=9)
ax.set_ylabel("Papers", color=W, fontsize=9)
ax.annotate("Avg " + str(avg_cit) + " | Total " + "{:,}".format(total_cit),
xy=(.98,.96), xycoords="axes fraction",
ha="right", va="top", color="#94a3b8", fontsize=8)
else:
ax.text(.5,.5,"No citation data", ha="center", va="center",
color="#94a3b8", fontsize=11, transform=ax.transAxes)
ax.set_title("Citation Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
ax = axes[1,2]; style(ax)
if top_authors:
an,ac = zip(*top_authors)
ax.barh(list(an)[::-1], list(ac)[::-1], color=C[3], edgecolor="#475569", lw=.6)
for b,c in zip(ax.patches, list(ac)[::-1]):
ax.text(b.get_width()+.05, b.get_y()+b.get_height()/2, str(c),
va="center", color=W, fontsize=8)
ax.set_xlabel("Papers", color=W, fontsize=9)
ax.set_title("Top Authors", color=W, fontsize=12, fontweight="bold", pad=10)
plt.tight_layout(pad=3)
path = PERSIST_DIR + "/trends.png"
plt.savefig(path, bbox_inches="tight", dpi=150, facecolor=BG)
plt.close()
top5 = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:5]
stats = ("### Stats\n\n| Metric | Value |\n|---|---|\n" +
"| Total | **" + str(len(papers)) + "** |\n" +
"| New | **" + str(sum(1 for p in papers if p.get("recent"))) + "** |\n" +
"| Citations | **" + "{:,}".format(total_cit) + "** |\n" +
"| Average | **" + str(avg_cit) + "** |\n\n")
if top5:
stats += "### Top Cited\n\n"
for i,p in enumerate(top5,1):
stats += (str(i) + ". [" + p["title"] + "](" + p["url"] + ")" +
" โ€” **" + "{:,}".format(p["citations"]) + "**\n\n")
return path, stats
# ================================================================
# LLM
# ================================================================
def _llm(messages, max_tokens=1200):
try:
r = groq_client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=messages, temperature=0.3, max_tokens=max_tokens)
return r.choices[0].message.content.strip()
except Exception as e: return "LLM Error: " + str(e)
def explain_paper(paper, lang="ar"):
cit = paper.get("citations","N/A")
if lang == "ar":
return fix_ar_format(_llm([
{"role":"system","content": "ุฃู†ุช ุฎุจูŠุฑ ุฃูƒุงุฏูŠู…ูŠ ูŠุดุฑุญ ุงู„ุฃุจุญุงุซ ุจุงู„ุนุฑุจูŠุฉ ุงู„ูุตุญู‰.\n" + AR_RULES},
{"role":"user","content":
"ุงุดุฑุญ ุงู„ูˆุฑู‚ุฉ:\nุงู„ุนู†ูˆุงู†: " + paper["title"] + "\n" +
"ุงู„ู…ุคู„ููˆู†: " + ", ".join(paper["authors"][:3]) + "\n" +
"ุงู„ุชุงุฑูŠุฎ: " + paper["published"] + " | ุงู„ุงู‚ุชุจุงุณุงุช: " + str(cit) + "\n" +
"ุงู„ู…ู„ุฎุต: " + paper["abstract"] + "\n\n" +
"## ู…ูˆุถูˆุน ุงู„ูˆุฑู‚ุฉ\n\n## ุงู„ู…ุดูƒู„ุฉ\n\n## ุงู„ู…ู†ู‡ุฌูŠุฉ\n\n" +
"## ุงู„ู†ุชุงุฆุฌ\n\n## ุงู„ุฃู‡ู…ูŠุฉ\n\n## ุงู„ุชุทุจูŠู‚ุงุช"}]))
return _llm([{"role":"user","content":
"Explain:\nTitle: " + paper["title"] + "\nAuthors: " +
", ".join(paper["authors"][:3]) + "\nDate: " + paper["published"] +
" | Citations: " + str(cit) + "\nAbstract: " + paper["abstract"] + "\n\n" +
"## Topic\n## Problem\n## Methodology\n## Findings\n## Contribution\n## Applications"}])
def compare_papers(pa, pb, lang="ar"):
body = ("Paper A: " + pa["title"] + " | Citations: " + str(pa.get("citations","N/A")) +
"\n" + pa["abstract"][:500] + "\n\nPaper B: " +
pb["title"] + " | Citations: " + str(pb.get("citations","N/A")) +
"\n" + pb["abstract"][:500])
if lang == "ar":
return fix_ar_format(_llm([{"role":"user","content":
"ู‚ุงุฑู† ุจูŠู† ุงู„ูˆุฑู‚ุชูŠู†.\n" + AR_RULES + "\n\n" + body + "\n\n" +
"## ุงู„ู‡ุฏู\n\n## ุงู„ู…ู†ู‡ุฌูŠุฉ\n\n## ุงู„ู†ุชุงุฆุฌ\n\n" +
"## ุงู„ู‚ูˆุฉ\n\n## ุงู„ู‚ูŠูˆุฏ\n\n## ุงู„ุฎู„ุงุตุฉ"}], 1400))
return _llm([{"role":"user","content":
"Compare:\n" + body + "\n\n" +
"## Topic\n## Methodology\n## Results\n## Strengths\n## Limits\n## Verdict"}], 1400)
def summarize_papers(papers, topic, lang="ar"):
text = "".join(
str(i) + ". " + p["title"] + " (" + p["published"] + "): " +
p["abstract"][:300] + "...\n\n"
for i,p in enumerate(papers[:8],1))
if lang == "ar":
return fix_ar_format(_llm([{"role":"user","content":
"ู†ุธุฑุฉ ุนุงู…ุฉ ุฃูƒุงุฏูŠู…ูŠุฉ ุญูˆู„ \"" + topic + "\".\n" + AR_RULES +
"\n\n" + text + "\n\n" +
"## ุงู„ุงุชุฌุงู‡ุงุช\n\n## ุฃุจุฑุฒ ุงู„ุฃูˆุฑุงู‚\n\n" +
"## ุงู„ู…ูˆุงุถูŠุน ุงู„ู…ุดุชุฑูƒุฉ\n\n## ุงู„ูุฌูˆุงุช"}], 900))
return _llm([{"role":"user","content":
"Academic overview of \"" + topic + "\":\n" + text + "\n\n" +
"## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900)
def generate_bibliography(papers, style="APA"):
entries = []
for i,p in enumerate(papers,1):
auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "")
year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d."
t,u = p["title"], p["url"]
if style == "APA":
entries.append(str(i) + ". " + auth + " (" + year + "). *" + t + "*. " + u)
elif style == "IEEE":
ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "")
entries.append("[" + str(i) + "] " + ae + ', "' + t + '," ' + year + ". [Online]: " + u)
elif style == "Chicago":
entries.append(str(i) + ". " + auth + '. "' + t + '." (' + year + "). " + u)
else:
key = re.sub(r"\W","", (p["authors"][0].split()[-1]
if p["authors"] else "Auth")) + year
entries.append("@article{" + key + str(i) + ",\n title={" + t +
"},\n author={" + auth + "},\n year={" + year +
"},\n url={" + u + "}\n}")
bib = "\n\n".join(entries)
path = PERSIST_DIR + "/bibliography_" + style + ".txt"
with open(path, "w", encoding="utf-8") as f: f.write(bib)
return bib, path
def chat_about_papers(question, history):
if not PAPERS:
return ("ูŠุฑุฌู‰ ุฌู„ุจ ุงู„ุฃูˆุฑุงู‚ ุฃูˆู„ุงู‹." if detect_lang(question)=="ar"
else "Fetch papers first.")
lang = detect_lang(question)
relevant = search_papers(question, top_k=4)
context = ""
if relevant:
context = ("ุงู„ุฃูˆุฑุงู‚ ุฐุงุช ุงู„ุตู„ุฉ:\n\n" if lang=="ar" else "Relevant papers:\n\n")
for r in relevant:
p = r["paper"]
cit = (" | " + str(p["citations"]) + " citations") if p.get("citations") else ""
context += ("**" + p["title"] + "** (" + p["published"] + ")" + cit +
"\n" + p["abstract"][:400] + "\n๐Ÿ”— " + p["url"] + "\n\n")
sys_msg = (("ุฃู†ุช ู…ุณุงุนุฏ ุจุญุซูŠ. ุฃุฌุจ ุจุงู„ุนุฑุจูŠุฉ ุงู„ูุตุญู‰.\n" + AR_RULES) if lang=="ar"
else "You are an academic assistant. Answer in English.")
msgs = [{"role":"system","content":sys_msg}]
for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]})
msgs.append({"role":"user","content":
(context + "\nุณุคุงู„: " + question) if context else question})
out = _llm(msgs, 800)
return fix_ar_format(out) if lang=="ar" else out
def text_to_audio(text, lang="ar"):
clean = clean_md(text)
if not clean: return None
try:
tts = gTTS(text=clean, lang=lang, slow=False)
path = PERSIST_DIR + "/audio_" + lang + ".mp3"
tts.save(path); return path
except Exception as e: print("TTS: " + str(e)); return None
# ================================================================
# GRADIO HANDLERS
# ================================================================
def gr_fetch(query, category_label, max_results, days_back, source_choice,
progress=gr.Progress()):
global ACTIVE_PAPERS
progress(0.05, desc="Connecting...")
papers, warn = [], ""
if source_choice in ("arXiv", "Both"):
progress(0.15, desc="Fetching arXiv...")
papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""),
int(max_results), int(days_back),
sort_by="submittedDate")
if source_choice in ("CrossRef", "Both"):
progress(0.35, desc="Fetching CrossRef...")
cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back))
if not cr: warn = "\n\n> CrossRef: no results."
papers += cr
seen, unique = set(), []
for p in papers:
key = re.sub(r"\W","",p["title"].lower())[:60]
if key not in seen: seen.add(key); unique.append(p)
papers = unique
if not papers:
return ("No results." + warn,
gr.update(choices=[], value=None), gr.update(choices=[], value=None),
gr.update(choices=[], value=None), gr.update(choices=[], value=None),
"0 papers")
progress(0.60, desc="Fetching citations...")
papers = enrich_citations(papers)
progress(0.85, desc="FAISS indexing...")
build_papers_index(papers)
ACTIVE_PAPERS = list(papers)
tbl, choices = build_table(papers)
recent = sum(1 for p in papers if p.get("recent"))
tot_cit = sum(p.get("citations") or 0 for p in papers)
zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0)
note = ("\n\n> " + str(zero_cit) + " papers with 0 citations (new/unindexed)."
if zero_cit else "")
md = ("## Fetched **" + str(len(papers)) + "** papers\n\n" +
"New: **" + str(recent) + "** | Citations: **" +
"{:,}".format(tot_cit) + "**" + warn + note +
"\n\n---\n\n" + tbl)
upd = gr.update(choices=choices, value=choices[0] if choices else None)
progress(1.0)
return md, upd, upd, upd, upd, str(len(papers)) + " papers | " + "{:,}".format(tot_cit) + " cit."
def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by):
global ACTIVE_PAPERS
if not PAPERS: return "Fetch papers first.", gr.update(), "0"
filtered = []
for p in PAPERS:
try:
y = int(p["published"][:4])
if y < int(year_from) or y > int(year_to): continue
except: pass
cit = int(p.get("citations") or 0)
if cit < int(cit_min) or cit > int(cit_max): continue
filtered.append(p)
if sort_by == "Newest": filtered.sort(key=lambda x: x["published"], reverse=True)
elif sort_by == "Oldest": filtered.sort(key=lambda x: x["published"])
elif sort_by == "Most Cited": filtered.sort(key=lambda x: x.get("citations") or 0, reverse=True)
elif sort_by == "Least Cited":filtered.sort(key=lambda x: x.get("citations") or 0)
if not filtered:
ACTIVE_PAPERS = []
return "No matching papers.", gr.update(choices=[], value=None), "0"
ACTIVE_PAPERS = list(filtered)
tbl, choices = build_table(filtered)
tot = sum(p.get("citations") or 0 for p in filtered)
md = ("## " + str(len(filtered)) + "/" + str(len(PAPERS)) + " papers" +
" | " + str(year_from) + "-" + str(year_to) +
" | cit " + str(cit_min) + "-" + str(cit_max) +
" | total " + "{:,}".format(tot) + "\n\n---\n\n" + tbl)
return md, gr.update(choices=choices, value=choices[0] if choices else None), str(len(filtered)) + "/" + str(len(PAPERS))
def gr_search_fetched(query):
if not query or not query.strip(): return "Enter a query."
if not PAPERS: return "Fetch papers first."
results = search_papers(query.strip(), top_k=8)
if not results: return "No results for: " + query
NL = "\n"
md = "## Search: " + query + " โ€” " + str(len(results)) + " results" + NL + NL
for r in results:
p,s = r["paper"], r["score"]
bar = "green " * round(s*10)
cit = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
link = "[View](" + p["url"] + ")"
pdf = (" [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
md += ("### " + "{:.0f}".format(s*100) + "% โ€” " + p["title"] + NL + NL +
", ".join(p["authors"][:2]) + " | " + p["published"] + cit +
" | " + p.get("source","") + NL + NL +
"> " + p["abstract"][:350] + "..." + NL + NL +
link + pdf + NL + NL + "---" + NL + NL)
return md
def _get_paper(choice):
pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
try: return pool[int(choice.split(".")[0]) - 1]
except: return None
def gr_explain(choice, lang_choice):
if not choice: return "Fetch papers and select one."
paper = _get_paper(choice)
if not paper: return "Selection error."
lang = "ar" if "Arabic" in lang_choice else "en"
NL = "\n"
# โœ… FIX: No backslash inside f-string โ€” use concatenation
pdf_link = (" [PDF](" + paper["pdf_url"] + ")") if paper.get("pdf_url") else ""
header = ("# " + paper["title"] + NL + NL +
"**Authors:** " + ", ".join(paper["authors"]) + NL + NL +
"**Date:** " + paper["published"] +
" | **Citations:** " + cit_badge(paper.get("citations")) +
" | **Source:** " + paper.get("source","arXiv") + NL + NL +
"[View Paper](" + paper["url"] + ")" + pdf_link + NL + NL +
"---" + NL + NL +
"> " + paper["abstract"] + NL + NL +
"---" + NL + NL +
"## Explanation (Llama 3.3 70B)" + NL + NL)
return header + explain_paper(paper, lang)
def gr_audio(txt, lang_choice):
if not txt or len(txt) < 50: return None
return text_to_audio(txt, "ar" if "Arabic" in lang_choice else "en")
def gr_save_fav(choice):
if not choice: return "Select a paper first."
paper = _get_paper(choice)
return save_favorite(paper) if paper else "Error."
def gr_show_favs():
favs = load_favorites()
if not favs: return "No saved papers."
NL = "\n"
lines = [("**" + p["title"] + "**" + NL +
(p["authors"][0] if p["authors"] else "N/A") +
" | " + p["published"] + " | " + p.get("source","") +
" | " + cit_badge(p.get("citations")) +
" | [Link](" + p["url"] + ")")
for p in favs]
return ("### Favorites โ€” " + str(len(favs)) + " papers" + NL + NL +
(NL + NL + "---" + NL + NL).join(lines))
def gr_compare(ca, cb, lc):
if not ca or not cb: return "Select two papers."
pa = _get_paper(ca); pb = _get_paper(cb)
if not pa or not pb: return "Selection error."
if pa["id"] == pb["id"]: return "Select two different papers."
return compare_papers(pa, pb, "ar" if "Arabic" in lc else "en")
def gr_overview(query, lc):
if not PAPERS: return "Fetch papers first."
pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
return ("## Overview\n\n" +
summarize_papers(pool, query or "research",
"ar" if "Arabic" in lc else "en"))
def gr_trends():
if not PAPERS: return None, "Fetch papers first."
return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS)
def gr_bib(style, progress=gr.Progress()):
if not PAPERS: return "Fetch papers first.", None
progress(0.5, desc="Generating...")
pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
text, path = generate_bibliography(pool, style)
progress(1.0)
short = text[:3000] + ("..." if len(text)>3000 else "")
return "```\n" + short + "\n```", path
def gr_chat_fn(message, history):
if not message.strip(): return history, ""
hd = []
for pair in history:
if pair[0]: hd.append({"role":"user", "content":pair[0]})
if pair[1]: hd.append({"role":"assistant","content":pair[1]})
history.append((message, chat_about_papers(message, hd)))
return history, ""
# ================================================================
# UI
# ================================================================
CSS = """
footer{display:none!important}
h1{text-align:center}
.status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0}
.legend{font-size:.8rem;color:#cbd5e1;background:#1e293b;
border-radius:8px;padding:6px 14px;margin-bottom:6px}
.filter-box{background:#1e293b;border-radius:10px;
padding:12px 16px;margin-top:8px}
.gs-box{background:#1e293b;border-radius:10px;padding:14px 18px;
margin-bottom:10px;border:1px solid #334155}
"""
with gr.Blocks(
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
title="Scientific Paper Discovery v7.4", css=CSS
) as demo:
gr.Markdown("# Scientific Paper Discovery v7.4\narXiv ยท CrossRef ยท Llama-3.3-70B ยท FAISS")
gr.Markdown("Citations: ๐Ÿฅ‡ >=1000 | ๐Ÿ† >=100 | โญ >=10 | ๐Ÿ“„ <10 | ยท = 0",
elem_classes="legend")
status_bar = gr.Markdown("No papers loaded yet.", elem_classes="status-bar")
with gr.Tabs():
# โ”€โ”€ TAB 1: BROWSE โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Browse / Search"):
with gr.Row():
with gr.Column(scale=3):
t_query = gr.Textbox(label="Topic",
placeholder="ARIMA, inflation, LLM...",
value="economic forecasting")
t_category = gr.Dropdown(label="Category",
choices=list(CATEGORIES.keys()),
value="๐Ÿ“Š Economics")
t_source = gr.Radio(label="Source",
choices=["arXiv","CrossRef","Both"],
value="arXiv")
with gr.Column(scale=1):
t_max = gr.Slider(5, 50, value=15, step=5, label="Max papers")
t_days = gr.Slider(1, 1500, value=365, step=30, label="Last N days")
btn_fetch = gr.Button("Fetch Papers", variant="primary", size="lg")
papers_table_md = gr.Markdown("Results appear here.")
paper_selector = gr.Dropdown(label="Select paper", choices=[], interactive=True)
with gr.Group(elem_classes="filter-box"):
gr.Markdown("### Filter & Sort")
with gr.Row():
f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="Year from")
f_year_to = gr.Slider(2000,2026,value=2026,step=1,label="Year to")
with gr.Row():
f_cit_min = gr.Slider(0,5000,value=0, step=5,label="Citations min")
f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="Citations max")
with gr.Row():
f_sort = gr.Dropdown(choices=SORT_CHOICES,
value="Most Cited",label="Sort",scale=3)
btn_filter = gr.Button("Apply",variant="primary",scale=1)
gr.Markdown("---\n### Semantic Search (FAISS โ€” in loaded papers)")
with gr.Row():
search_in_box = gr.Textbox(label="Search in loaded papers",
placeholder="ARIMA, transformer...",scale=5)
btn_search_in = gr.Button("Search",scale=1)
search_in_out = gr.Markdown()
# โ”€โ”€ TAB 2: GLOBAL SEARCH โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Global Search"):
gr.Markdown(
"### Search any paper by title or keywords\n\n"
"> Uses arXiv **relevance** sort + CrossRef **title** search.\n"
"> Example: `Attention is All You Need`"
)
with gr.Group(elem_classes="gs-box"):
with gr.Row():
gs_query = gr.Textbox(
label="Title or keywords",
placeholder="Attention is All You Need | ARIMA forecasting ...",
scale=4)
gs_source = gr.Radio(label="Source",
choices=["arXiv","CrossRef","Both"],
value="Both", scale=2)
gs_max = gr.Slider(5,30,value=10,step=5,label="Max results",scale=1)
btn_gs = gr.Button("Search Now", variant="primary", size="lg")
gs_out = gr.Markdown("Enter a title or keywords...")
# โ”€โ”€ TAB 3: EXPLAIN โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Explain"):
with gr.Row():
paper_sel2 = gr.Dropdown(label="Select paper",
choices=[], interactive=True, scale=4)
lang_exp = gr.Radio(LANG_CHOICES, value="Arabic",
label="Language", scale=1)
with gr.Row():
btn_explain = gr.Button("Explain", variant="primary")
btn_fav = gr.Button("Save Fav")
btn_audio = gr.Button("Listen")
btn_export_pdf = gr.Button("Export PDF", variant="secondary")
with gr.Row():
fav_status = gr.Markdown()
pdf_status = gr.Markdown()
explanation_out = gr.Markdown("Fetch papers and select one.")
audio_out = gr.Audio(label="Audio", type="filepath")
pdf_out = gr.File(label="Download PDF")
# โ”€โ”€ TAB 4: COMPARE โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Compare"):
with gr.Row():
cmp_a = gr.Dropdown(label="Paper A", choices=[], interactive=True)
cmp_b = gr.Dropdown(label="Paper B", choices=[], interactive=True)
lang_cmp = gr.Radio(LANG_CHOICES, value="Arabic",
label="Language", scale=1)
btn_compare = gr.Button("Compare", variant="primary")
compare_out = gr.Markdown("Select two papers.")
# โ”€โ”€ TAB 5: CHAT โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Chat"):
chatbot_ui = gr.Chatbot(label="Research Assistant",
height=480, bubble_full_width=False)
with gr.Row():
chat_in = gr.Textbox(label="Question", scale=5,
placeholder="Key findings? | ู…ุง ุฃุจุฑุฒ ุงู„ู†ุชุงุฆุฌุŸ")
btn_send = gr.Button("Send", variant="primary", scale=1)
btn_clear = gr.Button("Clear", size="sm")
# โ”€โ”€ TAB 6: OVERVIEW โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Overview"):
with gr.Row():
lang_ov = gr.Radio(LANG_CHOICES, value="Arabic",
label="Language", scale=1)
btn_overview = gr.Button("Generate Report", variant="primary", scale=3)
overview_out = gr.Markdown("Fetch papers first.")
# โ”€โ”€ TAB 7: TRENDS โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Trends"):
btn_trends = gr.Button("Analyze Trends", variant="primary", size="lg")
trend_chart = gr.Image(label="Trends Dashboard", type="filepath")
trend_stats = gr.Markdown("Fetch papers first.")
# โ”€โ”€ TAB 8: BIBLIOGRAPHY โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Bibliography"):
bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"],
value="APA", label="Style")
btn_bib = gr.Button("Generate Bibliography", variant="primary")
bib_out = gr.Markdown()
bib_file = gr.File(label="Download")
# โ”€โ”€ TAB 9: FAVORITES โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Favorites"):
btn_show_fav = gr.Button("Show Favorites")
favs_md = gr.Markdown("Press to show.")
btn_export_fav = gr.Button("Export CSV", variant="secondary")
fav_csv_file = gr.File(label="CSV File")
# โ”€โ”€ TAB 10: AUTO-FETCH โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("Auto-Fetch"):
with gr.Row():
auto_q = gr.Textbox(label="Topic",
value="economic forecasting", scale=3)
auto_cat = gr.Dropdown(label="Category",
choices=list(CATEGORIES.keys()),
value="๐Ÿ“Š Economics", scale=2)
auto_interval = gr.Slider(5,120,value=60,step=5,
label="Every (min)",scale=1)
with gr.Row():
btn_start_auto = gr.Button("Start", variant="primary")
btn_stop_auto = gr.Button("Stop", variant="stop")
btn_refresh_log = gr.Button("Refresh Log")
auto_status = gr.Markdown()
auto_log_md = gr.Markdown("No log.")
# โ”€โ”€ TAB 11: ABOUT โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Tab("About"):
gr.Markdown("""
# ๐Ÿ”ฌ Scientific Paper Discovery
### Version 7.4 โ€” Intelligent Research Assistant
---
## ๐Ÿง  About This Tool
**Scientific Paper Discovery** is an AI-powered academic research assistant that enables researchers, students, and scientists to **discover, understand, and organize** scientific literature with unprecedented ease. It combines state-of-the-art language models with multi-source academic APIs to deliver a seamless research experience.
---
## โš™๏ธ Core Technologies
| Component | Technology | Role |
|---|---|---|
| ๐Ÿค– Language Model | **Llama 3.3 70B** via Groq API | Paper explanation, comparison & chat |
| ๐Ÿ” Semantic Search | **FAISS** + MiniLM-L12-v2 | Vector similarity search |
| ๐Ÿ“ก Source 1 | **arXiv API** | Preprints across all sciences |
| ๐Ÿ“š Source 2 | **CrossRef API** | Peer-reviewed journal articles |
| ๐Ÿ“Š Citations | **Semantic Scholar** (3-layer) | Real citation counts |
| ๐ŸŽ™๏ธ Text-to-Speech | **gTTS** | Audio playback of explanations |
| ๐Ÿ“„ PDF Export | **ReportLab** | Professional PDF generation |
---
## ๐Ÿ—‚๏ธ Feature Overview
| Tab | Feature | Description |
|---|---|---|
| ๐Ÿ” Browse | Paper Fetching | Fetch latest papers by topic & category |
| ๐ŸŒ Global Search | Title Search | Find any paper by exact title (relevance-sorted) |
| ๐Ÿ“– Explain | AI Explanation | Full structured explanation in Arabic or English |
| โš–๏ธ Compare | Paper Comparison | Side-by-side AI comparison of two papers |
| ๐Ÿ’ฌ Chat | Research Chat | Ask questions about loaded papers |
| ๐ŸŒ Overview | Batch Summary | Academic overview of all loaded papers |
| ๐Ÿ“Š Trends | Analytics | Citation, keyword & author trend charts |
| ๐Ÿ“š Bibliography | Citation Export | APA, IEEE, Chicago, BibTeX formats |
| โญ Favorites | Saved Papers | Bookmark & export favorite papers |
| ๐Ÿ”” Auto-Fetch | Monitoring | Automatic periodic paper discovery |
---
## ๐Ÿ”Ž Search Mode Guide
| Mode | Algorithm | Best For |
|---|---|---|
| Browse | `sortBy=submittedDate` | Discovering latest papers on a topic |
| ๐ŸŒ Global Search | `sortBy=relevance` + `ti:"..."` | Finding a specific paper by title |
| FAISS (internal) | Cosine similarity | Semantic search within loaded papers |
---
## ๐Ÿ“Œ Citation Badges
| Badge | Meaning |
|---|---|
| ๐Ÿฅ‡ | โ‰ฅ 1,000 citations โ€” Highly influential |
| ๐Ÿ† | โ‰ฅ 100 citations โ€” Well-cited |
| โญ | โ‰ฅ 10 citations โ€” Notable |
| ๐Ÿ“„ | < 10 citations โ€” Recent or niche |
| ยท | 0 citations โ€” New or unindexed |
---
*Built with โค๏ธ for the research community โ€” v7.4*
""")
# โ”€โ”€ WIRING โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
FETCH_OUT = [papers_table_md, paper_selector, paper_sel2, cmp_a, cmp_b, status_bar]
btn_fetch.click(gr_fetch,
inputs=[t_query, t_category, t_max, t_days, t_source],
outputs=FETCH_OUT)
btn_filter.click(gr_filter_papers,
inputs=[f_year_from, f_year_to, f_cit_min, f_cit_max, f_sort],
outputs=[papers_table_md, paper_selector, status_bar])
paper_selector.change(lambda x: [gr.update(value=x)]*3,
inputs=[paper_selector],
outputs=[paper_sel2, cmp_a, cmp_b])
btn_search_in.click(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out])
search_in_box.submit(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out])
btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
btn_explain.click(gr_explain, inputs=[paper_sel2, lang_exp], outputs=[explanation_out])
btn_fav.click(gr_save_fav, inputs=[paper_sel2], outputs=[fav_status])
btn_audio.click(gr_audio, inputs=[explanation_out, lang_exp], outputs=[audio_out])
btn_export_pdf.click(gr_export_pdf,
inputs=[explanation_out, paper_sel2],
outputs=[pdf_out, pdf_status])
btn_compare.click(gr_compare, inputs=[cmp_a, cmp_b, lang_cmp], outputs=[compare_out])
btn_overview.click(gr_overview, inputs=[t_query, lang_ov], outputs=[overview_out])
btn_trends.click(gr_trends, outputs=[trend_chart, trend_stats])
btn_bib.click(gr_bib, inputs=[bib_style], outputs=[bib_out, bib_file])
btn_show_fav.click(gr_show_favs, outputs=[favs_md])
btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file])
btn_start_auto.click(start_auto_fetch,
inputs=[auto_q, auto_cat, auto_interval],
outputs=[auto_status])
btn_stop_auto.click(stop_auto_fetch, outputs=[auto_status])
btn_refresh_log.click(get_auto_log, outputs=[auto_log_md])
btn_send.click(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in])
chat_in.submit(gr_chat_fn, inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in])
btn_clear.click(lambda: ([], ""), outputs=[chatbot_ui, chat_in])
if __name__ == "__main__":
demo.launch()