Spaces:

sitayeb
/

Scientific_Paper_Discovery_Bot

Sleeping

File size: 59,929 Bytes

48e63cf
566a5de
48e63cf
 
7589acc
48e63cf
 
7589acc
48e63cf
7589acc
fc8ac6d
48e63cf
e6dadb5
 
 
48e63cf
 
fc8ac6d
48e63cf
 
 
 
 
 
 
ef047c0
7589acc
 
 
48e63cf
7589acc
 
566a5de
48e63cf
 
566a5de
48e63cf
 
 
 
 
 
 
 
 
566a5de
 
48e63cf
fc8ac6d
48e63cf
b54c8f9
 
 
 
 
 
 
 
 
 
 
48e63cf
 
b54c8f9
 
 
 
 
 
 
 
 
 
 
48e63cf
566a5de
 
 
 
 
48e63cf
 
 
 
 
 
 
566a5de
 
 
 
48e63cf
 
 
 
 
 
 
 
 
 
 
 
 
566a5de
 
 
 
48e63cf
 
 
566a5de
48e63cf
 
 
566a5de
 
 
 
 
 
 
48e63cf
 
 
 
566a5de
 
48e63cf
 
 
 
 
 
566a5de
48e63cf
566a5de
48e63cf
 
 
 
 
 
 
 
 
566a5de
48e63cf
 
 
 
 
 
 
 
 
 
 
fc8ac6d
48e63cf
 
fc8ac6d
48e63cf
fc8ac6d
48e63cf
 
 
 
 
 
 
 
566a5de
 
48e63cf
 
 
 
566a5de
 
 
 
 
 
 
 
 
48e63cf
 
fc8ac6d
48e63cf
 
 
566a5de
48e63cf
 
 
 
566a5de
48e63cf
 
566a5de
 
 
 
 
 
 
 
 
48e63cf
 
 
 
 
566a5de
 
 
48e63cf
 
 
 
 
 
 
566a5de
 
 
 
 
 
48e63cf
566a5de
 
 
 
 
 
 
fc8ac6d
48e63cf
fc8ac6d
566a5de
48e63cf
 
 
566a5de
48e63cf
 
566a5de
48e63cf
 
 
566a5de
 
 
48e63cf
 
 
 
 
 
566a5de
48e63cf
566a5de
48e63cf
566a5de
 
 
 
 
 
 
 
 
1dbfacc
48e63cf
 
566a5de
 
48e63cf
 
 
 
 
 
566a5de
e6dadb5
566a5de
 
 
 
 
 
48e63cf
566a5de
48e63cf
566a5de
48e63cf
 
566a5de
 
 
 
 
 
 
 
 
 
 
48e63cf
566a5de
 
48e63cf
 
 
566a5de
48e63cf
566a5de
 
48e63cf
566a5de
 
48e63cf
566a5de
 
 
48e63cf
 
 
 
 
 
e6dadb5
48e63cf
 
 
 
 
566a5de
 
 
48e63cf
 
 
 
 
 
 
 
 
 
 
 
566a5de
 
 
 
48e63cf
566a5de
 
 
 
 
48e63cf
 
566a5de
 
 
48e63cf
 
566a5de
 
 
 
 
48e63cf
566a5de
 
 
 
 
48e63cf
 
 
 
 
566a5de
48e63cf
 
 
566a5de
 
 
48e63cf
 
566a5de
48e63cf
 
 
566a5de
48e63cf
 
 
 
 
 
 
566a5de
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
 
 
 
48e63cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566a5de
 
48e63cf
 
 
 
 
 
 
 
 
 
 
566a5de
 
48e63cf
 
 
 
566a5de
48e63cf
 
 
fc8ac6d
48e63cf
566a5de
 
 
 
 
 
 
 
 
48e63cf
 
 
 
566a5de
 
48e63cf
566a5de
 
48e63cf
566a5de
 
 
 
 
 
 
 
48e63cf
566a5de
48e63cf
 
 
 
 
 
 
 
 
 
 
566a5de
48e63cf
 
566a5de
 
48e63cf
 
 
 
 
 
 
566a5de
 
48e63cf
 
 
 
 
 
 
 
 
 
 
 
 
 
566a5de
 
 
 
48e63cf
 
 
566a5de
48e63cf
566a5de
 
 
 
 
48e63cf
 
566a5de
48e63cf
 
566a5de
48e63cf
 
 
 
 
566a5de
48e63cf
 
 
 
 
 
566a5de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e63cf
 
 
 
566a5de
48e63cf
566a5de
48e63cf
 
 
 
566a5de
48e63cf
566a5de
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
48e63cf
566a5de
 
 
 
 
48e63cf
566a5de
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
9de17c8
566a5de
 
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
 
48e63cf
566a5de
 
 
 
48e63cf
566a5de
 
 
 
 
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
48e63cf
566a5de
48e63cf
566a5de
 
48e63cf
 
 
 
 
 
7589acc
9de17c8
 
48e63cf
7589acc
566a5de
48e63cf
 
 
566a5de
48e63cf
566a5de
48e63cf
566a5de
 
 
 
 
 
48e63cf
566a5de
 
 
 
48e63cf
 
566a5de
 
 
 
 
48e63cf
566a5de
 
 
48e63cf
566a5de
 
48e63cf
 
566a5de
 
 
 
 
48e63cf
566a5de
 
 
 
48e63cf
566a5de
48e63cf
 
 
 
 
 
 
 
566a5de
 
 
48e63cf
566a5de
 
 
48e63cf
 
566a5de
 
 
 
48e63cf
566a5de
 
48e63cf
 
 
 
566a5de
 
 
 
 
48e63cf
566a5de
48e63cf
566a5de
 
 
 
 
 
48e63cf
 
 
566a5de
48e63cf
 
 
 
 
 
fc8ac6d
566a5de
 
 
 
48e63cf
 
 
 
 
 
 
566a5de
48e63cf
566a5de
 
48e63cf
 
 
566a5de
 
48e63cf
566a5de
48e63cf
 
 
 
 
 
 
566a5de
 
 
 
 
48e63cf
566a5de
48e63cf
 
 
 
 
 
566a5de
48e63cf
566a5de
 
 
 
48e63cf
 
566a5de
48e63cf
 
 
566a5de
48e63cf
 
 
 
 
 
 
 
 
566a5de
 
 
 
48e63cf
566a5de
 
48e63cf
 
 
566a5de
 
 
 
 
48e63cf
 
566a5de
 
48e63cf
566a5de
 
 
48e63cf
566a5de
 
 
 
 
 
 
 
 
 
48e63cf
 
 
 
566a5de
48e63cf
 
 
566a5de
48e63cf
566a5de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e63cf
 
 
566a5de
 
48e63cf
 
566a5de
48e63cf
566a5de
48e63cf
 
 
566a5de
 
 
 
 
 
 
48e63cf
566a5de
 
48e63cf
 
566a5de
48e63cf
566a5de
 
 
48e63cf
 
566a5de
48e63cf
566a5de
 
 
48e63cf
 
566a5de
48e63cf
 
 
566a5de
 
48e63cf
 
 
566a5de
 
48e63cf
 
 
 
 
566a5de
48e63cf
 
 
 
 
 
 
 
 
 
 
566a5de
 
 
 
 
 
48e63cf
fc8ac6d
 
48e63cf
566a5de
48e63cf
fc8ac6d
566a5de
 
 
 
48e63cf
 
 
566a5de
 
48e63cf
 
566a5de
48e63cf
 
566a5de
48e63cf
b54c8f9
566a5de
 
48e63cf
 
566a5de
 
 
 
 
48e63cf
566a5de
48e63cf
566a5de
 
48e63cf
566a5de
 
48e63cf
 
566a5de
 
 
48e63cf
566a5de
48e63cf
566a5de
48e63cf
 
566a5de
 
48e63cf
566a5de
 
 
fc8ac6d
48e63cf
 
 
566a5de
 
48e63cf
566a5de
 
 
 
 
 
48e63cf
 
566a5de
48e63cf
566a5de
48e63cf
566a5de
 
48e63cf
566a5de
 
 
 
48e63cf
 
 
566a5de
 
 
48e63cf
 
566a5de
48e63cf
566a5de
 
 
 
 
 
48e63cf
b54c8f9
 
 
 
 
 
 
 
 
 
 
566a5de
48e63cf
566a5de
 
 
 
48e63cf
b54c8f9
566a5de
 
 
 
48e63cf
b54c8f9
566a5de
 
 
 
48e63cf
566a5de
48e63cf
b54c8f9
566a5de
 
 
 
 
48e63cf
b54c8f9
566a5de
48e63cf
566a5de
48e63cf
566a5de
48e63cf
b54c8f9
48e63cf
566a5de
48e63cf
566a5de
 
 
48e63cf
566a5de
48e63cf
 
566a5de
48e63cf
b54c8f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e63cf
b54c8f9
 
 
 
 
48e63cf
b54c8f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48e63cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566a5de
 
 
48e63cf
 
 
 
566a5de
 
48e63cf
566a5de
48e63cf

# ================================================================
# Scientific Paper Discovery Bot v7.4 — SyntaxError FIXED
# ================================================================
import os, re, time, json, pickle, threading
import requests
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta
from collections import Counter

import numpy as np
import faiss
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import gradio as gr
from sentence_transformers import SentenceTransformer
from groq import Groq
from gtts import gTTS
from langdetect import detect, DetectorFactory
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable

DetectorFactory.seed = 0

GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "")
S2_API_KEY   = os.environ.get("S2_API_KEY", "")
groq_client  = Groq(api_key=GROQ_API_KEY)

print("Loading embedder...")
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
_ = embedder.encode(["warmup"])
print("Embedder ready!")

PAPERS        = []
ACTIVE_PAPERS = []
FAISS_INDEX   = None
AUTO_RUNNING  = False
AUTO_LOG      = []
CURRENT_YEAR  = datetime.now().year

PERSIST_DIR    = "/tmp"
FAVORITES_PATH = PERSIST_DIR + "/favorites.pkl"
SEEN_IDS_PATH  = PERSIST_DIR + "/seen_ids.json"
os.makedirs(PERSIST_DIR, exist_ok=True)

CATEGORIES = {
    "🌐 All":             "",
    "📊 Economics":       "econ",
    "💰 Quant Finance":   "q-fin",
    "🤖 AI":              "cs.AI",
    "🧠 Machine Learning":"cs.LG",
    "💬 NLP":             "cs.CL",
    "📈 Statistics":      "stat",
    "🔬 Biology":         "q-bio",
    "⚛️ Physics":         "physics",
    "📐 Mathematics":     "math",
    "💻 Computer Science":"cs",
}
CROSSREF_SUBJECTS = {
    "🌐 All":             "",
    "📊 Economics":       "economics",
    "💰 Quant Finance":   "finance",
    "🤖 AI":              "artificial intelligence",
    "🧠 Machine Learning":"machine learning",
    "💬 NLP":             "natural language processing",
    "📈 Statistics":      "statistics",
    "🔬 Biology":         "biology",
    "⚛️ Physics":         "physics",
    "📐 Mathematics":     "mathematics",
    "💻 Computer Science":"computer science",
}
LANG_CHOICES = ["Arabic", "English"]
SORT_CHOICES = ["Newest", "Oldest", "Most Cited", "Least Cited"]
AR_RULES = """
- ابدأ كل قسم بـ ## مع سطر فارغ قبله وبعده
- اكتب كل قسم في فقرة 3-4 جمل بالعربية الفصحى
- لا تكرر عنوان القسم داخل النص
"""

# ================================================================
# HELPERS
# ================================================================
def detect_lang(text):
    try:
        return "ar" if detect(str(text)[:300]).startswith("ar") else "en"
    except:
        return "en"

def clean_md(text):
    text = re.sub(r"[#*`>\[\]!_~]", "", text)
    return re.sub(r"\n+", " ", text).strip()[:2500]

def fix_ar_format(text):
    text = re.sub(r"\n(##)", r"\n\n\1", text)
    text = re.sub(r"(## [^\n]+)\n([^\n#])", r"\1\n\n\2", text)
    return re.sub(r"\n{3,}", "\n\n", text).strip()

def cit_badge(n):
    if n is None or n == "": return "—"
    n = int(n)
    if n >= 1000: return "🥇 " + "{:,}".format(n)
    if n >= 100:  return "🏆 " + "{:,}".format(n)
    if n >= 10:   return "⭐ " + "{:,}".format(n)
    if n > 0:     return "📄 " + str(n)
    return "·"

def build_table(papers_list):
    rows  = "| # | Title | Author | Date | Citations | Source |\n"
    rows += "|---|---|---|---|---|---|\n"
    choices = []
    for i, p in enumerate(papers_list):
        first = p["authors"][0] if p["authors"] else "N/A"
        badge = "NEW" if p.get("recent") else "📄"
        rows += "| {} | {} {} | {} | {} | {} | {} |\n".format(
            i+1, badge, p["title"], first,
            p["published"], cit_badge(p.get("citations")),
            p.get("source","arXiv"))
        choices.append("{}. {}".format(i+1, p["title"]))
    return rows, choices

def s2_headers():
    h = {"User-Agent": "ScientificPaperBot/7.4"}
    if S2_API_KEY:
        h["x-api-key"] = S2_API_KEY
    return h

def cr_headers():
    return {"User-Agent": "ScientificPaperBot/7.4 (mailto:researcher@example.com)"}

# ================================================================
# CrossRef date parser — rejects garbage years
# ================================================================
def parse_crossref_date(item):
    for field in ["issued", "published", "published-print", "published-online", "created"]:
        dp = (item.get(field) or {}).get("date-parts", [[]])
        if not dp or not dp[0]: continue
        pts = dp[0]
        try:
            year = int(pts[0])
            if not (1900 <= year <= CURRENT_YEAR + 1): continue
            month = max(1, min(12, int(pts[1]) if len(pts) >= 2 else 1))
            day   = max(1, min(31, int(pts[2]) if len(pts) >= 3 else 1))
            return "{:04d}-{:02d}-{:02d}".format(year, month, day)
        except (ValueError, TypeError, IndexError):
            continue
    return "N/A"

# ================================================================
# SEEN / FAVORITES
# ================================================================
def load_seen_ids():
    try:
        with open(SEEN_IDS_PATH) as f: return set(json.load(f))
    except: return set()

def save_seen_ids(ids):
    with open(SEEN_IDS_PATH, "w") as f: json.dump(list(ids), f)

def load_favorites():
    try:
        with open(FAVORITES_PATH, "rb") as f: return pickle.load(f)
    except: return []

def save_favorite(paper):
    favs = load_favorites()
    if paper["id"] not in {p["id"] for p in favs}:
        favs.append(paper)
        with open(FAVORITES_PATH, "wb") as f: pickle.dump(favs, f)
        return "Saved: " + paper["title"]
    return "Already saved."

def export_favorites_csv():
    favs = load_favorites()
    if not favs: return None
    df = pd.DataFrame([{
        "Title":     p["title"],
        "Authors":   ", ".join(p["authors"][:3]),
        "Date":      p["published"],
        "Citations": p.get("citations","N/A"),
        "URL":       p["url"],
        "Source":    p.get("source","arXiv")
    } for p in favs])
    path = PERSIST_DIR + "/favorites.csv"
    df.to_csv(path, index=False, encoding="utf-8-sig")
    return path

def gr_export_fav(): return export_favorites_csv()

# ================================================================
# PDF EXPORT
# ================================================================
def export_explanation_pdf(explanation_text, paper_title="paper"):
    if not explanation_text or len(explanation_text) < 30: return None
    safe  = re.sub(r"[^\w\s-]", "", paper_title)[:50].strip().replace(" ", "_")
    path  = PERSIST_DIR + "/explanation_" + safe + ".pdf"
    doc   = SimpleDocTemplate(path, pagesize=A4,
                              rightMargin=2*cm, leftMargin=2*cm,
                              topMargin=2*cm,  bottomMargin=2*cm)
    styles   = getSampleStyleSheet()
    h2_style = ParagraphStyle("H2", parent=styles["Heading2"],
                               fontSize=11, textColor=colors.HexColor("#2563eb"),
                               spaceBefore=14, spaceAfter=6)
    bd_style = ParagraphStyle("BD", parent=styles["Normal"],
                               fontSize=10, leading=16, spaceAfter=8)
    mt_style = ParagraphStyle("MT", parent=styles["Normal"],
                               fontSize=9, textColor=colors.HexColor("#64748b"))
    story = []
    for line in explanation_text.split("\n"):
        line  = line.strip()
        if not line: story.append(Spacer(1, 6)); continue
        clean = re.sub(r"\*\*(.+?)\*\*", r"\1", line)
        clean = re.sub(r"\*(.+?)\*",       r"\1", clean)
        clean = re.sub(r"`(.+?)`",           r"\1", clean)
        clean = re.sub(r"^#{1,6}\s*",       "",     clean)
        clean = re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]", "", clean).strip()
        if not clean: continue
        if line.startswith("## ") or line.startswith("# "):
            story.append(HRFlowable(width="100%", thickness=0.5,
                                    color=colors.HexColor("#e2e8f0"), spaceAfter=4))
            story.append(Paragraph(clean, h2_style))
        elif line.startswith(">"):
            q_st = ParagraphStyle("Q", parent=styles["Normal"],
                                  fontSize=9, leftIndent=20,
                                  textColor=colors.HexColor("#475569"), leading=14)
            story.append(Paragraph(
                re.sub(r"[🎯❓🔧📊🌟🔗📄👥📅📡🤖#*_~]","",line.lstrip(">").strip()),
                q_st))
        else:
            story.append(Paragraph(clean, bd_style))
    story += [
        Spacer(1, 20),
        HRFlowable(width="100%", thickness=0.5, color=colors.HexColor("#e2e8f0")),
        Paragraph("Generated by Paper Discovery v7.4 — " +
                  datetime.now().strftime("%Y-%m-%d %H:%M"), mt_style)
    ]
    try:
        doc.build(story); return path
    except Exception as e:
        print("PDF error: " + str(e)); return None

def gr_export_pdf(explanation_text, choice):
    if not explanation_text or len(explanation_text) < 50:
        return None, "Explain a paper first."
    title = choice.split(". ", 1)[-1] if choice else "paper"
    path  = export_explanation_pdf(explanation_text, title)
    return (path, "PDF ready!") if path else (None, "PDF failed.")

# ================================================================
# SOURCE 1 — arXiv
# KEY FIX: sort_by parameter
#   Browse  → "submittedDate"   latest papers
#   Global  → "relevance"       exact title match
# ================================================================
def fetch_arxiv_papers(query, category, max_results=20, days_back=365,
                       sort_by="submittedDate"):
    parts = []
    words = query.strip().split()
    if len(words) >= 3 and sort_by == "relevance":
        parts.append('ti:"' + query.strip() + '"')
    elif query.strip():
        parts.append("all:" + query.strip())
    if category.strip():
        parts.append("cat:" + category.strip())
    sq = " AND ".join(parts) if parts else "all:machine learning"
    params = {
        "search_query": sq,
        "start":        0,
        "max_results":  max_results,
        "sortBy":       sort_by,
        "sortOrder":    "descending",
    }
    try:
        resp = requests.get("http://export.arxiv.org/api/query", params=params, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        print("arXiv error: " + str(e)); return []

    ns_a   = "http://www.w3.org/2005/Atom"
    ns_x   = "http://arxiv.org/schemas/atom"
    root   = ET.fromstring(resp.content)
    cutoff = datetime.now() - timedelta(days=days_back)
    papers = []
    for entry in root.findall("{" + ns_a + "}entry"):
        try:
            pid       = entry.find("{" + ns_a + "}id").text.split("/abs/")[-1].strip()
            title     = entry.find("{" + ns_a + "}title").text.strip().replace("\n"," ")
            abstract  = entry.find("{" + ns_a + "}summary").text.strip().replace("\n"," ")
            published = entry.find("{" + ns_a + "}published").text[:10]
            authors   = [a.find("{" + ns_a + "}name").text
                         for a in entry.findall("{" + ns_a + "}author")]
            cats = set()
            pc = entry.find("{" + ns_x + "}primary_category")
            if pc is not None: cats.add(pc.get("term",""))
            for c in entry.findall("{" + ns_x + "}category"): cats.add(c.get("term",""))
            cats.discard("")
            papers.append({
                "id":         pid,
                "title":      title,
                "authors":    authors[:6],
                "abstract":   abstract[:1200],
                "published":  published,
                "categories": list(cats)[:4],
                "citations":  None,
                "url":        "https://arxiv.org/abs/" + pid,
                "pdf_url":    "https://arxiv.org/pdf/" + pid,
                "recent":     datetime.strptime(published, "%Y-%m-%d") >= cutoff,
                "source":     "arXiv",
            })
        except Exception as e:
            print("arXiv parse: " + str(e))
    return papers

# ================================================================
# SOURCE 2 — CrossRef
# ================================================================
def fetch_crossref_papers(query, category_label="", max_results=20,
                          days_back=365, use_title=False):
    subject    = CROSSREF_SUBJECTS.get(category_label, "")
    full_query = (query + " " + subject).strip() if subject else query
    key        = "query.title" if use_title else "query"
    params = {
        key:    full_query,
        "rows": min(max_results * 3, 200),
        "sort": "relevance",
        "select": ("title,author,abstract,published,published-print,"
                   "published-online,issued,created,DOI,"
                   "is-referenced-by-count,link,subject"),
    }
    items = []
    for attempt in range(3):
        try:
            r = requests.get("https://api.crossref.org/works",
                             params=params, headers=cr_headers(), timeout=30)
            if r.status_code == 200:
                items = r.json().get("message",{}).get("items",[]); break
            if r.status_code == 429: time.sleep(2**attempt); continue
            print("CrossRef " + str(r.status_code)); return []
        except Exception as e:
            print("CrossRef attempt " + str(attempt) + ": " + str(e)); time.sleep(1)

    cutoff = datetime.now() - timedelta(days=days_back)
    papers, seen_ids = [], set()
    for item in items:
        if len(papers) >= max_results: break
        title_list = item.get("title", [])
        if not title_list: continue
        title = title_list[0].strip()
        if not title or title.lower().startswith("title pending"): continue
        pub = parse_crossref_date(item)
        if pub == "N/A": continue
        cit     = int(item.get("is-referenced-by-count", 0) or 0)
        authors = [
            (a.get("given","") + " " + a.get("family","")).strip()
            for a in item.get("author",[])[:6]
        ]
        authors = [a for a in authors if a.strip()] or ["Unknown"]
        abstract = re.sub(r"<[^>]+>","",
                          item.get("abstract","No abstract.")).strip()[:1200]
        doi     = item.get("DOI","")
        url     = "https://doi.org/" + doi if doi else "#"
        pid     = doi or re.sub(r"\W","",title)[:40]
        if pid in seen_ids: continue
        seen_ids.add(pid)
        pdf_url = next((l.get("URL","") for l in item.get("link",[])
                        if "pdf" in l.get("content-type","").lower()), "")
        try:    recent = datetime.strptime(pub[:10], "%Y-%m-%d") >= cutoff
        except: recent = False
        papers.append({
            "id":         pid,
            "title":      title,
            "authors":    authors,
            "abstract":   abstract,
            "published":  pub[:10],
            "categories": item.get("subject",[])[:3],
            "citations":  cit,
            "url":        url,
            "pdf_url":    pdf_url,
            "recent":     recent,
            "source":     "CrossRef",
        })
    papers.sort(key=lambda x: x["citations"], reverse=True)
    return papers

# ================================================================
# GLOBAL PAPER SEARCH — relevance sorted
# ================================================================
def global_paper_search(query, source_choice, max_results=10):
    if not query or not query.strip():
        return "Enter a title or keywords."
    q = query.strip(); papers = []
    if source_choice in ("arXiv", "Both"):
        papers += fetch_arxiv_papers(q, "", int(max_results), 3650,
                                     sort_by="relevance")
    if source_choice in ("CrossRef", "Both"):
        papers += fetch_crossref_papers(q, "", int(max_results), 3650,
                                        use_title=True)
    if not papers:
        return "No results for: " + q

    seen, unique = set(), []
    for p in papers:
        key = re.sub(r"\W","",p["title"].lower())[:60]
        if key not in seen: seen.add(key); unique.append(p)
    unique.sort(key=lambda x: x.get("citations") or 0, reverse=True)

    NL = "\n"
    md = "## Search Results: " + q + NL + NL
    md += "**" + str(len(unique)) + " papers found**" + NL + NL + "---" + NL + NL
    for i, p in enumerate(unique, 1):
        cit  = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
        cats = " | ".join(p.get("categories",[])[:2])
        auth = ", ".join(p["authors"][:3])
        abst = p["abstract"][:450]
        link = "[View](" + p["url"] + ")"
        pdf  = ("  [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
        src  = p.get("source","")
        md  += ("### " + str(i) + ". " + p["title"] + NL + NL +
                auth + " | " + p["published"] + cit + " | " + src +
                (" | " + cats if cats else "") + NL + NL +
                "> " + abst + "..." + NL + NL +
                link + pdf + NL + NL + "---" + NL + NL)
    return md

# ================================================================
# CITATION ENGINE — 3-layer
# ================================================================
def enrich_citations(papers):
    arxiv_papers = [p for p in papers
                    if p.get("source")=="arXiv" and
                    (p.get("citations") is None or p.get("citations")==0)]
    if not arxiv_papers:
        for p in papers:
            if p.get("citations") is None: p["citations"] = 0
        return papers
    id_map, batch_ids = {}, []
    for p in arxiv_papers:
        clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
        id_map[clean] = p
        batch_ids.append("arXiv:" + clean)
    for i in range(0, len(batch_ids), 500):
        try:
            r = requests.post(
                "https://api.semanticscholar.org/graph/v1/paper/batch",
                json={"ids": batch_ids[i:i+500]},
                params={"fields":"citationCount,externalIds"},
                headers=s2_headers(), timeout=30)
            if r.status_code == 200:
                for item in r.json():
                    if not item: continue
                    ext   = item.get("externalIds") or {}
                    clean = re.sub(r"v\d+$","",
                                   ext.get("ArXiv","").split("/")[-1].strip())
                    if clean and clean in id_map:
                        c = item.get("citationCount")
                        if c is not None: id_map[clean]["citations"] = int(c)
            elif r.status_code == 429: time.sleep(4)
        except Exception as e: print("S2 batch: " + str(e))
    for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0][:15]:
        clean = re.sub(r"v\d+$","", p["id"].split("/")[-1].strip())
        for attempt in range(2):
            try:
                r = requests.get(
                    "https://api.semanticscholar.org/graph/v1/paper/arXiv:" + clean,
                    params={"fields":"citationCount"},
                    headers=s2_headers(), timeout=10)
                if r.status_code == 200:
                    c = r.json().get("citationCount")
                    p["citations"] = int(c) if c else 0; break
                if r.status_code == 429: time.sleep(2**attempt); continue
                p["citations"] = 0; break
            except: p["citations"] = 0; break
        time.sleep(0.12)
    for p in [x for x in arxiv_papers if (x.get("citations") or 0)==0]:
        try:
            r = requests.get("https://api.crossref.org/works",
                params={"query.title": p["title"], "rows": 1,
                        "select": "is-referenced-by-count,title"},
                headers=cr_headers(), timeout=8)
            if r.status_code == 200:
                items = r.json().get("message",{}).get("items",[])
                if items:
                    found = (items[0].get("title") or [""])[0].lower()
                    qw    = set(p["title"].lower().split()[:5])
                    fw    = set(found.split()[:10])
                    p["citations"] = (
                        int(items[0].get("is-referenced-by-count",0) or 0)
                        if len(qw & fw) >= 2 else 0)
                else: p["citations"] = 0
            else: p["citations"] = 0
            time.sleep(0.12)
        except: p["citations"] = 0
    for p in papers:
        if p.get("citations") is None: p["citations"] = 0
    return papers

# ================================================================
# FAISS
# ================================================================
def build_papers_index(papers):
    global FAISS_INDEX, PAPERS
    PAPERS = papers
    if not papers: FAISS_INDEX = None; return
    texts = [p["title"] + " " + p["abstract"] for p in papers]
    embs  = embedder.encode(texts, convert_to_numpy=True,
                            normalize_embeddings=True).astype("float32")
    idx   = faiss.IndexFlatIP(embs.shape[1])
    idx.add(embs)
    FAISS_INDEX = idx

def search_papers(query, top_k=5):
    if FAISS_INDEX is None or not PAPERS: return []
    qe = embedder.encode([query], convert_to_numpy=True,
                         normalize_embeddings=True).astype("float32")
    scores, ids = FAISS_INDEX.search(qe, min(top_k, len(PAPERS)))
    return [{"paper": PAPERS[i], "score": float(s)}
            for s, i in zip(scores[0], ids[0]) if i >= 0 and float(s) > 0.1]

# ================================================================
# AUTO-FETCH
# ================================================================
def auto_fetch_worker(query, category, interval):
    global AUTO_RUNNING
    while AUTO_RUNNING:
        time.sleep(interval)
        if not AUTO_RUNNING: break
        papers = fetch_arxiv_papers(query, category, 30, 1)
        seen   = load_seen_ids()
        new_ps = [p for p in papers if p["id"] not in seen]
        if new_ps:
            save_seen_ids(seen | {p["id"] for p in papers})
            AUTO_LOG.append(
                "[" + datetime.now().strftime("%H:%M") + "] NEW " +
                str(len(new_ps)) + " — " + query)
            if len(AUTO_LOG) > 20: AUTO_LOG.pop(0)

def start_auto_fetch(query, cat_label, interval_min):
    global AUTO_RUNNING
    if AUTO_RUNNING: return "Already running."
    AUTO_RUNNING = True
    threading.Thread(
        target=auto_fetch_worker,
        args=(query, CATEGORIES.get(cat_label,""), int(interval_min)*60),
        daemon=True).start()
    return "Auto-fetch started every " + str(interval_min) + " min for: " + query

def stop_auto_fetch():
    global AUTO_RUNNING; AUTO_RUNNING = False; return "Stopped."

def get_auto_log():
    return "\n\n".join(reversed(AUTO_LOG[-10:])) if AUTO_LOG else "No log."

# ================================================================
# TRENDS
# ================================================================
def analyze_trends(papers):
    if not papers: return None, "No papers."
    date_counts = Counter(p["published"][:7] for p in papers if p["published"]!="N/A")
    stopwords   = {"the","a","an","of","in","for","on","with","and","or","to","using",
                   "based","via","from","by","is","are","our","we","this","that","which",
                   "towards","approach","method","new","into","over","learning","deep",
                   "model","models","data","neural","large","language","paper","study",
                   "analysis","results","show","also","can","used","two","its","their"}
    all_words   = [w.lower() for p in papers
                   for w in re.findall(r"[a-zA-Z]{4,}", p["title"])
                   if w.lower() not in stopwords]
    top_words   = Counter(all_words).most_common(15)
    sources     = Counter(p.get("source","arXiv") for p in papers)
    cit_papers  = [p for p in papers if (p.get("citations") or 0)>0]
    top_cited   = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:10]
    all_auth    = [a for p in papers for a in p["authors"][:3]]
    top_authors = Counter(all_auth).most_common(10)
    cvals       = [p["citations"] for p in cit_papers]
    buckets     = [0,1,5,10,50,100,500,10000]
    blabels     = ["0","1-4","5-9","10-49","50-99","100-499","500+"]
    bcounts     = ([sum(1 for c in cvals if buckets[i]<=c<buckets[i+1])
                    for i in range(len(buckets)-1)] if cvals else [0]*7)
    avg_cit     = round(sum(cvals)/max(len(cvals),1),1) if cvals else 0
    total_cit   = sum(p.get("citations") or 0 for p in papers)
    C  = ["#3b82f6","#8b5cf6","#10b981","#f59e0b","#ef4444","#06b6d4",
          "#ec4899","#14b8a6","#f97316","#a855f7","#22d3ee","#84cc16",
          "#fbbf24","#34d399","#f87171"]
    BG,PNL,BR,W = "#0f172a","#1e293b","#334155","white"
    fig, axes   = plt.subplots(2, 3, figsize=(20,12))
    fig.patch.set_facecolor(BG)
    fig.suptitle("Research Trends", color=W, fontsize=16, fontweight="bold", y=1.01)
    def style(ax):
        ax.set_facecolor(PNL)
        for sp in ax.spines.values(): sp.set_edgecolor(BR)
        ax.tick_params(colors=W, labelsize=8)
    ax = axes[0,0]; style(ax)
    if date_counts:
        ms,cs = zip(*sorted(date_counts.items()))
        ms,cs = list(ms), list(cs)
        bars  = ax.bar(ms, cs, color=C[0], edgecolor="#60a5fa", lw=0.8)
        for b,c in zip(bars,cs):
            ax.text(b.get_x()+b.get_width()/2, b.get_height()+.05, str(c),
                    ha="center", va="bottom", color=W, fontsize=8)
        if len(cs) > 2:
            z = np.polyfit(range(len(cs)), cs, 1)
            ax.plot(ms, np.poly1d(z)(range(len(cs))), "--",
                    color="#f59e0b", lw=1.5, alpha=.8, label="Trend")
            ax.legend(fontsize=8, facecolor=PNL, labelcolor=W)
    ax.set_title("Papers per Month",  color=W, fontsize=12, fontweight="bold", pad=10)
    ax.set_ylabel("Count",            color=W, fontsize=9)
    ax.tick_params(rotation=45)
    ax = axes[0,1]; style(ax)
    if top_words:
        wds,wcts = zip(*top_words)
        ax.barh(list(wds), list(wcts), color=C[:len(wds)], edgecolor="#475569", lw=.6)
        for b,c in zip(ax.patches, wcts):
            ax.text(b.get_width()+.1, b.get_y()+b.get_height()/2, str(c),
                    va="center", color=W, fontsize=8)
    ax.set_title("Top Keywords", color=W, fontsize=12, fontweight="bold", pad=10)
    ax.set_xlabel("Frequency",   color=W, fontsize=9)
    ax = axes[0,2]; ax.set_facecolor(PNL)
    if sources:
        sl,sv = zip(*sources.items())
        _,txts,ats = ax.pie(sv, labels=sl, autopct="%1.0f%%",
                            colors=C[:len(sl)], startangle=90,
                            textprops={"color":W,"fontsize":10},
                            wedgeprops={"edgecolor":BR,"linewidth":1.5})
        for at in ats: at.set_color(W); at.set_fontsize(9)
    ax.set_title("Source Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
    ax = axes[1,0]; style(ax)
    if top_cited:
        lbls = [(p["title"][:35]+"..." if len(p["title"])>35 else p["title"])
                for p in top_cited]
        cv   = [p["citations"] for p in top_cited]
        ax.barh(lbls[::-1], cv[::-1], color=C[1], edgecolor="#475569", lw=.6)
        mx = max(cv) if cv else 1
        for b,c in zip(ax.patches, cv[::-1]):
            ax.text(b.get_width()+mx*.01, b.get_y()+b.get_height()/2,
                    "{:,}".format(c), va="center", color=W, fontsize=8)
        ax.set_xlabel("Citations", color=W, fontsize=9)
    else:
        ax.text(.5,.5,"No citation data", ha="center", va="center",
                color="#94a3b8", fontsize=11, transform=ax.transAxes)
    ax.set_title("Top 10 Cited", color=W, fontsize=12, fontweight="bold", pad=10)
    ax = axes[1,1]; style(ax)
    if any(bcounts):
        ax.bar(blabels, bcounts, color=C[2], edgecolor="#475569", lw=.8)
        for b,c in zip(ax.patches, bcounts):
            if c > 0:
                ax.text(b.get_x()+b.get_width()/2, b.get_height()+.1, str(c),
                        ha="center", va="bottom", color=W, fontsize=9)
        ax.set_xlabel("Citation Range", color=W, fontsize=9)
        ax.set_ylabel("Papers",         color=W, fontsize=9)
        ax.annotate("Avg " + str(avg_cit) + " | Total " + "{:,}".format(total_cit),
                    xy=(.98,.96), xycoords="axes fraction",
                    ha="right", va="top", color="#94a3b8", fontsize=8)
    else:
        ax.text(.5,.5,"No citation data", ha="center", va="center",
                color="#94a3b8", fontsize=11, transform=ax.transAxes)
    ax.set_title("Citation Distribution", color=W, fontsize=12, fontweight="bold", pad=10)
    ax = axes[1,2]; style(ax)
    if top_authors:
        an,ac = zip(*top_authors)
        ax.barh(list(an)[::-1], list(ac)[::-1], color=C[3], edgecolor="#475569", lw=.6)
        for b,c in zip(ax.patches, list(ac)[::-1]):
            ax.text(b.get_width()+.05, b.get_y()+b.get_height()/2, str(c),
                    va="center", color=W, fontsize=8)
        ax.set_xlabel("Papers", color=W, fontsize=9)
    ax.set_title("Top Authors", color=W, fontsize=12, fontweight="bold", pad=10)
    plt.tight_layout(pad=3)
    path = PERSIST_DIR + "/trends.png"
    plt.savefig(path, bbox_inches="tight", dpi=150, facecolor=BG)
    plt.close()
    top5  = sorted(cit_papers, key=lambda x:x["citations"], reverse=True)[:5]
    stats = ("### Stats\n\n| Metric | Value |\n|---|---|\n" +
             "| Total | **" + str(len(papers)) + "** |\n" +
             "| New | **" + str(sum(1 for p in papers if p.get("recent"))) + "** |\n" +
             "| Citations | **" + "{:,}".format(total_cit) + "** |\n" +
             "| Average | **" + str(avg_cit) + "** |\n\n")
    if top5:
        stats += "### Top Cited\n\n"
        for i,p in enumerate(top5,1):
            stats += (str(i) + ". [" + p["title"] + "](" + p["url"] + ")" +
                      " — **" + "{:,}".format(p["citations"]) + "**\n\n")
    return path, stats

# ================================================================
# LLM
# ================================================================
def _llm(messages, max_tokens=1200):
    try:
        r = groq_client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=messages, temperature=0.3, max_tokens=max_tokens)
        return r.choices[0].message.content.strip()
    except Exception as e: return "LLM Error: " + str(e)

def explain_paper(paper, lang="ar"):
    cit = paper.get("citations","N/A")
    if lang == "ar":
        return fix_ar_format(_llm([
            {"role":"system","content": "أنت خبير أكاديمي يشرح الأبحاث بالعربية الفصحى.\n" + AR_RULES},
            {"role":"user","content":
             "اشرح الورقة:\nالعنوان: " + paper["title"] + "\n" +
             "المؤلفون: " + ", ".join(paper["authors"][:3]) + "\n" +
             "التاريخ: " + paper["published"] + " | الاقتباسات: " + str(cit) + "\n" +
             "الملخص: " + paper["abstract"] + "\n\n" +
             "## موضوع الورقة\n\n## المشكلة\n\n## المنهجية\n\n" +
             "## النتائج\n\n## الأهمية\n\n## التطبيقات"}]))
    return _llm([{"role":"user","content":
        "Explain:\nTitle: " + paper["title"] + "\nAuthors: " +
        ", ".join(paper["authors"][:3]) + "\nDate: " + paper["published"] +
        " | Citations: " + str(cit) + "\nAbstract: " + paper["abstract"] + "\n\n" +
        "## Topic\n## Problem\n## Methodology\n## Findings\n## Contribution\n## Applications"}])

def compare_papers(pa, pb, lang="ar"):
    body = ("Paper A: " + pa["title"] + " | Citations: " + str(pa.get("citations","N/A")) +
            "\n" + pa["abstract"][:500] + "\n\nPaper B: " +
            pb["title"] + " | Citations: " + str(pb.get("citations","N/A")) +
            "\n" + pb["abstract"][:500])
    if lang == "ar":
        return fix_ar_format(_llm([{"role":"user","content":
            "قارن بين الورقتين.\n" + AR_RULES + "\n\n" + body + "\n\n" +
            "## الهدف\n\n## المنهجية\n\n## النتائج\n\n" +
            "## القوة\n\n## القيود\n\n## الخلاصة"}], 1400))
    return _llm([{"role":"user","content":
        "Compare:\n" + body + "\n\n" +
        "## Topic\n## Methodology\n## Results\n## Strengths\n## Limits\n## Verdict"}], 1400)

def summarize_papers(papers, topic, lang="ar"):
    text = "".join(
        str(i) + ". " + p["title"] + " (" + p["published"] + "): " +
        p["abstract"][:300] + "...\n\n"
        for i,p in enumerate(papers[:8],1))
    if lang == "ar":
        return fix_ar_format(_llm([{"role":"user","content":
            "نظرة عامة أكاديمية حول \"" + topic + "\".\n" + AR_RULES +
            "\n\n" + text + "\n\n" +
            "## الاتجاهات\n\n## أبرز الأوراق\n\n" +
            "## المواضيع المشتركة\n\n## الفجوات"}], 900))
    return _llm([{"role":"user","content":
        "Academic overview of \"" + topic + "\":\n" + text + "\n\n" +
        "## Trends\n## Key Papers\n## Themes\n## Gaps"}], 900)

def generate_bibliography(papers, style="APA"):
    entries = []
    for i,p in enumerate(papers,1):
        auth = ", ".join(p["authors"][:6]) + (" et al." if len(p["authors"])>6 else "")
        year = p["published"][:4] if p["published"] not in ("N/A","") else "n.d."
        t,u  = p["title"], p["url"]
        if style == "APA":
            entries.append(str(i) + ". " + auth + " (" + year + "). *" + t + "*. " + u)
        elif style == "IEEE":
            ae = " and ".join(p["authors"][:3]) + (" et al." if len(p["authors"])>3 else "")
            entries.append("[" + str(i) + "] " + ae + ', "' + t + '," ' + year + ". [Online]: " + u)
        elif style == "Chicago":
            entries.append(str(i) + ". " + auth + '. "' + t + '." (' + year + "). " + u)
        else:
            key = re.sub(r"\W","", (p["authors"][0].split()[-1]
                                     if p["authors"] else "Auth")) + year
            entries.append("@article{" + key + str(i) + ",\n  title={" + t +
                           "},\n  author={" + auth + "},\n  year={" + year +
                           "},\n  url={" + u + "}\n}")
    bib  = "\n\n".join(entries)
    path = PERSIST_DIR + "/bibliography_" + style + ".txt"
    with open(path, "w", encoding="utf-8") as f: f.write(bib)
    return bib, path

def chat_about_papers(question, history):
    if not PAPERS:
        return ("يرجى جلب الأوراق أولاً." if detect_lang(question)=="ar"
                else "Fetch papers first.")
    lang     = detect_lang(question)
    relevant = search_papers(question, top_k=4)
    context  = ""
    if relevant:
        context = ("الأوراق ذات الصلة:\n\n" if lang=="ar" else "Relevant papers:\n\n")
        for r in relevant:
            p   = r["paper"]
            cit = (" | " + str(p["citations"]) + " citations") if p.get("citations") else ""
            context += ("**" + p["title"] + "** (" + p["published"] + ")" + cit +
                        "\n" + p["abstract"][:400] + "\n🔗 " + p["url"] + "\n\n")
    sys_msg = (("أنت مساعد بحثي. أجب بالعربية الفصحى.\n" + AR_RULES) if lang=="ar"
               else "You are an academic assistant. Answer in English.")
    msgs = [{"role":"system","content":sys_msg}]
    for t in history[-4:]: msgs.append({"role":t["role"],"content":t["content"]})
    msgs.append({"role":"user","content":
                 (context + "\nسؤال: " + question) if context else question})
    out = _llm(msgs, 800)
    return fix_ar_format(out) if lang=="ar" else out

def text_to_audio(text, lang="ar"):
    clean = clean_md(text)
    if not clean: return None
    try:
        tts  = gTTS(text=clean, lang=lang, slow=False)
        path = PERSIST_DIR + "/audio_" + lang + ".mp3"
        tts.save(path); return path
    except Exception as e: print("TTS: " + str(e)); return None

# ================================================================
# GRADIO HANDLERS
# ================================================================
def gr_fetch(query, category_label, max_results, days_back, source_choice,
             progress=gr.Progress()):
    global ACTIVE_PAPERS
    progress(0.05, desc="Connecting...")
    papers, warn = [], ""
    if source_choice in ("arXiv", "Both"):
        progress(0.15, desc="Fetching arXiv...")
        papers += fetch_arxiv_papers(query, CATEGORIES.get(category_label,""),
                                     int(max_results), int(days_back),
                                     sort_by="submittedDate")
    if source_choice in ("CrossRef", "Both"):
        progress(0.35, desc="Fetching CrossRef...")
        cr = fetch_crossref_papers(query, category_label, int(max_results), int(days_back))
        if not cr: warn = "\n\n> CrossRef: no results."
        papers += cr
    seen, unique = set(), []
    for p in papers:
        key = re.sub(r"\W","",p["title"].lower())[:60]
        if key not in seen: seen.add(key); unique.append(p)
    papers = unique
    if not papers:
        return ("No results." + warn,
                gr.update(choices=[], value=None), gr.update(choices=[], value=None),
                gr.update(choices=[], value=None), gr.update(choices=[], value=None),
                "0 papers")
    progress(0.60, desc="Fetching citations...")
    papers = enrich_citations(papers)
    progress(0.85, desc="FAISS indexing...")
    build_papers_index(papers)
    ACTIVE_PAPERS = list(papers)
    tbl, choices = build_table(papers)
    recent   = sum(1 for p in papers if p.get("recent"))
    tot_cit  = sum(p.get("citations") or 0 for p in papers)
    zero_cit = sum(1 for p in papers if (p.get("citations") or 0)==0)
    note     = ("\n\n> " + str(zero_cit) + " papers with 0 citations (new/unindexed)."
                if zero_cit else "")
    md = ("## Fetched **" + str(len(papers)) + "** papers\n\n" +
          "New: **" + str(recent) + "** | Citations: **" +
          "{:,}".format(tot_cit) + "**" + warn + note +
          "\n\n---\n\n" + tbl)
    upd = gr.update(choices=choices, value=choices[0] if choices else None)
    progress(1.0)
    return md, upd, upd, upd, upd, str(len(papers)) + " papers | " + "{:,}".format(tot_cit) + " cit."

def gr_filter_papers(year_from, year_to, cit_min, cit_max, sort_by):
    global ACTIVE_PAPERS
    if not PAPERS: return "Fetch papers first.", gr.update(), "0"
    filtered = []
    for p in PAPERS:
        try:
            y = int(p["published"][:4])
            if y < int(year_from) or y > int(year_to): continue
        except: pass
        cit = int(p.get("citations") or 0)
        if cit < int(cit_min) or cit > int(cit_max): continue
        filtered.append(p)
    if   sort_by == "Newest":     filtered.sort(key=lambda x: x["published"], reverse=True)
    elif sort_by == "Oldest":     filtered.sort(key=lambda x: x["published"])
    elif sort_by == "Most Cited": filtered.sort(key=lambda x: x.get("citations") or 0, reverse=True)
    elif sort_by == "Least Cited":filtered.sort(key=lambda x: x.get("citations") or 0)
    if not filtered:
        ACTIVE_PAPERS = []
        return "No matching papers.", gr.update(choices=[], value=None), "0"
    ACTIVE_PAPERS = list(filtered)
    tbl, choices = build_table(filtered)
    tot = sum(p.get("citations") or 0 for p in filtered)
    md  = ("## " + str(len(filtered)) + "/" + str(len(PAPERS)) + " papers" +
           " | " + str(year_from) + "-" + str(year_to) +
           " | cit " + str(cit_min) + "-" + str(cit_max) +
           " | total " + "{:,}".format(tot) + "\n\n---\n\n" + tbl)
    return md, gr.update(choices=choices, value=choices[0] if choices else None),            str(len(filtered)) + "/" + str(len(PAPERS))

def gr_search_fetched(query):
    if not query or not query.strip(): return "Enter a query."
    if not PAPERS: return "Fetch papers first."
    results = search_papers(query.strip(), top_k=8)
    if not results: return "No results for: " + query
    NL = "\n"
    md = "## Search: " + query + " — " + str(len(results)) + " results" + NL + NL
    for r in results:
        p,s  = r["paper"], r["score"]
        bar  = "green " * round(s*10)
        cit  = (" | " + cit_badge(p.get("citations"))) if p.get("citations") else ""
        link = "[View](" + p["url"] + ")"
        pdf  = ("  [PDF](" + p["pdf_url"] + ")") if p.get("pdf_url") else ""
        md  += ("### " + "{:.0f}".format(s*100) + "% — " + p["title"] + NL + NL +
                ", ".join(p["authors"][:2]) + " | " + p["published"] + cit +
                " | " + p.get("source","") + NL + NL +
                "> " + p["abstract"][:350] + "..." + NL + NL +
                link + pdf + NL + NL + "---" + NL + NL)
    return md

def _get_paper(choice):
    pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
    try: return pool[int(choice.split(".")[0]) - 1]
    except: return None

def gr_explain(choice, lang_choice):
    if not choice: return "Fetch papers and select one."
    paper = _get_paper(choice)
    if not paper: return "Selection error."
    lang = "ar" if "Arabic" in lang_choice else "en"
    NL   = "\n"
    # ✅ FIX: No backslash inside f-string — use concatenation
    pdf_link = ("  [PDF](" + paper["pdf_url"] + ")") if paper.get("pdf_url") else ""
    header = ("# " + paper["title"] + NL + NL +
              "**Authors:** " + ", ".join(paper["authors"]) + NL + NL +
              "**Date:** " + paper["published"] +
              "  |  **Citations:** " + cit_badge(paper.get("citations")) +
              "  |  **Source:** " + paper.get("source","arXiv") + NL + NL +
              "[View Paper](" + paper["url"] + ")" + pdf_link + NL + NL +
              "---" + NL + NL +
              "> " + paper["abstract"] + NL + NL +
              "---" + NL + NL +
              "## Explanation (Llama 3.3 70B)" + NL + NL)
    return header + explain_paper(paper, lang)

def gr_audio(txt, lang_choice):
    if not txt or len(txt) < 50: return None
    return text_to_audio(txt, "ar" if "Arabic" in lang_choice else "en")

def gr_save_fav(choice):
    if not choice: return "Select a paper first."
    paper = _get_paper(choice)
    return save_favorite(paper) if paper else "Error."

def gr_show_favs():
    favs = load_favorites()
    if not favs: return "No saved papers."
    NL    = "\n"
    lines = [("**" + p["title"] + "**" + NL +
              (p["authors"][0] if p["authors"] else "N/A") +
              " | " + p["published"] + " | " + p.get("source","") +
              " | " + cit_badge(p.get("citations")) +
              " | [Link](" + p["url"] + ")")
             for p in favs]
    return ("### Favorites — " + str(len(favs)) + " papers" + NL + NL +
            (NL + NL + "---" + NL + NL).join(lines))

def gr_compare(ca, cb, lc):
    if not ca or not cb: return "Select two papers."
    pa = _get_paper(ca); pb = _get_paper(cb)
    if not pa or not pb: return "Selection error."
    if pa["id"] == pb["id"]: return "Select two different papers."
    return compare_papers(pa, pb, "ar" if "Arabic" in lc else "en")

def gr_overview(query, lc):
    if not PAPERS: return "Fetch papers first."
    pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
    return ("## Overview\n\n" +
            summarize_papers(pool, query or "research",
                             "ar" if "Arabic" in lc else "en"))

def gr_trends():
    if not PAPERS: return None, "Fetch papers first."
    return analyze_trends(ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS)

def gr_bib(style, progress=gr.Progress()):
    if not PAPERS: return "Fetch papers first.", None
    progress(0.5, desc="Generating...")
    pool = ACTIVE_PAPERS if ACTIVE_PAPERS else PAPERS
    text, path = generate_bibliography(pool, style)
    progress(1.0)
    short = text[:3000] + ("..." if len(text)>3000 else "")
    return "```\n" + short + "\n```", path

def gr_chat_fn(message, history):
    if not message.strip(): return history, ""
    hd = []
    for pair in history:
        if pair[0]: hd.append({"role":"user",     "content":pair[0]})
        if pair[1]: hd.append({"role":"assistant","content":pair[1]})
    history.append((message, chat_about_papers(message, hd)))
    return history, ""

# ================================================================
# UI
# ================================================================
CSS = """
footer{display:none!important}
h1{text-align:center}
.status-bar{font-size:.85rem;color:#94a3b8;padding:2px 0}
.legend{font-size:.8rem;color:#cbd5e1;background:#1e293b;
        border-radius:8px;padding:6px 14px;margin-bottom:6px}
.filter-box{background:#1e293b;border-radius:10px;
            padding:12px 16px;margin-top:8px}
.gs-box{background:#1e293b;border-radius:10px;padding:14px 18px;
        margin-bottom:10px;border:1px solid #334155}
"""

with gr.Blocks(
    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="purple"),
    title="Scientific Paper Discovery v7.4", css=CSS
) as demo:

    gr.Markdown("# Scientific Paper Discovery v7.4\narXiv · CrossRef · Llama-3.3-70B · FAISS")
    gr.Markdown("Citations: 🥇 >=1000 | 🏆 >=100 | ⭐ >=10 | 📄 <10 | · = 0",
                elem_classes="legend")
    status_bar = gr.Markdown("No papers loaded yet.", elem_classes="status-bar")

    with gr.Tabs():

        # ── TAB 1: BROWSE ──────────────────────────────────
        with gr.Tab("Browse / Search"):
            with gr.Row():
                with gr.Column(scale=3):
                    t_query    = gr.Textbox(label="Topic",
                                           placeholder="ARIMA, inflation, LLM...",
                                           value="economic forecasting")
                    t_category = gr.Dropdown(label="Category",
                                             choices=list(CATEGORIES.keys()),
                                             value="📊 Economics")
                    t_source   = gr.Radio(label="Source",
                                         choices=["arXiv","CrossRef","Both"],
                                         value="arXiv")
                with gr.Column(scale=1):
                    t_max  = gr.Slider(5,  50,   value=15, step=5,  label="Max papers")
                    t_days = gr.Slider(1, 1500, value=365, step=30, label="Last N days")
                    btn_fetch = gr.Button("Fetch Papers", variant="primary", size="lg")
            papers_table_md = gr.Markdown("Results appear here.")
            paper_selector  = gr.Dropdown(label="Select paper", choices=[], interactive=True)
            with gr.Group(elem_classes="filter-box"):
                gr.Markdown("### Filter & Sort")
                with gr.Row():
                    f_year_from = gr.Slider(2000,2026,value=2020,step=1,label="Year from")
                    f_year_to   = gr.Slider(2000,2026,value=2026,step=1,label="Year to")
                with gr.Row():
                    f_cit_min = gr.Slider(0,5000,value=0,   step=5,label="Citations min")
                    f_cit_max = gr.Slider(0,5000,value=5000,step=5,label="Citations max")
                with gr.Row():
                    f_sort     = gr.Dropdown(choices=SORT_CHOICES,
                                            value="Most Cited",label="Sort",scale=3)
                    btn_filter = gr.Button("Apply",variant="primary",scale=1)
            gr.Markdown("---\n### Semantic Search (FAISS — in loaded papers)")
            with gr.Row():
                search_in_box = gr.Textbox(label="Search in loaded papers",
                                           placeholder="ARIMA, transformer...",scale=5)
                btn_search_in = gr.Button("Search",scale=1)
            search_in_out = gr.Markdown()

        # ── TAB 2: GLOBAL SEARCH ───────────────────────────
        with gr.Tab("Global Search"):
            gr.Markdown(
                "### Search any paper by title or keywords\n\n"
                "> Uses arXiv **relevance** sort + CrossRef **title** search.\n"
                "> Example: `Attention is All You Need`"
            )
            with gr.Group(elem_classes="gs-box"):
                with gr.Row():
                    gs_query  = gr.Textbox(
                        label="Title or keywords",
                        placeholder="Attention is All You Need | ARIMA forecasting ...",
                        scale=4)
                    gs_source = gr.Radio(label="Source",
                                        choices=["arXiv","CrossRef","Both"],
                                        value="Both", scale=2)
                    gs_max    = gr.Slider(5,30,value=10,step=5,label="Max results",scale=1)
                btn_gs = gr.Button("Search Now", variant="primary", size="lg")
            gs_out = gr.Markdown("Enter a title or keywords...")

        # ── TAB 3: EXPLAIN ─────────────────────────────────
        with gr.Tab("Explain"):
            with gr.Row():
                paper_sel2 = gr.Dropdown(label="Select paper",
                                         choices=[], interactive=True, scale=4)
                lang_exp   = gr.Radio(LANG_CHOICES, value="Arabic",
                                      label="Language", scale=1)
            with gr.Row():
                btn_explain    = gr.Button("Explain",     variant="primary")
                btn_fav        = gr.Button("Save Fav")
                btn_audio      = gr.Button("Listen")
                btn_export_pdf = gr.Button("Export PDF",  variant="secondary")
            with gr.Row():
                fav_status = gr.Markdown()
                pdf_status = gr.Markdown()
            explanation_out = gr.Markdown("Fetch papers and select one.")
            audio_out       = gr.Audio(label="Audio", type="filepath")
            pdf_out         = gr.File(label="Download PDF")

        # ── TAB 4: COMPARE ─────────────────────────────────
        with gr.Tab("Compare"):
            with gr.Row():
                cmp_a    = gr.Dropdown(label="Paper A", choices=[], interactive=True)
                cmp_b    = gr.Dropdown(label="Paper B", choices=[], interactive=True)
                lang_cmp = gr.Radio(LANG_CHOICES, value="Arabic",
                                    label="Language", scale=1)
            btn_compare = gr.Button("Compare", variant="primary")
            compare_out = gr.Markdown("Select two papers.")

        # ── TAB 5: CHAT ────────────────────────────────────
        with gr.Tab("Chat"):
            chatbot_ui = gr.Chatbot(label="Research Assistant",
                                    height=480, bubble_full_width=False)
            with gr.Row():
                chat_in  = gr.Textbox(label="Question", scale=5,
                                      placeholder="Key findings? | ما أبرز النتائج؟")
                btn_send = gr.Button("Send", variant="primary", scale=1)
            btn_clear = gr.Button("Clear", size="sm")

        # ── TAB 6: OVERVIEW ────────────────────────────────
        with gr.Tab("Overview"):
            with gr.Row():
                lang_ov      = gr.Radio(LANG_CHOICES, value="Arabic",
                                        label="Language", scale=1)
                btn_overview = gr.Button("Generate Report", variant="primary", scale=3)
            overview_out = gr.Markdown("Fetch papers first.")

        # ── TAB 7: TRENDS ──────────────────────────────────
        with gr.Tab("Trends"):
            btn_trends  = gr.Button("Analyze Trends", variant="primary", size="lg")
            trend_chart = gr.Image(label="Trends Dashboard", type="filepath")
            trend_stats = gr.Markdown("Fetch papers first.")

        # ── TAB 8: BIBLIOGRAPHY ────────────────────────────
        with gr.Tab("Bibliography"):
            bib_style = gr.Radio(["APA","IEEE","Chicago","BibTeX"],
                                 value="APA", label="Style")
            btn_bib   = gr.Button("Generate Bibliography", variant="primary")
            bib_out   = gr.Markdown()
            bib_file  = gr.File(label="Download")

        # ── TAB 9: FAVORITES ───────────────────────────────
        with gr.Tab("Favorites"):
            btn_show_fav   = gr.Button("Show Favorites")
            favs_md        = gr.Markdown("Press to show.")
            btn_export_fav = gr.Button("Export CSV", variant="secondary")
            fav_csv_file   = gr.File(label="CSV File")

        # ── TAB 10: AUTO-FETCH ─────────────────────────────
        with gr.Tab("Auto-Fetch"):
            with gr.Row():
                auto_q        = gr.Textbox(label="Topic",
                                           value="economic forecasting", scale=3)
                auto_cat      = gr.Dropdown(label="Category",
                                            choices=list(CATEGORIES.keys()),
                                            value="📊 Economics", scale=2)
                auto_interval = gr.Slider(5,120,value=60,step=5,
                                         label="Every (min)",scale=1)
            with gr.Row():
                btn_start_auto  = gr.Button("Start", variant="primary")
                btn_stop_auto   = gr.Button("Stop",  variant="stop")
                btn_refresh_log = gr.Button("Refresh Log")
            auto_status = gr.Markdown()
            auto_log_md = gr.Markdown("No log.")

        # ── TAB 11: ABOUT ──────────────────────────────────
        with gr.Tab("About"):
            gr.Markdown("""
# 🔬 Scientific Paper Discovery
### Version 7.4 — Intelligent Research Assistant

---

## 🧠 About This Tool

**Scientific Paper Discovery** is an AI-powered academic research assistant that enables researchers, students, and scientists to **discover, understand, and organize** scientific literature with unprecedented ease. It combines state-of-the-art language models with multi-source academic APIs to deliver a seamless research experience.

---

## ⚙️ Core Technologies

| Component | Technology | Role |
|---|---|---|
| 🤖 Language Model | **Llama 3.3 70B** via Groq API | Paper explanation, comparison & chat |
| 🔍 Semantic Search | **FAISS** + MiniLM-L12-v2 | Vector similarity search |
| 📡 Source 1 | **arXiv API** | Preprints across all sciences |
| 📚 Source 2 | **CrossRef API** | Peer-reviewed journal articles |
| 📊 Citations | **Semantic Scholar** (3-layer) | Real citation counts |
| 🎙️ Text-to-Speech | **gTTS** | Audio playback of explanations |
| 📄 PDF Export | **ReportLab** | Professional PDF generation |

---

## 🗂️ Feature Overview

| Tab | Feature | Description |
|---|---|---|
| 🔍 Browse | Paper Fetching | Fetch latest papers by topic & category |
| 🌐 Global Search | Title Search | Find any paper by exact title (relevance-sorted) |
| 📖 Explain | AI Explanation | Full structured explanation in Arabic or English |
| ⚖️ Compare | Paper Comparison | Side-by-side AI comparison of two papers |
| 💬 Chat | Research Chat | Ask questions about loaded papers |
| 🌐 Overview | Batch Summary | Academic overview of all loaded papers |
| 📊 Trends | Analytics | Citation, keyword & author trend charts |
| 📚 Bibliography | Citation Export | APA, IEEE, Chicago, BibTeX formats |
| ⭐ Favorites | Saved Papers | Bookmark & export favorite papers |
| 🔔 Auto-Fetch | Monitoring | Automatic periodic paper discovery |

---

## 🔎 Search Mode Guide

| Mode | Algorithm | Best For |
|---|---|---|
| Browse | `sortBy=submittedDate` | Discovering latest papers on a topic |
| 🌐 Global Search | `sortBy=relevance` + `ti:"..."` | Finding a specific paper by title |
| FAISS (internal) | Cosine similarity | Semantic search within loaded papers |

---

## 📌 Citation Badges

| Badge | Meaning |
|---|---|
| 🥇 | ≥ 1,000 citations — Highly influential |
| 🏆 | ≥ 100 citations — Well-cited |
| ⭐ | ≥ 10 citations — Notable |
| 📄 | < 10 citations — Recent or niche |
| · | 0 citations — New or unindexed |

---

*Built with ❤️ for the research community — v7.4*
""")

    # ── WIRING ──────────────────────────────────────────────
    FETCH_OUT = [papers_table_md, paper_selector, paper_sel2, cmp_a, cmp_b, status_bar]

    btn_fetch.click(gr_fetch,
                    inputs=[t_query, t_category, t_max, t_days, t_source],
                    outputs=FETCH_OUT)
    btn_filter.click(gr_filter_papers,
                     inputs=[f_year_from, f_year_to, f_cit_min, f_cit_max, f_sort],
                     outputs=[papers_table_md, paper_selector, status_bar])
    paper_selector.change(lambda x: [gr.update(value=x)]*3,
                          inputs=[paper_selector],
                          outputs=[paper_sel2, cmp_a, cmp_b])

    btn_search_in.click(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out])
    search_in_box.submit(gr_search_fetched, inputs=[search_in_box], outputs=[search_in_out])

    btn_gs.click(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])
    gs_query.submit(global_paper_search, inputs=[gs_query, gs_source, gs_max], outputs=[gs_out])

    btn_explain.click(gr_explain,   inputs=[paper_sel2, lang_exp],      outputs=[explanation_out])
    btn_fav.click(gr_save_fav,      inputs=[paper_sel2],                outputs=[fav_status])
    btn_audio.click(gr_audio,       inputs=[explanation_out, lang_exp], outputs=[audio_out])
    btn_export_pdf.click(gr_export_pdf,
                         inputs=[explanation_out, paper_sel2],
                         outputs=[pdf_out, pdf_status])

    btn_compare.click(gr_compare,   inputs=[cmp_a, cmp_b, lang_cmp],   outputs=[compare_out])
    btn_overview.click(gr_overview, inputs=[t_query, lang_ov],         outputs=[overview_out])
    btn_trends.click(gr_trends,     outputs=[trend_chart, trend_stats])
    btn_bib.click(gr_bib,           inputs=[bib_style],                outputs=[bib_out, bib_file])

    btn_show_fav.click(gr_show_favs,    outputs=[favs_md])
    btn_export_fav.click(gr_export_fav, outputs=[fav_csv_file])

    btn_start_auto.click(start_auto_fetch,
                         inputs=[auto_q, auto_cat, auto_interval],
                         outputs=[auto_status])
    btn_stop_auto.click(stop_auto_fetch,   outputs=[auto_status])
    btn_refresh_log.click(get_auto_log,    outputs=[auto_log_md])

    btn_send.click(gr_chat_fn,  inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in])
    chat_in.submit(gr_chat_fn,  inputs=[chat_in, chatbot_ui], outputs=[chatbot_ui, chat_in])
    btn_clear.click(lambda: ([], ""), outputs=[chatbot_ui, chat_in])

if __name__ == "__main__":
    demo.launch()