Spaces:

RohanB67
/

epirag

Running

App Files Files Community

RohanB67 commited on 25 days ago

Commit

210908f

1 Parent(s): 9c58b2a

Rewind

Browse files

Files changed (3) hide show

ingest.py +10 -70
query.py +11 -33
static/index.html +0 -5

ingest.py CHANGED Viewed

@@ -31,68 +31,11 @@ CHROMA_BATCH    = 5000
 HF_DATASET_ID   = "RohanB67/papers"
-def extract_text_and_title(pdf_path: str) -> tuple[str, str]:
-    """
-    Extract full text AND attempt to extract the real paper title from the PDF.
-    Strategy:
-      1. PyMuPDF metadata (most reliable when present)
-      2. Largest font text on page 1 (works for most academic PDFs)
-      3. First non-empty line of text on page 1
-      4. Fall back to cleaned filename
-    """
-    doc = fitz.open(pdf_path)
-    # Strategy 1 - PDF metadata title
-    meta_title = (doc.metadata or {}).get("title", "").strip()
-    if meta_title and len(meta_title) > 10 and not meta_title.lower().startswith("untitled"):
-        title = meta_title
-    else:
-        title = None
-    # Strategy 2 - Largest font text block on page 1
-    if not title and len(doc) > 0:
-        try:
-            blocks = doc[0].get_text("dict")["blocks"]
-            best_size, best_text = 0, ""
-            for block in blocks:
-                for line in block.get("lines", []):
-                    for span in line.get("spans", []):
-                        txt  = span["text"].strip()
-                        size = span["size"]
-                        if size > best_size and len(txt) > 10 and txt.isascii():
-                            best_size = size
-                            best_text = txt
-            if best_text:
-                title = best_text
-        except Exception:
-            pass
-    # Strategy 3 - First substantial line of text on page 1
-    if not title:
-        try:
-            first_page_text = doc[0].get_text()
-            for line in first_page_text.split("\n"):
-                line = line.strip()
-                if len(line) > 20 and not line.startswith("http"):
-                    title = line
-                    break
-        except Exception:
-            pass
-    # Strategy 4 — Fall back to cleaned filename
-    if not title:
-        title = os.path.basename(pdf_path).replace(".pdf", "").replace("_", " ").replace("-", " ")
-    # Clean up title - remove newlines, excessive whitespace
-    title = re.sub(r"\s+", " ", title).strip()
-    # Truncate very long titles
-    if len(title) > 150:
-        title = title[:150].rsplit(" ", 1)[0] + "..."
-    full_text = "".join(page.get_text() for page in doc)
     doc.close()
-    return full_text, title
 def chunk_text(text: str) -> list[str]:
@@ -127,24 +70,17 @@ def _load_pdfs(papers_dir: str):
     docs, ids, metas, chunk_index = [], [], [], 0
     for pdf_file in pdf_files:
-        pdf_path = os.path.join(papers_dir, pdf_file)
         print(f"Processing: {pdf_file}", flush=True)
-        full_text, real_title = extract_text_and_title(pdf_path)
-        chunks = chunk_text(full_text)
-        print(f"  Title: {real_title}", flush=True)
         print(f"  -> {len(chunks)} chunks", flush=True)
-        # Clean paper_name from filename (kept for backward compat)
-        paper_name = pdf_file.replace(".pdf", "").replace("_", " ")
         for i, chunk in enumerate(chunks):
             docs.append(chunk)
             ids.append(f"{pdf_file}_chunk_{chunk_index}")
             metas.append({
                 "source":      pdf_file,
                 "chunk_index": i,
-                "paper_name":  paper_name,
-                "paper_title": real_title,
             })
             chunk_index += 1
@@ -152,6 +88,10 @@ def _load_pdfs(papers_dir: str):
 def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
     from huggingface_hub import list_repo_files, hf_hub_download
     os.makedirs(dest_dir, exist_ok=True)
     pdf_files = [

 HF_DATASET_ID   = "RohanB67/papers"
+def extract_text(pdf_path: str) -> tuple[str, str]:
+    doc  = fitz.open(pdf_path)
+    text = "".join(page.get_text() for page in doc)
     doc.close()
+    return text
 def chunk_text(text: str) -> list[str]:
     docs, ids, metas, chunk_index = [], [], [], 0
     for pdf_file in pdf_files:
         print(f"Processing: {pdf_file}", flush=True)
+        chunks = chunk_text(extract_text(os.path.join(papers_dir, pdf_file)))
         print(f"  -> {len(chunks)} chunks", flush=True)
         for i, chunk in enumerate(chunks):
             docs.append(chunk)
             ids.append(f"{pdf_file}_chunk_{chunk_index}")
             metas.append({
                 "source":      pdf_file,
                 "chunk_index": i,
+                "paper_name":  pdf_file.replace(".pdf", "").replace("_", " ")
             })
             chunk_index += 1
 def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
+    """
+    Pull all PDF files from HF dataset RohanB67/papers into dest_dir.
+    Uses huggingface_hub already available in HF Spaces environment.
+    """
     from huggingface_hub import list_repo_files, hf_hub_download
     os.makedirs(dest_dir, exist_ok=True)
     pdf_files = [

query.py CHANGED Viewed

@@ -28,28 +28,12 @@ from tavily import TavilyClient
 _paper_link_cache = {}
-def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
-    """
-    Enrich a local paper with links from multiple free research databases.
-    Uses real paper title for searching when available.
-    Sources tried:
-      - Semantic Scholar API  (DOI, arXiv ID, open-access PDF)
-      - arXiv API             (abs page + PDF)
-      - OpenAlex API          (open research graph, DOI)
-      - NCBI/PubMed E-utils   (PMID, PubMed page)
-      - Generated search URLs: Google, Google Scholar, Semantic Scholar,
-                               arXiv, PubMed, NCBI, OpenAlex
-    """
     global _paper_link_cache
-    cache_key = paper_title or paper_name
-    if cache_key in _paper_link_cache:
-        return _paper_link_cache[cache_key]
-    # Use real title if available, else cleaned filename
-    search_term = paper_title if paper_title and len(paper_title) > 10 else paper_name
-    q = urllib.parse.quote(search_term)
     # Always-available search links (never fail)
     links = {
         "google":                  f"https://www.google.com/search?q={q}+research+paper",
@@ -65,7 +49,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
     try:
         r = requests.get(
             "https://api.semanticscholar.org/graph/v1/paper/search",
-            params={"query": search_term, "limit": 1,
                     "fields": "title,url,externalIds,openAccessPdf"},
             timeout=5
         )
@@ -78,9 +62,6 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
                     links["semantic_scholar"] = p["url"]
                 if ext.get("ArXiv"):
                     links["arxiv"]     = f"https://arxiv.org/abs/{ext['ArXiv']}"
-                    links["arxiv_pdf"] = f"https://arxiv.org/pdf/{ext['ArXiv']}"
-                if ext.get("DOI"):
-                    links["doi"] = f"https://doi.org/{ext['DOI']}"
                 if ext.get("PubMed"):
                     links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
                 pdf = p.get("openAccessPdf")
@@ -93,7 +74,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
     try:
         r = requests.get(
             "https://api.openalex.org/works",
-            params={"search": search_term, "per_page": 1,
                     "select": "id,doi,open_access,primary_location"},
             headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
             timeout=5
@@ -118,7 +99,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
         if "pubmed" not in links:
             r = requests.get(
                 "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
-                params={"db": "pubmed", "term": search_term,
                         "retmax": 1, "retmode": "json"},
                 timeout=5
             )
@@ -129,7 +110,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
     except Exception:
         pass
-    _paper_link_cache[cache_key] = links
     return links
 # -- Config -----------------------------------------------
@@ -195,14 +176,11 @@ def retrieve_local(query: str, embedder, collection) -> list[dict]:
         results["metadatas"][0],
         results["distances"][0]
     ):
-        paper_name  = meta.get("paper_name", meta.get("source", "Unknown"))
-        paper_title = meta.get("paper_title", paper_name)
-        links       = _get_paper_links(paper_name, paper_title)
-        # Display the real title if available, else fall back to filename-based name
-        display_name = paper_title if paper_title and paper_title != paper_name else paper_name
         chunks.append({
             "text":       doc,
-            "source":     display_name,
             "similarity": round(1 - dist, 4),
             "url":        links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
             "links":      links,

 _paper_link_cache = {}
+def _get_paper_links(paper_name: str) -> dict:
     global _paper_link_cache
+    if paper_name in _paper_link_cache:
+        return _paper_link_cache[paper_name]
+    q = urllib.parse.quote_plus(paper_name)
     # Always-available search links (never fail)
     links = {
         "google":                  f"https://www.google.com/search?q={q}+research+paper",
     try:
         r = requests.get(
             "https://api.semanticscholar.org/graph/v1/paper/search",
+            params={"query": paper_name, "limit": 1,
                     "fields": "title,url,externalIds,openAccessPdf"},
             timeout=5
         )
                     links["semantic_scholar"] = p["url"]
                 if ext.get("ArXiv"):
                     links["arxiv"]     = f"https://arxiv.org/abs/{ext['ArXiv']}"
                 if ext.get("PubMed"):
                     links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
                 pdf = p.get("openAccessPdf")
     try:
         r = requests.get(
             "https://api.openalex.org/works",
+            params={"search": paper_name, "per_page": 1,
                     "select": "id,doi,open_access,primary_location"},
             headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
             timeout=5
         if "pubmed" not in links:
             r = requests.get(
                 "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
+                params={"db": "pubmed", "term": paper_name,
                         "retmax": 1, "retmode": "json"},
                 timeout=5
             )
     except Exception:
         pass
+    _paper_link_cache[paper_name] = links
     return links
 # -- Config -----------------------------------------------
         results["metadatas"][0],
         results["distances"][0]
     ):
+        paper_name = meta.get("paper_name", meta.get("source", "Unknown"))
+        links      = _get_paper_links(paper_name)
         chunks.append({
             "text":       doc,
+            "source":     paper_name,
             "similarity": round(1 - dist, 4),
             "url":        links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
             "links":      links,

static/index.html CHANGED Viewed

@@ -525,13 +525,8 @@ function renderResults(data) {
                         return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
                     }
                     let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
-                    // PDF first — highest value
-                    const pdfUrl = links.pdf || links.arxiv_pdf;
-                    if (pdfUrl) btns += `<a class="${btnCls} text-green-400 border-green-800 hover:border-green-400 hover:text-green-300" href="${pdfUrl}" target="_blank">📄 PDF <span class="material-symbols-outlined text-[9px]">download</span></a>`;
-                    // Exact matches
                     if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.arxiv)            btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
-                    if (links.doi)              btns += `<a class="${btnCls}" href="${links.doi}" target="_blank">DOI <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.pubmed)           btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.openalex)         btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     // Search fallbacks — always present

                         return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
                     }
                     let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
                     if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.arxiv)            btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.pubmed)           btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     if (links.openalex)         btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
                     // Search fallbacks — always present