Rewind
Browse files- ingest.py +10 -70
- query.py +11 -33
- static/index.html +0 -5
ingest.py
CHANGED
|
@@ -31,68 +31,11 @@ CHROMA_BATCH = 5000
|
|
| 31 |
HF_DATASET_ID = "RohanB67/papers"
|
| 32 |
|
| 33 |
|
| 34 |
-
def
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
Strategy:
|
| 39 |
-
1. PyMuPDF metadata (most reliable when present)
|
| 40 |
-
2. Largest font text on page 1 (works for most academic PDFs)
|
| 41 |
-
3. First non-empty line of text on page 1
|
| 42 |
-
4. Fall back to cleaned filename
|
| 43 |
-
"""
|
| 44 |
-
doc = fitz.open(pdf_path)
|
| 45 |
-
|
| 46 |
-
# Strategy 1 - PDF metadata title
|
| 47 |
-
meta_title = (doc.metadata or {}).get("title", "").strip()
|
| 48 |
-
if meta_title and len(meta_title) > 10 and not meta_title.lower().startswith("untitled"):
|
| 49 |
-
title = meta_title
|
| 50 |
-
else:
|
| 51 |
-
title = None
|
| 52 |
-
|
| 53 |
-
# Strategy 2 - Largest font text block on page 1
|
| 54 |
-
if not title and len(doc) > 0:
|
| 55 |
-
try:
|
| 56 |
-
blocks = doc[0].get_text("dict")["blocks"]
|
| 57 |
-
best_size, best_text = 0, ""
|
| 58 |
-
for block in blocks:
|
| 59 |
-
for line in block.get("lines", []):
|
| 60 |
-
for span in line.get("spans", []):
|
| 61 |
-
txt = span["text"].strip()
|
| 62 |
-
size = span["size"]
|
| 63 |
-
if size > best_size and len(txt) > 10 and txt.isascii():
|
| 64 |
-
best_size = size
|
| 65 |
-
best_text = txt
|
| 66 |
-
if best_text:
|
| 67 |
-
title = best_text
|
| 68 |
-
except Exception:
|
| 69 |
-
pass
|
| 70 |
-
|
| 71 |
-
# Strategy 3 - First substantial line of text on page 1
|
| 72 |
-
if not title:
|
| 73 |
-
try:
|
| 74 |
-
first_page_text = doc[0].get_text()
|
| 75 |
-
for line in first_page_text.split("\n"):
|
| 76 |
-
line = line.strip()
|
| 77 |
-
if len(line) > 20 and not line.startswith("http"):
|
| 78 |
-
title = line
|
| 79 |
-
break
|
| 80 |
-
except Exception:
|
| 81 |
-
pass
|
| 82 |
-
|
| 83 |
-
# Strategy 4 — Fall back to cleaned filename
|
| 84 |
-
if not title:
|
| 85 |
-
title = os.path.basename(pdf_path).replace(".pdf", "").replace("_", " ").replace("-", " ")
|
| 86 |
-
|
| 87 |
-
# Clean up title - remove newlines, excessive whitespace
|
| 88 |
-
title = re.sub(r"\s+", " ", title).strip()
|
| 89 |
-
# Truncate very long titles
|
| 90 |
-
if len(title) > 150:
|
| 91 |
-
title = title[:150].rsplit(" ", 1)[0] + "..."
|
| 92 |
-
|
| 93 |
-
full_text = "".join(page.get_text() for page in doc)
|
| 94 |
doc.close()
|
| 95 |
-
return
|
| 96 |
|
| 97 |
|
| 98 |
def chunk_text(text: str) -> list[str]:
|
|
@@ -127,24 +70,17 @@ def _load_pdfs(papers_dir: str):
|
|
| 127 |
|
| 128 |
docs, ids, metas, chunk_index = [], [], [], 0
|
| 129 |
for pdf_file in pdf_files:
|
| 130 |
-
pdf_path = os.path.join(papers_dir, pdf_file)
|
| 131 |
print(f"Processing: {pdf_file}", flush=True)
|
| 132 |
-
|
| 133 |
-
chunks = chunk_text(full_text)
|
| 134 |
-
print(f" Title: {real_title}", flush=True)
|
| 135 |
print(f" -> {len(chunks)} chunks", flush=True)
|
| 136 |
|
| 137 |
-
# Clean paper_name from filename (kept for backward compat)
|
| 138 |
-
paper_name = pdf_file.replace(".pdf", "").replace("_", " ")
|
| 139 |
-
|
| 140 |
for i, chunk in enumerate(chunks):
|
| 141 |
docs.append(chunk)
|
| 142 |
ids.append(f"{pdf_file}_chunk_{chunk_index}")
|
| 143 |
metas.append({
|
| 144 |
"source": pdf_file,
|
| 145 |
"chunk_index": i,
|
| 146 |
-
"paper_name":
|
| 147 |
-
"paper_title": real_title,
|
| 148 |
})
|
| 149 |
chunk_index += 1
|
| 150 |
|
|
@@ -152,6 +88,10 @@ def _load_pdfs(papers_dir: str):
|
|
| 152 |
|
| 153 |
|
| 154 |
def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
from huggingface_hub import list_repo_files, hf_hub_download
|
| 156 |
os.makedirs(dest_dir, exist_ok=True)
|
| 157 |
pdf_files = [
|
|
|
|
| 31 |
HF_DATASET_ID = "RohanB67/papers"
|
| 32 |
|
| 33 |
|
| 34 |
+
def extract_text(pdf_path: str) -> tuple[str, str]:
|
| 35 |
+
doc = fitz.open(pdf_path)
|
| 36 |
+
text = "".join(page.get_text() for page in doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
doc.close()
|
| 38 |
+
return text
|
| 39 |
|
| 40 |
|
| 41 |
def chunk_text(text: str) -> list[str]:
|
|
|
|
| 70 |
|
| 71 |
docs, ids, metas, chunk_index = [], [], [], 0
|
| 72 |
for pdf_file in pdf_files:
|
|
|
|
| 73 |
print(f"Processing: {pdf_file}", flush=True)
|
| 74 |
+
chunks = chunk_text(extract_text(os.path.join(papers_dir, pdf_file)))
|
|
|
|
|
|
|
| 75 |
print(f" -> {len(chunks)} chunks", flush=True)
|
| 76 |
|
|
|
|
|
|
|
|
|
|
| 77 |
for i, chunk in enumerate(chunks):
|
| 78 |
docs.append(chunk)
|
| 79 |
ids.append(f"{pdf_file}_chunk_{chunk_index}")
|
| 80 |
metas.append({
|
| 81 |
"source": pdf_file,
|
| 82 |
"chunk_index": i,
|
| 83 |
+
"paper_name": pdf_file.replace(".pdf", "").replace("_", " ")
|
|
|
|
| 84 |
})
|
| 85 |
chunk_index += 1
|
| 86 |
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
|
| 91 |
+
"""
|
| 92 |
+
Pull all PDF files from HF dataset RohanB67/papers into dest_dir.
|
| 93 |
+
Uses huggingface_hub already available in HF Spaces environment.
|
| 94 |
+
"""
|
| 95 |
from huggingface_hub import list_repo_files, hf_hub_download
|
| 96 |
os.makedirs(dest_dir, exist_ok=True)
|
| 97 |
pdf_files = [
|
query.py
CHANGED
|
@@ -28,28 +28,12 @@ from tavily import TavilyClient
|
|
| 28 |
_paper_link_cache = {}
|
| 29 |
|
| 30 |
|
| 31 |
-
def _get_paper_links(paper_name: str
|
| 32 |
-
"""
|
| 33 |
-
Enrich a local paper with links from multiple free research databases.
|
| 34 |
-
Uses real paper title for searching when available.
|
| 35 |
-
|
| 36 |
-
Sources tried:
|
| 37 |
-
- Semantic Scholar API (DOI, arXiv ID, open-access PDF)
|
| 38 |
-
- arXiv API (abs page + PDF)
|
| 39 |
-
- OpenAlex API (open research graph, DOI)
|
| 40 |
-
- NCBI/PubMed E-utils (PMID, PubMed page)
|
| 41 |
-
- Generated search URLs: Google, Google Scholar, Semantic Scholar,
|
| 42 |
-
arXiv, PubMed, NCBI, OpenAlex
|
| 43 |
-
"""
|
| 44 |
global _paper_link_cache
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
return _paper_link_cache[cache_key]
|
| 48 |
-
|
| 49 |
-
# Use real title if available, else cleaned filename
|
| 50 |
-
search_term = paper_title if paper_title and len(paper_title) > 10 else paper_name
|
| 51 |
-
q = urllib.parse.quote(search_term)
|
| 52 |
|
|
|
|
| 53 |
# Always-available search links (never fail)
|
| 54 |
links = {
|
| 55 |
"google": f"https://www.google.com/search?q={q}+research+paper",
|
|
@@ -65,7 +49,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
|
|
| 65 |
try:
|
| 66 |
r = requests.get(
|
| 67 |
"https://api.semanticscholar.org/graph/v1/paper/search",
|
| 68 |
-
params={"query":
|
| 69 |
"fields": "title,url,externalIds,openAccessPdf"},
|
| 70 |
timeout=5
|
| 71 |
)
|
|
@@ -78,9 +62,6 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
|
|
| 78 |
links["semantic_scholar"] = p["url"]
|
| 79 |
if ext.get("ArXiv"):
|
| 80 |
links["arxiv"] = f"https://arxiv.org/abs/{ext['ArXiv']}"
|
| 81 |
-
links["arxiv_pdf"] = f"https://arxiv.org/pdf/{ext['ArXiv']}"
|
| 82 |
-
if ext.get("DOI"):
|
| 83 |
-
links["doi"] = f"https://doi.org/{ext['DOI']}"
|
| 84 |
if ext.get("PubMed"):
|
| 85 |
links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
|
| 86 |
pdf = p.get("openAccessPdf")
|
|
@@ -93,7 +74,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
|
|
| 93 |
try:
|
| 94 |
r = requests.get(
|
| 95 |
"https://api.openalex.org/works",
|
| 96 |
-
params={"search":
|
| 97 |
"select": "id,doi,open_access,primary_location"},
|
| 98 |
headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
|
| 99 |
timeout=5
|
|
@@ -118,7 +99,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
|
|
| 118 |
if "pubmed" not in links:
|
| 119 |
r = requests.get(
|
| 120 |
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
|
| 121 |
-
params={"db": "pubmed", "term":
|
| 122 |
"retmax": 1, "retmode": "json"},
|
| 123 |
timeout=5
|
| 124 |
)
|
|
@@ -129,7 +110,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
|
|
| 129 |
except Exception:
|
| 130 |
pass
|
| 131 |
|
| 132 |
-
_paper_link_cache[
|
| 133 |
return links
|
| 134 |
|
| 135 |
# -- Config -----------------------------------------------
|
|
@@ -195,14 +176,11 @@ def retrieve_local(query: str, embedder, collection) -> list[dict]:
|
|
| 195 |
results["metadatas"][0],
|
| 196 |
results["distances"][0]
|
| 197 |
):
|
| 198 |
-
paper_name
|
| 199 |
-
|
| 200 |
-
links = _get_paper_links(paper_name, paper_title)
|
| 201 |
-
# Display the real title if available, else fall back to filename-based name
|
| 202 |
-
display_name = paper_title if paper_title and paper_title != paper_name else paper_name
|
| 203 |
chunks.append({
|
| 204 |
"text": doc,
|
| 205 |
-
"source":
|
| 206 |
"similarity": round(1 - dist, 4),
|
| 207 |
"url": links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
|
| 208 |
"links": links,
|
|
|
|
| 28 |
_paper_link_cache = {}
|
| 29 |
|
| 30 |
|
| 31 |
+
def _get_paper_links(paper_name: str) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
global _paper_link_cache
|
| 33 |
+
if paper_name in _paper_link_cache:
|
| 34 |
+
return _paper_link_cache[paper_name]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
q = urllib.parse.quote_plus(paper_name)
|
| 37 |
# Always-available search links (never fail)
|
| 38 |
links = {
|
| 39 |
"google": f"https://www.google.com/search?q={q}+research+paper",
|
|
|
|
| 49 |
try:
|
| 50 |
r = requests.get(
|
| 51 |
"https://api.semanticscholar.org/graph/v1/paper/search",
|
| 52 |
+
params={"query": paper_name, "limit": 1,
|
| 53 |
"fields": "title,url,externalIds,openAccessPdf"},
|
| 54 |
timeout=5
|
| 55 |
)
|
|
|
|
| 62 |
links["semantic_scholar"] = p["url"]
|
| 63 |
if ext.get("ArXiv"):
|
| 64 |
links["arxiv"] = f"https://arxiv.org/abs/{ext['ArXiv']}"
|
|
|
|
|
|
|
|
|
|
| 65 |
if ext.get("PubMed"):
|
| 66 |
links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
|
| 67 |
pdf = p.get("openAccessPdf")
|
|
|
|
| 74 |
try:
|
| 75 |
r = requests.get(
|
| 76 |
"https://api.openalex.org/works",
|
| 77 |
+
params={"search": paper_name, "per_page": 1,
|
| 78 |
"select": "id,doi,open_access,primary_location"},
|
| 79 |
headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
|
| 80 |
timeout=5
|
|
|
|
| 99 |
if "pubmed" not in links:
|
| 100 |
r = requests.get(
|
| 101 |
"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
|
| 102 |
+
params={"db": "pubmed", "term": paper_name,
|
| 103 |
"retmax": 1, "retmode": "json"},
|
| 104 |
timeout=5
|
| 105 |
)
|
|
|
|
| 110 |
except Exception:
|
| 111 |
pass
|
| 112 |
|
| 113 |
+
_paper_link_cache[paper_name] = links
|
| 114 |
return links
|
| 115 |
|
| 116 |
# -- Config -----------------------------------------------
|
|
|
|
| 176 |
results["metadatas"][0],
|
| 177 |
results["distances"][0]
|
| 178 |
):
|
| 179 |
+
paper_name = meta.get("paper_name", meta.get("source", "Unknown"))
|
| 180 |
+
links = _get_paper_links(paper_name)
|
|
|
|
|
|
|
|
|
|
| 181 |
chunks.append({
|
| 182 |
"text": doc,
|
| 183 |
+
"source": paper_name,
|
| 184 |
"similarity": round(1 - dist, 4),
|
| 185 |
"url": links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
|
| 186 |
"links": links,
|
static/index.html
CHANGED
|
@@ -525,13 +525,8 @@ function renderResults(data) {
|
|
| 525 |
return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
|
| 526 |
}
|
| 527 |
let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
|
| 528 |
-
// PDF first — highest value
|
| 529 |
-
const pdfUrl = links.pdf || links.arxiv_pdf;
|
| 530 |
-
if (pdfUrl) btns += `<a class="${btnCls} text-green-400 border-green-800 hover:border-green-400 hover:text-green-300" href="${pdfUrl}" target="_blank">📄 PDF <span class="material-symbols-outlined text-[9px]">download</span></a>`;
|
| 531 |
-
// Exact matches
|
| 532 |
if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 533 |
if (links.arxiv) btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 534 |
-
if (links.doi) btns += `<a class="${btnCls}" href="${links.doi}" target="_blank">DOI <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 535 |
if (links.pubmed) btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 536 |
if (links.openalex) btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 537 |
// Search fallbacks — always present
|
|
|
|
| 525 |
return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
|
| 526 |
}
|
| 527 |
let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 529 |
if (links.arxiv) btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
|
|
|
| 530 |
if (links.pubmed) btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 531 |
if (links.openalex) btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
|
| 532 |
// Search fallbacks — always present
|