RohanB67 commited on
Commit
210908f
·
1 Parent(s): 9c58b2a
Files changed (3) hide show
  1. ingest.py +10 -70
  2. query.py +11 -33
  3. static/index.html +0 -5
ingest.py CHANGED
@@ -31,68 +31,11 @@ CHROMA_BATCH = 5000
31
  HF_DATASET_ID = "RohanB67/papers"
32
 
33
 
34
- def extract_text_and_title(pdf_path: str) -> tuple[str, str]:
35
- """
36
- Extract full text AND attempt to extract the real paper title from the PDF.
37
-
38
- Strategy:
39
- 1. PyMuPDF metadata (most reliable when present)
40
- 2. Largest font text on page 1 (works for most academic PDFs)
41
- 3. First non-empty line of text on page 1
42
- 4. Fall back to cleaned filename
43
- """
44
- doc = fitz.open(pdf_path)
45
-
46
- # Strategy 1 - PDF metadata title
47
- meta_title = (doc.metadata or {}).get("title", "").strip()
48
- if meta_title and len(meta_title) > 10 and not meta_title.lower().startswith("untitled"):
49
- title = meta_title
50
- else:
51
- title = None
52
-
53
- # Strategy 2 - Largest font text block on page 1
54
- if not title and len(doc) > 0:
55
- try:
56
- blocks = doc[0].get_text("dict")["blocks"]
57
- best_size, best_text = 0, ""
58
- for block in blocks:
59
- for line in block.get("lines", []):
60
- for span in line.get("spans", []):
61
- txt = span["text"].strip()
62
- size = span["size"]
63
- if size > best_size and len(txt) > 10 and txt.isascii():
64
- best_size = size
65
- best_text = txt
66
- if best_text:
67
- title = best_text
68
- except Exception:
69
- pass
70
-
71
- # Strategy 3 - First substantial line of text on page 1
72
- if not title:
73
- try:
74
- first_page_text = doc[0].get_text()
75
- for line in first_page_text.split("\n"):
76
- line = line.strip()
77
- if len(line) > 20 and not line.startswith("http"):
78
- title = line
79
- break
80
- except Exception:
81
- pass
82
-
83
- # Strategy 4 — Fall back to cleaned filename
84
- if not title:
85
- title = os.path.basename(pdf_path).replace(".pdf", "").replace("_", " ").replace("-", " ")
86
-
87
- # Clean up title - remove newlines, excessive whitespace
88
- title = re.sub(r"\s+", " ", title).strip()
89
- # Truncate very long titles
90
- if len(title) > 150:
91
- title = title[:150].rsplit(" ", 1)[0] + "..."
92
-
93
- full_text = "".join(page.get_text() for page in doc)
94
  doc.close()
95
- return full_text, title
96
 
97
 
98
  def chunk_text(text: str) -> list[str]:
@@ -127,24 +70,17 @@ def _load_pdfs(papers_dir: str):
127
 
128
  docs, ids, metas, chunk_index = [], [], [], 0
129
  for pdf_file in pdf_files:
130
- pdf_path = os.path.join(papers_dir, pdf_file)
131
  print(f"Processing: {pdf_file}", flush=True)
132
- full_text, real_title = extract_text_and_title(pdf_path)
133
- chunks = chunk_text(full_text)
134
- print(f" Title: {real_title}", flush=True)
135
  print(f" -> {len(chunks)} chunks", flush=True)
136
 
137
- # Clean paper_name from filename (kept for backward compat)
138
- paper_name = pdf_file.replace(".pdf", "").replace("_", " ")
139
-
140
  for i, chunk in enumerate(chunks):
141
  docs.append(chunk)
142
  ids.append(f"{pdf_file}_chunk_{chunk_index}")
143
  metas.append({
144
  "source": pdf_file,
145
  "chunk_index": i,
146
- "paper_name": paper_name,
147
- "paper_title": real_title,
148
  })
149
  chunk_index += 1
150
 
@@ -152,6 +88,10 @@ def _load_pdfs(papers_dir: str):
152
 
153
 
154
  def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
 
 
 
 
155
  from huggingface_hub import list_repo_files, hf_hub_download
156
  os.makedirs(dest_dir, exist_ok=True)
157
  pdf_files = [
 
31
  HF_DATASET_ID = "RohanB67/papers"
32
 
33
 
34
+ def extract_text(pdf_path: str) -> tuple[str, str]:
35
+ doc = fitz.open(pdf_path)
36
+ text = "".join(page.get_text() for page in doc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  doc.close()
38
+ return text
39
 
40
 
41
  def chunk_text(text: str) -> list[str]:
 
70
 
71
  docs, ids, metas, chunk_index = [], [], [], 0
72
  for pdf_file in pdf_files:
 
73
  print(f"Processing: {pdf_file}", flush=True)
74
+ chunks = chunk_text(extract_text(os.path.join(papers_dir, pdf_file)))
 
 
75
  print(f" -> {len(chunks)} chunks", flush=True)
76
 
 
 
 
77
  for i, chunk in enumerate(chunks):
78
  docs.append(chunk)
79
  ids.append(f"{pdf_file}_chunk_{chunk_index}")
80
  metas.append({
81
  "source": pdf_file,
82
  "chunk_index": i,
83
+ "paper_name": pdf_file.replace(".pdf", "").replace("_", " ")
 
84
  })
85
  chunk_index += 1
86
 
 
88
 
89
 
90
  def _download_papers_from_hf(dest_dir: str = PAPERS_DIR):
91
+ """
92
+ Pull all PDF files from HF dataset RohanB67/papers into dest_dir.
93
+ Uses huggingface_hub already available in HF Spaces environment.
94
+ """
95
  from huggingface_hub import list_repo_files, hf_hub_download
96
  os.makedirs(dest_dir, exist_ok=True)
97
  pdf_files = [
query.py CHANGED
@@ -28,28 +28,12 @@ from tavily import TavilyClient
28
  _paper_link_cache = {}
29
 
30
 
31
- def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
32
- """
33
- Enrich a local paper with links from multiple free research databases.
34
- Uses real paper title for searching when available.
35
-
36
- Sources tried:
37
- - Semantic Scholar API (DOI, arXiv ID, open-access PDF)
38
- - arXiv API (abs page + PDF)
39
- - OpenAlex API (open research graph, DOI)
40
- - NCBI/PubMed E-utils (PMID, PubMed page)
41
- - Generated search URLs: Google, Google Scholar, Semantic Scholar,
42
- arXiv, PubMed, NCBI, OpenAlex
43
- """
44
  global _paper_link_cache
45
- cache_key = paper_title or paper_name
46
- if cache_key in _paper_link_cache:
47
- return _paper_link_cache[cache_key]
48
-
49
- # Use real title if available, else cleaned filename
50
- search_term = paper_title if paper_title and len(paper_title) > 10 else paper_name
51
- q = urllib.parse.quote(search_term)
52
 
 
53
  # Always-available search links (never fail)
54
  links = {
55
  "google": f"https://www.google.com/search?q={q}+research+paper",
@@ -65,7 +49,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
65
  try:
66
  r = requests.get(
67
  "https://api.semanticscholar.org/graph/v1/paper/search",
68
- params={"query": search_term, "limit": 1,
69
  "fields": "title,url,externalIds,openAccessPdf"},
70
  timeout=5
71
  )
@@ -78,9 +62,6 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
78
  links["semantic_scholar"] = p["url"]
79
  if ext.get("ArXiv"):
80
  links["arxiv"] = f"https://arxiv.org/abs/{ext['ArXiv']}"
81
- links["arxiv_pdf"] = f"https://arxiv.org/pdf/{ext['ArXiv']}"
82
- if ext.get("DOI"):
83
- links["doi"] = f"https://doi.org/{ext['DOI']}"
84
  if ext.get("PubMed"):
85
  links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
86
  pdf = p.get("openAccessPdf")
@@ -93,7 +74,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
93
  try:
94
  r = requests.get(
95
  "https://api.openalex.org/works",
96
- params={"search": search_term, "per_page": 1,
97
  "select": "id,doi,open_access,primary_location"},
98
  headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
99
  timeout=5
@@ -118,7 +99,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
118
  if "pubmed" not in links:
119
  r = requests.get(
120
  "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
121
- params={"db": "pubmed", "term": search_term,
122
  "retmax": 1, "retmode": "json"},
123
  timeout=5
124
  )
@@ -129,7 +110,7 @@ def _get_paper_links(paper_name: str, paper_title: str = None) -> dict:
129
  except Exception:
130
  pass
131
 
132
- _paper_link_cache[cache_key] = links
133
  return links
134
 
135
  # -- Config -----------------------------------------------
@@ -195,14 +176,11 @@ def retrieve_local(query: str, embedder, collection) -> list[dict]:
195
  results["metadatas"][0],
196
  results["distances"][0]
197
  ):
198
- paper_name = meta.get("paper_name", meta.get("source", "Unknown"))
199
- paper_title = meta.get("paper_title", paper_name)
200
- links = _get_paper_links(paper_name, paper_title)
201
- # Display the real title if available, else fall back to filename-based name
202
- display_name = paper_title if paper_title and paper_title != paper_name else paper_name
203
  chunks.append({
204
  "text": doc,
205
- "source": display_name,
206
  "similarity": round(1 - dist, 4),
207
  "url": links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
208
  "links": links,
 
28
  _paper_link_cache = {}
29
 
30
 
31
+ def _get_paper_links(paper_name: str) -> dict:
 
 
 
 
 
 
 
 
 
 
 
 
32
  global _paper_link_cache
33
+ if paper_name in _paper_link_cache:
34
+ return _paper_link_cache[paper_name]
 
 
 
 
 
35
 
36
+ q = urllib.parse.quote_plus(paper_name)
37
  # Always-available search links (never fail)
38
  links = {
39
  "google": f"https://www.google.com/search?q={q}+research+paper",
 
49
  try:
50
  r = requests.get(
51
  "https://api.semanticscholar.org/graph/v1/paper/search",
52
+ params={"query": paper_name, "limit": 1,
53
  "fields": "title,url,externalIds,openAccessPdf"},
54
  timeout=5
55
  )
 
62
  links["semantic_scholar"] = p["url"]
63
  if ext.get("ArXiv"):
64
  links["arxiv"] = f"https://arxiv.org/abs/{ext['ArXiv']}"
 
 
 
65
  if ext.get("PubMed"):
66
  links["pubmed"] = f"https://pubmed.ncbi.nlm.nih.gov/{ext['PubMed']}/"
67
  pdf = p.get("openAccessPdf")
 
74
  try:
75
  r = requests.get(
76
  "https://api.openalex.org/works",
77
+ params={"search": paper_name, "per_page": 1,
78
  "select": "id,doi,open_access,primary_location"},
79
  headers={"User-Agent": "EpiRAG/1.0 (rohanbiswas031@gmail.com)"},
80
  timeout=5
 
99
  if "pubmed" not in links:
100
  r = requests.get(
101
  "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
102
+ params={"db": "pubmed", "term": paper_name,
103
  "retmax": 1, "retmode": "json"},
104
  timeout=5
105
  )
 
110
  except Exception:
111
  pass
112
 
113
+ _paper_link_cache[paper_name] = links
114
  return links
115
 
116
  # -- Config -----------------------------------------------
 
176
  results["metadatas"][0],
177
  results["distances"][0]
178
  ):
179
+ paper_name = meta.get("paper_name", meta.get("source", "Unknown"))
180
+ links = _get_paper_links(paper_name)
 
 
 
181
  chunks.append({
182
  "text": doc,
183
+ "source": paper_name,
184
  "similarity": round(1 - dist, 4),
185
  "url": links.get("semantic_scholar") or links.get("arxiv") or links.get("doi") or links.get("pubmed"),
186
  "links": links,
static/index.html CHANGED
@@ -525,13 +525,8 @@ function renderResults(data) {
525
  return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
526
  }
527
  let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
528
- // PDF first — highest value
529
- const pdfUrl = links.pdf || links.arxiv_pdf;
530
- if (pdfUrl) btns += `<a class="${btnCls} text-green-400 border-green-800 hover:border-green-400 hover:text-green-300" href="${pdfUrl}" target="_blank">📄 PDF <span class="material-symbols-outlined text-[9px]">download</span></a>`;
531
- // Exact matches
532
  if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
533
  if (links.arxiv) btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
534
- if (links.doi) btns += `<a class="${btnCls}" href="${links.doi}" target="_blank">DOI <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
535
  if (links.pubmed) btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
536
  if (links.openalex) btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
537
  // Search fallbacks — always present
 
525
  return `<a class="text-[10px] text-tertiary/80 pl-8 font-mono hover:underline flex items-center gap-1 truncate" href="${src.url}" target="_blank">${src.url.slice(0,60)}${src.url.length>60?'…':''}<span class="material-symbols-outlined text-[10px] flex-shrink-0">open_in_new</span></a>`;
526
  }
527
  let btns = '<div class="pl-8 flex flex-wrap gap-1.5 mt-1.5">';
 
 
 
 
528
  if (links.semantic_scholar) btns += `<a class="${btnCls}" href="${links.semantic_scholar}" target="_blank">Semantic Scholar <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
529
  if (links.arxiv) btns += `<a class="${btnCls}" href="${links.arxiv}" target="_blank">arXiv <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
 
530
  if (links.pubmed) btns += `<a class="${btnCls}" href="${links.pubmed}" target="_blank">PubMed <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
531
  if (links.openalex) btns += `<a class="${btnCls}" href="${links.openalex}" target="_blank">OpenAlex <span class="material-symbols-outlined text-[9px]">open_in_new</span></a>`;
532
  // Search fallbacks — always present