asplos-chatbot / ingest.py
tanvisharma's picture
Upload ingest.py with huggingface_hub
a696ce5 verified
#!/usr/bin/env python3
"""
ASPLOS Proceedings Ingestion Pipeline
======================================
Run this LOCALLY (not on HF Spaces) to build the search index.
Steps:
1. Fetches paper metadata from Semantic Scholar API
2. Downloads open-access PDFs
3. Extracts + chunks text
4. Creates embeddings with sentence-transformers
5. Saves data/ folder β†’ commit to your HF Space repo
Usage:
pip install -r requirements-ingest.txt
python ingest.py
"""
import os
import json
import time
import hashlib
import requests
import numpy as np
from pathlib import Path
from tqdm import tqdm
import fitz # PyMuPDF
from sentence_transformers import SentenceTransformer
# ── Config ─────────────────────────────────────────────────────────────────────
PROCEEDINGS_DOIS = [
"10.1145/3760250", # ASPLOS '26 Vol 1
"10.1145/3779212", # ASPLOS '26 Vol 2
]
# Semantic Scholar fields to fetch
S2_FIELDS = "title,authors,abstract,year,venue,openAccessPdf,externalIds,url"
S2_API = "https://api.semanticscholar.org/graph/v1"
EMBED_MODEL = "BAAI/bge-small-en-v1.5" # 33M params, ~130 MB, fast + good quality
CHUNK_WORDS = 400
OVERLAP_WORDS = 60
DATA_DIR = Path("data")
PDFS_DIR = DATA_DIR / "pdfs"
DATA_DIR.mkdir(exist_ok=True)
PDFS_DIR.mkdir(exist_ok=True)
HEADERS = {
"User-Agent": "ASPLOS-chatbot/1.0 (academic research; contact: research@nvidia.com)"
}
# ── Semantic Scholar helpers ────────────────────────────────────────────────────
def s2_search_by_venue(venue_query: str, year: int, limit: int = 100) -> list[dict]:
"""Search Semantic Scholar for papers by venue + year."""
papers, offset = [], 0
while True:
resp = requests.get(
f"{S2_API}/paper/search",
params={
"query": venue_query,
"year": f"{year}-{year}",
"fields": S2_FIELDS,
"limit": min(limit, 100),
"offset": offset,
},
headers=HEADERS,
timeout=30,
)
if resp.status_code == 429:
print(" Rate limited, sleeping 60s…")
time.sleep(60)
continue
resp.raise_for_status()
data = resp.json()
batch = data.get("data", [])
papers.extend(batch)
total = data.get("total", 0)
offset += len(batch)
if offset >= total or not batch:
break
time.sleep(0.5)
return papers
def s2_papers_by_doi_prefix(doi_prefix: str) -> list[dict]:
"""
Fetch all papers whose DOI starts with a given prefix (e.g. '10.1145/3760250').
Uses S2 bulk-fetch after collecting candidate DOIs via search.
"""
# S2 doesn't support DOI-prefix search directly.
# Strategy: search for the proceedings title / DOI as a keyword.
results = []
for query in [doi_prefix, f"ASPLOS 2025 2026 site:dl.acm.org/{doi_prefix}"]:
batch = s2_search_by_venue(query, year=2025, limit=200)
results.extend(batch)
batch2 = s2_search_by_venue(query, year=2026, limit=200)
results.extend(batch2)
# Deduplicate by paperId
seen, unique = set(), []
for p in results:
pid = p.get("paperId", "")
if pid and pid not in seen:
seen.add(pid)
# Only keep papers whose DOI starts with our prefix
doi = (p.get("externalIds") or {}).get("DOI", "")
if doi.startswith(doi_prefix):
unique.append(p)
return unique
def fetch_asplos_papers() -> list[dict]:
"""
Collect all papers from both proceedings.
Also runs a broad ASPLOS 2025/2026 search as a safety net.
"""
all_papers: list[dict] = []
print("── Fetching papers from Semantic Scholar ──")
for doi_prefix in PROCEEDINGS_DOIS:
print(f"\n Proceedings {doi_prefix}…")
papers = s2_papers_by_doi_prefix(doi_prefix)
print(f" β†’ {len(papers)} papers found")
all_papers.extend(papers)
# Broad fallback search to catch anything missed
for query in ["ASPLOS 2025 architectural support programming languages",
"ASPLOS 2026 architectural support programming languages"]:
print(f"\n Broad search: '{query[:50]}…'")
papers = s2_search_by_venue(query, year=2025, limit=200)
papers += s2_search_by_venue(query, year=2026, limit=200)
for p in papers:
doi = (p.get("externalIds") or {}).get("DOI", "")
if any(doi.startswith(prefix) for prefix in PROCEEDINGS_DOIS):
all_papers.append(p)
time.sleep(1)
# Final dedup
seen, unique = set(), []
for p in all_papers:
pid = p.get("paperId", "")
if pid and pid not in seen:
seen.add(pid)
unique.append(p)
print(f"\n Total unique papers: {len(unique)}")
return unique
# ── PDF helpers ─────────────────────────────────────────────────────────────────
def download_pdf(paper: dict, session: requests.Session) -> Path | None:
"""Try to download the open-access PDF for a paper."""
doi = (paper.get("externalIds") or {}).get("DOI", "")
title = paper.get("title", "untitled")
fname = hashlib.md5(doi.encode() or title.encode()).hexdigest() + ".pdf"
dest = PDFS_DIR / fname
if dest.exists() and dest.stat().st_size > 1024:
return dest
# Candidate URLs (open-access sources in priority order)
candidates: list[str] = []
oa = paper.get("openAccessPdf") or {}
if oa.get("url"):
candidates.append(oa["url"])
if doi:
candidates.append(f"https://dl.acm.org/doi/pdf/{doi}")
arxiv_id = (paper.get("externalIds") or {}).get("ArXiv", "")
if arxiv_id:
candidates.append(f"https://arxiv.org/pdf/{arxiv_id}.pdf")
for url in candidates:
try:
r = session.get(url, timeout=30, allow_redirects=True)
if r.status_code == 200 and "application/pdf" in r.headers.get("content-type", ""):
dest.write_bytes(r.content)
return dest
except Exception:
pass
time.sleep(0.3)
return None
def extract_text(pdf_path: Path) -> str:
"""Extract plain text from a PDF using PyMuPDF."""
try:
doc = fitz.open(str(pdf_path))
pages = [page.get_text("text") for page in doc]
doc.close()
return "\n".join(pages).strip()
except Exception as e:
print(f" PDF parse error ({pdf_path.name}): {e}")
return ""
# ── Chunking ────────────────────────────────────────────────────────────────────
def chunk_text(text: str, chunk_words: int = CHUNK_WORDS,
overlap: int = OVERLAP_WORDS) -> list[str]:
words = text.split()
chunks, i = [], 0
while i < len(words):
chunks.append(" ".join(words[i : i + chunk_words]))
i += chunk_words - overlap
return chunks
def build_chunks(paper: dict, body_text: str) -> list[dict]:
"""Return a list of chunk dicts for one paper."""
title = paper.get("title", "")
authors = ", ".join(a.get("name", "") for a in (paper.get("authors") or [])[:5])
abstract = paper.get("abstract") or ""
doi = (paper.get("externalIds") or {}).get("DOI", "")
chunks = []
# Header chunk: title + abstract (important for title-based queries)
header = (
f"Title: {title}\n"
f"Authors: {authors}\n"
f"Abstract: {abstract}"
)
chunks.append({"text": header, "is_header": True})
# Body chunks
text_to_chunk = body_text if body_text else abstract
for i, chunk in enumerate(chunk_text(text_to_chunk)):
chunks.append({
"text": f"[Paper: {title}]\n{chunk}",
"is_header": False,
"chunk_idx": i,
})
return chunks
# ── Main pipeline ───────────────────────────────────────────────────────────────
def build_index():
# 1. Fetch metadata
raw_papers = fetch_asplos_papers()
if not raw_papers:
print("ERROR: No papers found. Check Semantic Scholar connectivity.")
return
# 2. Download PDFs + extract text
session = requests.Session()
session.headers.update(HEADERS)
papers_out: list[dict] = []
all_chunks: list[dict] = []
print("\n── Downloading PDFs & extracting text ──")
for paper in tqdm(raw_papers):
doi = (paper.get("externalIds") or {}).get("DOI", "")
title = paper.get("title", "")
authors = [a.get("name", "") for a in (paper.get("authors") or [])]
pdf_path = download_pdf(paper, session)
body_text = extract_text(pdf_path) if pdf_path else ""
has_full = bool(body_text)
papers_out.append({
"title": title,
"authors": authors,
"abstract": paper.get("abstract") or "",
"doi": doi,
"url": f"https://dl.acm.org/doi/{doi}" if doi else paper.get("url", ""),
"year": paper.get("year"),
"has_full_text": has_full,
})
paper_idx = len(papers_out) - 1
for chunk in build_chunks(paper, body_text):
chunk["paper_idx"] = paper_idx
all_chunks.append(chunk)
time.sleep(0.2)
print(f"\n Papers processed : {len(papers_out)}")
print(f" Total chunks : {len(all_chunks)}")
print(f" With full text : {sum(1 for p in papers_out if p['has_full_text'])}")
# 3. Embed
print("\n── Creating embeddings ──")
model = SentenceTransformer(EMBED_MODEL)
texts = [c["text"] for c in all_chunks]
BATCH = 128
embeds = []
for i in tqdm(range(0, len(texts), BATCH)):
embs = model.encode(texts[i : i + BATCH], normalize_embeddings=True,
show_progress_bar=False)
embeds.append(embs)
embeddings = np.vstack(embeds).astype(np.float32)
# 4. Save
print("\n── Saving index ──")
np.save(DATA_DIR / "embeddings.npy", embeddings)
with open(DATA_DIR / "chunks.json", "w") as f:
json.dump(all_chunks, f)
with open(DATA_DIR / "papers.json", "w") as f:
json.dump(papers_out, f, indent=2)
size_mb = embeddings.nbytes / 1024 / 1024
print(f"\nβœ“ Done!")
print(f" Embeddings : {embeddings.shape} ({size_mb:.1f} MB)")
print(f" Papers : {len(papers_out)}")
print(f" Chunks : {len(all_chunks)}")
print(f"\nNext step: commit the data/ folder to your HF Space repo.")
if __name__ == "__main__":
build_index()