Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| ASPLOS Proceedings Ingestion Pipeline | |
| ====================================== | |
| Run this LOCALLY (not on HF Spaces) to build the search index. | |
| Steps: | |
| 1. Fetches paper metadata from Semantic Scholar API | |
| 2. Downloads open-access PDFs | |
| 3. Extracts + chunks text | |
| 4. Creates embeddings with sentence-transformers | |
| 5. Saves data/ folder β commit to your HF Space repo | |
| Usage: | |
| pip install -r requirements-ingest.txt | |
| python ingest.py | |
| """ | |
| import os | |
| import json | |
| import time | |
| import hashlib | |
| import requests | |
| import numpy as np | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| import fitz # PyMuPDF | |
| from sentence_transformers import SentenceTransformer | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROCEEDINGS_DOIS = [ | |
| "10.1145/3760250", # ASPLOS '26 Vol 1 | |
| "10.1145/3779212", # ASPLOS '26 Vol 2 | |
| ] | |
| # Semantic Scholar fields to fetch | |
| S2_FIELDS = "title,authors,abstract,year,venue,openAccessPdf,externalIds,url" | |
| S2_API = "https://api.semanticscholar.org/graph/v1" | |
| EMBED_MODEL = "BAAI/bge-small-en-v1.5" # 33M params, ~130 MB, fast + good quality | |
| CHUNK_WORDS = 400 | |
| OVERLAP_WORDS = 60 | |
| DATA_DIR = Path("data") | |
| PDFS_DIR = DATA_DIR / "pdfs" | |
| DATA_DIR.mkdir(exist_ok=True) | |
| PDFS_DIR.mkdir(exist_ok=True) | |
| HEADERS = { | |
| "User-Agent": "ASPLOS-chatbot/1.0 (academic research; contact: research@nvidia.com)" | |
| } | |
| # ββ Semantic Scholar helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def s2_search_by_venue(venue_query: str, year: int, limit: int = 100) -> list[dict]: | |
| """Search Semantic Scholar for papers by venue + year.""" | |
| papers, offset = [], 0 | |
| while True: | |
| resp = requests.get( | |
| f"{S2_API}/paper/search", | |
| params={ | |
| "query": venue_query, | |
| "year": f"{year}-{year}", | |
| "fields": S2_FIELDS, | |
| "limit": min(limit, 100), | |
| "offset": offset, | |
| }, | |
| headers=HEADERS, | |
| timeout=30, | |
| ) | |
| if resp.status_code == 429: | |
| print(" Rate limited, sleeping 60sβ¦") | |
| time.sleep(60) | |
| continue | |
| resp.raise_for_status() | |
| data = resp.json() | |
| batch = data.get("data", []) | |
| papers.extend(batch) | |
| total = data.get("total", 0) | |
| offset += len(batch) | |
| if offset >= total or not batch: | |
| break | |
| time.sleep(0.5) | |
| return papers | |
| def s2_papers_by_doi_prefix(doi_prefix: str) -> list[dict]: | |
| """ | |
| Fetch all papers whose DOI starts with a given prefix (e.g. '10.1145/3760250'). | |
| Uses S2 bulk-fetch after collecting candidate DOIs via search. | |
| """ | |
| # S2 doesn't support DOI-prefix search directly. | |
| # Strategy: search for the proceedings title / DOI as a keyword. | |
| results = [] | |
| for query in [doi_prefix, f"ASPLOS 2025 2026 site:dl.acm.org/{doi_prefix}"]: | |
| batch = s2_search_by_venue(query, year=2025, limit=200) | |
| results.extend(batch) | |
| batch2 = s2_search_by_venue(query, year=2026, limit=200) | |
| results.extend(batch2) | |
| # Deduplicate by paperId | |
| seen, unique = set(), [] | |
| for p in results: | |
| pid = p.get("paperId", "") | |
| if pid and pid not in seen: | |
| seen.add(pid) | |
| # Only keep papers whose DOI starts with our prefix | |
| doi = (p.get("externalIds") or {}).get("DOI", "") | |
| if doi.startswith(doi_prefix): | |
| unique.append(p) | |
| return unique | |
| def fetch_asplos_papers() -> list[dict]: | |
| """ | |
| Collect all papers from both proceedings. | |
| Also runs a broad ASPLOS 2025/2026 search as a safety net. | |
| """ | |
| all_papers: list[dict] = [] | |
| print("ββ Fetching papers from Semantic Scholar ββ") | |
| for doi_prefix in PROCEEDINGS_DOIS: | |
| print(f"\n Proceedings {doi_prefix}β¦") | |
| papers = s2_papers_by_doi_prefix(doi_prefix) | |
| print(f" β {len(papers)} papers found") | |
| all_papers.extend(papers) | |
| # Broad fallback search to catch anything missed | |
| for query in ["ASPLOS 2025 architectural support programming languages", | |
| "ASPLOS 2026 architectural support programming languages"]: | |
| print(f"\n Broad search: '{query[:50]}β¦'") | |
| papers = s2_search_by_venue(query, year=2025, limit=200) | |
| papers += s2_search_by_venue(query, year=2026, limit=200) | |
| for p in papers: | |
| doi = (p.get("externalIds") or {}).get("DOI", "") | |
| if any(doi.startswith(prefix) for prefix in PROCEEDINGS_DOIS): | |
| all_papers.append(p) | |
| time.sleep(1) | |
| # Final dedup | |
| seen, unique = set(), [] | |
| for p in all_papers: | |
| pid = p.get("paperId", "") | |
| if pid and pid not in seen: | |
| seen.add(pid) | |
| unique.append(p) | |
| print(f"\n Total unique papers: {len(unique)}") | |
| return unique | |
| # ββ PDF helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def download_pdf(paper: dict, session: requests.Session) -> Path | None: | |
| """Try to download the open-access PDF for a paper.""" | |
| doi = (paper.get("externalIds") or {}).get("DOI", "") | |
| title = paper.get("title", "untitled") | |
| fname = hashlib.md5(doi.encode() or title.encode()).hexdigest() + ".pdf" | |
| dest = PDFS_DIR / fname | |
| if dest.exists() and dest.stat().st_size > 1024: | |
| return dest | |
| # Candidate URLs (open-access sources in priority order) | |
| candidates: list[str] = [] | |
| oa = paper.get("openAccessPdf") or {} | |
| if oa.get("url"): | |
| candidates.append(oa["url"]) | |
| if doi: | |
| candidates.append(f"https://dl.acm.org/doi/pdf/{doi}") | |
| arxiv_id = (paper.get("externalIds") or {}).get("ArXiv", "") | |
| if arxiv_id: | |
| candidates.append(f"https://arxiv.org/pdf/{arxiv_id}.pdf") | |
| for url in candidates: | |
| try: | |
| r = session.get(url, timeout=30, allow_redirects=True) | |
| if r.status_code == 200 and "application/pdf" in r.headers.get("content-type", ""): | |
| dest.write_bytes(r.content) | |
| return dest | |
| except Exception: | |
| pass | |
| time.sleep(0.3) | |
| return None | |
| def extract_text(pdf_path: Path) -> str: | |
| """Extract plain text from a PDF using PyMuPDF.""" | |
| try: | |
| doc = fitz.open(str(pdf_path)) | |
| pages = [page.get_text("text") for page in doc] | |
| doc.close() | |
| return "\n".join(pages).strip() | |
| except Exception as e: | |
| print(f" PDF parse error ({pdf_path.name}): {e}") | |
| return "" | |
| # ββ Chunking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_text(text: str, chunk_words: int = CHUNK_WORDS, | |
| overlap: int = OVERLAP_WORDS) -> list[str]: | |
| words = text.split() | |
| chunks, i = [], 0 | |
| while i < len(words): | |
| chunks.append(" ".join(words[i : i + chunk_words])) | |
| i += chunk_words - overlap | |
| return chunks | |
| def build_chunks(paper: dict, body_text: str) -> list[dict]: | |
| """Return a list of chunk dicts for one paper.""" | |
| title = paper.get("title", "") | |
| authors = ", ".join(a.get("name", "") for a in (paper.get("authors") or [])[:5]) | |
| abstract = paper.get("abstract") or "" | |
| doi = (paper.get("externalIds") or {}).get("DOI", "") | |
| chunks = [] | |
| # Header chunk: title + abstract (important for title-based queries) | |
| header = ( | |
| f"Title: {title}\n" | |
| f"Authors: {authors}\n" | |
| f"Abstract: {abstract}" | |
| ) | |
| chunks.append({"text": header, "is_header": True}) | |
| # Body chunks | |
| text_to_chunk = body_text if body_text else abstract | |
| for i, chunk in enumerate(chunk_text(text_to_chunk)): | |
| chunks.append({ | |
| "text": f"[Paper: {title}]\n{chunk}", | |
| "is_header": False, | |
| "chunk_idx": i, | |
| }) | |
| return chunks | |
| # ββ Main pipeline βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_index(): | |
| # 1. Fetch metadata | |
| raw_papers = fetch_asplos_papers() | |
| if not raw_papers: | |
| print("ERROR: No papers found. Check Semantic Scholar connectivity.") | |
| return | |
| # 2. Download PDFs + extract text | |
| session = requests.Session() | |
| session.headers.update(HEADERS) | |
| papers_out: list[dict] = [] | |
| all_chunks: list[dict] = [] | |
| print("\nββ Downloading PDFs & extracting text ββ") | |
| for paper in tqdm(raw_papers): | |
| doi = (paper.get("externalIds") or {}).get("DOI", "") | |
| title = paper.get("title", "") | |
| authors = [a.get("name", "") for a in (paper.get("authors") or [])] | |
| pdf_path = download_pdf(paper, session) | |
| body_text = extract_text(pdf_path) if pdf_path else "" | |
| has_full = bool(body_text) | |
| papers_out.append({ | |
| "title": title, | |
| "authors": authors, | |
| "abstract": paper.get("abstract") or "", | |
| "doi": doi, | |
| "url": f"https://dl.acm.org/doi/{doi}" if doi else paper.get("url", ""), | |
| "year": paper.get("year"), | |
| "has_full_text": has_full, | |
| }) | |
| paper_idx = len(papers_out) - 1 | |
| for chunk in build_chunks(paper, body_text): | |
| chunk["paper_idx"] = paper_idx | |
| all_chunks.append(chunk) | |
| time.sleep(0.2) | |
| print(f"\n Papers processed : {len(papers_out)}") | |
| print(f" Total chunks : {len(all_chunks)}") | |
| print(f" With full text : {sum(1 for p in papers_out if p['has_full_text'])}") | |
| # 3. Embed | |
| print("\nββ Creating embeddings ββ") | |
| model = SentenceTransformer(EMBED_MODEL) | |
| texts = [c["text"] for c in all_chunks] | |
| BATCH = 128 | |
| embeds = [] | |
| for i in tqdm(range(0, len(texts), BATCH)): | |
| embs = model.encode(texts[i : i + BATCH], normalize_embeddings=True, | |
| show_progress_bar=False) | |
| embeds.append(embs) | |
| embeddings = np.vstack(embeds).astype(np.float32) | |
| # 4. Save | |
| print("\nββ Saving index ββ") | |
| np.save(DATA_DIR / "embeddings.npy", embeddings) | |
| with open(DATA_DIR / "chunks.json", "w") as f: | |
| json.dump(all_chunks, f) | |
| with open(DATA_DIR / "papers.json", "w") as f: | |
| json.dump(papers_out, f, indent=2) | |
| size_mb = embeddings.nbytes / 1024 / 1024 | |
| print(f"\nβ Done!") | |
| print(f" Embeddings : {embeddings.shape} ({size_mb:.1f} MB)") | |
| print(f" Papers : {len(papers_out)}") | |
| print(f" Chunks : {len(all_chunks)}") | |
| print(f"\nNext step: commit the data/ folder to your HF Space repo.") | |
| if __name__ == "__main__": | |
| build_index() | |