"""Scientific paper parser: extract full text + structured metadata from PDFs. Extracts: - title, authors, year, abstract - section titles + bodies - references """ import re import json from pathlib import Path from typing import Dict, List, Tuple, Optional # --------------------------------------------------------------------------- # Section patterns (covers most LaTeX-compiled arXiv PDFs) # --------------------------------------------------------------------------- SECTION_PATTERNS = [ # Numbered: "1 Introduction", "2. Related Work", "1.1 Background" re.compile(r"^\s*(\d+(?:\.\d+)*)\s{1,4}([A-Z][A-Za-z &,\-:]{2,60})\s*$", re.MULTILINE), # Unnumbered all-caps: "INTRODUCTION", "RELATED WORK" re.compile(r"^\s*([A-Z][A-Z ]{3,40})\s*$", re.MULTILINE), # Named: "Abstract", "Introduction", "Conclusion", "References" re.compile( r"^\s*(Abstract|Introduction|Related Work|Background|Methodology|Methods|" r"Experiments?|Results?|Discussion|Conclusion|Limitations?|" r"Future Work|Acknowledgements?|References?)\s*$", re.MULTILINE | re.IGNORECASE, ), ] ABSTRACT_RE = re.compile( r"(?:Abstract|ABSTRACT)[.\s—–-]*\n(.*?)(?=\n\s*\n|\n\s*(?:1\.|Introduction|Keywords))", re.DOTALL | re.IGNORECASE, ) REFERENCE_RE = re.compile( r"\n\s*(?:References?|Bibliography)\s*\n(.*?)$", re.DOTALL | re.IGNORECASE, ) AUTHORS_RE = re.compile( r"(?:^|\n)((?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))(?:,?\s+(?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))*", ) def extract_text_from_pdf(pdf_path: str) -> str: """Extract raw text from PDF, preserving page structure.""" from pypdf import PdfReader reader = PdfReader(pdf_path) pages = [] for page in reader.pages: text = page.extract_text() or "" pages.append(text) return "\n\n[PAGE_BREAK]\n\n".join(pages) def extract_abstract(text: str) -> str: """Extract abstract from paper text.""" m = ABSTRACT_RE.search(text) if m: abstract = m.group(1).strip() # Clean up hyphenated line breaks and extra spaces abstract = re.sub(r"-\n", "", abstract) abstract = re.sub(r"\s+", " ", abstract) return abstract[:2000] # Fallback: first 500 chars after abstract keyword idx = text.lower().find("abstract") if idx >= 0: return text[idx + 8:idx + 800].strip() return "" def extract_sections(text: str) -> List[Dict]: """ Split paper into sections. Returns list of: {"title": str, "body": str, "type": str} where type is one of: abstract, introduction, methodology, results, conclusion, other. """ # Find all section header positions positions = [] for pat in SECTION_PATTERNS: for m in pat.finditer(text): positions.append((m.start(), m.end(), m.group(0).strip())) # Sort by position, deduplicate overlapping matches positions.sort(key=lambda x: x[0]) deduped = [] last_end = -1 for start, end, title in positions: if start >= last_end: deduped.append((start, end, title)) last_end = end if not deduped: # No sections found: return whole text as one section return [{"title": "full_text", "body": text, "type": "other"}] sections = [] for i, (start, end, title) in enumerate(deduped): body_start = end body_end = deduped[i + 1][0] if i + 1 < len(deduped) else len(text) body = text[body_start:body_end].strip() section_type = _classify_section(title) sections.append({"title": title, "body": body, "type": section_type}) return sections def _classify_section(title: str) -> str: t = title.lower() if any(k in t for k in ["abstract"]): return "abstract" if any(k in t for k in ["introduction", "background", "overview", "motivation"]): return "introduction" if any(k in t for k in ["method", "approach", "model", "architecture", "framework", "system"]): return "methodology" if any(k in t for k in ["experiment", "result", "evaluat", "benchmark", "performance", "ablat"]): return "results" if any(k in t for k in ["conclusion", "future", "discussion", "limitation", "summary"]): return "conclusion" if any(k in t for k in ["reference", "bibliograph"]): return "references" if any(k in t for k in ["related work", "prior work", "literature"]): return "related_work" return "other" def extract_references(text: str) -> List[str]: """Extract references section as list of reference strings.""" m = REFERENCE_RE.search(text) if not m: return [] ref_block = m.group(1) # Split on numbered references [1], [2] or numbered lines refs = re.split(r"\n\s*\[\d+\]|\n\s*\d+\.", ref_block) return [r.strip() for r in refs if len(r.strip()) > 20][:100] def extract_metadata( pdf_path: str, prefetched_meta: Optional[Dict] = None, ) -> Dict: """ Full extraction pipeline for one PDF. Args: pdf_path: path to PDF file prefetched_meta: dict with title/authors/year/abstract from arXiv API (optional) Returns metadata dict with: pdf_path, title, authors, year, abstract, sections (list of {title, body, type}), references (list of strings), full_text """ path = Path(pdf_path) try: full_text = extract_text_from_pdf(str(path)) except Exception as e: print(f" [warn] could not parse {path.name}: {e}") full_text = "" if prefetched_meta: title = prefetched_meta.get("title", path.stem) authors = prefetched_meta.get("authors", []) year = prefetched_meta.get("year", "") abstract = prefetched_meta.get("abstract", "") or extract_abstract(full_text) else: title = path.stem authors = [] year = "" abstract = extract_abstract(full_text) sections = extract_sections(full_text) references = extract_references(full_text) return { "pdf_path": str(path), "arxiv_id": path.stem, "title": title, "authors": authors, "year": year, "abstract": abstract, "sections": sections, "references": references, "full_text": full_text, "num_sections": len(sections), "num_references": len(references), "text_length": len(full_text), } def parse_pdf(pdf_path: str, prefetched_meta: Optional[Dict] = None) -> Dict: """Alias for extract_metadata — use in pipeline code.""" return extract_metadata(pdf_path, prefetched_meta) # --------------------------------------------------------------------------- # Batch parsing with metadata JSONL # --------------------------------------------------------------------------- def parse_corpus( papers_dir: str = "data/papers", metadata_file: str = "data/metadata.jsonl", output_file: str = "data/parsed_corpus.jsonl", ) -> List[Dict]: """Parse all PDFs in papers_dir. Returns list of parsed metadata dicts.""" papers_path = Path(papers_dir) output_path = Path(output_file) # Load arXiv metadata if available meta_lookup = {} if Path(metadata_file).exists(): with open(metadata_file) as f: for line in f: rec = json.loads(line) meta_lookup[rec.get("arxiv_id", "")] = rec pdfs = sorted(papers_path.glob("*.pdf")) print(f"Parsing {len(pdfs)} PDFs...") parsed = [] with output_path.open("w") as out_f: for i, pdf in enumerate(pdfs): arxiv_id = pdf.stem pre = meta_lookup.get(arxiv_id) print(f" [{i+1}/{len(pdfs)}] {pdf.name[:50]}") doc = extract_metadata(str(pdf), prefetched_meta=pre) out_f.write(json.dumps(doc, ensure_ascii=False) + "\n") out_f.flush() parsed.append(doc) print(f"Parsed corpus saved to {output_path}") return parsed if __name__ == "__main__": parse_corpus()