Spaces:
Sleeping
Sleeping
| """Scientific paper parser: extract full text + structured metadata from PDFs. | |
| Extracts: | |
| - title, authors, year, abstract | |
| - section titles + bodies | |
| - references | |
| """ | |
| import re | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional | |
| # --------------------------------------------------------------------------- | |
| # Section patterns (covers most LaTeX-compiled arXiv PDFs) | |
| # --------------------------------------------------------------------------- | |
| SECTION_PATTERNS = [ | |
| # Numbered: "1 Introduction", "2. Related Work", "1.1 Background" | |
| re.compile(r"^\s*(\d+(?:\.\d+)*)\s{1,4}([A-Z][A-Za-z &,\-:]{2,60})\s*$", re.MULTILINE), | |
| # Unnumbered all-caps: "INTRODUCTION", "RELATED WORK" | |
| re.compile(r"^\s*([A-Z][A-Z ]{3,40})\s*$", re.MULTILINE), | |
| # Named: "Abstract", "Introduction", "Conclusion", "References" | |
| re.compile( | |
| r"^\s*(Abstract|Introduction|Related Work|Background|Methodology|Methods|" | |
| r"Experiments?|Results?|Discussion|Conclusion|Limitations?|" | |
| r"Future Work|Acknowledgements?|References?)\s*$", | |
| re.MULTILINE | re.IGNORECASE, | |
| ), | |
| ] | |
| ABSTRACT_RE = re.compile( | |
| r"(?:Abstract|ABSTRACT)[.\s—–-]*\n(.*?)(?=\n\s*\n|\n\s*(?:1\.|Introduction|Keywords))", | |
| re.DOTALL | re.IGNORECASE, | |
| ) | |
| REFERENCE_RE = re.compile( | |
| r"\n\s*(?:References?|Bibliography)\s*\n(.*?)$", | |
| re.DOTALL | re.IGNORECASE, | |
| ) | |
| AUTHORS_RE = re.compile( | |
| r"(?:^|\n)((?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))(?:,?\s+(?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))*", | |
| ) | |
| def extract_text_from_pdf(pdf_path: str) -> str: | |
| """Extract raw text from PDF, preserving page structure.""" | |
| from pypdf import PdfReader | |
| reader = PdfReader(pdf_path) | |
| pages = [] | |
| for page in reader.pages: | |
| text = page.extract_text() or "" | |
| pages.append(text) | |
| return "\n\n[PAGE_BREAK]\n\n".join(pages) | |
| def extract_abstract(text: str) -> str: | |
| """Extract abstract from paper text.""" | |
| m = ABSTRACT_RE.search(text) | |
| if m: | |
| abstract = m.group(1).strip() | |
| # Clean up hyphenated line breaks and extra spaces | |
| abstract = re.sub(r"-\n", "", abstract) | |
| abstract = re.sub(r"\s+", " ", abstract) | |
| return abstract[:2000] | |
| # Fallback: first 500 chars after abstract keyword | |
| idx = text.lower().find("abstract") | |
| if idx >= 0: | |
| return text[idx + 8:idx + 800].strip() | |
| return "" | |
| def extract_sections(text: str) -> List[Dict]: | |
| """ | |
| Split paper into sections. Returns list of: | |
| {"title": str, "body": str, "type": str} | |
| where type is one of: abstract, introduction, methodology, results, conclusion, other. | |
| """ | |
| # Find all section header positions | |
| positions = [] | |
| for pat in SECTION_PATTERNS: | |
| for m in pat.finditer(text): | |
| positions.append((m.start(), m.end(), m.group(0).strip())) | |
| # Sort by position, deduplicate overlapping matches | |
| positions.sort(key=lambda x: x[0]) | |
| deduped = [] | |
| last_end = -1 | |
| for start, end, title in positions: | |
| if start >= last_end: | |
| deduped.append((start, end, title)) | |
| last_end = end | |
| if not deduped: | |
| # No sections found: return whole text as one section | |
| return [{"title": "full_text", "body": text, "type": "other"}] | |
| sections = [] | |
| for i, (start, end, title) in enumerate(deduped): | |
| body_start = end | |
| body_end = deduped[i + 1][0] if i + 1 < len(deduped) else len(text) | |
| body = text[body_start:body_end].strip() | |
| section_type = _classify_section(title) | |
| sections.append({"title": title, "body": body, "type": section_type}) | |
| return sections | |
| def _classify_section(title: str) -> str: | |
| t = title.lower() | |
| if any(k in t for k in ["abstract"]): | |
| return "abstract" | |
| if any(k in t for k in ["introduction", "background", "overview", "motivation"]): | |
| return "introduction" | |
| if any(k in t for k in ["method", "approach", "model", "architecture", "framework", "system"]): | |
| return "methodology" | |
| if any(k in t for k in ["experiment", "result", "evaluat", "benchmark", "performance", "ablat"]): | |
| return "results" | |
| if any(k in t for k in ["conclusion", "future", "discussion", "limitation", "summary"]): | |
| return "conclusion" | |
| if any(k in t for k in ["reference", "bibliograph"]): | |
| return "references" | |
| if any(k in t for k in ["related work", "prior work", "literature"]): | |
| return "related_work" | |
| return "other" | |
| def extract_references(text: str) -> List[str]: | |
| """Extract references section as list of reference strings.""" | |
| m = REFERENCE_RE.search(text) | |
| if not m: | |
| return [] | |
| ref_block = m.group(1) | |
| # Split on numbered references [1], [2] or numbered lines | |
| refs = re.split(r"\n\s*\[\d+\]|\n\s*\d+\.", ref_block) | |
| return [r.strip() for r in refs if len(r.strip()) > 20][:100] | |
| def extract_metadata( | |
| pdf_path: str, | |
| prefetched_meta: Optional[Dict] = None, | |
| ) -> Dict: | |
| """ | |
| Full extraction pipeline for one PDF. | |
| Args: | |
| pdf_path: path to PDF file | |
| prefetched_meta: dict with title/authors/year/abstract from arXiv API (optional) | |
| Returns metadata dict with: | |
| pdf_path, title, authors, year, abstract, | |
| sections (list of {title, body, type}), | |
| references (list of strings), | |
| full_text | |
| """ | |
| path = Path(pdf_path) | |
| try: | |
| full_text = extract_text_from_pdf(str(path)) | |
| except Exception as e: | |
| print(f" [warn] could not parse {path.name}: {e}") | |
| full_text = "" | |
| if prefetched_meta: | |
| title = prefetched_meta.get("title", path.stem) | |
| authors = prefetched_meta.get("authors", []) | |
| year = prefetched_meta.get("year", "") | |
| abstract = prefetched_meta.get("abstract", "") or extract_abstract(full_text) | |
| else: | |
| title = path.stem | |
| authors = [] | |
| year = "" | |
| abstract = extract_abstract(full_text) | |
| sections = extract_sections(full_text) | |
| references = extract_references(full_text) | |
| return { | |
| "pdf_path": str(path), | |
| "arxiv_id": path.stem, | |
| "title": title, | |
| "authors": authors, | |
| "year": year, | |
| "abstract": abstract, | |
| "sections": sections, | |
| "references": references, | |
| "full_text": full_text, | |
| "num_sections": len(sections), | |
| "num_references": len(references), | |
| "text_length": len(full_text), | |
| } | |
| def parse_pdf(pdf_path: str, prefetched_meta: Optional[Dict] = None) -> Dict: | |
| """Alias for extract_metadata — use in pipeline code.""" | |
| return extract_metadata(pdf_path, prefetched_meta) | |
| # --------------------------------------------------------------------------- | |
| # Batch parsing with metadata JSONL | |
| # --------------------------------------------------------------------------- | |
| def parse_corpus( | |
| papers_dir: str = "data/papers", | |
| metadata_file: str = "data/metadata.jsonl", | |
| output_file: str = "data/parsed_corpus.jsonl", | |
| ) -> List[Dict]: | |
| """Parse all PDFs in papers_dir. Returns list of parsed metadata dicts.""" | |
| papers_path = Path(papers_dir) | |
| output_path = Path(output_file) | |
| # Load arXiv metadata if available | |
| meta_lookup = {} | |
| if Path(metadata_file).exists(): | |
| with open(metadata_file) as f: | |
| for line in f: | |
| rec = json.loads(line) | |
| meta_lookup[rec.get("arxiv_id", "")] = rec | |
| pdfs = sorted(papers_path.glob("*.pdf")) | |
| print(f"Parsing {len(pdfs)} PDFs...") | |
| parsed = [] | |
| with output_path.open("w") as out_f: | |
| for i, pdf in enumerate(pdfs): | |
| arxiv_id = pdf.stem | |
| pre = meta_lookup.get(arxiv_id) | |
| print(f" [{i+1}/{len(pdfs)}] {pdf.name[:50]}") | |
| doc = extract_metadata(str(pdf), prefetched_meta=pre) | |
| out_f.write(json.dumps(doc, ensure_ascii=False) + "\n") | |
| out_f.flush() | |
| parsed.append(doc) | |
| print(f"Parsed corpus saved to {output_path}") | |
| return parsed | |
| if __name__ == "__main__": | |
| parse_corpus() | |