genAI-Project / src /ingestion /parser.py
OGB2000's picture
Initial clean deployment
bf77be6
Raw
History Blame Contribute Delete
8.04 kB
"""Scientific paper parser: extract full text + structured metadata from PDFs.
Extracts:
- title, authors, year, abstract
- section titles + bodies
- references
"""
import re
import json
from pathlib import Path
from typing import Dict, List, Tuple, Optional
# ---------------------------------------------------------------------------
# Section patterns (covers most LaTeX-compiled arXiv PDFs)
# ---------------------------------------------------------------------------
SECTION_PATTERNS = [
# Numbered: "1 Introduction", "2. Related Work", "1.1 Background"
re.compile(r"^\s*(\d+(?:\.\d+)*)\s{1,4}([A-Z][A-Za-z &,\-:]{2,60})\s*$", re.MULTILINE),
# Unnumbered all-caps: "INTRODUCTION", "RELATED WORK"
re.compile(r"^\s*([A-Z][A-Z ]{3,40})\s*$", re.MULTILINE),
# Named: "Abstract", "Introduction", "Conclusion", "References"
re.compile(
r"^\s*(Abstract|Introduction|Related Work|Background|Methodology|Methods|"
r"Experiments?|Results?|Discussion|Conclusion|Limitations?|"
r"Future Work|Acknowledgements?|References?)\s*$",
re.MULTILINE | re.IGNORECASE,
),
]
ABSTRACT_RE = re.compile(
r"(?:Abstract|ABSTRACT)[.\s—–-]*\n(.*?)(?=\n\s*\n|\n\s*(?:1\.|Introduction|Keywords))",
re.DOTALL | re.IGNORECASE,
)
REFERENCE_RE = re.compile(
r"\n\s*(?:References?|Bibliography)\s*\n(.*?)$",
re.DOTALL | re.IGNORECASE,
)
AUTHORS_RE = re.compile(
r"(?:^|\n)((?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))(?:,?\s+(?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))*",
)
def extract_text_from_pdf(pdf_path: str) -> str:
"""Extract raw text from PDF, preserving page structure."""
from pypdf import PdfReader
reader = PdfReader(pdf_path)
pages = []
for page in reader.pages:
text = page.extract_text() or ""
pages.append(text)
return "\n\n[PAGE_BREAK]\n\n".join(pages)
def extract_abstract(text: str) -> str:
"""Extract abstract from paper text."""
m = ABSTRACT_RE.search(text)
if m:
abstract = m.group(1).strip()
# Clean up hyphenated line breaks and extra spaces
abstract = re.sub(r"-\n", "", abstract)
abstract = re.sub(r"\s+", " ", abstract)
return abstract[:2000]
# Fallback: first 500 chars after abstract keyword
idx = text.lower().find("abstract")
if idx >= 0:
return text[idx + 8:idx + 800].strip()
return ""
def extract_sections(text: str) -> List[Dict]:
"""
Split paper into sections. Returns list of:
{"title": str, "body": str, "type": str}
where type is one of: abstract, introduction, methodology, results, conclusion, other.
"""
# Find all section header positions
positions = []
for pat in SECTION_PATTERNS:
for m in pat.finditer(text):
positions.append((m.start(), m.end(), m.group(0).strip()))
# Sort by position, deduplicate overlapping matches
positions.sort(key=lambda x: x[0])
deduped = []
last_end = -1
for start, end, title in positions:
if start >= last_end:
deduped.append((start, end, title))
last_end = end
if not deduped:
# No sections found: return whole text as one section
return [{"title": "full_text", "body": text, "type": "other"}]
sections = []
for i, (start, end, title) in enumerate(deduped):
body_start = end
body_end = deduped[i + 1][0] if i + 1 < len(deduped) else len(text)
body = text[body_start:body_end].strip()
section_type = _classify_section(title)
sections.append({"title": title, "body": body, "type": section_type})
return sections
def _classify_section(title: str) -> str:
t = title.lower()
if any(k in t for k in ["abstract"]):
return "abstract"
if any(k in t for k in ["introduction", "background", "overview", "motivation"]):
return "introduction"
if any(k in t for k in ["method", "approach", "model", "architecture", "framework", "system"]):
return "methodology"
if any(k in t for k in ["experiment", "result", "evaluat", "benchmark", "performance", "ablat"]):
return "results"
if any(k in t for k in ["conclusion", "future", "discussion", "limitation", "summary"]):
return "conclusion"
if any(k in t for k in ["reference", "bibliograph"]):
return "references"
if any(k in t for k in ["related work", "prior work", "literature"]):
return "related_work"
return "other"
def extract_references(text: str) -> List[str]:
"""Extract references section as list of reference strings."""
m = REFERENCE_RE.search(text)
if not m:
return []
ref_block = m.group(1)
# Split on numbered references [1], [2] or numbered lines
refs = re.split(r"\n\s*\[\d+\]|\n\s*\d+\.", ref_block)
return [r.strip() for r in refs if len(r.strip()) > 20][:100]
def extract_metadata(
pdf_path: str,
prefetched_meta: Optional[Dict] = None,
) -> Dict:
"""
Full extraction pipeline for one PDF.
Args:
pdf_path: path to PDF file
prefetched_meta: dict with title/authors/year/abstract from arXiv API (optional)
Returns metadata dict with:
pdf_path, title, authors, year, abstract,
sections (list of {title, body, type}),
references (list of strings),
full_text
"""
path = Path(pdf_path)
try:
full_text = extract_text_from_pdf(str(path))
except Exception as e:
print(f" [warn] could not parse {path.name}: {e}")
full_text = ""
if prefetched_meta:
title = prefetched_meta.get("title", path.stem)
authors = prefetched_meta.get("authors", [])
year = prefetched_meta.get("year", "")
abstract = prefetched_meta.get("abstract", "") or extract_abstract(full_text)
else:
title = path.stem
authors = []
year = ""
abstract = extract_abstract(full_text)
sections = extract_sections(full_text)
references = extract_references(full_text)
return {
"pdf_path": str(path),
"arxiv_id": path.stem,
"title": title,
"authors": authors,
"year": year,
"abstract": abstract,
"sections": sections,
"references": references,
"full_text": full_text,
"num_sections": len(sections),
"num_references": len(references),
"text_length": len(full_text),
}
def parse_pdf(pdf_path: str, prefetched_meta: Optional[Dict] = None) -> Dict:
"""Alias for extract_metadata — use in pipeline code."""
return extract_metadata(pdf_path, prefetched_meta)
# ---------------------------------------------------------------------------
# Batch parsing with metadata JSONL
# ---------------------------------------------------------------------------
def parse_corpus(
papers_dir: str = "data/papers",
metadata_file: str = "data/metadata.jsonl",
output_file: str = "data/parsed_corpus.jsonl",
) -> List[Dict]:
"""Parse all PDFs in papers_dir. Returns list of parsed metadata dicts."""
papers_path = Path(papers_dir)
output_path = Path(output_file)
# Load arXiv metadata if available
meta_lookup = {}
if Path(metadata_file).exists():
with open(metadata_file) as f:
for line in f:
rec = json.loads(line)
meta_lookup[rec.get("arxiv_id", "")] = rec
pdfs = sorted(papers_path.glob("*.pdf"))
print(f"Parsing {len(pdfs)} PDFs...")
parsed = []
with output_path.open("w") as out_f:
for i, pdf in enumerate(pdfs):
arxiv_id = pdf.stem
pre = meta_lookup.get(arxiv_id)
print(f" [{i+1}/{len(pdfs)}] {pdf.name[:50]}")
doc = extract_metadata(str(pdf), prefetched_meta=pre)
out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
out_f.flush()
parsed.append(doc)
print(f"Parsed corpus saved to {output_path}")
return parsed
if __name__ == "__main__":
parse_corpus()