Spaces:

OGB2000
/

genAI-Project

Sleeping

App Files Files Community

genAI-Project / src /ingestion /parser.py

OGB2000

Initial clean deployment

bf77be6 24 days ago

Raw

History Blame Contribute Delete

8.04 kB

	"""Scientific paper parser: extract full text + structured metadata from PDFs.

	Extracts:
	- title, authors, year, abstract
	- section titles + bodies
	- references
	"""

	import re
	import json
	from pathlib import Path
	from typing import Dict, List, Tuple, Optional


	# ---------------------------------------------------------------------------
	# Section patterns (covers most LaTeX-compiled arXiv PDFs)
	# ---------------------------------------------------------------------------

	SECTION_PATTERNS = [
	# Numbered: "1 Introduction", "2. Related Work", "1.1 Background"
	re.compile(r"^\s(\d+(?:\.\d+))\s{1,4}([A-Z][A-Za-z &,\-:]{2,60})\s*$", re.MULTILINE),
	# Unnumbered all-caps: "INTRODUCTION", "RELATED WORK"
	re.compile(r"^\s([A-Z][A-Z ]{3,40})\s$", re.MULTILINE),
	# Named: "Abstract", "Introduction", "Conclusion", "References"
	re.compile(
	r"^\s*(Abstract\|Introduction\|Related Work\|Background\|Methodology\|Methods\|"
	r"Experiments?\|Results?\|Discussion\|Conclusion\|Limitations?\|"
	r"Future Work\|Acknowledgements?\|References?)\s*$",
	re.MULTILINE \| re.IGNORECASE,
	),
	]

	ABSTRACT_RE = re.compile(
	r"(?:Abstract\|ABSTRACT)[.\s—–-]\n(.?)(?=\n\s\n\|\n\s(?:1\.\|Introduction\|Keywords))",
	re.DOTALL \| re.IGNORECASE,
	)

	REFERENCE_RE = re.compile(
	r"\n\s(?:References?\|Bibliography)\s\n(.*?)$",
	re.DOTALL \| re.IGNORECASE,
	)

	AUTHORS_RE = re.compile(
	r"(?:^\|\n)((?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))(?:,?\s+(?:[A-Z][a-z]+ ){1,3}(?:[A-Z][a-z]+))*",
	)


	def extract_text_from_pdf(pdf_path: str) -> str:
	"""Extract raw text from PDF, preserving page structure."""
	from pypdf import PdfReader
	reader = PdfReader(pdf_path)
	pages = []
	for page in reader.pages:
	text = page.extract_text() or ""
	pages.append(text)
	return "\n\n[PAGE_BREAK]\n\n".join(pages)


	def extract_abstract(text: str) -> str:
	"""Extract abstract from paper text."""
	m = ABSTRACT_RE.search(text)
	if m:
	abstract = m.group(1).strip()
	# Clean up hyphenated line breaks and extra spaces
	abstract = re.sub(r"-\n", "", abstract)
	abstract = re.sub(r"\s+", " ", abstract)
	return abstract[:2000]
	# Fallback: first 500 chars after abstract keyword
	idx = text.lower().find("abstract")
	if idx >= 0:
	return text[idx + 8:idx + 800].strip()
	return ""


	def extract_sections(text: str) -> List[Dict]:
	"""
	Split paper into sections. Returns list of:
	{"title": str, "body": str, "type": str}
	where type is one of: abstract, introduction, methodology, results, conclusion, other.
	"""
	# Find all section header positions
	positions = []
	for pat in SECTION_PATTERNS:
	for m in pat.finditer(text):
	positions.append((m.start(), m.end(), m.group(0).strip()))

	# Sort by position, deduplicate overlapping matches
	positions.sort(key=lambda x: x[0])
	deduped = []
	last_end = -1
	for start, end, title in positions:
	if start >= last_end:
	deduped.append((start, end, title))
	last_end = end

	if not deduped:
	# No sections found: return whole text as one section
	return [{"title": "full_text", "body": text, "type": "other"}]

	sections = []
	for i, (start, end, title) in enumerate(deduped):
	body_start = end
	body_end = deduped[i + 1][0] if i + 1 < len(deduped) else len(text)
	body = text[body_start:body_end].strip()
	section_type = _classify_section(title)
	sections.append({"title": title, "body": body, "type": section_type})

	return sections


	def _classify_section(title: str) -> str:
	t = title.lower()
	if any(k in t for k in ["abstract"]):
	return "abstract"
	if any(k in t for k in ["introduction", "background", "overview", "motivation"]):
	return "introduction"
	if any(k in t for k in ["method", "approach", "model", "architecture", "framework", "system"]):
	return "methodology"
	if any(k in t for k in ["experiment", "result", "evaluat", "benchmark", "performance", "ablat"]):
	return "results"
	if any(k in t for k in ["conclusion", "future", "discussion", "limitation", "summary"]):
	return "conclusion"
	if any(k in t for k in ["reference", "bibliograph"]):
	return "references"
	if any(k in t for k in ["related work", "prior work", "literature"]):
	return "related_work"
	return "other"


	def extract_references(text: str) -> List[str]:
	"""Extract references section as list of reference strings."""
	m = REFERENCE_RE.search(text)
	if not m:
	return []
	ref_block = m.group(1)
	# Split on numbered references [1], [2] or numbered lines
	refs = re.split(r"\n\s\[\d+\]\|\n\s\d+\.", ref_block)
	return [r.strip() for r in refs if len(r.strip()) > 20][:100]


	def extract_metadata(
	pdf_path: str,
	prefetched_meta: Optional[Dict] = None,
	) -> Dict:
	"""
	Full extraction pipeline for one PDF.

	Args:
	pdf_path: path to PDF file
	prefetched_meta: dict with title/authors/year/abstract from arXiv API (optional)

	Returns metadata dict with:
	pdf_path, title, authors, year, abstract,
	sections (list of {title, body, type}),
	references (list of strings),
	full_text
	"""
	path = Path(pdf_path)
	try:
	full_text = extract_text_from_pdf(str(path))
	except Exception as e:
	print(f" [warn] could not parse {path.name}: {e}")
	full_text = ""

	if prefetched_meta:
	title = prefetched_meta.get("title", path.stem)
	authors = prefetched_meta.get("authors", [])
	year = prefetched_meta.get("year", "")
	abstract = prefetched_meta.get("abstract", "") or extract_abstract(full_text)
	else:
	title = path.stem
	authors = []
	year = ""
	abstract = extract_abstract(full_text)

	sections = extract_sections(full_text)
	references = extract_references(full_text)

	return {
	"pdf_path": str(path),
	"arxiv_id": path.stem,
	"title": title,
	"authors": authors,
	"year": year,
	"abstract": abstract,
	"sections": sections,
	"references": references,
	"full_text": full_text,
	"num_sections": len(sections),
	"num_references": len(references),
	"text_length": len(full_text),
	}


	def parse_pdf(pdf_path: str, prefetched_meta: Optional[Dict] = None) -> Dict:
	"""Alias for extract_metadata — use in pipeline code."""
	return extract_metadata(pdf_path, prefetched_meta)


	# ---------------------------------------------------------------------------
	# Batch parsing with metadata JSONL
	# ---------------------------------------------------------------------------

	def parse_corpus(
	papers_dir: str = "data/papers",
	metadata_file: str = "data/metadata.jsonl",
	output_file: str = "data/parsed_corpus.jsonl",
	) -> List[Dict]:
	"""Parse all PDFs in papers_dir. Returns list of parsed metadata dicts."""
	papers_path = Path(papers_dir)
	output_path = Path(output_file)

	# Load arXiv metadata if available
	meta_lookup = {}
	if Path(metadata_file).exists():
	with open(metadata_file) as f:
	for line in f:
	rec = json.loads(line)
	meta_lookup[rec.get("arxiv_id", "")] = rec

	pdfs = sorted(papers_path.glob("*.pdf"))
	print(f"Parsing {len(pdfs)} PDFs...")

	parsed = []
	with output_path.open("w") as out_f:
	for i, pdf in enumerate(pdfs):
	arxiv_id = pdf.stem
	pre = meta_lookup.get(arxiv_id)
	print(f" [{i+1}/{len(pdfs)}] {pdf.name[:50]}")
	doc = extract_metadata(str(pdf), prefetched_meta=pre)
	out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
	out_f.flush()
	parsed.append(doc)

	print(f"Parsed corpus saved to {output_path}")
	return parsed


	if __name__ == "__main__":
	parse_corpus()