kiyer's picture
fix: heading leakage in flush — clear pending_headings unconditionally; strip trailing stars from list-item headings
ca1fce6
Raw
History Blame Contribute Delete
19.2 kB
import re
from collections import Counter
from pathlib import Path
_ARXIV_STAMP = re.compile(r"arXiv:\d{4}\.\d{4,5}", re.I)
# ---------------------------------------------------------------------------
# Picture-text block filtering (pymupdf4llm 1.27.x emits these)
# ---------------------------------------------------------------------------
_PICTURE_TEXT_MARKER = re.compile(
r"-{3,}\s*(start|begin|end)\s+of\s+picture\s+text\s*-{3,}", re.I
)
_PICTURE_TEXT_START = re.compile(
r"-{3,}\s*(start|begin)\s+of\s+picture\s+text\s*-{3,}", re.I
)
_PICTURE_TEXT_END = re.compile(
r"-{3,}\s*end\s+of\s+picture\s+text\s*-{3,}", re.I
)
def strip_picture_text(lines: list[str]) -> list[str]:
"""Remove lines between (and including) picture-text markers.
Handles:
- Multi-line blocks: start marker on one line, end on another.
- Inline end markers: content and end marker on the same line (with <br>).
- Both 'Start/End' (pymupdf4llm 1.27.x) and 'Begin/End' variants.
"""
result: list[str] = []
in_block = False
for line in lines:
if in_block:
# Check if this line contains an end marker (possibly with content before it)
if _PICTURE_TEXT_END.search(line):
in_block = False
# Either way, drop the line (it's inside a picture block)
continue
# Check if this line starts a picture block
if _PICTURE_TEXT_START.search(line):
in_block = True
# If the end marker is ALSO on this same line (rare but possible)
if _PICTURE_TEXT_END.search(line):
in_block = False
continue
result.append(line)
return result
# ---------------------------------------------------------------------------
# De-markdown: strip emphasis/backtick and repair math notation
# ---------------------------------------------------------------------------
_SUPERSCRIPTS = str.maketrans("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")
def demarkdown(text: str) -> str:
"""Strip markdown emphasis markers and repair math notation debris.
Applied to paragraph text and figure captions. Does NOT touch headings or
pattern-filter lines (those run on raw blocks before this is called).
Steps (order matters):
1. Strip bold ``**...**``
2. Strip italic/emphasis ``_..._``
3. Strip backticks
4. Repair digit-dot-digit spacing left by ``_._`` stripping: ``5 . 7`` → ``5.7``
5. Repair digit-×-digit spacing left by ``_×_`` stripping: ``5 × 7`` → ``5×7``
6. Convert bracketed exponents after digit: ``10[5]`` → ``10⁵``
"""
# 1. Strip bold
t = re.sub(r"\*\*(.+?)\*\*", r"\1", text)
# 2. Strip emphasis (non-greedy; multiple passes handle nested/adjacent)
t = re.sub(r"_(.+?)_", r"\1", t)
# 3. Strip backticks
t = t.replace("`", "")
# 4. Repair spacing around decimal points (artifact of stripping _._)
t = re.sub(r"(\d) \. (\d)", r"\1.\2", t)
# 5. Repair spacing around × symbol (artifact of stripping _×_)
t = re.sub(r"(\d) × (\d)", r"\1×\2", t)
# 6. Bracketed exponents after a digit: 10[5] → 10⁵, 10[11] → 10¹¹
t = re.sub(
r"(?<=\d)\[(\d+)\]",
lambda m: m.group(1).translate(_SUPERSCRIPTS),
t,
)
# 7. Negative bracketed exponents: 10[-5] → 10⁻⁵
t = re.sub(
r"(?<=\d)\[-(\d+)\]",
lambda m: "⁻" + m.group(1).translate(_SUPERSCRIPTS),
t,
)
# 8. Strip (cid:N) font-encoding debris
t = re.sub(r"\(cid:\d+\)", "", t)
return t
# ---------------------------------------------------------------------------
# Open-sentence detection: words that strongly imply an incomplete sentence
# ---------------------------------------------------------------------------
_OPEN_FUNCTION_WORDS = frozenset({
# Coordinating conjunctions
'and', 'or', 'but', 'nor', 'yet', 'so',
# Articles
'the', 'a', 'an',
# Prepositions and subordinating conjunctions
'in', 'on', 'at', 'of', 'to', 'for', 'with', 'by', 'from', 'as', 'into',
'that', 'which', 'who', 'whose', 'where', 'when',
'between', 'among', 'including', 'such', 'both', 'either',
'than', 'then', 'also', 'only', 'even', 'about', 'over', 'under',
})
def _prev_para_is_open(text: str) -> bool:
"""Return True if the paragraph text ends mid-sentence.
Uses a word-level heuristic: the last alphabetic token is a function word
(conjunction, preposition, article) that cannot end a sentence. Also
rejects blocks that end with a digit (line-number artifact) or a hyphen
(word-break artifact), both common in 2-column paper extractions.
"""
stripped = text.rstrip()
# Digit at end → line number artifact, not a real sentence break
if re.search(r'\d\s*$', stripped):
return False
# Hyphen at end → word-break artifact
if stripped.endswith('-'):
return False
last_words = re.findall(r'[a-zA-Z]+', stripped)
last_word = last_words[-1].lower() if last_words else ''
return last_word in _OPEN_FUNCTION_WORDS
# Pages whose raw content stream exceeds this size (bytes) are "pathological":
# pymupdf4llm's layout analysis takes minutes on them (e.g. JWST vector-drawing
# plates with 500k+ path elements), while plain get_text() stays fast.
_MAX_PAGE_STREAM_BYTES = 8_000_000
def _pathological_pages(doc) -> set[int]:
"""Return indices of pages whose raw content stream is enormous.
Such pages (e.g. JWST images drawn as 563 k vector paths) cause
pymupdf4llm's layout/table analysis to run for 60+ seconds regardless of
``graphics_limit``/``table_strategy``. Plain ``page.get_text()`` on the
same pages takes < 1 s and recovers all text. Detected by content-stream
size (threshold: _MAX_PAGE_STREAM_BYTES).
"""
out: set[int] = set()
for i, page in enumerate(doc):
try:
if len(page.read_contents()) > _MAX_PAGE_STREAM_BYTES:
out.add(i)
except Exception: # noqa: BLE001
pass
return out
_DOI_STAMP = re.compile(r"^(doi:|https?://doi\.org)", re.I)
_PAGE_NUM = re.compile(r"^[–—\-\s]*\d+[–—\-\s]*$")
# Running journal headers: all-uppercase, ≥5 chars, no lowercase letters.
# Catches things like "RENAUD ET AL.", "STARBURSTS HIDING IN THE MAIN SEQUENCE"
# that survive remove_repeating_lines when they vary slightly across pages.
_RUNNING_HEADER = re.compile(r"^[A-Z][A-Z\s\.\,\-\–\—\(\)\[\]\:\;\'\"]{4,}$")
_MIN_PARA = 200
_HEADING = re.compile(r"^#{1,4}\s+(.*)")
_CAPTION = re.compile(r"^(figure|fig\.|table)\s*\d+\s*[.:|]", re.I)
_PICTURE = re.compile(r"^==>|^\*\*==>")
_TABLE_ROW = re.compile(r"^\|")
_REFERENCES = re.compile(r"^(references|bibliography|acknowledg)", re.I)
_AFFILIATION = re.compile(r"^>\s*\d+\s")
_LIST_HEADING = re.compile(
r"^[-*]\s+(?:[_*]{0,2})(\d+(?:\.\d+)+\.?\s+\S[^\n]{0,100})(?:[_*]{0,2})$"
)
_HEADING_JUNK = re.compile(
r"^\d{1,4}\s+\S.*\bet\s+al\.?\s*$"
r"|^(Received|Accepted|Submitted|Published)\b",
re.I,
)
# Author/affiliation block detector thresholds
_AUTHOR_BLOCK_MAX_PROSE = 0.25 # prose fraction below this → not real prose
_AUTHOR_BLOCK_MIN_COMMAS = 3 # must have at least this many commas
_AUTHOR_BLOCK_FRONT_LIMIT = 3 # only apply detector to first N accepted paragraphs
def _norm(line: str) -> str:
return re.sub(r"\d+", "#", line.strip().lower())
def _prose_fraction(text: str) -> float:
"""Return fraction of alphabetic tokens that are lowercase-only, ≥3 chars.
Real prose runs ~0.6–0.8; author name lists ~0.0–0.1; affiliation lists
~0.1–0.2. Strip markdown emphasis, bracket groups, and digits first.
"""
cleaned = re.sub(r"\[[^\]]*\]", "", text) # drop [...]
cleaned = re.sub(r"[_*]", "", cleaned) # drop emphasis chars
cleaned = re.sub(r"\d+", "", cleaned) # drop digits
tokens = re.findall(r"[a-zA-Z]+", cleaned)
if not tokens:
return 0.0
prose_tokens = [t for t in tokens if t == t.lower() and len(t) >= 3]
return len(prose_tokens) / len(tokens)
def _is_author_affiliation_block(text: str) -> bool:
"""Return True if the block looks like an author/affiliation block.
Criteria (applied only during front-matter scanning):
- Contains at least _AUTHOR_BLOCK_MIN_COMMAS commas, AND
- Prose fraction is below _AUTHOR_BLOCK_MAX_PROSE.
"""
return (
text.count(",") >= _AUTHOR_BLOCK_MIN_COMMAS
and _prose_fraction(text) < _AUTHOR_BLOCK_MAX_PROSE
)
def _clean_authors(raw: str) -> str:
"""Return a cleaned, truncated author string.
Strips bracket groups, parenthetical groups, and markdown emphasis;
splits on commas; keeps tokens with ≥2 letters; joins first 3 with ', '
and appends ' et al.' when more than 3 remain.
"""
cleaned = re.sub(r"\[[^\]]*\]", "", raw) # drop [...]
cleaned = re.sub(r"\([^)]*\)", "", cleaned) # drop (...)
cleaned = re.sub(r"[_*]", "", cleaned) # drop emphasis chars
cleaned = re.sub(r"\s+", " ", cleaned).strip()
parts = [p.strip() for p in cleaned.split(",")]
# Keep only tokens that contain at least 2 letters
authors = [p for p in parts if len(re.findall(r"[a-zA-Z]", p)) >= 2]
if len(authors) > 3:
return ", ".join(authors[:3]) + " et al."
return ", ".join(authors)
def remove_repeating_lines(pages: list[list[str]]) -> list[list[str]]:
"""Drop lines whose digit-normalized form appears on >= half the pages."""
counts = Counter()
for page in pages:
for n in {_norm(l) for l in page if l.strip()}:
counts[n] += 1
threshold = max(2, len(pages) // 2)
repeating = {n for n, c in counts.items() if c >= threshold}
return [[l for l in page if _norm(l) not in repeating] for page in pages]
def repair_hyphenation(text: str) -> str:
return re.sub(r"(\w)-\n(\w)", r"\1\2", text)
def strip_page_artifacts(lines: list[str]) -> list[str]:
return [
l for l in lines
if not l.strip()
or (not _PAGE_NUM.match(l.strip())
and not _ARXIV_STAMP.search(l)
and not _DOI_STAMP.match(l.strip())
and not _RUNNING_HEADER.match(l.strip()))
]
def segment_markdown(md: str) -> tuple[list[dict], str]:
"""Segment markdown into paragraphs and capture raw references text.
Returns (paras, raw_refs) where raw_refs is the text of the References
section (empty string if absent), consumed by the reference parser.
"""
paras: list[dict] = []
section = ""
first_pending = True
pending: list[str] = [] # short blocks buffered as section-head (fix 0.1)
pending_headings: list[str] = [] # heading names since last paragraph (fix 0.3)
pending_heading_levels: list[int] = [] # parallel list of heading levels (fix 0.3b)
raw_refs = ""
_state = "normal" # "normal" | "refs_capture" | "refs_skip"
def _flush_pending_as_para():
nonlocal first_pending
if not pending:
return
text = " ".join(pending)
pending.clear()
headings_snap = list(pending_headings)
pending_headings.clear()
pending_heading_levels.clear()
if len(text) >= _MIN_PARA:
paras.append({
"section": section,
"firstOfSection": first_pending,
"text": text,
"headings": headings_snap,
})
first_pending = False
for block in re.split(r"\n\s*\n", md):
block = repair_hyphenation(block).strip()
if not block:
continue
m = _HEADING.match(block)
lm = _LIST_HEADING.match(block) if not m else None
if m or lm:
name_raw = m.group(1).strip().strip("*") if m else lm.group(1).strip().strip("*")
name = demarkdown(name_raw)
# Heading level: count leading '#' for _HEADING; treat _LIST_HEADING as level 3
hlevel = len(re.match(r"^(#+)", block).group(1)) if m else 3
if _HEADING_JUNK.match(name):
continue
_flush_pending_as_para()
# Prune orphaned sibling/ancestor headings that accumulated with no prose.
# Keep parent headings (lower level number) so consecutive sub-headings chain.
if not pending:
# Remove any pending_headings at same or deeper level than current heading
keep_up_to = next(
(i for i, lvl in enumerate(pending_heading_levels) if lvl >= hlevel),
len(pending_heading_levels),
)
del pending_headings[keep_up_to:]
del pending_heading_levels[keep_up_to:]
if _REFERENCES.match(name):
_state = (
"refs_capture"
if re.match(r"^(references|bibliography)", name, re.I)
else "refs_skip"
)
section = name
first_pending = True
continue
if _state in ("refs_capture", "refs_skip"):
_state = "normal"
section = name
first_pending = True
pending_headings.append(name)
pending_heading_levels.append(hlevel)
continue
if _state == "refs_capture":
raw_refs += block + "\n\n"
continue
if _state == "refs_skip":
continue
filtered_lines = strip_picture_text([
l for l in block.splitlines() if not _AFFILIATION.match(l)
])
block = "\n".join(filtered_lines).strip()
if not block:
continue
# Fix 0.6: demarkdown first, measure once on cleaned text
text = demarkdown(re.sub(r"\s+", " ", block))
if _CAPTION.match(text) or _PICTURE.match(text) or _TABLE_ROW.match(text):
continue
if len(paras) < _AUTHOR_BLOCK_FRONT_LIMIT and _is_author_affiliation_block(text):
continue
# Prepend pending buffer
if pending:
text = " ".join(pending) + " " + text
pending.clear()
starts_lower = text[:1].islower()
prev_open = bool(paras and _prev_para_is_open(paras[-1]["text"]))
should_merge = starts_lower or len(text) < _MIN_PARA or prev_open
if paras and should_merge and paras[-1]["section"] == section:
paras[-1]["text"] += " " + text
elif len(text) >= _MIN_PARA:
headings_chain = list(pending_headings)
pending_headings.clear()
pending_heading_levels.clear()
paras.append({
"section": section,
"firstOfSection": first_pending,
"text": text,
"headings": headings_chain,
})
first_pending = False
else:
# Fix 0.1: buffer short non-mergeable block instead of dropping
pending.append(text)
_flush_pending_as_para()
# Fix 0.6: removed final re-filter (length already measured post-demarkdown)
for i, p in enumerate(paras):
p["id"] = f"p{i + 1}"
return paras, raw_refs
def _fallback_chunk(page) -> dict:
"""Build a substitute chunk for a pathological page using plain text extraction.
Uses ``page.get_text("blocks")`` to preserve paragraph boundaries:
text blocks (block type 0) are joined with double newlines so that the
downstream ``segment_markdown`` call sees intact paragraph structure.
"""
blocks = page.get_text("blocks")
text_parts = [b[4] for b in blocks if b[6] == 0 and b[4].strip()]
return {"text": "\n\n".join(text_parts)}
def parse_text(raw: bytes, file_name: str) -> tuple["Paper", str]:
from .schemas import Paper, Paragraph
text = raw.decode("utf-8", errors="replace")
title = Path(file_name).stem
for line in text.splitlines():
m = _HEADING.match(line.strip())
if m:
title = m.group(1).strip().strip("*")
break
arxiv = ""
arxiv_from_name = re.search(r"(\d{4}\.\d{4,5})", file_name)
if arxiv_from_name:
arxiv = arxiv_from_name.group(1)
paras, raw_refs = segment_markdown(text)
return Paper(
title=title,
authors="",
arxivId=arxiv,
pages=0,
paragraphs=[Paragraph(**p) for p in paras],
), raw_refs
def parse_pdf(pdf_bytes: bytes, file_name: str) -> tuple["Paper", str]:
import pymupdf4llm
import pymupdf
from .schemas import Paper, Paragraph
doc = pymupdf.open(stream=pdf_bytes, filetype="pdf")
n_pages = doc.page_count
bad = _pathological_pages(doc)
if bad:
# Request only the normal pages from pymupdf4llm
normal_pages = [i for i in range(n_pages) if i not in bad]
md_chunks: list[dict] = pymupdf4llm.to_markdown(
doc, page_chunks=True, pages=normal_pages, force_text=False
)
# pymupdf4llm returns chunks only for requested pages, in order —
# interleave fallback chunks for bad pages to restore document order.
normal_iter = iter(md_chunks)
chunks: list[dict] = []
for i in range(n_pages):
if i in bad:
chunks.append(_fallback_chunk(doc[i]))
else:
chunks.append(next(normal_iter))
assert len(chunks) == n_pages, (
f"Chunk count mismatch: {len(chunks)} != {n_pages}"
)
else:
chunks = pymupdf4llm.to_markdown(doc, page_chunks=True, force_text=False)
pages_lines = [c["text"].splitlines() for c in chunks]
pages_lines = remove_repeating_lines(pages_lines)
pages_lines = [strip_page_artifacts(p) for p in pages_lines]
# Belt-and-suspenders: strip any remaining picture-text marker lines
pages_lines = [strip_picture_text(p) for p in pages_lines]
md = "\n".join("\n".join(p) for p in pages_lines)
paras, raw_refs = segment_markdown(md)
# Title: first markdown heading in the document; authors: first non-heading line after it.
title, authors = file_name, ""
lines = [l for l in pages_lines[0] if l.strip()]
for i, l in enumerate(lines):
m = _HEADING.match(l)
if m:
title = m.group(1).strip().strip("*")
for nxt in lines[i + 1:]:
if not _HEADING.match(nxt):
raw_authors = re.sub(r"[*#]", "", nxt).strip()
authors = _clean_authors(raw_authors)
break
break
# arXiv id: try to extract from the file_name (e.g. "2402.08696" pattern)
arxiv = ""
arxiv_from_name = re.search(r"(\d{4}\.\d{4,5})", file_name)
if arxiv_from_name:
arxiv = arxiv_from_name.group(1)
else:
# Fall back: search the raw first-page chunk text before cleaning
m = _ARXIV_STAMP.search(chunks[0]["text"])
if m:
arxiv = m.group(0).split(":")[1]
return Paper(
title=title, authors=authors, arxivId=arxiv, pages=n_pages,
paragraphs=[Paragraph(**p) for p in paras],
), raw_refs