Spaces:

kiyer
/

beacon

Running

App Files Files Community

beacon / backend /tests /test_parse_segment.py

kiyer

fix: heading leakage in flush — clear pending_headings unconditionally; strip trailing stars from list-item headings

ca1fce6 18 days ago

Raw

History Blame Contribute Delete

18 kB

	import time
	from pathlib import Path
	from astroparse_api.parse import segment_markdown, parse_pdf, _pathological_pages


	# ---------------------------------------------------------------------------
	# Bug A: strip_picture_text
	# ---------------------------------------------------------------------------

	def test_picture_text_blocks_filtered():
	from astroparse_api.parse import strip_picture_text
	lines = ["Real prose before.", "----- Begin of picture text -----",
	"z = 5.0 z = 4.0", "1.5", "R [kpc]",
	"----- End of picture text -----", "Real prose after."]
	assert strip_picture_text(lines) == ["Real prose before.", "Real prose after."]


	def test_picture_text_start_end_variants():
	"""Handles 'Start of picture text' / 'End of picture text' (pymupdf4llm 1.27.x)."""
	from astroparse_api.parse import strip_picture_text
	lines = [
	"Prose A.",
	"----- Start of picture text -----<br>",
	"axis label stuff<br>more axis<br>----- End of picture text -----<br>",
	"Prose B.",
	]
	result = strip_picture_text(lines)
	assert result == ["Prose A.", "Prose B."]


	def test_picture_text_multiline_block():
	"""Handles multi-line blocks between start/end markers."""
	from astroparse_api.parse import strip_picture_text
	lines = [
	"Before.",
	"----- Start of picture text -----<br>",
	"z = 5.0 z = 4.0",
	"1.5",
	"R [kpc]",
	"----- End of picture text -----<br>",
	"After.",
	]
	result = strip_picture_text(lines)
	assert result == ["Before.", "After."]


	# ---------------------------------------------------------------------------
	# Bug B: demarkdown
	# ---------------------------------------------------------------------------

	def test_demarkdown_cleans_math_debris():
	from astroparse_api.parse import demarkdown
	t = ("with a peak at _z ≈_ 4, masses between 5 _._ 7 _×_ 10[5] M _⊙_ and "
	"2 _._ 5 _×_ 10[11] M _⊙_ .")
	out = demarkdown(t)
	assert "_" not in out and "**" not in out
	assert "5.7×10⁵" in out and "2.5×10¹¹" in out and "M ⊙" in out


	def test_demarkdown_bold_stripped():
	from astroparse_api.parse import demarkdown
	t = "This is very important and _emphasized_ text."
	out = demarkdown(t)
	assert "**" not in out and "_" not in out
	assert "very important" in out and "emphasized" in out


	def test_demarkdown_backtick_stripped():
	from astroparse_api.parse import demarkdown
	t = "Use `code` here."
	out = demarkdown(t)
	assert "`" not in out
	assert "code" in out


	def test_demarkdown_bracketed_exponents():
	from astroparse_api.parse import demarkdown
	assert "10⁵" in demarkdown("10[5]")
	assert "10¹¹" in demarkdown("10[11]")
	assert "10⁰" in demarkdown("10[0]")


	# ---------------------------------------------------------------------------
	# Bug B2: merge open-sentence continuations
	# ---------------------------------------------------------------------------

	def test_merge_continuation_after_open_sentence():
	# First half ends without terminal punctuation ("and"), second half starts with digit.
	# Second half is >= _MIN_PARA so len heuristic alone would NOT merge it.
	# Only the open-sentence rule (prev doesn't end with terminal punct) should merge it.
	first_half = "Alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu nu xi. " * 3
	first_half += "masses between five and" # no terminal punctuation — ends open
	# second_half starts with "2" (digit, not lowercase) and is >= 200 chars
	second_half = ("2.5 something more here delta epsilon zeta eta theta "
	"iota kappa lambda mu nu. " * 4)

	md = "## S\n\n" + first_half + "\n\n" + second_half
	paras, _ = segment_markdown(md)
	# Digit-starting continuation merged because prev para ended mid-sentence
	assert len(paras) == 1, (
	f"Expected 1 merged paragraph, got {len(paras)}: "
	+ str([p['text'][:80] for p in paras])
	)

	FIXTURE = Path(__file__).parent / "fixtures" / "mowla_iyer_2024.pdf"

	MD = """# The Firefly Sparkle

	## Introduction

	""" + ("First intro paragraph. " * 15) + """

	""" + ("Second intro paragraph. " * 15) + """

	## Methods

	""" + ("Methods paragraph here. " * 15) + """

	Figure 1. A caption that should be dropped.

	## References

	[1] Someone et al. 2020
	"""

	def test_segment_basic():
	paras, _ = segment_markdown(MD)
	sections = [p["section"] for p in paras]
	assert sections == ["Introduction", "Introduction", "Methods"]
	assert paras[0]["firstOfSection"] and not paras[1]["firstOfSection"]
	assert paras[2]["firstOfSection"]
	assert paras[0]["id"] == "p1"
	assert all(len(p["text"]) >= 200 for p in paras)

	def test_segment_drops_captions_and_references():
	paras, raw_refs = segment_markdown(MD)
	joined = " ".join(p["text"] for p in paras)
	assert "Figure 1" not in joined
	assert "Someone et al" not in joined # not in paras
	assert "Someone et al" in raw_refs # but captured in raw_refs

	def test_parse_pdf_fixture():
	paper, _ = parse_pdf(FIXTURE.read_bytes(), "mowla_iyer_2024.pdf")
	assert 10 <= len(paper.paragraphs) <= 80
	assert paper.pages >= 10
	assert "firefly" in paper.title.lower() or len(paper.title) > 10


	def test_segment_drops_author_affiliation_block():
	md = """# A Great Paper

	F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1] , C. ACCARD[1] , D. ISMAIL[1] , E. DADDI[3] , D. ELBAZ[3] , L. CIESLA[4] , G. MARTIN[5] , Y. DUBOIS[6] , S. PEIRANI[7]
	> 1 _Observatoire Astronomique de Strasbourg, Université de Strasbourg, CNRS UMR 7550, 11 rue de l'Université, F-67000 Strasbourg, France_
	> 2 _University of Strasbourg Institute for Advanced Study, 5 allée du Général Rouvillois, F-67083 Strasbourg, France_

	## ABSTRACT

	""" + ("Star-forming galaxies spend most of their lifetimes on the main sequence of star formation. " * 4)
	paras, _ = segment_markdown(md)
	joined = " ".join(p["text"] for p in paras)
	assert "KRALJIC" not in joined and "Observatoire" not in joined
	assert any("Star-forming galaxies" in p["text"] for p in paras)


	def test_clean_authors_three_plus_et_al():
	from astroparse_api.parse import _clean_authors
	raw = "F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1]"
	assert _clean_authors(raw) == "F.RENAUD, K. KRALJIC, J. FREUNDLICH et al."


	def test_clean_authors_short_list_unchanged():
	from astroparse_api.parse import _clean_authors
	assert _clean_authors("A. One, B. Two") == "A. One, B. Two"


	# ---------------------------------------------------------------------------
	# Pathological-page detection unit tests
	# ---------------------------------------------------------------------------

	def test_pathological_pages_detects_large_stream():
	"""_pathological_pages returns the index of a page whose content stream
	exceeds _MAX_PAGE_STREAM_BYTES (8 MB). We duck-type a minimal fake doc."""
	from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES

	class _FakePage:
	def __init__(self, size: int):
	self._size = size
	def read_contents(self) -> bytes:
	return b"x" * self._size

	class _FakeDoc:
	def __init__(self, sizes):
	self._pages = [_FakePage(s) for s in sizes]
	def __iter__(self):
	return iter(self._pages)

	# One small page, one huge page, one small page
	sizes = [100, _MAX_PAGE_STREAM_BYTES + 1, 200]
	doc = _FakeDoc(sizes)
	bad = _pathological_pages(doc)
	assert bad == {1}, f"Expected {{1}}, got {bad}"


	def test_pathological_pages_empty_when_all_small():
	"""_pathological_pages returns empty set when all pages are below threshold."""
	from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES

	class _FakePage:
	def __init__(self, size: int):
	self._size = size
	def read_contents(self) -> bytes:
	return b"x" * self._size

	class _FakeDoc:
	def __init__(self, sizes):
	self._pages = [_FakePage(s) for s in sizes]
	def __iter__(self):
	return iter(self._pages)

	sizes = [100, _MAX_PAGE_STREAM_BYTES - 1, 200]
	doc = _FakeDoc(sizes)
	bad = _pathological_pages(doc)
	assert bad == set(), f"Expected empty set, got {bad}"


	# ---------------------------------------------------------------------------
	# Feature: parse_text — plaintext/markdown input
	# ---------------------------------------------------------------------------

	_MD_DOC = """# Galactic Winds in Starburst Galaxies

	## Introduction

	""" + ("Galaxy-scale winds driven by stellar feedback are a key mechanism for regulating " * 15) + """

	""" + ("The outflow rates in starburst systems can exceed the star formation rate by factors " * 15) + """

	## References

	[1] Veilleux et al. 2005
	[2] Rupke & Veilleux 2011
	"""

	def test_parse_text_title_from_heading():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "notes.md")
	assert result.title == "Galactic Winds in Starburst Galaxies"


	def test_parse_text_two_paragraphs():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "notes.md")
	assert len(result.paragraphs) == 2


	def test_parse_text_sections_correct():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "notes.md")
	assert all(p.section == "Introduction" for p in result.paragraphs)


	def test_parse_text_references_dropped():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "notes.md")
	joined = " ".join(p.text for p in result.paragraphs)
	assert "Veilleux" not in joined


	def test_parse_text_pages_zero_authors_empty():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "notes.md")
	assert result.pages == 0
	assert result.authors == ""


	def test_parse_text_title_fallback_to_stem():
	from astroparse_api.parse import parse_text
	# No heading — falls back to stem of filename
	plain = ("A " + "paragraph without any heading content describing galaxies " * 20)
	result, _ = parse_text(plain.encode(), "my_draft.txt")
	assert result.title == "my_draft"


	def test_parse_text_arxiv_from_filename():
	from astroparse_api.parse import parse_text
	result, _ = parse_text(_MD_DOC.encode(), "2402.08696.md")
	assert result.arxivId == "2402.08696"


	def test_parse_text_utf8_replace_on_bad_bytes():
	from astroparse_api.parse import parse_text
	# Inject a Latin-1 byte that isn't valid UTF-8, surrounded by valid UTF-8
	bad = b"# My Title\n\n" + b"Good text here. " * 20 + b"\xff" + b" more text. " * 10
	result, _ = parse_text(bad, "test.md")
	assert result.title == "My Title"


	def test_pathological_pages_handles_read_contents_exception():
	"""_pathological_pages silently skips pages where read_contents raises."""
	class _FaultyPage:
	def read_contents(self):
	raise RuntimeError("PDF error")

	class _FakeDoc:
	def __iter__(self):
	return iter([_FaultyPage()])

	bad = _pathological_pages(_FakeDoc())
	assert bad == set()


	# ---------------------------------------------------------------------------
	# Integration test: real pathological PDF
	# ---------------------------------------------------------------------------

	def test_parse_pdf_handles_vector_heavy_pages():
	"""parse_pdf completes within 60s on the JWST paper (page 3 has 65 MB
	vector drawings that previously caused pymupdf4llm to run for 400+s)."""
	pdf = Path(__file__).parent.parent.parent / "test_papers" / "s41586-024-08293-0.pdf"
	if not pdf.exists():
	import pytest
	pytest.skip("test paper not present")
	t0 = time.time()
	paper, _ = parse_pdf(pdf.read_bytes(), pdf.name)
	elapsed = time.time() - t0
	assert elapsed < 60, f"parse_pdf took {elapsed:.1f}s — expected < 60s"
	assert len(paper.paragraphs) >= 30, (
	f"Expected >= 30 paragraphs, got {len(paper.paragraphs)}"
	)
	joined = " ".join(p.text for p in paper.paragraphs)
	assert (
	"Firefly" in paper.title
	or "Firefly" in joined
	or len(joined) > 10000
	), "Expected Firefly Sparkle content or substantial text"


	# --- Fix 0.1: pending-head buffer saves section openers ---
	def test_pending_buffer_saves_section_opener():
	"""A short block at a section boundary should be kept, not dropped."""
	short_opener = "We begin with a key observation here." # < 200 chars
	long_body = "The stellar mass function at high redshift is uncertain. " * 6
	md = f"## Section A\n\n{short_opener}\n\n{long_body}"
	paras, _ = segment_markdown(md)
	joined = " ".join(p["text"] for p in paras)
	assert "key observation" in joined, "short section opener was dropped (pending buffer failed)"

	# --- Fix 0.2: skip-and-resume keeps appendix content ---
	def test_references_captured_and_appendix_preserved():
	"""Content after the References heading, in an Appendix, should appear in paras."""
	body = "Star-forming galaxies remain on the main sequence for extended periods. " * 5
	ref_body = "[1] Smith et al. 2020 ApJ 900 1"
	appendix_body = "This appendix derives the mass-to-light ratio correction. " * 5
	md = (
	f"## Introduction\n\n{body}\n\n"
	f"## References\n\n{ref_body}\n\n"
	f"## Appendix A\n\n{appendix_body}"
	)
	paras, raw_refs = segment_markdown(md)
	joined = " ".join(p["text"] for p in paras)
	assert "mass-to-light ratio" in joined, "appendix content was discarded after References"
	assert "Smith et al." in raw_refs, "raw_refs should contain references section text"

	# --- Fix 0.3: headings chain attached to paragraphs ---
	def test_headings_chain_on_paragraph():
	"""First paragraph under a heading should carry the heading in its headings list."""
	body = "Galaxy evolution proceeds through mergers and secular processes. " * 5
	md = f"## Methods\n\n{body}"
	paras, _ = segment_markdown(md)
	assert len(paras) == 1
	assert "Methods" in paras[0]["headings"]

	def test_headings_chain_consecutive_headings():
	"""Two consecutive headings both appear in the next paragraph's headings list."""
	body = "The SFR surface density is measured from UV continuum emission. " * 5
	md = f"## Results\n\n### 3.1 Star Formation\n\n{body}"
	paras, _ = segment_markdown(md)
	assert len(paras) == 1
	assert paras[0]["headings"] == ["Results", "3.1 Star Formation"]

	# --- Fix 0.4: heading hygiene ---
	def test_heading_hygiene_filters_page_numbers():
	"""A heading shaped like '456 K. G. Iyer et al.' should be discarded."""
	body = "Outflow rates exceed the star formation rate in starburst systems. " * 5
	md = (
	"## Introduction\n\n" + body +
	"\n\n## 456 K. G. Iyer et al.\n\n" +
	"## Results\n\n" + body
	)
	paras, _ = segment_markdown(md)
	sections = [p["section"] for p in paras]
	assert "456 K. G. Iyer et al." not in sections

	# --- Fix 0.5: caption gate allows figure-referencing prose ---
	def test_caption_gate_allows_figure_prose():
	"""'Figure 4 shows that the fraction...' is prose and must not be dropped."""
	prose = ("Figure 4 shows that the fraction of star-forming galaxies decreases "
	"sharply above a stellar mass of 10^10.5 solar masses, consistent with "
	"quenching operating preferentially at the high-mass end of the distribution "
	"and in dense environments where AGN feedback is more effective.")
	md = "## Results\n\n" + prose
	paras, _ = segment_markdown(md)
	assert any("fraction of star-forming galaxies" in p["text"] for p in paras), \
	"caption gate dropped figure-referencing prose"

	def test_caption_gate_still_drops_real_captions():
	"""'Figure 4. The fraction...' is a caption and should be dropped."""
	caption = "Figure 4. The fraction of star-forming galaxies as a function of stellar mass."
	body = "Galaxy stellar mass functions have been measured across cosmic time. " * 5
	md = f"## Results\n\n{body}\n\n{caption}"
	paras, _ = segment_markdown(md)
	joined = " ".join(p["text"] for p in paras)
	assert "The fraction of star-forming galaxies as a function" not in joined

	def test_heading_not_leaked_from_empty_section():
	"""Headings from a section with no prose don't appear in the next section's paragraph."""
	body = "The stellar mass function evolves significantly from z=2 to the present day. " * 5
	# Ghost section has no content (heading immediately followed by another heading)
	md = "## Ghost Section\n\n## Real Section\n\n" + body
	paras, _ = segment_markdown(md)
	assert len(paras) == 1
	# "Ghost Section" must NOT appear in the paragraph's headings
	assert "Ghost Section" not in paras[0]["headings"], \
	f"Ghost Section leaked into headings: {paras[0]['headings']}"
	assert "Real Section" in paras[0]["headings"]


	# --- Fix 0.6: length measured post-demarkdown ---
	def test_length_measured_post_demarkdown():
	"""A paragraph with heavy markdown markup that is still >= 200 chars after demarkdown must survive."""
	# Raw text has lots of emphasis markers; after demarkdown the content is still >= 200 chars.
	core = ("_z_ ≈ 3 sources show Lyman-alpha emission tracing the neutral hydrogen distribution "
	"in high-redshift galaxies at cosmic noon. "
	"The _flux_ calibration introduces _systematic_ uncertainties of order 10 per cent, "
	"which must be accounted for in the _stellar mass_ estimates.")
	md = "## Discussion\n\n" + core
	paras, _ = segment_markdown(md)
	assert len(paras) == 1, "post-demarkdown paragraph was dropped by length gate"