import time
from pathlib import Path
from astroparse_api.parse import segment_markdown, parse_pdf, _pathological_pages
# ---------------------------------------------------------------------------
# Bug A: strip_picture_text
# ---------------------------------------------------------------------------
def test_picture_text_blocks_filtered():
from astroparse_api.parse import strip_picture_text
lines = ["Real prose before.", "**----- Begin of picture text -----**",
"z = 5.0 z = 4.0", "1.5", "R [kpc]",
"**----- End of picture text -----**", "Real prose after."]
assert strip_picture_text(lines) == ["Real prose before.", "Real prose after."]
def test_picture_text_start_end_variants():
"""Handles 'Start of picture text' / 'End of picture text' (pymupdf4llm 1.27.x)."""
from astroparse_api.parse import strip_picture_text
lines = [
"Prose A.",
"**----- Start of picture text -----**
",
"axis label stuff
more axis
**----- End of picture text -----**
",
"Prose B.",
]
result = strip_picture_text(lines)
assert result == ["Prose A.", "Prose B."]
def test_picture_text_multiline_block():
"""Handles multi-line blocks between start/end markers."""
from astroparse_api.parse import strip_picture_text
lines = [
"Before.",
"**----- Start of picture text -----**
",
"z = 5.0 z = 4.0",
"1.5",
"R [kpc]",
"**----- End of picture text -----**
",
"After.",
]
result = strip_picture_text(lines)
assert result == ["Before.", "After."]
# ---------------------------------------------------------------------------
# Bug B: demarkdown
# ---------------------------------------------------------------------------
def test_demarkdown_cleans_math_debris():
from astroparse_api.parse import demarkdown
t = ("with a peak at _z ≈_ 4, masses between 5 _._ 7 _×_ 10[5] M _⊙_ and "
"2 _._ 5 _×_ 10[11] M _⊙_ .")
out = demarkdown(t)
assert "_" not in out and "**" not in out
assert "5.7×10⁵" in out and "2.5×10¹¹" in out and "M ⊙" in out
def test_demarkdown_bold_stripped():
from astroparse_api.parse import demarkdown
t = "This is **very important** and _emphasized_ text."
out = demarkdown(t)
assert "**" not in out and "_" not in out
assert "very important" in out and "emphasized" in out
def test_demarkdown_backtick_stripped():
from astroparse_api.parse import demarkdown
t = "Use `code` here."
out = demarkdown(t)
assert "`" not in out
assert "code" in out
def test_demarkdown_bracketed_exponents():
from astroparse_api.parse import demarkdown
assert "10⁵" in demarkdown("10[5]")
assert "10¹¹" in demarkdown("10[11]")
assert "10⁰" in demarkdown("10[0]")
# ---------------------------------------------------------------------------
# Bug B2: merge open-sentence continuations
# ---------------------------------------------------------------------------
def test_merge_continuation_after_open_sentence():
# First half ends without terminal punctuation ("and"), second half starts with digit.
# Second half is >= _MIN_PARA so len heuristic alone would NOT merge it.
# Only the open-sentence rule (prev doesn't end with terminal punct) should merge it.
first_half = "Alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu nu xi. " * 3
first_half += "masses between five and" # no terminal punctuation — ends open
# second_half starts with "2" (digit, not lowercase) and is >= 200 chars
second_half = ("2.5 something more here delta epsilon zeta eta theta "
"iota kappa lambda mu nu. " * 4)
md = "## S\n\n" + first_half + "\n\n" + second_half
paras, _ = segment_markdown(md)
# Digit-starting continuation merged because prev para ended mid-sentence
assert len(paras) == 1, (
f"Expected 1 merged paragraph, got {len(paras)}: "
+ str([p['text'][:80] for p in paras])
)
FIXTURE = Path(__file__).parent / "fixtures" / "mowla_iyer_2024.pdf"
MD = """# The Firefly Sparkle
## Introduction
""" + ("First intro paragraph. " * 15) + """
""" + ("Second intro paragraph. " * 15) + """
## Methods
""" + ("Methods paragraph here. " * 15) + """
**Figure 1.** A caption that should be dropped.
## References
[1] Someone et al. 2020
"""
def test_segment_basic():
paras, _ = segment_markdown(MD)
sections = [p["section"] for p in paras]
assert sections == ["Introduction", "Introduction", "Methods"]
assert paras[0]["firstOfSection"] and not paras[1]["firstOfSection"]
assert paras[2]["firstOfSection"]
assert paras[0]["id"] == "p1"
assert all(len(p["text"]) >= 200 for p in paras)
def test_segment_drops_captions_and_references():
paras, raw_refs = segment_markdown(MD)
joined = " ".join(p["text"] for p in paras)
assert "Figure 1" not in joined
assert "Someone et al" not in joined # not in paras
assert "Someone et al" in raw_refs # but captured in raw_refs
def test_parse_pdf_fixture():
paper, _ = parse_pdf(FIXTURE.read_bytes(), "mowla_iyer_2024.pdf")
assert 10 <= len(paper.paragraphs) <= 80
assert paper.pages >= 10
assert "firefly" in paper.title.lower() or len(paper.title) > 10
def test_segment_drops_author_affiliation_block():
md = """# A Great Paper
F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1] , C. ACCARD[1] , D. ISMAIL[1] , E. DADDI[3] , D. ELBAZ[3] , L. CIESLA[4] , G. MARTIN[5] , Y. DUBOIS[6] , S. PEIRANI[7]
> 1 _Observatoire Astronomique de Strasbourg, Université de Strasbourg, CNRS UMR 7550, 11 rue de l'Université, F-67000 Strasbourg, France_
> 2 _University of Strasbourg Institute for Advanced Study, 5 allée du Général Rouvillois, F-67083 Strasbourg, France_
## ABSTRACT
""" + ("Star-forming galaxies spend most of their lifetimes on the main sequence of star formation. " * 4)
paras, _ = segment_markdown(md)
joined = " ".join(p["text"] for p in paras)
assert "KRALJIC" not in joined and "Observatoire" not in joined
assert any("Star-forming galaxies" in p["text"] for p in paras)
def test_clean_authors_three_plus_et_al():
from astroparse_api.parse import _clean_authors
raw = "F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1]"
assert _clean_authors(raw) == "F.RENAUD, K. KRALJIC, J. FREUNDLICH et al."
def test_clean_authors_short_list_unchanged():
from astroparse_api.parse import _clean_authors
assert _clean_authors("A. One, B. Two") == "A. One, B. Two"
# ---------------------------------------------------------------------------
# Pathological-page detection unit tests
# ---------------------------------------------------------------------------
def test_pathological_pages_detects_large_stream():
"""_pathological_pages returns the index of a page whose content stream
exceeds _MAX_PAGE_STREAM_BYTES (8 MB). We duck-type a minimal fake doc."""
from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES
class _FakePage:
def __init__(self, size: int):
self._size = size
def read_contents(self) -> bytes:
return b"x" * self._size
class _FakeDoc:
def __init__(self, sizes):
self._pages = [_FakePage(s) for s in sizes]
def __iter__(self):
return iter(self._pages)
# One small page, one huge page, one small page
sizes = [100, _MAX_PAGE_STREAM_BYTES + 1, 200]
doc = _FakeDoc(sizes)
bad = _pathological_pages(doc)
assert bad == {1}, f"Expected {{1}}, got {bad}"
def test_pathological_pages_empty_when_all_small():
"""_pathological_pages returns empty set when all pages are below threshold."""
from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES
class _FakePage:
def __init__(self, size: int):
self._size = size
def read_contents(self) -> bytes:
return b"x" * self._size
class _FakeDoc:
def __init__(self, sizes):
self._pages = [_FakePage(s) for s in sizes]
def __iter__(self):
return iter(self._pages)
sizes = [100, _MAX_PAGE_STREAM_BYTES - 1, 200]
doc = _FakeDoc(sizes)
bad = _pathological_pages(doc)
assert bad == set(), f"Expected empty set, got {bad}"
# ---------------------------------------------------------------------------
# Feature: parse_text — plaintext/markdown input
# ---------------------------------------------------------------------------
_MD_DOC = """# Galactic Winds in Starburst Galaxies
## Introduction
""" + ("Galaxy-scale winds driven by stellar feedback are a key mechanism for regulating " * 15) + """
""" + ("The outflow rates in starburst systems can exceed the star formation rate by factors " * 15) + """
## References
[1] Veilleux et al. 2005
[2] Rupke & Veilleux 2011
"""
def test_parse_text_title_from_heading():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "notes.md")
assert result.title == "Galactic Winds in Starburst Galaxies"
def test_parse_text_two_paragraphs():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "notes.md")
assert len(result.paragraphs) == 2
def test_parse_text_sections_correct():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "notes.md")
assert all(p.section == "Introduction" for p in result.paragraphs)
def test_parse_text_references_dropped():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "notes.md")
joined = " ".join(p.text for p in result.paragraphs)
assert "Veilleux" not in joined
def test_parse_text_pages_zero_authors_empty():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "notes.md")
assert result.pages == 0
assert result.authors == ""
def test_parse_text_title_fallback_to_stem():
from astroparse_api.parse import parse_text
# No heading — falls back to stem of filename
plain = ("A " + "paragraph without any heading content describing galaxies " * 20)
result, _ = parse_text(plain.encode(), "my_draft.txt")
assert result.title == "my_draft"
def test_parse_text_arxiv_from_filename():
from astroparse_api.parse import parse_text
result, _ = parse_text(_MD_DOC.encode(), "2402.08696.md")
assert result.arxivId == "2402.08696"
def test_parse_text_utf8_replace_on_bad_bytes():
from astroparse_api.parse import parse_text
# Inject a Latin-1 byte that isn't valid UTF-8, surrounded by valid UTF-8
bad = b"# My Title\n\n" + b"Good text here. " * 20 + b"\xff" + b" more text. " * 10
result, _ = parse_text(bad, "test.md")
assert result.title == "My Title"
def test_pathological_pages_handles_read_contents_exception():
"""_pathological_pages silently skips pages where read_contents raises."""
class _FaultyPage:
def read_contents(self):
raise RuntimeError("PDF error")
class _FakeDoc:
def __iter__(self):
return iter([_FaultyPage()])
bad = _pathological_pages(_FakeDoc())
assert bad == set()
# ---------------------------------------------------------------------------
# Integration test: real pathological PDF
# ---------------------------------------------------------------------------
def test_parse_pdf_handles_vector_heavy_pages():
"""parse_pdf completes within 60s on the JWST paper (page 3 has 65 MB
vector drawings that previously caused pymupdf4llm to run for 400+s)."""
pdf = Path(__file__).parent.parent.parent / "test_papers" / "s41586-024-08293-0.pdf"
if not pdf.exists():
import pytest
pytest.skip("test paper not present")
t0 = time.time()
paper, _ = parse_pdf(pdf.read_bytes(), pdf.name)
elapsed = time.time() - t0
assert elapsed < 60, f"parse_pdf took {elapsed:.1f}s — expected < 60s"
assert len(paper.paragraphs) >= 30, (
f"Expected >= 30 paragraphs, got {len(paper.paragraphs)}"
)
joined = " ".join(p.text for p in paper.paragraphs)
assert (
"Firefly" in paper.title
or "Firefly" in joined
or len(joined) > 10000
), "Expected Firefly Sparkle content or substantial text"
# --- Fix 0.1: pending-head buffer saves section openers ---
def test_pending_buffer_saves_section_opener():
"""A short block at a section boundary should be kept, not dropped."""
short_opener = "We begin with a key observation here." # < 200 chars
long_body = "The stellar mass function at high redshift is uncertain. " * 6
md = f"## Section A\n\n{short_opener}\n\n{long_body}"
paras, _ = segment_markdown(md)
joined = " ".join(p["text"] for p in paras)
assert "key observation" in joined, "short section opener was dropped (pending buffer failed)"
# --- Fix 0.2: skip-and-resume keeps appendix content ---
def test_references_captured_and_appendix_preserved():
"""Content after the References heading, in an Appendix, should appear in paras."""
body = "Star-forming galaxies remain on the main sequence for extended periods. " * 5
ref_body = "[1] Smith et al. 2020 ApJ 900 1"
appendix_body = "This appendix derives the mass-to-light ratio correction. " * 5
md = (
f"## Introduction\n\n{body}\n\n"
f"## References\n\n{ref_body}\n\n"
f"## Appendix A\n\n{appendix_body}"
)
paras, raw_refs = segment_markdown(md)
joined = " ".join(p["text"] for p in paras)
assert "mass-to-light ratio" in joined, "appendix content was discarded after References"
assert "Smith et al." in raw_refs, "raw_refs should contain references section text"
# --- Fix 0.3: headings chain attached to paragraphs ---
def test_headings_chain_on_paragraph():
"""First paragraph under a heading should carry the heading in its headings list."""
body = "Galaxy evolution proceeds through mergers and secular processes. " * 5
md = f"## Methods\n\n{body}"
paras, _ = segment_markdown(md)
assert len(paras) == 1
assert "Methods" in paras[0]["headings"]
def test_headings_chain_consecutive_headings():
"""Two consecutive headings both appear in the next paragraph's headings list."""
body = "The SFR surface density is measured from UV continuum emission. " * 5
md = f"## Results\n\n### 3.1 Star Formation\n\n{body}"
paras, _ = segment_markdown(md)
assert len(paras) == 1
assert paras[0]["headings"] == ["Results", "3.1 Star Formation"]
# --- Fix 0.4: heading hygiene ---
def test_heading_hygiene_filters_page_numbers():
"""A heading shaped like '456 K. G. Iyer et al.' should be discarded."""
body = "Outflow rates exceed the star formation rate in starburst systems. " * 5
md = (
"## Introduction\n\n" + body +
"\n\n## 456 K. G. Iyer et al.\n\n" +
"## Results\n\n" + body
)
paras, _ = segment_markdown(md)
sections = [p["section"] for p in paras]
assert "456 K. G. Iyer et al." not in sections
# --- Fix 0.5: caption gate allows figure-referencing prose ---
def test_caption_gate_allows_figure_prose():
"""'Figure 4 shows that the fraction...' is prose and must not be dropped."""
prose = ("Figure 4 shows that the fraction of star-forming galaxies decreases "
"sharply above a stellar mass of 10^10.5 solar masses, consistent with "
"quenching operating preferentially at the high-mass end of the distribution "
"and in dense environments where AGN feedback is more effective.")
md = "## Results\n\n" + prose
paras, _ = segment_markdown(md)
assert any("fraction of star-forming galaxies" in p["text"] for p in paras), \
"caption gate dropped figure-referencing prose"
def test_caption_gate_still_drops_real_captions():
"""'Figure 4. The fraction...' is a caption and should be dropped."""
caption = "Figure 4. The fraction of star-forming galaxies as a function of stellar mass."
body = "Galaxy stellar mass functions have been measured across cosmic time. " * 5
md = f"## Results\n\n{body}\n\n{caption}"
paras, _ = segment_markdown(md)
joined = " ".join(p["text"] for p in paras)
assert "The fraction of star-forming galaxies as a function" not in joined
def test_heading_not_leaked_from_empty_section():
"""Headings from a section with no prose don't appear in the next section's paragraph."""
body = "The stellar mass function evolves significantly from z=2 to the present day. " * 5
# Ghost section has no content (heading immediately followed by another heading)
md = "## Ghost Section\n\n## Real Section\n\n" + body
paras, _ = segment_markdown(md)
assert len(paras) == 1
# "Ghost Section" must NOT appear in the paragraph's headings
assert "Ghost Section" not in paras[0]["headings"], \
f"Ghost Section leaked into headings: {paras[0]['headings']}"
assert "Real Section" in paras[0]["headings"]
# --- Fix 0.6: length measured post-demarkdown ---
def test_length_measured_post_demarkdown():
"""A paragraph with heavy markdown markup that is still >= 200 chars after demarkdown must survive."""
# Raw text has lots of emphasis markers; after demarkdown the content is still >= 200 chars.
core = ("_z_ ≈ 3 sources show Lyman-alpha emission tracing the neutral hydrogen distribution "
"in high-redshift galaxies at cosmic noon. "
"The _flux_ calibration introduces _systematic_ uncertainties of order 10 per cent, "
"which must be accounted for in the _stellar mass_ estimates.")
md = "## Discussion\n\n" + core
paras, _ = segment_markdown(md)
assert len(paras) == 1, "post-demarkdown paragraph was dropped by length gate"