import time from pathlib import Path from astroparse_api.parse import segment_markdown, parse_pdf, _pathological_pages # --------------------------------------------------------------------------- # Bug A: strip_picture_text # --------------------------------------------------------------------------- def test_picture_text_blocks_filtered(): from astroparse_api.parse import strip_picture_text lines = ["Real prose before.", "**----- Begin of picture text -----**", "z = 5.0 z = 4.0", "1.5", "R [kpc]", "**----- End of picture text -----**", "Real prose after."] assert strip_picture_text(lines) == ["Real prose before.", "Real prose after."] def test_picture_text_start_end_variants(): """Handles 'Start of picture text' / 'End of picture text' (pymupdf4llm 1.27.x).""" from astroparse_api.parse import strip_picture_text lines = [ "Prose A.", "**----- Start of picture text -----**
", "axis label stuff
more axis
**----- End of picture text -----**
", "Prose B.", ] result = strip_picture_text(lines) assert result == ["Prose A.", "Prose B."] def test_picture_text_multiline_block(): """Handles multi-line blocks between start/end markers.""" from astroparse_api.parse import strip_picture_text lines = [ "Before.", "**----- Start of picture text -----**
", "z = 5.0 z = 4.0", "1.5", "R [kpc]", "**----- End of picture text -----**
", "After.", ] result = strip_picture_text(lines) assert result == ["Before.", "After."] # --------------------------------------------------------------------------- # Bug B: demarkdown # --------------------------------------------------------------------------- def test_demarkdown_cleans_math_debris(): from astroparse_api.parse import demarkdown t = ("with a peak at _z ≈_ 4, masses between 5 _._ 7 _×_ 10[5] M _⊙_ and " "2 _._ 5 _×_ 10[11] M _⊙_ .") out = demarkdown(t) assert "_" not in out and "**" not in out assert "5.7×10⁵" in out and "2.5×10¹¹" in out and "M ⊙" in out def test_demarkdown_bold_stripped(): from astroparse_api.parse import demarkdown t = "This is **very important** and _emphasized_ text." out = demarkdown(t) assert "**" not in out and "_" not in out assert "very important" in out and "emphasized" in out def test_demarkdown_backtick_stripped(): from astroparse_api.parse import demarkdown t = "Use `code` here." out = demarkdown(t) assert "`" not in out assert "code" in out def test_demarkdown_bracketed_exponents(): from astroparse_api.parse import demarkdown assert "10⁵" in demarkdown("10[5]") assert "10¹¹" in demarkdown("10[11]") assert "10⁰" in demarkdown("10[0]") # --------------------------------------------------------------------------- # Bug B2: merge open-sentence continuations # --------------------------------------------------------------------------- def test_merge_continuation_after_open_sentence(): # First half ends without terminal punctuation ("and"), second half starts with digit. # Second half is >= _MIN_PARA so len heuristic alone would NOT merge it. # Only the open-sentence rule (prev doesn't end with terminal punct) should merge it. first_half = "Alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu nu xi. " * 3 first_half += "masses between five and" # no terminal punctuation — ends open # second_half starts with "2" (digit, not lowercase) and is >= 200 chars second_half = ("2.5 something more here delta epsilon zeta eta theta " "iota kappa lambda mu nu. " * 4) md = "## S\n\n" + first_half + "\n\n" + second_half paras, _ = segment_markdown(md) # Digit-starting continuation merged because prev para ended mid-sentence assert len(paras) == 1, ( f"Expected 1 merged paragraph, got {len(paras)}: " + str([p['text'][:80] for p in paras]) ) FIXTURE = Path(__file__).parent / "fixtures" / "mowla_iyer_2024.pdf" MD = """# The Firefly Sparkle ## Introduction """ + ("First intro paragraph. " * 15) + """ """ + ("Second intro paragraph. " * 15) + """ ## Methods """ + ("Methods paragraph here. " * 15) + """ **Figure 1.** A caption that should be dropped. ## References [1] Someone et al. 2020 """ def test_segment_basic(): paras, _ = segment_markdown(MD) sections = [p["section"] for p in paras] assert sections == ["Introduction", "Introduction", "Methods"] assert paras[0]["firstOfSection"] and not paras[1]["firstOfSection"] assert paras[2]["firstOfSection"] assert paras[0]["id"] == "p1" assert all(len(p["text"]) >= 200 for p in paras) def test_segment_drops_captions_and_references(): paras, raw_refs = segment_markdown(MD) joined = " ".join(p["text"] for p in paras) assert "Figure 1" not in joined assert "Someone et al" not in joined # not in paras assert "Someone et al" in raw_refs # but captured in raw_refs def test_parse_pdf_fixture(): paper, _ = parse_pdf(FIXTURE.read_bytes(), "mowla_iyer_2024.pdf") assert 10 <= len(paper.paragraphs) <= 80 assert paper.pages >= 10 assert "firefly" in paper.title.lower() or len(paper.title) > 10 def test_segment_drops_author_affiliation_block(): md = """# A Great Paper F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1] , C. ACCARD[1] , D. ISMAIL[1] , E. DADDI[3] , D. ELBAZ[3] , L. CIESLA[4] , G. MARTIN[5] , Y. DUBOIS[6] , S. PEIRANI[7] > 1 _Observatoire Astronomique de Strasbourg, Université de Strasbourg, CNRS UMR 7550, 11 rue de l'Université, F-67000 Strasbourg, France_ > 2 _University of Strasbourg Institute for Advanced Study, 5 allée du Général Rouvillois, F-67083 Strasbourg, France_ ## ABSTRACT """ + ("Star-forming galaxies spend most of their lifetimes on the main sequence of star formation. " * 4) paras, _ = segment_markdown(md) joined = " ".join(p["text"] for p in paras) assert "KRALJIC" not in joined and "Observatoire" not in joined assert any("Star-forming galaxies" in p["text"] for p in paras) def test_clean_authors_three_plus_et_al(): from astroparse_api.parse import _clean_authors raw = "F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1]" assert _clean_authors(raw) == "F.RENAUD, K. KRALJIC, J. FREUNDLICH et al." def test_clean_authors_short_list_unchanged(): from astroparse_api.parse import _clean_authors assert _clean_authors("A. One, B. Two") == "A. One, B. Two" # --------------------------------------------------------------------------- # Pathological-page detection unit tests # --------------------------------------------------------------------------- def test_pathological_pages_detects_large_stream(): """_pathological_pages returns the index of a page whose content stream exceeds _MAX_PAGE_STREAM_BYTES (8 MB). We duck-type a minimal fake doc.""" from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES class _FakePage: def __init__(self, size: int): self._size = size def read_contents(self) -> bytes: return b"x" * self._size class _FakeDoc: def __init__(self, sizes): self._pages = [_FakePage(s) for s in sizes] def __iter__(self): return iter(self._pages) # One small page, one huge page, one small page sizes = [100, _MAX_PAGE_STREAM_BYTES + 1, 200] doc = _FakeDoc(sizes) bad = _pathological_pages(doc) assert bad == {1}, f"Expected {{1}}, got {bad}" def test_pathological_pages_empty_when_all_small(): """_pathological_pages returns empty set when all pages are below threshold.""" from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES class _FakePage: def __init__(self, size: int): self._size = size def read_contents(self) -> bytes: return b"x" * self._size class _FakeDoc: def __init__(self, sizes): self._pages = [_FakePage(s) for s in sizes] def __iter__(self): return iter(self._pages) sizes = [100, _MAX_PAGE_STREAM_BYTES - 1, 200] doc = _FakeDoc(sizes) bad = _pathological_pages(doc) assert bad == set(), f"Expected empty set, got {bad}" # --------------------------------------------------------------------------- # Feature: parse_text — plaintext/markdown input # --------------------------------------------------------------------------- _MD_DOC = """# Galactic Winds in Starburst Galaxies ## Introduction """ + ("Galaxy-scale winds driven by stellar feedback are a key mechanism for regulating " * 15) + """ """ + ("The outflow rates in starburst systems can exceed the star formation rate by factors " * 15) + """ ## References [1] Veilleux et al. 2005 [2] Rupke & Veilleux 2011 """ def test_parse_text_title_from_heading(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "notes.md") assert result.title == "Galactic Winds in Starburst Galaxies" def test_parse_text_two_paragraphs(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "notes.md") assert len(result.paragraphs) == 2 def test_parse_text_sections_correct(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "notes.md") assert all(p.section == "Introduction" for p in result.paragraphs) def test_parse_text_references_dropped(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "notes.md") joined = " ".join(p.text for p in result.paragraphs) assert "Veilleux" not in joined def test_parse_text_pages_zero_authors_empty(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "notes.md") assert result.pages == 0 assert result.authors == "" def test_parse_text_title_fallback_to_stem(): from astroparse_api.parse import parse_text # No heading — falls back to stem of filename plain = ("A " + "paragraph without any heading content describing galaxies " * 20) result, _ = parse_text(plain.encode(), "my_draft.txt") assert result.title == "my_draft" def test_parse_text_arxiv_from_filename(): from astroparse_api.parse import parse_text result, _ = parse_text(_MD_DOC.encode(), "2402.08696.md") assert result.arxivId == "2402.08696" def test_parse_text_utf8_replace_on_bad_bytes(): from astroparse_api.parse import parse_text # Inject a Latin-1 byte that isn't valid UTF-8, surrounded by valid UTF-8 bad = b"# My Title\n\n" + b"Good text here. " * 20 + b"\xff" + b" more text. " * 10 result, _ = parse_text(bad, "test.md") assert result.title == "My Title" def test_pathological_pages_handles_read_contents_exception(): """_pathological_pages silently skips pages where read_contents raises.""" class _FaultyPage: def read_contents(self): raise RuntimeError("PDF error") class _FakeDoc: def __iter__(self): return iter([_FaultyPage()]) bad = _pathological_pages(_FakeDoc()) assert bad == set() # --------------------------------------------------------------------------- # Integration test: real pathological PDF # --------------------------------------------------------------------------- def test_parse_pdf_handles_vector_heavy_pages(): """parse_pdf completes within 60s on the JWST paper (page 3 has 65 MB vector drawings that previously caused pymupdf4llm to run for 400+s).""" pdf = Path(__file__).parent.parent.parent / "test_papers" / "s41586-024-08293-0.pdf" if not pdf.exists(): import pytest pytest.skip("test paper not present") t0 = time.time() paper, _ = parse_pdf(pdf.read_bytes(), pdf.name) elapsed = time.time() - t0 assert elapsed < 60, f"parse_pdf took {elapsed:.1f}s — expected < 60s" assert len(paper.paragraphs) >= 30, ( f"Expected >= 30 paragraphs, got {len(paper.paragraphs)}" ) joined = " ".join(p.text for p in paper.paragraphs) assert ( "Firefly" in paper.title or "Firefly" in joined or len(joined) > 10000 ), "Expected Firefly Sparkle content or substantial text" # --- Fix 0.1: pending-head buffer saves section openers --- def test_pending_buffer_saves_section_opener(): """A short block at a section boundary should be kept, not dropped.""" short_opener = "We begin with a key observation here." # < 200 chars long_body = "The stellar mass function at high redshift is uncertain. " * 6 md = f"## Section A\n\n{short_opener}\n\n{long_body}" paras, _ = segment_markdown(md) joined = " ".join(p["text"] for p in paras) assert "key observation" in joined, "short section opener was dropped (pending buffer failed)" # --- Fix 0.2: skip-and-resume keeps appendix content --- def test_references_captured_and_appendix_preserved(): """Content after the References heading, in an Appendix, should appear in paras.""" body = "Star-forming galaxies remain on the main sequence for extended periods. " * 5 ref_body = "[1] Smith et al. 2020 ApJ 900 1" appendix_body = "This appendix derives the mass-to-light ratio correction. " * 5 md = ( f"## Introduction\n\n{body}\n\n" f"## References\n\n{ref_body}\n\n" f"## Appendix A\n\n{appendix_body}" ) paras, raw_refs = segment_markdown(md) joined = " ".join(p["text"] for p in paras) assert "mass-to-light ratio" in joined, "appendix content was discarded after References" assert "Smith et al." in raw_refs, "raw_refs should contain references section text" # --- Fix 0.3: headings chain attached to paragraphs --- def test_headings_chain_on_paragraph(): """First paragraph under a heading should carry the heading in its headings list.""" body = "Galaxy evolution proceeds through mergers and secular processes. " * 5 md = f"## Methods\n\n{body}" paras, _ = segment_markdown(md) assert len(paras) == 1 assert "Methods" in paras[0]["headings"] def test_headings_chain_consecutive_headings(): """Two consecutive headings both appear in the next paragraph's headings list.""" body = "The SFR surface density is measured from UV continuum emission. " * 5 md = f"## Results\n\n### 3.1 Star Formation\n\n{body}" paras, _ = segment_markdown(md) assert len(paras) == 1 assert paras[0]["headings"] == ["Results", "3.1 Star Formation"] # --- Fix 0.4: heading hygiene --- def test_heading_hygiene_filters_page_numbers(): """A heading shaped like '456 K. G. Iyer et al.' should be discarded.""" body = "Outflow rates exceed the star formation rate in starburst systems. " * 5 md = ( "## Introduction\n\n" + body + "\n\n## 456 K. G. Iyer et al.\n\n" + "## Results\n\n" + body ) paras, _ = segment_markdown(md) sections = [p["section"] for p in paras] assert "456 K. G. Iyer et al." not in sections # --- Fix 0.5: caption gate allows figure-referencing prose --- def test_caption_gate_allows_figure_prose(): """'Figure 4 shows that the fraction...' is prose and must not be dropped.""" prose = ("Figure 4 shows that the fraction of star-forming galaxies decreases " "sharply above a stellar mass of 10^10.5 solar masses, consistent with " "quenching operating preferentially at the high-mass end of the distribution " "and in dense environments where AGN feedback is more effective.") md = "## Results\n\n" + prose paras, _ = segment_markdown(md) assert any("fraction of star-forming galaxies" in p["text"] for p in paras), \ "caption gate dropped figure-referencing prose" def test_caption_gate_still_drops_real_captions(): """'Figure 4. The fraction...' is a caption and should be dropped.""" caption = "Figure 4. The fraction of star-forming galaxies as a function of stellar mass." body = "Galaxy stellar mass functions have been measured across cosmic time. " * 5 md = f"## Results\n\n{body}\n\n{caption}" paras, _ = segment_markdown(md) joined = " ".join(p["text"] for p in paras) assert "The fraction of star-forming galaxies as a function" not in joined def test_heading_not_leaked_from_empty_section(): """Headings from a section with no prose don't appear in the next section's paragraph.""" body = "The stellar mass function evolves significantly from z=2 to the present day. " * 5 # Ghost section has no content (heading immediately followed by another heading) md = "## Ghost Section\n\n## Real Section\n\n" + body paras, _ = segment_markdown(md) assert len(paras) == 1 # "Ghost Section" must NOT appear in the paragraph's headings assert "Ghost Section" not in paras[0]["headings"], \ f"Ghost Section leaked into headings: {paras[0]['headings']}" assert "Real Section" in paras[0]["headings"] # --- Fix 0.6: length measured post-demarkdown --- def test_length_measured_post_demarkdown(): """A paragraph with heavy markdown markup that is still >= 200 chars after demarkdown must survive.""" # Raw text has lots of emphasis markers; after demarkdown the content is still >= 200 chars. core = ("_z_ ≈ 3 sources show Lyman-alpha emission tracing the neutral hydrogen distribution " "in high-redshift galaxies at cosmic noon. " "The _flux_ calibration introduces _systematic_ uncertainties of order 10 per cent, " "which must be accounted for in the _stellar mass_ estimates.") md = "## Discussion\n\n" + core paras, _ = segment_markdown(md) assert len(paras) == 1, "post-demarkdown paragraph was dropped by length gate"