| import time |
| from pathlib import Path |
| from astroparse_api.parse import segment_markdown, parse_pdf, _pathological_pages |
|
|
|
|
| |
| |
| |
|
|
| def test_picture_text_blocks_filtered(): |
| from astroparse_api.parse import strip_picture_text |
| lines = ["Real prose before.", "**----- Begin of picture text -----**", |
| "z = 5.0 z = 4.0", "1.5", "R [kpc]", |
| "**----- End of picture text -----**", "Real prose after."] |
| assert strip_picture_text(lines) == ["Real prose before.", "Real prose after."] |
|
|
|
|
| def test_picture_text_start_end_variants(): |
| """Handles 'Start of picture text' / 'End of picture text' (pymupdf4llm 1.27.x).""" |
| from astroparse_api.parse import strip_picture_text |
| lines = [ |
| "Prose A.", |
| "**----- Start of picture text -----**<br>", |
| "axis label stuff<br>more axis<br>**----- End of picture text -----**<br>", |
| "Prose B.", |
| ] |
| result = strip_picture_text(lines) |
| assert result == ["Prose A.", "Prose B."] |
|
|
|
|
| def test_picture_text_multiline_block(): |
| """Handles multi-line blocks between start/end markers.""" |
| from astroparse_api.parse import strip_picture_text |
| lines = [ |
| "Before.", |
| "**----- Start of picture text -----**<br>", |
| "z = 5.0 z = 4.0", |
| "1.5", |
| "R [kpc]", |
| "**----- End of picture text -----**<br>", |
| "After.", |
| ] |
| result = strip_picture_text(lines) |
| assert result == ["Before.", "After."] |
|
|
|
|
| |
| |
| |
|
|
| def test_demarkdown_cleans_math_debris(): |
| from astroparse_api.parse import demarkdown |
| t = ("with a peak at _z ≈_ 4, masses between 5 _._ 7 _×_ 10[5] M _⊙_ and " |
| "2 _._ 5 _×_ 10[11] M _⊙_ .") |
| out = demarkdown(t) |
| assert "_" not in out and "**" not in out |
| assert "5.7×10⁵" in out and "2.5×10¹¹" in out and "M ⊙" in out |
|
|
|
|
| def test_demarkdown_bold_stripped(): |
| from astroparse_api.parse import demarkdown |
| t = "This is **very important** and _emphasized_ text." |
| out = demarkdown(t) |
| assert "**" not in out and "_" not in out |
| assert "very important" in out and "emphasized" in out |
|
|
|
|
| def test_demarkdown_backtick_stripped(): |
| from astroparse_api.parse import demarkdown |
| t = "Use `code` here." |
| out = demarkdown(t) |
| assert "`" not in out |
| assert "code" in out |
|
|
|
|
| def test_demarkdown_bracketed_exponents(): |
| from astroparse_api.parse import demarkdown |
| assert "10⁵" in demarkdown("10[5]") |
| assert "10¹¹" in demarkdown("10[11]") |
| assert "10⁰" in demarkdown("10[0]") |
|
|
|
|
| |
| |
| |
|
|
| def test_merge_continuation_after_open_sentence(): |
| |
| |
| |
| first_half = "Alpha beta gamma delta epsilon zeta eta theta iota kappa lambda mu nu xi. " * 3 |
| first_half += "masses between five and" |
| |
| second_half = ("2.5 something more here delta epsilon zeta eta theta " |
| "iota kappa lambda mu nu. " * 4) |
|
|
| md = "## S\n\n" + first_half + "\n\n" + second_half |
| paras, _ = segment_markdown(md) |
| |
| assert len(paras) == 1, ( |
| f"Expected 1 merged paragraph, got {len(paras)}: " |
| + str([p['text'][:80] for p in paras]) |
| ) |
|
|
| FIXTURE = Path(__file__).parent / "fixtures" / "mowla_iyer_2024.pdf" |
|
|
| MD = """# The Firefly Sparkle |
| |
| ## Introduction |
| |
| """ + ("First intro paragraph. " * 15) + """ |
| |
| """ + ("Second intro paragraph. " * 15) + """ |
| |
| ## Methods |
| |
| """ + ("Methods paragraph here. " * 15) + """ |
| |
| **Figure 1.** A caption that should be dropped. |
| |
| ## References |
| |
| [1] Someone et al. 2020 |
| """ |
|
|
| def test_segment_basic(): |
| paras, _ = segment_markdown(MD) |
| sections = [p["section"] for p in paras] |
| assert sections == ["Introduction", "Introduction", "Methods"] |
| assert paras[0]["firstOfSection"] and not paras[1]["firstOfSection"] |
| assert paras[2]["firstOfSection"] |
| assert paras[0]["id"] == "p1" |
| assert all(len(p["text"]) >= 200 for p in paras) |
|
|
| def test_segment_drops_captions_and_references(): |
| paras, raw_refs = segment_markdown(MD) |
| joined = " ".join(p["text"] for p in paras) |
| assert "Figure 1" not in joined |
| assert "Someone et al" not in joined |
| assert "Someone et al" in raw_refs |
|
|
| def test_parse_pdf_fixture(): |
| paper, _ = parse_pdf(FIXTURE.read_bytes(), "mowla_iyer_2024.pdf") |
| assert 10 <= len(paper.paragraphs) <= 80 |
| assert paper.pages >= 10 |
| assert "firefly" in paper.title.lower() or len(paper.title) > 10 |
|
|
|
|
| def test_segment_drops_author_affiliation_block(): |
| md = """# A Great Paper |
| |
| F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1] , C. ACCARD[1] , D. ISMAIL[1] , E. DADDI[3] , D. ELBAZ[3] , L. CIESLA[4] , G. MARTIN[5] , Y. DUBOIS[6] , S. PEIRANI[7] |
| > 1 _Observatoire Astronomique de Strasbourg, Université de Strasbourg, CNRS UMR 7550, 11 rue de l'Université, F-67000 Strasbourg, France_ |
| > 2 _University of Strasbourg Institute for Advanced Study, 5 allée du Général Rouvillois, F-67083 Strasbourg, France_ |
| |
| ## ABSTRACT |
| |
| """ + ("Star-forming galaxies spend most of their lifetimes on the main sequence of star formation. " * 4) |
| paras, _ = segment_markdown(md) |
| joined = " ".join(p["text"] for p in paras) |
| assert "KRALJIC" not in joined and "Observatoire" not in joined |
| assert any("Star-forming galaxies" in p["text"] for p in paras) |
|
|
|
|
| def test_clean_authors_three_plus_et_al(): |
| from astroparse_api.parse import _clean_authors |
| raw = "F.RENAUD[1] _[,]_[ 2] , K. KRALJIC[1] , J. FREUNDLICH[1] , B. MAGNELLI[3] , M. BÉTHERMIN[1]" |
| assert _clean_authors(raw) == "F.RENAUD, K. KRALJIC, J. FREUNDLICH et al." |
|
|
|
|
| def test_clean_authors_short_list_unchanged(): |
| from astroparse_api.parse import _clean_authors |
| assert _clean_authors("A. One, B. Two") == "A. One, B. Two" |
|
|
|
|
| |
| |
| |
|
|
| def test_pathological_pages_detects_large_stream(): |
| """_pathological_pages returns the index of a page whose content stream |
| exceeds _MAX_PAGE_STREAM_BYTES (8 MB). We duck-type a minimal fake doc.""" |
| from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES |
|
|
| class _FakePage: |
| def __init__(self, size: int): |
| self._size = size |
| def read_contents(self) -> bytes: |
| return b"x" * self._size |
|
|
| class _FakeDoc: |
| def __init__(self, sizes): |
| self._pages = [_FakePage(s) for s in sizes] |
| def __iter__(self): |
| return iter(self._pages) |
|
|
| |
| sizes = [100, _MAX_PAGE_STREAM_BYTES + 1, 200] |
| doc = _FakeDoc(sizes) |
| bad = _pathological_pages(doc) |
| assert bad == {1}, f"Expected {{1}}, got {bad}" |
|
|
|
|
| def test_pathological_pages_empty_when_all_small(): |
| """_pathological_pages returns empty set when all pages are below threshold.""" |
| from astroparse_api.parse import _MAX_PAGE_STREAM_BYTES |
|
|
| class _FakePage: |
| def __init__(self, size: int): |
| self._size = size |
| def read_contents(self) -> bytes: |
| return b"x" * self._size |
|
|
| class _FakeDoc: |
| def __init__(self, sizes): |
| self._pages = [_FakePage(s) for s in sizes] |
| def __iter__(self): |
| return iter(self._pages) |
|
|
| sizes = [100, _MAX_PAGE_STREAM_BYTES - 1, 200] |
| doc = _FakeDoc(sizes) |
| bad = _pathological_pages(doc) |
| assert bad == set(), f"Expected empty set, got {bad}" |
|
|
|
|
| |
| |
| |
|
|
| _MD_DOC = """# Galactic Winds in Starburst Galaxies |
| |
| ## Introduction |
| |
| """ + ("Galaxy-scale winds driven by stellar feedback are a key mechanism for regulating " * 15) + """ |
| |
| """ + ("The outflow rates in starburst systems can exceed the star formation rate by factors " * 15) + """ |
| |
| ## References |
| |
| [1] Veilleux et al. 2005 |
| [2] Rupke & Veilleux 2011 |
| """ |
|
|
| def test_parse_text_title_from_heading(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "notes.md") |
| assert result.title == "Galactic Winds in Starburst Galaxies" |
|
|
|
|
| def test_parse_text_two_paragraphs(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "notes.md") |
| assert len(result.paragraphs) == 2 |
|
|
|
|
| def test_parse_text_sections_correct(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "notes.md") |
| assert all(p.section == "Introduction" for p in result.paragraphs) |
|
|
|
|
| def test_parse_text_references_dropped(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "notes.md") |
| joined = " ".join(p.text for p in result.paragraphs) |
| assert "Veilleux" not in joined |
|
|
|
|
| def test_parse_text_pages_zero_authors_empty(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "notes.md") |
| assert result.pages == 0 |
| assert result.authors == "" |
|
|
|
|
| def test_parse_text_title_fallback_to_stem(): |
| from astroparse_api.parse import parse_text |
| |
| plain = ("A " + "paragraph without any heading content describing galaxies " * 20) |
| result, _ = parse_text(plain.encode(), "my_draft.txt") |
| assert result.title == "my_draft" |
|
|
|
|
| def test_parse_text_arxiv_from_filename(): |
| from astroparse_api.parse import parse_text |
| result, _ = parse_text(_MD_DOC.encode(), "2402.08696.md") |
| assert result.arxivId == "2402.08696" |
|
|
|
|
| def test_parse_text_utf8_replace_on_bad_bytes(): |
| from astroparse_api.parse import parse_text |
| |
| bad = b"# My Title\n\n" + b"Good text here. " * 20 + b"\xff" + b" more text. " * 10 |
| result, _ = parse_text(bad, "test.md") |
| assert result.title == "My Title" |
|
|
|
|
| def test_pathological_pages_handles_read_contents_exception(): |
| """_pathological_pages silently skips pages where read_contents raises.""" |
| class _FaultyPage: |
| def read_contents(self): |
| raise RuntimeError("PDF error") |
|
|
| class _FakeDoc: |
| def __iter__(self): |
| return iter([_FaultyPage()]) |
|
|
| bad = _pathological_pages(_FakeDoc()) |
| assert bad == set() |
|
|
|
|
| |
| |
| |
|
|
| def test_parse_pdf_handles_vector_heavy_pages(): |
| """parse_pdf completes within 60s on the JWST paper (page 3 has 65 MB |
| vector drawings that previously caused pymupdf4llm to run for 400+s).""" |
| pdf = Path(__file__).parent.parent.parent / "test_papers" / "s41586-024-08293-0.pdf" |
| if not pdf.exists(): |
| import pytest |
| pytest.skip("test paper not present") |
| t0 = time.time() |
| paper, _ = parse_pdf(pdf.read_bytes(), pdf.name) |
| elapsed = time.time() - t0 |
| assert elapsed < 60, f"parse_pdf took {elapsed:.1f}s — expected < 60s" |
| assert len(paper.paragraphs) >= 30, ( |
| f"Expected >= 30 paragraphs, got {len(paper.paragraphs)}" |
| ) |
| joined = " ".join(p.text for p in paper.paragraphs) |
| assert ( |
| "Firefly" in paper.title |
| or "Firefly" in joined |
| or len(joined) > 10000 |
| ), "Expected Firefly Sparkle content or substantial text" |
|
|
|
|
| |
| def test_pending_buffer_saves_section_opener(): |
| """A short block at a section boundary should be kept, not dropped.""" |
| short_opener = "We begin with a key observation here." |
| long_body = "The stellar mass function at high redshift is uncertain. " * 6 |
| md = f"## Section A\n\n{short_opener}\n\n{long_body}" |
| paras, _ = segment_markdown(md) |
| joined = " ".join(p["text"] for p in paras) |
| assert "key observation" in joined, "short section opener was dropped (pending buffer failed)" |
|
|
| |
| def test_references_captured_and_appendix_preserved(): |
| """Content after the References heading, in an Appendix, should appear in paras.""" |
| body = "Star-forming galaxies remain on the main sequence for extended periods. " * 5 |
| ref_body = "[1] Smith et al. 2020 ApJ 900 1" |
| appendix_body = "This appendix derives the mass-to-light ratio correction. " * 5 |
| md = ( |
| f"## Introduction\n\n{body}\n\n" |
| f"## References\n\n{ref_body}\n\n" |
| f"## Appendix A\n\n{appendix_body}" |
| ) |
| paras, raw_refs = segment_markdown(md) |
| joined = " ".join(p["text"] for p in paras) |
| assert "mass-to-light ratio" in joined, "appendix content was discarded after References" |
| assert "Smith et al." in raw_refs, "raw_refs should contain references section text" |
|
|
| |
| def test_headings_chain_on_paragraph(): |
| """First paragraph under a heading should carry the heading in its headings list.""" |
| body = "Galaxy evolution proceeds through mergers and secular processes. " * 5 |
| md = f"## Methods\n\n{body}" |
| paras, _ = segment_markdown(md) |
| assert len(paras) == 1 |
| assert "Methods" in paras[0]["headings"] |
|
|
| def test_headings_chain_consecutive_headings(): |
| """Two consecutive headings both appear in the next paragraph's headings list.""" |
| body = "The SFR surface density is measured from UV continuum emission. " * 5 |
| md = f"## Results\n\n### 3.1 Star Formation\n\n{body}" |
| paras, _ = segment_markdown(md) |
| assert len(paras) == 1 |
| assert paras[0]["headings"] == ["Results", "3.1 Star Formation"] |
|
|
| |
| def test_heading_hygiene_filters_page_numbers(): |
| """A heading shaped like '456 K. G. Iyer et al.' should be discarded.""" |
| body = "Outflow rates exceed the star formation rate in starburst systems. " * 5 |
| md = ( |
| "## Introduction\n\n" + body + |
| "\n\n## 456 K. G. Iyer et al.\n\n" + |
| "## Results\n\n" + body |
| ) |
| paras, _ = segment_markdown(md) |
| sections = [p["section"] for p in paras] |
| assert "456 K. G. Iyer et al." not in sections |
|
|
| |
| def test_caption_gate_allows_figure_prose(): |
| """'Figure 4 shows that the fraction...' is prose and must not be dropped.""" |
| prose = ("Figure 4 shows that the fraction of star-forming galaxies decreases " |
| "sharply above a stellar mass of 10^10.5 solar masses, consistent with " |
| "quenching operating preferentially at the high-mass end of the distribution " |
| "and in dense environments where AGN feedback is more effective.") |
| md = "## Results\n\n" + prose |
| paras, _ = segment_markdown(md) |
| assert any("fraction of star-forming galaxies" in p["text"] for p in paras), \ |
| "caption gate dropped figure-referencing prose" |
|
|
| def test_caption_gate_still_drops_real_captions(): |
| """'Figure 4. The fraction...' is a caption and should be dropped.""" |
| caption = "Figure 4. The fraction of star-forming galaxies as a function of stellar mass." |
| body = "Galaxy stellar mass functions have been measured across cosmic time. " * 5 |
| md = f"## Results\n\n{body}\n\n{caption}" |
| paras, _ = segment_markdown(md) |
| joined = " ".join(p["text"] for p in paras) |
| assert "The fraction of star-forming galaxies as a function" not in joined |
|
|
| def test_heading_not_leaked_from_empty_section(): |
| """Headings from a section with no prose don't appear in the next section's paragraph.""" |
| body = "The stellar mass function evolves significantly from z=2 to the present day. " * 5 |
| |
| md = "## Ghost Section\n\n## Real Section\n\n" + body |
| paras, _ = segment_markdown(md) |
| assert len(paras) == 1 |
| |
| assert "Ghost Section" not in paras[0]["headings"], \ |
| f"Ghost Section leaked into headings: {paras[0]['headings']}" |
| assert "Real Section" in paras[0]["headings"] |
|
|
|
|
| |
| def test_length_measured_post_demarkdown(): |
| """A paragraph with heavy markdown markup that is still >= 200 chars after demarkdown must survive.""" |
| |
| core = ("_z_ ≈ 3 sources show Lyman-alpha emission tracing the neutral hydrogen distribution " |
| "in high-redshift galaxies at cosmic noon. " |
| "The _flux_ calibration introduces _systematic_ uncertainties of order 10 per cent, " |
| "which must be accounted for in the _stellar mass_ estimates.") |
| md = "## Discussion\n\n" + core |
| paras, _ = segment_markdown(md) |
| assert len(paras) == 1, "post-demarkdown paragraph was dropped by length gate" |
|
|