TerraFin / tests /data /test_sec_edgar_parser.py
sk851's picture
feat(sec_edgar): 8-K parser parity + EX-99.x exhibit fetcher
07d0380
import re
from pathlib import Path
import pytest
from TerraFin.data.providers.corporate.filings.sec_edgar import parser
_FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
# Real EDGAR 8-K filing: Apple Inc., accession 0000320193-26-000011.
# Contains Item 2.02 (Results of Operations) + Item 9.01 (Exhibits) —
# the two-item shape covers both correctly-classified TopSectionTitle
# and mis-classified TitleElement cases observed with Edgar10QParser.
_SAMPLE_8K_PATH = _FIXTURES_DIR / "sample_8k_AAPL_0000320193-26-000011.html"
_FILING_HTML = """<html><body>
<h2>PART I - FINANCIAL INFORMATION</h2>
<h3>Item 1. Financial Statements</h3>
<p>Revenue grew 10% year over year.</p>
<img src="charts/revenue.jpg" alt="Revenue chart" />
<img src="data:image/png;base64,AAAAAAAA" alt="Inline logo" />
<img src="logo.png" alt=" Leading\nand trailing\twhitespace " />
<h3>Item 2. MD&amp;A</h3>
<p>Management commentary.</p>
</body></html>"""
def test_parse_sec_filing_uses_hash_prefix_for_section_titles() -> None:
md = parser.parse_sec_filing(_FILING_HTML, "10-Q")
# sec_parser classifies "PART I" and "Item N" as TopSectionTitle → "## ".
# Flat "#### " from the prior implementation should be gone.
assert "## PART I - FINANCIAL INFORMATION" in md
assert "## Item 1. Financial Statements" in md
assert "#### " not in md
def test_parse_filing_maps_title_element_to_h3(monkeypatch) -> None:
"""Direct branch-coverage test for the TitleElement → '### ' path.
Real SEC filings that use explicit `<strong>`-style sub-titles can yield
TitleElements alongside TopSectionTitles — mock the parser to guarantee
we see both branches.
"""
from unittest.mock import MagicMock
from sec_parser.semantic_elements import TitleElement, TopSectionTitle
top = MagicMock(spec=TopSectionTitle)
top.text = "Top"
sub = MagicMock(spec=TitleElement)
sub.text = "Sub"
class _FakeParser:
def parse(self, _html):
return [top, sub]
monkeypatch.setattr(parser.sp, "Edgar10QParser", _FakeParser)
md = parser._parse_filing("<html></html>", include_images=False)
assert "## Top" in md
assert "### Sub" in md
def test_parse_sec_filing_omits_images_by_default() -> None:
md = parser.parse_sec_filing(_FILING_HTML, "10-Q")
assert "![" not in md
assert "<inline-image" not in md
def test_parse_sec_filing_includes_images_when_requested() -> None:
md = parser.parse_sec_filing(_FILING_HTML, "10-Q", include_images=True)
assert "![Revenue chart](charts/revenue.jpg)" in md
# Data URI is replaced with a placeholder — no raw base64 payload.
assert "AAAAAAAA" not in md
assert "![Inline logo](<inline-image:image/png>)" in md
# Alt text whitespace is collapsed.
assert "![Leading and trailing whitespace](logo.png)" in md
def test_image_to_md_sanitizes_long_alt() -> None:
import sec_parser as sp
from sec_parser.semantic_elements import ImageElement
long_alt = "x" * 500
html = f'<html><body><h2>S</h2><img src="a.png" alt="{long_alt}" /></body></html>'
elements = sp.Edgar10QParser().parse(html)
images = [e for e in elements if isinstance(e, ImageElement)]
assert images, "sec_parser should emit an ImageElement"
md = parser._image_to_md(images[0])
# Truncated to _ALT_MAX with an ellipsis, leaving room for ]( syntax.
assert md.startswith("![xxx")
assert md.endswith("\u2026](a.png)")
assert len(md) < 500
def test_parse_sec_filing_raises_for_unsupported_form() -> None:
with pytest.raises(ValueError, match="not supported"):
parser.parse_sec_filing(_FILING_HTML, "DEF 14A")
def test_parse_sec_filing_accepts_verbose_form_descriptors() -> None:
# SEC's primaryDocDescription sometimes comes as "FORM 10-Q" or
# "10-K (Annual Report)". Loose matching preserves the caller contract.
md_a = parser.parse_sec_filing(_FILING_HTML, "FORM 10-Q")
md_b = parser.parse_sec_filing(_FILING_HTML, "10-K (Annual Report)")
md_c = parser.parse_sec_filing(_FILING_HTML, "10-Q/A")
assert "PART I" in md_a
assert "PART I" in md_b
assert "PART I" in md_c
def test_parse_sec_filing_handles_none_filing_form() -> None:
with pytest.raises(ValueError, match="not supported"):
parser.parse_sec_filing(_FILING_HTML, None)
# ---------------------------------------------------------------------------
# 8-K parsing parity (real EDGAR fixture)
# ---------------------------------------------------------------------------
_EIGHT_K_ITEM_HEADING_RE = re.compile(r"^## Item \d+\.\d{2}\b", re.MULTILINE)
def test_parse_sec_filing_8k_returns_nonempty_markdown() -> None:
"""8-K branch should accept the form and produce markdown output."""
html = _SAMPLE_8K_PATH.read_text()
md = parser.parse_sec_filing(html, "8-K")
assert isinstance(md, str)
assert md.strip()
def test_parse_sec_filing_8k_promotes_item_code_headings_to_level_two() -> None:
"""8-K item codes (`Item N.NN`) must surface as ## headings — sec_parser
classifies the second item as TitleElement on this AAPL fixture, so
the heading-promotion path in `_emit_heading` is what lifts it."""
html = _SAMPLE_8K_PATH.read_text()
md = parser.parse_sec_filing(html, "8-K")
matches = _EIGHT_K_ITEM_HEADING_RE.findall(md)
assert matches, "expected at least one `## Item N.NN` heading in 8-K markdown"
def test_parse_sec_filing_8k_build_toc_emits_item_code_slugs() -> None:
"""`build_toc` is form-agnostic — it just needs ## headings. On the
AAPL fixture both Item 2.02 and Item 9.01 should land in the TOC
with item-code-style slugs."""
html = _SAMPLE_8K_PATH.read_text()
md = parser.parse_sec_filing(html, "8-K")
toc = parser.build_toc(md)
slugs = [e["slug"] for e in toc]
assert any(s.startswith("item-202") for s in slugs), slugs
assert any(s.startswith("item-901") for s in slugs), slugs
def test_parse_sec_filing_8k_accepts_verbose_form_descriptors() -> None:
"""EDGAR's primaryDocDescription sometimes comes as `FORM 8-K` or
`8-K/A`. Loose matching preserves the caller contract."""
html = _SAMPLE_8K_PATH.read_text()
md_a = parser.parse_sec_filing(html, "FORM 8-K")
md_b = parser.parse_sec_filing(html, "8-K/A")
assert _EIGHT_K_ITEM_HEADING_RE.search(md_a)
assert _EIGHT_K_ITEM_HEADING_RE.search(md_b)
def test_parse_sec_filing_8k_section_body_lookup_roundtrip() -> None:
"""End-to-end: parse → build TOC → slice section body by slug.
Mirrors what `service.sec_filing_section` does — confirms the
parser output is shaped correctly for downstream consumers."""
html = _SAMPLE_8K_PATH.read_text()
md = parser.parse_sec_filing(html, "8-K")
toc = parser.build_toc(md)
target = next(e for e in toc if e["slug"].startswith("item-202"))
lines = md.split("\n")
later = [e for e in toc if e["line_index"] > target["line_index"]]
end_line = later[0]["line_index"] if later else len(lines)
body = "\n".join(lines[target["line_index"] + 1 : end_line]).strip()
assert body, "Item 2.02 body should not be empty"
# AAPL's Item 2.02 mentions a press release.
assert "press release" in body.lower()
_SAMPLE_MD = (
"## PART I - FINANCIAL INFORMATION\n"
"\n"
"### Item 1. Financial Statements\n"
"\n"
"Some prose that mentions #tokens but is not a heading.\n"
"\n"
"### Item 2. MD&A\n"
"\n"
"## PART II - OTHER INFORMATION\n"
)
def test_build_toc_default_keeps_top_level_only_for_compact_agent_context() -> None:
"""Compact default: agents get the Part/Item scaffold, not every sub-title."""
toc = parser.build_toc(_SAMPLE_MD)
assert [(e["level"], e["text"]) for e in toc] == [
(2, "PART I - FINANCIAL INFORMATION"),
(2, "PART II - OTHER INFORMATION"),
]
def test_build_toc_max_level_none_returns_full_hierarchy() -> None:
toc = parser.build_toc(_SAMPLE_MD, max_level=None)
assert [(e["level"], e["text"]) for e in toc] == [
(2, "PART I - FINANCIAL INFORMATION"),
(3, "Item 1. Financial Statements"),
(3, "Item 2. MD&A"),
(2, "PART II - OTHER INFORMATION"),
]
# Common entry shape for every item.
for entry in toc:
assert set(entry) == {"level", "text", "line_index", "slug", "char_count"}
assert toc[0]["slug"] == "part-i-financial-information"
def test_build_toc_char_count_aggregates_filtered_subsections() -> None:
"""When subsections are filtered out, the parent section's char_count expands
to cover every filtered-out heading line plus its body."""
toc_full = parser.build_toc(_SAMPLE_MD, max_level=None)
toc_compact = parser.build_toc(_SAMPLE_MD, max_level=2)
# PART I (compact) must span strictly more chars than PART I (full): full
# stops at the next ### heading; compact stops at the next ## heading.
assert toc_compact[0]["text"] == "PART I - FINANCIAL INFORMATION"
assert toc_compact[0]["char_count"] > toc_full[0]["char_count"]
# Every body span should be non-negative and no larger than the document itself.
total_chars = sum(len(line) + 1 for line in _SAMPLE_MD.splitlines())
for entry in toc_compact + toc_full:
assert 0 <= entry["char_count"] <= total_chars
def test_build_toc_on_empty_or_heading_less_input() -> None:
assert parser.build_toc("") == []
assert parser.build_toc("No headings here.\nJust prose.") == []
def test_heal_broken_titles_splices_mid_word_split() -> None:
"""sec_parser occasionally splits a heading mid-word (e.g. ZETA's 10-K
gives `TopSectionTitle("Item 1. Bus")` + `TitleElement("iness.")`).
The post-parse healer must merge them back."""
raw = (
"## Item 1. Bus\n"
"\n"
"### iness.\n"
"\n"
"### Overview\n"
"\n"
"Body prose.\n"
)
healed = parser._heal_broken_titles(raw)
assert "## Item 1. Business." in healed
assert "### iness." not in healed
# Overview and body prose should survive unchanged.
assert "### Overview" in healed
assert "Body prose." in healed
def test_heal_broken_titles_handles_multi_fragment_chain() -> None:
"""Handles chains where the word is split across 3+ fragments."""
raw = "## Item 1. Bus\n\n### iness\n\n### .\n"
healed = parser._heal_broken_titles(raw)
assert "## Item 1. Business." in healed
def test_heal_broken_titles_leaves_legit_subheadings_alone() -> None:
"""A genuine lowercase subheading after a complete parent title must NOT
be merged. Signal: parent ends with a period or other terminator."""
raw = "## Item 2. Properties.\n\n### overview\n\nBody.\n"
healed = parser._heal_broken_titles(raw)
assert "## Item 2. Properties." in healed
assert "### overview" in healed
def test_heal_broken_titles_does_not_merge_all_caps_parents() -> None:
"""All-caps section titles followed by a legitimate lowercase subheading
are a real 10-K pattern (e.g. `RISKS` + `### related to operations`).
The old regex merged them into `RISKSrelated`; the tightened regex
requires the parent-line tail to be title-case (one upper + two lower)."""
raw = "## RISKS\n\n### related to operations\n\nBody.\n"
healed = parser._heal_broken_titles(raw)
assert "## RISKS" in healed
assert "RISKSrelated" not in healed
assert "### related to operations" in healed
def test_heal_broken_titles_does_not_merge_possessive_parent() -> None:
"""Possessive nouns like `Company's` look mid-wordish to a naive regex
because they end in a letter, but the following lowercase-led line is
always a legitimate sub-heading."""
raw = "## Item 1. Company's\n\n### own operations\n\nBody.\n"
healed = parser._heal_broken_titles(raw)
assert "## Item 1. Company's" in healed
assert "Company'sown" not in healed
assert "### own operations" in healed
def test_heal_broken_titles_does_not_merge_complete_short_parent() -> None:
"""A four-character complete word like `Note` must not glue onto a
following lowercase sub-heading."""
raw = "## Note\n\n### overview of disclosures\n\nBody.\n"
healed = parser._heal_broken_titles(raw)
assert "## Note" in healed
assert "Noteoverview" not in healed
assert "### overview of disclosures" in healed
def test_build_toc_ignores_inline_hashes() -> None:
md = "This sentence has a ## middle-of-line token.\n## Real Heading"
toc = parser.build_toc(md)
assert len(toc) == 1
assert toc[0]["text"] == "Real Heading"
assert toc[0]["line_index"] == 1
def test_looks_like_section_heading_matches_part_and_item_patterns() -> None:
"""Core regex that decides whether a sec_parser text blob should be
promoted to a ## heading. Matches the canonical Part/Item prefixes
(case-insensitive, with or without period/dash).
No length cap: a genuine Item 7 MD&A heading can be 100+ chars, and
even when sec_parser fuses the heading with its paragraph into one
long blob we still want promotion — `_split_heading_and_body` in
`_emit_heading` splits at the first newline so the heading line
lands cleanly in the TOC."""
assert parser._looks_like_section_heading("Item 7. Management's Discussion") is True
assert parser._looks_like_section_heading("ITEM 7A. Quantitative and Qualitative") is True
assert parser._looks_like_section_heading("Part I") is True
assert parser._looks_like_section_heading("PART II") is True
assert parser._looks_like_section_heading("Item 8") is True # no period
# Run-on (heading fused with body) still matches — caller splits.
run_on = "Item 7. MD&A\nOverview\nNet income was ..."
assert parser._looks_like_section_heading(run_on) is True
# Rejects non-heading text.
assert parser._looks_like_section_heading("The Company sells products") is False
assert parser._looks_like_section_heading("") is False
def test_slugify_caps_output_at_80_chars() -> None:
"""Without this cap, a sec_parser-misclassified 400-char boilerplate
paragraph becomes a 400-char slug that floods the TOC. Cap at 80 chars
at a word boundary."""
long_title = (
"Indicates a management contract or compensatory plan. The certifications "
"attached as Exhibit 32.1 and Exhibit 32.2 that accompany this Annual Report"
)
slug = parser._slugify(long_title)
assert len(slug) <= 80
# Must end at a word boundary (no trailing hyphen / partial word).
assert not slug.endswith("-")
# Stays meaningful-looking.
assert slug.startswith("indicates-a-management-contract")
def test_slugify_preserves_short_slugs_unchanged() -> None:
assert parser._slugify("Item 7. MD&A") == "item-7-mda"
assert parser._slugify("Part II") == "part-ii"
def test_emit_heading_splits_run_on_item_paragraph_into_heading_plus_body() -> None:
"""sec_parser's Edgar10QParser routinely fuses a 10-K Item heading
with the paragraph that follows it into a single TextElement. Before
this split, Item 7 / Item 8 never appeared in the TOC for 10-Ks
because the fused text was too long to recognize as a heading —
ZETA's 10-K reproduced this.
The fix: when a text blob starts with an `Item N.` / `Part I` marker
and contains a newline, treat the first line as the heading and
emit the rest as body. The length of the heading line itself no
longer matters; a 91-char `Item 7. Management's Discussion and
Analysis of Financial Condition and Results of Operations` is a
legitimate heading, not boilerplate."""
run_on = (
"Item 7. Management's Discussion and Analysis of Financial Condition "
"and Results of Operations\n"
"Overview\n"
"Net income was $X million for the year."
)
rendered = parser._emit_heading(run_on, default_level=3)
# Heading promoted to ##, body preserved as following lines.
assert rendered.startswith("## Item 7. Management's Discussion")
assert "Overview" in rendered
assert "Net income was $X million" in rendered
def test_build_toc_keeps_long_real_headings() -> None:
"""No length filter in `build_toc` — a genuine `Item 5.` heading is
often 100+ chars in a 10-K (`Item 5. Market for Registrant's Common
Equity, Related Stockholder Matters and Issuer Purchases of Equity
Securities`) and must still appear in the TOC."""
long_real = (
"Item 5. Market for Registrant's Common Equity, Related Stockholder "
"Matters and Issuer Purchases of Equity Securities"
)
md = f"## Real Part II\n### {long_real}\nBody.\n### Item 7. MD&A\nMD&A body.\n"
toc = parser.build_toc(md, max_level=3)
texts = [entry["text"] for entry in toc]
assert long_real in texts
assert "Item 7. MD&A" in texts
def test_emit_heading_splits_multi_item_blob_so_every_item_lands_in_toc() -> None:
"""sec_parser routinely fuses several 10-K Item headings and their
bodies into one big TextElement. The ZETA 10-K failure mode: Item 7
MD&A, Item 7A Market Risk, and Item 8 Financial Statements all live
inside a single blob, so only the first one used to get promoted.
After the multi-chunk split they all surface in the TOC."""
blob = (
"Item 7. Management's Discussion and Analysis of Financial Condition\n"
"Overview of results.\n"
"Revenue was $500M.\n"
"Item 7A. Quantitative and Qualitative Disclosures About Market Risk\n"
"Interest rate sensitivity analysis.\n"
"Item 8. Financial Statements and Supplementary Data\n"
"Net income was $50M for the year ended 2025.\n"
)
rendered = parser._emit_heading(blob, default_level=3)
# All three Item headings promoted to ##.
assert "## Item 7. Management's Discussion" in rendered
assert "## Item 7A. Quantitative and Qualitative" in rendered
assert "## Item 8. Financial Statements" in rendered
# And their bodies are still present under each heading.
assert "Revenue was $500M" in rendered
assert "Interest rate sensitivity" in rendered
assert "Net income was $50M" in rendered
def test_emit_heading_promotes_embedded_item_after_preamble() -> None:
"""If a text blob starts with non-heading prose (like a 'Table of
Contents' breadcrumb) and an Item heading is embedded further down,
the embedded heading must still promote — earlier regex anchored
at start-of-string and would miss this case, leaving the whole
blob as unclassified body prose."""
preamble_blob = (
"Table of Contents\n"
"Item 7. Management's Discussion\n"
"Overview text."
)
rendered = parser._emit_heading(preamble_blob, default_level=3)
# The Item 7 heading is promoted despite the preamble line.
assert "## Item 7. Management's Discussion" in rendered
# Preamble stays, emitted above the heading.
assert "Table of Contents" in rendered
# Body stays under the heading.
assert "Overview text" in rendered
def test_build_toc_dedupes_colliding_slugs_with_numeric_suffix() -> None:
"""Two sections whose titles slugify to the same string would
otherwise alias to a single TOC entry — first-match-wins in
`sec_filing_section` makes the second section unreachable.
Deduplicate with `-2`, `-3`, … suffixes so every entry has a
unique slug."""
# Same heading text appearing twice is the simplest collision case;
# it happens in real 10-Ks where, e.g. Part I and Part IV both have
# an "Exhibits" subsection.
md = "## Part I\n### Exhibits\nA.\n## Part IV\n### Exhibits\nB.\n"
toc = parser.build_toc(md, max_level=3)
slugs = [entry["slug"] for entry in toc]
# All slugs distinct.
assert len(set(slugs)) == len(slugs)
# One `exhibits`, one `exhibits-2`.
assert "exhibits" in slugs
assert "exhibits-2" in slugs
def test_build_toc_dedup_respects_existing_numeric_suffix_slug() -> None:
"""If the source already has a heading whose own slug is
`exhibits-2` (e.g. literally `## Exhibits 2`) AND two `Exhibits`
headings collide, the collision resolver must increment past the
already-used `-2` rather than re-emitting it. A plain per-base-slug
counter would produce a duplicate; the used-slug set approach
avoids that."""
md = "## Exhibits\nA.\n## Exhibits 2\nB.\n## Exhibits\nC.\n## Exhibits\nD.\n"
toc = parser.build_toc(md, max_level=3)
slugs = [entry["slug"] for entry in toc]
# Every slug must be unique.
assert len(set(slugs)) == len(slugs)
# Sensible ordering: first `Exhibits` wins the bare slug, then the
# legitimately-named `Exhibits 2` keeps its own slug, and the later
# duplicates skip past the collision.
assert slugs[0] == "exhibits"
assert slugs[1] == "exhibits-2"
# Subsequent collisions use suffixes that don't collide with slug[1].
assert slugs[2] != slugs[1]
assert slugs[3] != slugs[1] and slugs[3] != slugs[2]
def test_build_toc_dedupes_colliding_truncated_slugs() -> None:
"""Collision case that specifically exercises the 80-char cap: two
long titles that share their first 80 chars but diverge after that.
Before dedup, both truncate to the same slug."""
# Identical prefix (first 80 chars of slug identical), different tails.
prefix = "Item 1. Business overview covering segments products go to market strategy and"
title_a = prefix + " North America revenue"
title_b = prefix + " Europe operations headcount"
md = f"## Part I\n### {title_a}\nA.\n### {title_b}\nB.\n"
toc = parser.build_toc(md, max_level=3)
slugs = [entry["slug"] for entry in toc]
assert len(set(slugs)) == len(slugs) # no collision survives
def test_build_toc_strips_trailing_carriage_return_from_heading_text() -> None:
"""EDGAR HTML often has CRLF line endings. `splitlines()` preserves
the `\\r` inside the captured group, which would leak into the slug
as a `-r` suffix and break lookup."""
md = "## Part I\r\n### Item 1. Business\r\nBody.\r\n"
toc = parser.build_toc(md, max_level=3)
slugs = [entry["slug"] for entry in toc]
assert "item-1-business" in slugs
# None of the slugs end with a garbage `-r`.
assert not any(s.endswith("-r") for s in slugs)
def test_promote_item_heading_regex_does_not_overmatch_body_text() -> None:
"""The promote/merge regexes used to accept `\\w+` after `ITEM`,
which matched incidental body text like `### item foo` (a real H3
sub-section). Tightening to `\\d+[A-Z]?` prevents spurious
level-2 promotions.
The critical property here is that an `### item foo bar` heading
(non-numeric Item-like word) stays at level 3 in the healed
markdown — `build_toc(max_level=2)` will correctly ignore it."""
md = "## Part II\n### item foo bar\nRegular subsection.\n"
healed = parser._heal_broken_titles(md)
# It's still a level-3 heading; `build_toc(max_level=2)` excludes it
# instead of surfacing a bogus level-2 entry.
toc = parser.build_toc(healed, max_level=2)
slugs = [e["slug"] for e in toc]
assert "item-foo-bar" not in slugs
assert "part-ii" in slugs
def test_parse_sec_filing_8k_item_801_other_events_lands_in_toc() -> None:
"""Item 8.01 "Other Events" is a common 8-K item code (used for
non-2.02/non-5.02 announcements: dividends, buyback authorizations,
SEC settlements, etc.). The 8-K item-code regex (`\\d+\\.\\d{2}`)
must match it the same as the more common Item 2.02 / Item 9.01.
Synthesize a minimal 8-K body with just an Item 8.01 heading and
assert it surfaces both as a `## Item 8.01` heading and as a TOC slug.
"""
html = (
"<html><body>"
"<p>Item 8.01 Other Events</p>"
"<p>On May 21, 2026 the Company announced a $1B share repurchase "
"authorization replacing the prior program.</p>"
"</body></html>"
)
md = parser.parse_sec_filing(html, "8-K")
assert "## Item 8.01" in md, md
toc = parser.build_toc(md)
slugs = [e["slug"] for e in toc]
assert any(s.startswith("item-801") for s in slugs), slugs
def test_table_to_md_returns_sec_parser_markdown_verbatim() -> None:
"""We intentionally don't post-process sec_parser's table markdown so the
agent sees the same rows and columns the user renders. Strip/normalize
would risk silent data loss on sparse tables, and user↔agent data
surface must stay identical."""
class _FakeTable:
def table_to_markdown(self) -> str:
return "| Header | Other |\n| cell | value |"
result = parser._table_to_md(_FakeTable())
assert result == "| Header | Other |\n| cell | value |"