Spaces:

sk851
/

TerraFin

Running

App Files Files Community

TerraFin / tests /data /test_sec_edgar_get_sec_data.py

sk851

feat(sec_edgar): 8-K parser parity + EX-99.x exhibit fetcher

07d0380 13 days ago

raw

history blame contribute delete

12.9 kB

	from pathlib import Path

	import pandas as pd
	import pytest

	from TerraFin.data.cache import manager as cache_manager
	from TerraFin.data.providers.corporate.filings import sec_edgar as sec_pkg
	from TerraFin.data.providers.corporate.filings.sec_edgar import filing


	_FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures"
	_NVDA_8K_HTML = _FIXTURES_DIR / "sample_8k_NVDA_0001045810-26-000051.html"
	_NVDA_INDEX_HTML = _FIXTURES_DIR / "sample_8k_index_NVDA_0001045810-26-000051.html"
	_NVDA_EX99_HTML = _FIXTURES_DIR / "sample_ex99_NVDA_2026-05-20.html"


	@pytest.fixture(autouse=True)
	def _isolated_file_cache(tmp_path, monkeypatch):
	monkeypatch.setattr(cache_manager, "_FILE_CACHE_DIR", tmp_path)
	# Reset the managed CIK/submissions/parsed caches so each test starts cold.
	filing.clear_sec_filings_cache()
	yield


	def _install_fakes(monkeypatch, *, download_calls, parse_calls, html="<html>body</html>", parsed="parsed-md"):
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"AAPL": 320193})

	def fake_get_company_filings(cik, include_8k=False, include_history=False):
	return pd.DataFrame(
	{
	"form": ["10-Q"],
	"accessionNumber": ["0000320193-25-000001"],
	"primaryDocument": ["aapl-10q.htm"],
	"primaryDocDescription": ["10-Q"],
	}
	)

	def fake_download(cik, accession, file_name):
	download_calls.append((cik, accession, file_name))
	return html

	def fake_parse(html_content, filing_form, *, include_images=False):
	parse_calls.append((filing_form, include_images))
	return f"{parsed}\|images={include_images}"

	monkeypatch.setattr(sec_pkg, "get_company_filings", fake_get_company_filings)
	monkeypatch.setattr(sec_pkg, "download_filing", fake_download)
	monkeypatch.setattr(sec_pkg, "parse_sec_filing", fake_parse)


	def test_get_sec_data_caches_parsed_result(monkeypatch) -> None:
	downloads: list = []
	parses: list = []
	_install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses)

	first = sec_pkg.get_sec_data("AAPL")
	second = sec_pkg.get_sec_data("AAPL")

	assert first.markdown == second.markdown
	assert first.ticker == "AAPL"
	assert len(downloads) == 1, "second call must skip download"
	assert len(parses) == 1, "second call must skip parse"


	def test_get_sec_data_caches_per_include_images_flag(monkeypatch) -> None:
	downloads: list = []
	parses: list = []
	_install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses)

	no_img = sec_pkg.get_sec_data("AAPL", include_images=False)
	with_img = sec_pkg.get_sec_data("AAPL", include_images=True)

	assert "images=False" in no_img.markdown
	assert "images=True" in with_img.markdown
	# Distinct cache entries → two fetches & two parses.
	assert len(downloads) == 2
	assert len(parses) == 2

	# But repeating the same flag is served from cache.
	sec_pkg.get_sec_data("AAPL", include_images=False)
	sec_pkg.get_sec_data("AAPL", include_images=True)
	assert len(downloads) == 2
	assert len(parses) == 2


	def test_clear_sec_filings_cache_invalidates_parsed_output(monkeypatch) -> None:
	downloads: list = []
	parses: list = []
	_install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses)

	sec_pkg.get_sec_data("AAPL")
	filing.clear_sec_filings_cache()
	sec_pkg.get_sec_data("AAPL")

	assert len(downloads) == 2, "clear must force a re-download on the next call"
	assert len(parses) == 2


	def test_get_sec_data_raises_for_unknown_ticker(monkeypatch) -> None:
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"AAPL": 320193})

	with pytest.raises(ValueError, match="CIK not found"):
	sec_pkg.get_sec_data("BOGUS")


	def test_clear_sec_filings_cache_also_resets_in_memory_ticker_memo(monkeypatch) -> None:
	"""Coherence check: after clearing, the next CIK lookup must go back through
	the (now empty) file cache rather than silently serving the stale dict."""
	fetches: list[str] = []

	def fake_fetch_json(url: str, *, host_url: str = "data.sec.gov") -> dict:
	fetches.append(url)
	return {"data": [["AAPL", 320193]], "fields": ["ticker", "cik"]}

	monkeypatch.setattr(filing, "_fetch_json", fake_fetch_json)

	filing.get_ticker_to_cik_dict_cached()
	assert len(fetches) == 1

	filing.clear_sec_filings_cache()
	filing.get_ticker_to_cik_dict_cached()
	assert len(fetches) == 2, "clear must invalidate both file cache and in-memory memo"


	def test_get_sec_toc_default_is_top_level_only(monkeypatch) -> None:
	"""Default max_level=2: agents see the Part list, not every sub-item."""
	downloads: list = []
	parses: list = []
	_install_fakes(
	monkeypatch,
	download_calls=downloads,
	parse_calls=parses,
	parsed="## PART I\n\n### Item 1\n\nbody\n\n### Item 2\n",
	)

	toc = sec_pkg.get_sec_toc("AAPL")

	assert [(e.level, e.title) for e in toc] == [(2, "PART I")]
	assert all(e.id and e.anchor for e in toc)

	# Follow-up get_sec_data with the same flags hits the cache get_sec_toc populated.
	sec_pkg.get_sec_data("AAPL")
	assert len(downloads) == 1
	assert len(parses) == 1


	def test_get_sec_data_8k_appends_ex99_exhibits(monkeypatch) -> None:
	"""End-to-end-ish: real NVDA 8-K body + real index.html + real EX-99.1 PR
	(all loaded from fixtures, network mocked at `_fetch_text`). The cached
	markdown must include both the 8-K item heading(s) AND an exhibit heading."""
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810})

	def fake_get_company_filings(cik, include_8k=False, include_history=False):
	return pd.DataFrame(
	{
	"form": ["8-K"],
	"accessionNumber": ["0001045810-26-000051"],
	"primaryDocument": ["nvda-20260520.htm"],
	"primaryDocDescription": ["8-K"],
	"filingDate": ["2026-05-20"],
	}
	)

	monkeypatch.setattr(sec_pkg, "get_company_filings", fake_get_company_filings)

	body_html = _NVDA_8K_HTML.read_text()
	index_html = _NVDA_INDEX_HTML.read_text()
	ex99_html = _NVDA_EX99_HTML.read_text()

	fetched: list[str] = []

	def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str:
	fetched.append(url)
	if url.endswith("-index.html"):
	return index_html
	if url.endswith("nvda-20260520.htm"):
	return body_html
	if url.endswith("q1fy27pr.htm") or url.endswith("q1fy27cfocommentary.htm"):
	return ex99_html
	raise AssertionError(f"unexpected URL: {url}")

	monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text)

	doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K")

	assert "## Item " in doc.markdown, "primary 8-K item heading missing"
	assert "## Exhibit 99.1" in doc.markdown, "EX-99.1 exhibit heading missing"
	# And the slug surfaces in the TOC.
	slugs = [e.id for e in doc.toc]
	# `_slugify` strips the dot in `99.1`, so the slug is `exhibit-991-...`.
	assert any(s.startswith("exhibit-991") for s in slugs), slugs


	def test_get_sec_data_8k_survives_missing_accession_index(monkeypatch) -> None:
	"""If the accession-index fetch 404s, the orchestrator still returns the
	parsed primary 8-K body — just without exhibits."""
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810})
	monkeypatch.setattr(
	sec_pkg,
	"get_company_filings",
	lambda a, *k: pd.DataFrame(
	{
	"form": ["8-K"],
	"accessionNumber": ["0001045810-26-000051"],
	"primaryDocument": ["nvda-20260520.htm"],
	"primaryDocDescription": ["8-K"],
	"filingDate": ["2026-05-20"],
	}
	),
	)

	body_html = _NVDA_8K_HTML.read_text()

	def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str:
	if url.endswith("-index.html"):
	raise filing.SecEdgarUnavailableError("simulated 404")
	if url.endswith("nvda-20260520.htm"):
	return body_html
	raise AssertionError(f"unexpected URL: {url}")

	monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text)

	doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K")
	assert "## Item " in doc.markdown
	assert "## Exhibit" not in doc.markdown


	def test_get_sec_data_8k_marks_unreachable_exhibit(monkeypatch) -> None:
	"""If the index resolves but a specific exhibit 404s, the orchestrator
	emits a ``(fetch failed)`` marker so the caller knows it existed."""
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810})
	monkeypatch.setattr(
	sec_pkg,
	"get_company_filings",
	lambda a, *k: pd.DataFrame(
	{
	"form": ["8-K"],
	"accessionNumber": ["0001045810-26-000051"],
	"primaryDocument": ["nvda-20260520.htm"],
	"primaryDocDescription": ["8-K"],
	"filingDate": ["2026-05-20"],
	}
	),
	)

	body_html = _NVDA_8K_HTML.read_text()
	index_html = _NVDA_INDEX_HTML.read_text()

	def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str:
	if url.endswith("-index.html"):
	return index_html
	if url.endswith("nvda-20260520.htm"):
	return body_html
	# Every exhibit fetch fails.
	raise filing.SecEdgarUnavailableError("simulated 404")

	monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text)

	doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K")
	assert "(fetch failed)" in doc.markdown
	assert "## Exhibit 99.1" in doc.markdown


	def test_get_sec_data_8k_renders_heading_less_exhibit_body(monkeypatch) -> None:
	"""Some issuers ship EX-99.1 press releases as a single ``<p>`` blob
	with no internal headings (e.g. a one-paragraph dividend notice).
	The 8-K orchestrator must still wrap it under a ``## Exhibit 99.1
	— Press Release`` heading and preserve the body text — heading
	promotion in the orchestrator is what guarantees the exhibit shows
	up in the TOC regardless of how the issuer structured the body."""
	monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810})
	monkeypatch.setattr(
	sec_pkg,
	"get_company_filings",
	lambda a, *k: pd.DataFrame(
	{
	"form": ["8-K"],
	"accessionNumber": ["0001045810-26-000051"],
	"primaryDocument": ["nvda-20260520.htm"],
	"primaryDocDescription": ["8-K"],
	"filingDate": ["2026-05-20"],
	}
	),
	)

	body_html = _NVDA_8K_HTML.read_text()
	# Minimal accession index containing one EX-99.1 row only — simpler
	# than the full NVDA fixture so we can assert exactly one Exhibit heading.
	index_html = (
	"<html><body>"
	'<table summary="Document Format Files"><tr>'
	"<td>1</td><td>8-K</td><td><a>nvda-20260520.htm</a></td><td>8-K</td><td>1</td>"
	"</tr><tr>"
	"<td>2</td><td>EX-99.1</td><td><a>plain.htm</a></td><td>EX-99.1</td><td>1</td>"
	"</tr></table></body></html>"
	)
	# Exhibit body with NO <h*> tags whatsoever.
	plain_ex99 = (
	"<html><body>"
	"<p>NVIDIA Corporation today declared a quarterly cash dividend "
	"of $0.01 per share payable June 27, 2026.</p>"
	"</body></html>"
	)

	def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str:
	if url.endswith("-index.html"):
	return index_html
	if url.endswith("nvda-20260520.htm"):
	return body_html
	if url.endswith("plain.htm"):
	return plain_ex99
	raise AssertionError(f"unexpected URL: {url}")

	monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text)

	doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K")
	# The orchestrator-emitted heading is present even though the body had no <h*>.
	assert "## Exhibit 99.1 — Press Release" in doc.markdown
	# Body text survives intact under that heading.
	assert "quarterly cash dividend" in doc.markdown


	def test_get_sec_toc_full_hierarchy_when_max_level_none(monkeypatch) -> None:
	downloads: list = []
	parses: list = []
	_install_fakes(
	monkeypatch,
	download_calls=downloads,
	parse_calls=parses,
	parsed="## PART I\n\n### Item 1\n\n### Item 2\n",
	)

	toc = sec_pkg.get_sec_toc("AAPL", max_level=None)

	assert [(e.level, e.title) for e in toc] == [
	(2, "PART I"),
	(3, "Item 1"),
	(3, "Item 2"),
	]