| from pathlib import Path |
|
|
| import pandas as pd |
| import pytest |
|
|
| from TerraFin.data.cache import manager as cache_manager |
| from TerraFin.data.providers.corporate.filings import sec_edgar as sec_pkg |
| from TerraFin.data.providers.corporate.filings.sec_edgar import filing |
|
|
|
|
| _FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures" |
| _NVDA_8K_HTML = _FIXTURES_DIR / "sample_8k_NVDA_0001045810-26-000051.html" |
| _NVDA_INDEX_HTML = _FIXTURES_DIR / "sample_8k_index_NVDA_0001045810-26-000051.html" |
| _NVDA_EX99_HTML = _FIXTURES_DIR / "sample_ex99_NVDA_2026-05-20.html" |
|
|
|
|
| @pytest.fixture(autouse=True) |
| def _isolated_file_cache(tmp_path, monkeypatch): |
| monkeypatch.setattr(cache_manager, "_FILE_CACHE_DIR", tmp_path) |
| |
| filing.clear_sec_filings_cache() |
| yield |
|
|
|
|
| def _install_fakes(monkeypatch, *, download_calls, parse_calls, html="<html>body</html>", parsed="parsed-md"): |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"AAPL": 320193}) |
|
|
| def fake_get_company_filings(cik, include_8k=False, include_history=False): |
| return pd.DataFrame( |
| { |
| "form": ["10-Q"], |
| "accessionNumber": ["0000320193-25-000001"], |
| "primaryDocument": ["aapl-10q.htm"], |
| "primaryDocDescription": ["10-Q"], |
| } |
| ) |
|
|
| def fake_download(cik, accession, file_name): |
| download_calls.append((cik, accession, file_name)) |
| return html |
|
|
| def fake_parse(html_content, filing_form, *, include_images=False): |
| parse_calls.append((filing_form, include_images)) |
| return f"{parsed}|images={include_images}" |
|
|
| monkeypatch.setattr(sec_pkg, "get_company_filings", fake_get_company_filings) |
| monkeypatch.setattr(sec_pkg, "download_filing", fake_download) |
| monkeypatch.setattr(sec_pkg, "parse_sec_filing", fake_parse) |
|
|
|
|
| def test_get_sec_data_caches_parsed_result(monkeypatch) -> None: |
| downloads: list = [] |
| parses: list = [] |
| _install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses) |
|
|
| first = sec_pkg.get_sec_data("AAPL") |
| second = sec_pkg.get_sec_data("AAPL") |
|
|
| assert first.markdown == second.markdown |
| assert first.ticker == "AAPL" |
| assert len(downloads) == 1, "second call must skip download" |
| assert len(parses) == 1, "second call must skip parse" |
|
|
|
|
| def test_get_sec_data_caches_per_include_images_flag(monkeypatch) -> None: |
| downloads: list = [] |
| parses: list = [] |
| _install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses) |
|
|
| no_img = sec_pkg.get_sec_data("AAPL", include_images=False) |
| with_img = sec_pkg.get_sec_data("AAPL", include_images=True) |
|
|
| assert "images=False" in no_img.markdown |
| assert "images=True" in with_img.markdown |
| |
| assert len(downloads) == 2 |
| assert len(parses) == 2 |
|
|
| |
| sec_pkg.get_sec_data("AAPL", include_images=False) |
| sec_pkg.get_sec_data("AAPL", include_images=True) |
| assert len(downloads) == 2 |
| assert len(parses) == 2 |
|
|
|
|
| def test_clear_sec_filings_cache_invalidates_parsed_output(monkeypatch) -> None: |
| downloads: list = [] |
| parses: list = [] |
| _install_fakes(monkeypatch, download_calls=downloads, parse_calls=parses) |
|
|
| sec_pkg.get_sec_data("AAPL") |
| filing.clear_sec_filings_cache() |
| sec_pkg.get_sec_data("AAPL") |
|
|
| assert len(downloads) == 2, "clear must force a re-download on the next call" |
| assert len(parses) == 2 |
|
|
|
|
| def test_get_sec_data_raises_for_unknown_ticker(monkeypatch) -> None: |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"AAPL": 320193}) |
|
|
| with pytest.raises(ValueError, match="CIK not found"): |
| sec_pkg.get_sec_data("BOGUS") |
|
|
|
|
| def test_clear_sec_filings_cache_also_resets_in_memory_ticker_memo(monkeypatch) -> None: |
| """Coherence check: after clearing, the next CIK lookup must go back through |
| the (now empty) file cache rather than silently serving the stale dict.""" |
| fetches: list[str] = [] |
|
|
| def fake_fetch_json(url: str, *, host_url: str = "data.sec.gov") -> dict: |
| fetches.append(url) |
| return {"data": [["AAPL", 320193]], "fields": ["ticker", "cik"]} |
|
|
| monkeypatch.setattr(filing, "_fetch_json", fake_fetch_json) |
|
|
| filing.get_ticker_to_cik_dict_cached() |
| assert len(fetches) == 1 |
|
|
| filing.clear_sec_filings_cache() |
| filing.get_ticker_to_cik_dict_cached() |
| assert len(fetches) == 2, "clear must invalidate both file cache and in-memory memo" |
|
|
|
|
| def test_get_sec_toc_default_is_top_level_only(monkeypatch) -> None: |
| """Default max_level=2: agents see the Part list, not every sub-item.""" |
| downloads: list = [] |
| parses: list = [] |
| _install_fakes( |
| monkeypatch, |
| download_calls=downloads, |
| parse_calls=parses, |
| parsed="## PART I\n\n### Item 1\n\nbody\n\n### Item 2\n", |
| ) |
|
|
| toc = sec_pkg.get_sec_toc("AAPL") |
|
|
| assert [(e.level, e.title) for e in toc] == [(2, "PART I")] |
| assert all(e.id and e.anchor for e in toc) |
|
|
| |
| sec_pkg.get_sec_data("AAPL") |
| assert len(downloads) == 1 |
| assert len(parses) == 1 |
|
|
|
|
| def test_get_sec_data_8k_appends_ex99_exhibits(monkeypatch) -> None: |
| """End-to-end-ish: real NVDA 8-K body + real index.html + real EX-99.1 PR |
| (all loaded from fixtures, network mocked at `_fetch_text`). The cached |
| markdown must include both the 8-K item heading(s) AND an exhibit heading.""" |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810}) |
|
|
| def fake_get_company_filings(cik, include_8k=False, include_history=False): |
| return pd.DataFrame( |
| { |
| "form": ["8-K"], |
| "accessionNumber": ["0001045810-26-000051"], |
| "primaryDocument": ["nvda-20260520.htm"], |
| "primaryDocDescription": ["8-K"], |
| "filingDate": ["2026-05-20"], |
| } |
| ) |
|
|
| monkeypatch.setattr(sec_pkg, "get_company_filings", fake_get_company_filings) |
|
|
| body_html = _NVDA_8K_HTML.read_text() |
| index_html = _NVDA_INDEX_HTML.read_text() |
| ex99_html = _NVDA_EX99_HTML.read_text() |
|
|
| fetched: list[str] = [] |
|
|
| def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str: |
| fetched.append(url) |
| if url.endswith("-index.html"): |
| return index_html |
| if url.endswith("nvda-20260520.htm"): |
| return body_html |
| if url.endswith("q1fy27pr.htm") or url.endswith("q1fy27cfocommentary.htm"): |
| return ex99_html |
| raise AssertionError(f"unexpected URL: {url}") |
|
|
| monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text) |
|
|
| doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K") |
|
|
| assert "## Item " in doc.markdown, "primary 8-K item heading missing" |
| assert "## Exhibit 99.1" in doc.markdown, "EX-99.1 exhibit heading missing" |
| |
| slugs = [e.id for e in doc.toc] |
| |
| assert any(s.startswith("exhibit-991") for s in slugs), slugs |
|
|
|
|
| def test_get_sec_data_8k_survives_missing_accession_index(monkeypatch) -> None: |
| """If the accession-index fetch 404s, the orchestrator still returns the |
| parsed primary 8-K body — just without exhibits.""" |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810}) |
| monkeypatch.setattr( |
| sec_pkg, |
| "get_company_filings", |
| lambda *a, **k: pd.DataFrame( |
| { |
| "form": ["8-K"], |
| "accessionNumber": ["0001045810-26-000051"], |
| "primaryDocument": ["nvda-20260520.htm"], |
| "primaryDocDescription": ["8-K"], |
| "filingDate": ["2026-05-20"], |
| } |
| ), |
| ) |
|
|
| body_html = _NVDA_8K_HTML.read_text() |
|
|
| def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str: |
| if url.endswith("-index.html"): |
| raise filing.SecEdgarUnavailableError("simulated 404") |
| if url.endswith("nvda-20260520.htm"): |
| return body_html |
| raise AssertionError(f"unexpected URL: {url}") |
|
|
| monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text) |
|
|
| doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K") |
| assert "## Item " in doc.markdown |
| assert "## Exhibit" not in doc.markdown |
|
|
|
|
| def test_get_sec_data_8k_marks_unreachable_exhibit(monkeypatch) -> None: |
| """If the index resolves but a specific exhibit 404s, the orchestrator |
| emits a ``(fetch failed)`` marker so the caller knows it existed.""" |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810}) |
| monkeypatch.setattr( |
| sec_pkg, |
| "get_company_filings", |
| lambda *a, **k: pd.DataFrame( |
| { |
| "form": ["8-K"], |
| "accessionNumber": ["0001045810-26-000051"], |
| "primaryDocument": ["nvda-20260520.htm"], |
| "primaryDocDescription": ["8-K"], |
| "filingDate": ["2026-05-20"], |
| } |
| ), |
| ) |
|
|
| body_html = _NVDA_8K_HTML.read_text() |
| index_html = _NVDA_INDEX_HTML.read_text() |
|
|
| def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str: |
| if url.endswith("-index.html"): |
| return index_html |
| if url.endswith("nvda-20260520.htm"): |
| return body_html |
| |
| raise filing.SecEdgarUnavailableError("simulated 404") |
|
|
| monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text) |
|
|
| doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K") |
| assert "(fetch failed)" in doc.markdown |
| assert "## Exhibit 99.1" in doc.markdown |
|
|
|
|
| def test_get_sec_data_8k_renders_heading_less_exhibit_body(monkeypatch) -> None: |
| """Some issuers ship EX-99.1 press releases as a single ``<p>`` blob |
| with no internal headings (e.g. a one-paragraph dividend notice). |
| The 8-K orchestrator must still wrap it under a ``## Exhibit 99.1 |
| — Press Release`` heading and preserve the body text — heading |
| promotion in the orchestrator is what guarantees the exhibit shows |
| up in the TOC regardless of how the issuer structured the body.""" |
| monkeypatch.setattr(sec_pkg, "get_ticker_to_cik_dict_cached", lambda: {"NVDA": 1045810}) |
| monkeypatch.setattr( |
| sec_pkg, |
| "get_company_filings", |
| lambda *a, **k: pd.DataFrame( |
| { |
| "form": ["8-K"], |
| "accessionNumber": ["0001045810-26-000051"], |
| "primaryDocument": ["nvda-20260520.htm"], |
| "primaryDocDescription": ["8-K"], |
| "filingDate": ["2026-05-20"], |
| } |
| ), |
| ) |
|
|
| body_html = _NVDA_8K_HTML.read_text() |
| |
| |
| index_html = ( |
| "<html><body>" |
| '<table summary="Document Format Files"><tr>' |
| "<td>1</td><td>8-K</td><td><a>nvda-20260520.htm</a></td><td>8-K</td><td>1</td>" |
| "</tr><tr>" |
| "<td>2</td><td>EX-99.1</td><td><a>plain.htm</a></td><td>EX-99.1</td><td>1</td>" |
| "</tr></table></body></html>" |
| ) |
| |
| plain_ex99 = ( |
| "<html><body>" |
| "<p>NVIDIA Corporation today declared a quarterly cash dividend " |
| "of $0.01 per share payable June 27, 2026.</p>" |
| "</body></html>" |
| ) |
|
|
| def fake_fetch_text(url: str, *, host_url: str = "www.sec.gov") -> str: |
| if url.endswith("-index.html"): |
| return index_html |
| if url.endswith("nvda-20260520.htm"): |
| return body_html |
| if url.endswith("plain.htm"): |
| return plain_ex99 |
| raise AssertionError(f"unexpected URL: {url}") |
|
|
| monkeypatch.setattr(filing, "_fetch_text", fake_fetch_text) |
|
|
| doc = sec_pkg.get_sec_data("NVDA", filing_type="8-K") |
| |
| assert "## Exhibit 99.1 — Press Release" in doc.markdown |
| |
| assert "quarterly cash dividend" in doc.markdown |
|
|
|
|
| def test_get_sec_toc_full_hierarchy_when_max_level_none(monkeypatch) -> None: |
| downloads: list = [] |
| parses: list = [] |
| _install_fakes( |
| monkeypatch, |
| download_calls=downloads, |
| parse_calls=parses, |
| parsed="## PART I\n\n### Item 1\n\n### Item 2\n", |
| ) |
|
|
| toc = sec_pkg.get_sec_toc("AAPL", max_level=None) |
|
|
| assert [(e.level, e.title) for e in toc] == [ |
| (2, "PART I"), |
| (3, "Item 1"), |
| (3, "Item 2"), |
| ] |
|
|