"""Golden-file tests for the rule-based SEC `` → markdown rebuild. Each fixture under ``tests/data/fixtures/sec_edgar_tables/tables/`` has: - ``.html`` — raw ``
`` HTML from the real filing - ``.md`` — the flattened ``sec_parser`` output (the broken state) - ``.expected.md`` — what ``_rebuild_table_markdown`` should produce, committed alongside the input. Regenerate with ``REGEN=1 pytest``. When a parser rule changes, re-run with ``REGEN=1`` and review the diffs before committing. The test is dumb (byte-compare against the expected file) so every change lands as a reviewable snapshot update. """ from __future__ import annotations import json import os from pathlib import Path import pytest from TerraFin.data.providers.corporate.filings.sec_edgar.parser import ( _rebuild_table_markdown, ) FIXTURES_DIR = Path(__file__).resolve().parent / "fixtures" / "sec_edgar_tables" MANIFEST = FIXTURES_DIR / "manifest.jsonl" TABLES_DIR = FIXTURES_DIR / "tables" def _manifest_entries() -> list[dict]: if not MANIFEST.exists(): return [] entries = [] for line in MANIFEST.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue entries.append(json.loads(line)) return entries @pytest.mark.parametrize( "entry", _manifest_entries(), ids=lambda e: Path(e["html_path"]).stem, ) def test_table_rebuild_matches_snapshot(entry: dict) -> None: html_path = FIXTURES_DIR / entry["html_path"] expected_path = html_path.with_suffix(".expected.md") html = html_path.read_text(encoding="utf-8") produced = _rebuild_table_markdown(html) or "" if os.environ.get("REGEN") == "1": expected_path.write_text(produced + ("\n" if produced and not produced.endswith("\n") else ""), encoding="utf-8") return if not expected_path.exists(): pytest.fail( f"missing snapshot: {expected_path.relative_to(FIXTURES_DIR)}. " "Run `REGEN=1 pytest tests/data/test_sec_edgar_table_rules.py` to create it." ) expected = expected_path.read_text(encoding="utf-8").rstrip("\n") actual = produced.rstrip("\n") assert actual == expected, ( f"Rebuild diverged from snapshot for {entry['html_path']}. " "Re-run with REGEN=1 if the rule change is intentional." )