Spaces:

Pushkya
/

Financial_bot

Running

File size: 17,836 Bytes
"""
sec_processor.py
================
Phase 2b – SEC Filing Processor

Processes Apple SEC HTML filings (10-K, 10-Q, 8-K) through Docling and saves:
  - {stem}.json         → structured JSON (sections, tables, metadata)
  - {stem}_docling.json → native DoclingDocument (required for HybridChunker)

Why not reuse pdf_processor.py?
---------------------------------
pdf_processor.py is built around PDFs:
  - Page-based noise filter (cover page, TOC, disclaimer pages)
  - Page numbers tracked throughout
  - Assumes DocLayNet layout detection

SEC HTML filings are structurally different:
  - No pages — HTML has no page layout concept
  - Boilerplate is at the START of the document (cover section), not spread
    across specific pages
  - HTML headings (h1/h2/h3) map to SectionHeaderItem automatically
  - Tables use standard <table> tags — no OCR or TableFormer needed

What stays the same
--------------------
  - Docling converter with do_table_structure=True
  - export_to_dataframe(doc) / export_to_markdown(doc) for tables
  - doc.model_dump_json() → _docling.json (for HybridChunker)
  - cleaned_text, parent_header on every section

Output format per chunk (after Phase 3 chunking)
--------------------------------------------------
{
    "chunk_id"  : "10-K_2024_text_0042",
    "doc_id"    : "10-K_2024",
    "chunk_type": "text" | "table",
    "text"      : "...",
    "metadata"  : {
        "source"      : "sec_edgar",
        "doc_type"    : "10-K",
        "ticker"      : "AAPL",
        "company"     : "Apple Inc.",
        "fiscal_year" : "2024",
        "filing_date" : "2024-11-01",
        "accession"   : "0000320193-24-000123",
        "heading_path": "PART I > Item 1. Business",
        ...
    }
}

Usage (as a module)
-------------------
    from src.sec_processor import SECProcessor
    processor = SECProcessor()
    processor.process_all()

Usage (as a script)
-------------------
    python src/sec_processor.py
    python src/sec_processor.py --force
"""

import re
import json
import logging
from pathlib import Path
from datetime import datetime, timezone

# ── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level  = logging.INFO,
    format = "%(asctime)s  %(levelname)-8s  %(message)s",
)
log = logging.getLogger(__name__)

# ── Paths ──────────────────────────────────────────────────────────────────────
BASE_DIR      = Path(__file__).parent.parent
RAW_SEC_DIR   = BASE_DIR / "data" / "raw" / "sec_filings" / "AAPL"
PROCESSED_DIR = BASE_DIR / "data" / "processed" / "sec_filings" / "AAPL"

# ── SEC boilerplate detection ──────────────────────────────────────────────────
# Every SEC filing begins with a cover section containing form labels,
# legal boilerplate, and administrative identifiers. These fragments are
# short and carry no analytical signal for RAG queries.
_BOILERPLATE_EXACT = {
    "united states",
    "securities and exchange commission",
    "washington, d.c. 20549",
    "(mark one)",
    "or",
    "for the transition period from to .",
    "☒", "☐",
}

_BOILERPLATE_RE = re.compile(
    r"^("
    r"form \d+[\-/][a-z]+"           # FORM 10-K, FORM 10-Q
    r"|commission file"               # Commission File Number
    r"|irs employer"                  # IRS Employer Identification
    r"|state or other"                # State or other jurisdiction
    r"|jurisdiction"                  # of incorporation
    r"|\(exact name"                  # (Exact name of Registrant...)
    r"|\(zip code"                    # (Zip Code)
    r"|indicate by check"             # Indicate by check mark...
    r"|securities registered"         # Securities registered...
    r"|aggregate market value"        # Aggregate market value...
    r"|number of shares"              # Number of shares outstanding
    r"|☒|☐"                          # form checkboxes
    r")",
    re.IGNORECASE,
)


def _df_to_markdown(df) -> str:
    """
    Build a clean markdown table from a pandas DataFrame.

    Why not use table.export_to_markdown(doc)?
    Docling's HTML→markdown export produces blank cells for SEC HTML tables that
    use iXBRL inline tags or complex colspan/rowspan structures.  The DataFrame
    export correctly populates cell values; we build the markdown from that instead.

    SEC HTML tables often expand colspan cells into N identical columns (e.g. a
    cell spanning 3 columns becomes ['Americas','Americas','Americas']).  We
    de-duplicate consecutive identical values in each row before rendering so the
    markdown stays readable.
    """
    def _dedup(cells: list[str]) -> list[str]:
        """Remove consecutive identical tokens (colspan artefacts)."""
        result, prev = [], object()
        for c in cells:
            if c != prev:
                result.append(c)
                prev = c
        return result

    rows_md = []
    for _, row in df.iterrows():
        cells = _dedup([str(c).strip() if c else "" for c in row.values])
        rows_md.append(cells)

    # Drop rows that are entirely empty after dedup
    rows_md = [r for r in rows_md if any(c for c in r)]
    if not rows_md:
        return ""

    # Normalise column count to the widest row
    width = max(len(r) for r in rows_md)
    rows_md = [r + [""] * (width - len(r)) for r in rows_md]

    # Treat the first non-empty row as the header
    header   = rows_md[0]
    data_rows = rows_md[1:]

    lines = ["| " + " | ".join(header) + " |",
             "| " + " | ".join(["---"] * width) + " |"]
    for r in data_rows:
        lines.append("| " + " | ".join(r) + " |")

    return "\n".join(lines)


def _is_boilerplate(text: str) -> bool:
    """Return True for known SEC cover-page administrative fragments."""
    t = text.strip().lower()
    if t in _BOILERPLATE_EXACT:
        return True
    if len(t) < 5:
        return True
    if _BOILERPLATE_RE.match(text.strip()):
        return True
    return False


# ── Text cleaning ──────────────────────────────────────────────────────────────

def clean_text(text: str) -> str:
    """Remove soft hyphens, zero-width spaces, and collapse whitespace."""
    if not text:
        return ""
    text = text.replace("\u00ad", "").replace("\u200b", "")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


# ══════════════════════════════════════════════════════════════════════════════
# MAIN PROCESSOR CLASS
# ══════════════════════════════════════════════════════════════════════════════

class SECProcessor:
    """
    Processes Apple SEC HTML filings through Docling.

    Saves two files per filing:
      {stem}.json         — structured JSON for inspection and table extraction
      {stem}_docling.json — native DoclingDocument for HybridChunker (Phase 3)
    """

    def __init__(self, output_dir: Path = PROCESSED_DIR):
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self._converter = None

    # ── Lazy-loaded Docling converter ──────────────────────────────────────────

    @property
    def converter(self):
        """Build the Docling converter on first use (slow import)."""
        if self._converter is None:
            from docling.document_converter import DocumentConverter, PdfFormatOption
            from docling.datamodel.pipeline_options import PdfPipelineOptions
            from docling.datamodel.base_models import InputFormat

            opts = PdfPipelineOptions()
            opts.do_table_structure      = True    # reconstruct table rows/cols
            opts.do_ocr                  = False   # HTML — no OCR needed
            opts.generate_picture_images = False   # skip figure images

            self._converter = DocumentConverter(
                format_options={
                    InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
                }
            )
            log.info("Docling converter ready.")
        return self._converter

    # ── Process one filing ─────────────────────────────────────────────────────

    def process_filing(
        self,
        htm_path : Path,
        metadata : dict,
        force    : bool = False,
    ) -> dict:
        """
        Parse one SEC HTML filing and save JSON + _docling.json.

        Args:
            htm_path : path to filing.htm
            metadata : dict containing doc_stem, source, doc_type, ticker, etc.
            force    : re-process even if output already exists

        Returns:
            parsed document dict
        """
        stem         = metadata["doc_stem"]
        out_path     = self.output_dir / f"{stem}.json"
        docling_path = self.output_dir / f"{stem}_docling.json"

        # Skip if both outputs already exist
        if out_path.exists() and docling_path.exists() and not force:
            log.info(f"SKIP {stem}  (already processed → {out_path.name})")
            with open(out_path) as f:
                return json.load(f)

        log.info(f"Processing: {stem}  ({htm_path.name})")

        # ── Parse with Docling ────────────────────────────────────────────────
        result = self.converter.convert(str(htm_path))
        doc    = result.document

        from docling.datamodel.document import SectionHeaderItem, TableItem

        # ── Extract sections ──────────────────────────────────────────────────
        sections       = []
        current_header = ""

        for item, level in doc.iterate_items():
            text = getattr(item, "text", None)
            if not text or not text.strip():
                continue
            if isinstance(item, TableItem):
                continue   # tables handled separately below

            raw      = text.strip()
            cleaned  = clean_text(raw)
            is_hdr   = isinstance(item, SectionHeaderItem)

            sections.append({
                "type"          : "header" if is_hdr else "text",
                "level"         : level,
                "text"          : raw,
                "cleaned_text"  : cleaned,
                "page_num"      : None,   # HTML has no page numbers
                "parent_header" : current_header,
                "is_boilerplate": _is_boilerplate(raw),
            })

            if is_hdr:
                current_header = raw

        # ── Extract tables ────────────────────────────────────────────────────
        tables = []
        for i, table in enumerate(doc.tables):
            try:
                df = table.export_to_dataframe(doc)

                if df.empty or len(df) < 2:
                    continue

                # Build markdown from the DataFrame values, not from
                # export_to_markdown() which produces blank cells for SEC HTML.
                markdown = _df_to_markdown(df)
                if not markdown:
                    continue

                tables.append({
                    "index"    : i,
                    "page_num" : None,   # HTML has no page numbers
                    "markdown" : markdown,
                    "headers"  : list(df.columns.astype(str)),
                    "rows"     : len(df),
                    "cols"     : len(df.columns),
                    "data"     : df.fillna("").values.tolist(),
                    "is_atomic": True,
                })
            except Exception as e:
                log.warning(f"  Table {i} skipped: {e}")

        # ── Build document metadata ───────────────────────────────────────────
        doc_meta = {
            k: v for k, v in metadata.items() if k != "doc_stem"
        }
        doc_meta.update({
            "parsed_at"      : datetime.now(timezone.utc).isoformat(),
            "parser"         : "docling",
            "total_pages"    : 0,
            "total_sections" : len(sections),
            "total_tables"   : len(tables),
            "removed_pages"  : [],   # no pages in HTML — nothing to remove
        })

        parsed = {
            "metadata" : doc_meta,
            "sections" : sections,
            "tables"   : tables,
        }

        # ── Save structured JSON ──────────────────────────────────────────────
        with open(out_path, "w") as f:
            json.dump(parsed, f, indent=2, ensure_ascii=False, default=str)
        size_kb = out_path.stat().st_size / 1024
        log.info(f"  Saved JSON     : {out_path.name}  ({size_kb:.1f} KB)")

        # ── Save native DoclingDocument (for HybridChunker) ───────────────────
        with open(docling_path, "w") as f:
            f.write(doc.model_dump_json())
        dl_kb = docling_path.stat().st_size / 1024
        log.info(f"  Saved _docling : {docling_path.name}  ({dl_kb:.1f} KB)")

        boilerplate_n = sum(1 for s in sections if s.get("is_boilerplate"))
        log.info(
            f"  Sections: {len(sections)}  "
            f"(boilerplate: {boilerplate_n})  "
            f"Tables: {len(tables)}"
        )

        return parsed

    # ── Batch process all filings ──────────────────────────────────────────────

    def process_all(
        self,
        raw_dir : Path = RAW_SEC_DIR,
        force   : bool = False,
    ) -> list[dict]:
        """
        Process all 10-K, 10-Q, and 8-K filings under raw_dir.

        Returns:
            list of parsed document dicts
        """
        results = []

        for doc_type in ["10-K", "10-Q", "8-K"]:
            type_dir = Path(raw_dir) / doc_type
            if not type_dir.exists():
                continue

            log.info(f"\n── {doc_type} filings ────────────────────────────")

            for period_dir in sorted(type_dir.iterdir()):
                htm = period_dir / "filing.htm"
                if not htm.exists():
                    continue

                # Load filing metadata
                meta_file = period_dir / "metadata.json"
                file_meta = {}
                if meta_file.exists():
                    with open(meta_file) as f:
                        file_meta = json.load(f)

                period   = period_dir.name
                stem     = f"{doc_type}_{period}"
                metadata = {
                    "doc_stem"    : stem,
                    "source"      : "sec_edgar",
                    "doc_type"    : doc_type,
                    "ticker"      : "AAPL",
                    "company"     : "Apple Inc.",
                    "fiscal_year" : file_meta.get("fiscal_year", period[:4]),
                    "filing_date" : file_meta.get("filing_date", ""),
                    "accession"   : file_meta.get("accession", ""),
                    "file_name"   : htm.name,
                    "file_path"   : str(htm),
                    "license"     : "public",
                    "access_level": "public",
                }

                try:
                    parsed = self.process_filing(htm, metadata, force=force)
                    results.append(parsed)
                except Exception as e:
                    log.error(f"  FAILED {stem}: {e}")

        return results


# ── Entry point ────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import sys

    force = "--force" in sys.argv

    log.info("=" * 60)
    log.info("Phase 2b – SEC Filing Processor")
    log.info("=" * 60)

    processor = SECProcessor()
    results   = processor.process_all(force=force)

    log.info("\n" + "=" * 60)
    log.info("Processing complete.")
    log.info(f"  Filings processed : {len(results)}")
    log.info(f"  Total sections    : {sum(r['metadata']['total_sections'] for r in results)}")
    log.info(f"  Total tables      : {sum(r['metadata']['total_tables']   for r in results)}")
    log.info("\nOutput files:")
    for f in sorted(PROCESSED_DIR.rglob("*.json")):
        if not f.name.endswith("_docling.json"):
            size_kb = f.stat().st_size / 1024
            log.info(f"  {f.name:40s}  ({size_kb:.1f} KB)")
    log.info("=" * 60)