Spaces:

Pushkya
/

Financial_bot

Running

File size: 10,258 Bytes
"""
Phase 2 – Document Parser
==========================
Parses all raw documents (Morningstar PDFs + SEC filings) using Docling.
Outputs structured JSON per document with:
  - Text sections (with hierarchy / heading level)
  - Tables (as markdown + dataframe-ready dict)
  - Metadata (source, type, page, fiscal year, etc.)

Usage:
    python doc_parser.py

Output:
    data/processed/
    ├── morningstar/
    │   ├── a-wide-moat-focus-provides-differentiation.json
    │   └── ptc01302411420.json
    └── sec_filings/
        └── AAPL/
            ├── 10-K_2023.json
            ├── 10-K_2024.json
            └── ...
"""

import json
import logging
from pathlib import Path
from datetime import datetime, timezone

# ── Paths ──────────────────────────────────────────────────────────────────────
BASE_DIR       = Path(__file__).parent.parent
RAW_DIR        = BASE_DIR / "data" / "raw"
PROCESSED_DIR  = BASE_DIR / "data" / "processed"
LOG_DIR        = BASE_DIR / "logs"

MORNINGSTAR_RAW  = RAW_DIR / "morningstar"
SEC_RAW          = RAW_DIR / "sec_filings" / "AAPL"
MORNINGSTAR_OUT  = PROCESSED_DIR / "morningstar"
SEC_OUT          = PROCESSED_DIR / "sec_filings" / "AAPL"

LOG_DIR.mkdir(parents=True, exist_ok=True)

# ── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(
    level   = logging.INFO,
    format  = "%(asctime)s  %(levelname)-8s  %(message)s",
    handlers=[
        logging.FileHandler(LOG_DIR / "doc_parser.log"),
        logging.StreamHandler(),
    ]
)
log = logging.getLogger(__name__)


# ── Docling setup ──────────────────────────────────────────────────────────────
def build_converter():
    from docling.document_converter import DocumentConverter, PdfFormatOption
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.datamodel.base_models import InputFormat

    opts = PdfPipelineOptions()
    opts.do_table_structure      = True    # preserve financial tables
    opts.do_ocr                  = False   # these are digital PDFs, skip OCR
    opts.generate_picture_images = False   # skip figure image extraction

    return DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
        }
    )


# ── Parse one PDF ──────────────────────────────────────────────────────────────
def parse_pdf(pdf_path: Path, metadata: dict, converter) -> dict:
    """
    Parse a single PDF with Docling.
    Returns a structured dict with sections, tables, and metadata.
    """
    log.info(f"  Parsing: {pdf_path.name}")

    result = converter.convert(str(pdf_path))
    doc    = result.document

    # ── Text sections ────────────────────────────────────────────────────────
    sections = []
    for item, level in doc.iterate_items():
        from docling.datamodel.document import TextItem, SectionHeaderItem
        text = getattr(item, "text", None)
        if not text or not text.strip():
            continue

        item_type = "header" if isinstance(item, SectionHeaderItem) else "text"
        page_num  = item.prov[0].page_no if item.prov else None

        sections.append({
            "type"    : item_type,
            "level"   : level,
            "text"    : text.strip(),
            "page_num": page_num,
        })

    # ── Tables ───────────────────────────────────────────────────────────────
    tables = []
    for i, table in enumerate(doc.tables):
        try:
            df       = table.export_to_dataframe()
            markdown = table.export_to_markdown()
            page_num = table.prov[0].page_no if table.prov else None

            tables.append({
                "index"    : i,
                "page_num" : page_num,
                "markdown" : markdown,
                "rows"     : len(df),
                "cols"     : len(df.columns),
                "headers"  : list(df.columns.astype(str)),
                "data"     : df.values.tolist(),
                "is_atomic": True,   # never split this chunk
            })
        except Exception as e:
            log.warning(f"    Table {i} export failed: {e}")

    # ── Full markdown export (for quick inspection) ───────────────────────────
    full_markdown = doc.export_to_markdown()

    parsed = {
        "metadata"      : {
            **metadata,
            "parsed_at"    : datetime.now(timezone.utc).isoformat(),
            "parser"       : "docling",
            "total_pages"  : max((s["page_num"] for s in sections if s["page_num"]), default=0),
            "total_sections": len(sections),
            "total_tables"  : len(tables),
        },
        "sections"      : sections,
        "tables"        : tables,
        "full_markdown" : full_markdown,
    }

    return parsed


def save_parsed(data: dict, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w") as f:
        json.dump(data, f, indent=2, ensure_ascii=False, default=str)
    size_kb = out_path.stat().st_size / 1024
    log.info(f"    Saved: {out_path.name}  ({size_kb:.1f} KB)")


# ── Morningstar PDFs ───────────────────────────────────────────────────────────
def process_morningstar(converter):
    log.info("\n=== Morningstar PDFs ===")
    pdfs = list(MORNINGSTAR_RAW.glob("*.pdf"))
    log.info(f"Found {len(pdfs)} PDFs")

    for pdf in pdfs:
        out_path = MORNINGSTAR_OUT / f"{pdf.stem}.json"
        if out_path.exists():
            log.info(f"  SKIP  {pdf.name}  (already parsed)")
            continue

        metadata = {
            "source"     : "morningstar",
            "doc_type"   : "research_report",
            "file_name"  : pdf.name,
            "file_path"  : str(pdf),
            "license"    : "proprietary",
            "access_level": "internal",
        }

        try:
            parsed = parse_pdf(pdf, metadata, converter)
            save_parsed(parsed, out_path)
            log.info(
                f"    Sections: {parsed['metadata']['total_sections']}  "
                f"Tables: {parsed['metadata']['total_tables']}  "
                f"Pages: {parsed['metadata']['total_pages']}"
            )
        except Exception as e:
            log.error(f"  FAILED {pdf.name}: {e}")


# ── SEC Filings ────────────────────────────────────────────────────────────────
def process_sec_filings(converter):
    log.info("\n=== SEC Filings (AAPL) ===")

    for ftype in ["10-K", "10-Q", "8-K"]:
        ftype_dir = SEC_RAW / ftype
        if not ftype_dir.exists():
            continue

        for folder in sorted(ftype_dir.iterdir()):
            htm_files = list(folder.glob("filing.htm"))
            if not htm_files:
                continue

            htm      = htm_files[0]
            out_name = f"{ftype}_{folder.name}.json"
            out_path = SEC_OUT / out_name

            if out_path.exists():
                log.info(f"  SKIP  {out_name}  (already parsed)")
                continue

            # Load filing metadata
            meta_file = folder / "metadata.json"
            file_meta = {}
            if meta_file.exists():
                with open(meta_file) as f:
                    file_meta = json.load(f)

            metadata = {
                "source"      : "sec_edgar",
                "doc_type"    : ftype,
                "ticker"      : "AAPL",
                "company"     : "Apple Inc.",
                "fiscal_year" : file_meta.get("fiscal_year", folder.name[:4]),
                "filing_date" : file_meta.get("filing_date", ""),
                "accession"   : file_meta.get("accession", ""),
                "file_name"   : htm.name,
                "file_path"   : str(htm),
                "license"     : "public",
                "access_level": "public",
            }

            log.info(f"  Parsing {ftype}/{folder.name} ...")
            try:
                parsed = parse_pdf(htm, metadata, converter)
                save_parsed(parsed, out_path)
                log.info(
                    f"    Sections: {parsed['metadata']['total_sections']}  "
                    f"Tables: {parsed['metadata']['total_tables']}  "
                    f"Pages: {parsed['metadata']['total_pages']}"
                )
            except Exception as e:
                log.error(f"  FAILED {out_name}: {e}")


# ── Entry point ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    log.info("=" * 60)
    log.info("Phase 2 – Document Parser")
    log.info("=" * 60)

    log.info("Loading Docling converter ...")
    converter = build_converter()
    log.info("Converter ready.")

    process_morningstar(converter)
    process_sec_filings(converter)

    # Summary
    log.info("\n" + "=" * 60)
    log.info("Parsing complete. Output files:")
    for f in sorted(PROCESSED_DIR.rglob("*.json")):
        size_kb = f.stat().st_size / 1024
        log.info(f"  {f.relative_to(PROCESSED_DIR)}  ({size_kb:.1f} KB)")
    log.info("=" * 60)