Financial_bot / scripts /doc_parser.py
Pushkya's picture
Upload 30 files
8299003 verified
Raw
History Blame Contribute Delete
10.3 kB
"""
Phase 2 – Document Parser
==========================
Parses all raw documents (Morningstar PDFs + SEC filings) using Docling.
Outputs structured JSON per document with:
- Text sections (with hierarchy / heading level)
- Tables (as markdown + dataframe-ready dict)
- Metadata (source, type, page, fiscal year, etc.)
Usage:
python doc_parser.py
Output:
data/processed/
β”œβ”€β”€ morningstar/
β”‚ β”œβ”€β”€ a-wide-moat-focus-provides-differentiation.json
β”‚ └── ptc01302411420.json
└── sec_filings/
└── AAPL/
β”œβ”€β”€ 10-K_2023.json
β”œβ”€β”€ 10-K_2024.json
└── ...
"""
import json
import logging
from pathlib import Path
from datetime import datetime, timezone
# ── Paths ──────────────────────────────────────────────────────────────────────
BASE_DIR = Path(__file__).parent.parent
RAW_DIR = BASE_DIR / "data" / "raw"
PROCESSED_DIR = BASE_DIR / "data" / "processed"
LOG_DIR = BASE_DIR / "logs"
MORNINGSTAR_RAW = RAW_DIR / "morningstar"
SEC_RAW = RAW_DIR / "sec_filings" / "AAPL"
MORNINGSTAR_OUT = PROCESSED_DIR / "morningstar"
SEC_OUT = PROCESSED_DIR / "sec_filings" / "AAPL"
LOG_DIR.mkdir(parents=True, exist_ok=True)
# ── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(
level = logging.INFO,
format = "%(asctime)s %(levelname)-8s %(message)s",
handlers=[
logging.FileHandler(LOG_DIR / "doc_parser.log"),
logging.StreamHandler(),
]
)
log = logging.getLogger(__name__)
# ── Docling setup ──────────────────────────────────────────────────────────────
def build_converter():
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
opts = PdfPipelineOptions()
opts.do_table_structure = True # preserve financial tables
opts.do_ocr = False # these are digital PDFs, skip OCR
opts.generate_picture_images = False # skip figure image extraction
return DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
}
)
# ── Parse one PDF ──────────────────────────────────────────────────────────────
def parse_pdf(pdf_path: Path, metadata: dict, converter) -> dict:
"""
Parse a single PDF with Docling.
Returns a structured dict with sections, tables, and metadata.
"""
log.info(f" Parsing: {pdf_path.name}")
result = converter.convert(str(pdf_path))
doc = result.document
# ── Text sections ────────────────────────────────────────────────────────
sections = []
for item, level in doc.iterate_items():
from docling.datamodel.document import TextItem, SectionHeaderItem
text = getattr(item, "text", None)
if not text or not text.strip():
continue
item_type = "header" if isinstance(item, SectionHeaderItem) else "text"
page_num = item.prov[0].page_no if item.prov else None
sections.append({
"type" : item_type,
"level" : level,
"text" : text.strip(),
"page_num": page_num,
})
# ── Tables ───────────────────────────────────────────────────────────────
tables = []
for i, table in enumerate(doc.tables):
try:
df = table.export_to_dataframe()
markdown = table.export_to_markdown()
page_num = table.prov[0].page_no if table.prov else None
tables.append({
"index" : i,
"page_num" : page_num,
"markdown" : markdown,
"rows" : len(df),
"cols" : len(df.columns),
"headers" : list(df.columns.astype(str)),
"data" : df.values.tolist(),
"is_atomic": True, # never split this chunk
})
except Exception as e:
log.warning(f" Table {i} export failed: {e}")
# ── Full markdown export (for quick inspection) ───────────────────────────
full_markdown = doc.export_to_markdown()
parsed = {
"metadata" : {
**metadata,
"parsed_at" : datetime.now(timezone.utc).isoformat(),
"parser" : "docling",
"total_pages" : max((s["page_num"] for s in sections if s["page_num"]), default=0),
"total_sections": len(sections),
"total_tables" : len(tables),
},
"sections" : sections,
"tables" : tables,
"full_markdown" : full_markdown,
}
return parsed
def save_parsed(data: dict, out_path: Path):
out_path.parent.mkdir(parents=True, exist_ok=True)
with open(out_path, "w") as f:
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
size_kb = out_path.stat().st_size / 1024
log.info(f" Saved: {out_path.name} ({size_kb:.1f} KB)")
# ── Morningstar PDFs ───────────────────────────────────────────────────────────
def process_morningstar(converter):
log.info("\n=== Morningstar PDFs ===")
pdfs = list(MORNINGSTAR_RAW.glob("*.pdf"))
log.info(f"Found {len(pdfs)} PDFs")
for pdf in pdfs:
out_path = MORNINGSTAR_OUT / f"{pdf.stem}.json"
if out_path.exists():
log.info(f" SKIP {pdf.name} (already parsed)")
continue
metadata = {
"source" : "morningstar",
"doc_type" : "research_report",
"file_name" : pdf.name,
"file_path" : str(pdf),
"license" : "proprietary",
"access_level": "internal",
}
try:
parsed = parse_pdf(pdf, metadata, converter)
save_parsed(parsed, out_path)
log.info(
f" Sections: {parsed['metadata']['total_sections']} "
f"Tables: {parsed['metadata']['total_tables']} "
f"Pages: {parsed['metadata']['total_pages']}"
)
except Exception as e:
log.error(f" FAILED {pdf.name}: {e}")
# ── SEC Filings ────────────────────────────────────────────────────────────────
def process_sec_filings(converter):
log.info("\n=== SEC Filings (AAPL) ===")
for ftype in ["10-K", "10-Q", "8-K"]:
ftype_dir = SEC_RAW / ftype
if not ftype_dir.exists():
continue
for folder in sorted(ftype_dir.iterdir()):
htm_files = list(folder.glob("filing.htm"))
if not htm_files:
continue
htm = htm_files[0]
out_name = f"{ftype}_{folder.name}.json"
out_path = SEC_OUT / out_name
if out_path.exists():
log.info(f" SKIP {out_name} (already parsed)")
continue
# Load filing metadata
meta_file = folder / "metadata.json"
file_meta = {}
if meta_file.exists():
with open(meta_file) as f:
file_meta = json.load(f)
metadata = {
"source" : "sec_edgar",
"doc_type" : ftype,
"ticker" : "AAPL",
"company" : "Apple Inc.",
"fiscal_year" : file_meta.get("fiscal_year", folder.name[:4]),
"filing_date" : file_meta.get("filing_date", ""),
"accession" : file_meta.get("accession", ""),
"file_name" : htm.name,
"file_path" : str(htm),
"license" : "public",
"access_level": "public",
}
log.info(f" Parsing {ftype}/{folder.name} ...")
try:
parsed = parse_pdf(htm, metadata, converter)
save_parsed(parsed, out_path)
log.info(
f" Sections: {parsed['metadata']['total_sections']} "
f"Tables: {parsed['metadata']['total_tables']} "
f"Pages: {parsed['metadata']['total_pages']}"
)
except Exception as e:
log.error(f" FAILED {out_name}: {e}")
# ── Entry point ────────────────────────────────────────────────────────────────
if __name__ == "__main__":
log.info("=" * 60)
log.info("Phase 2 – Document Parser")
log.info("=" * 60)
log.info("Loading Docling converter ...")
converter = build_converter()
log.info("Converter ready.")
process_morningstar(converter)
process_sec_filings(converter)
# Summary
log.info("\n" + "=" * 60)
log.info("Parsing complete. Output files:")
for f in sorted(PROCESSED_DIR.rglob("*.json")):
size_kb = f.stat().st_size / 1024
log.info(f" {f.relative_to(PROCESSED_DIR)} ({size_kb:.1f} KB)")
log.info("=" * 60)