Spaces:

Pushkya
/

Financial_bot

Running

App Files Files Community

Financial_bot / scripts /doc_parser.py

Pushkya

Upload 30 files

8299003 verified 1 day ago

Raw

History Blame Contribute Delete

10.3 kB

	"""
	Phase 2 – Document Parser
	==========================
	Parses all raw documents (Morningstar PDFs + SEC filings) using Docling.
	Outputs structured JSON per document with:
	- Text sections (with hierarchy / heading level)
	- Tables (as markdown + dataframe-ready dict)
	- Metadata (source, type, page, fiscal year, etc.)

	Usage:
	python doc_parser.py

	Output:
	data/processed/
	├── morningstar/
	│ ├── a-wide-moat-focus-provides-differentiation.json
	│ └── ptc01302411420.json
	└── sec_filings/
	└── AAPL/
	├── 10-K_2023.json
	├── 10-K_2024.json
	└── ...
	"""

	import json
	import logging
	from pathlib import Path
	from datetime import datetime, timezone

	# ── Paths ──────────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).parent.parent
	RAW_DIR = BASE_DIR / "data" / "raw"
	PROCESSED_DIR = BASE_DIR / "data" / "processed"
	LOG_DIR = BASE_DIR / "logs"

	MORNINGSTAR_RAW = RAW_DIR / "morningstar"
	SEC_RAW = RAW_DIR / "sec_filings" / "AAPL"
	MORNINGSTAR_OUT = PROCESSED_DIR / "morningstar"
	SEC_OUT = PROCESSED_DIR / "sec_filings" / "AAPL"

	LOG_DIR.mkdir(parents=True, exist_ok=True)

	# ── Logging ────────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level = logging.INFO,
	format = "%(asctime)s %(levelname)-8s %(message)s",
	handlers=[
	logging.FileHandler(LOG_DIR / "doc_parser.log"),
	logging.StreamHandler(),
	]
	)
	log = logging.getLogger(__name__)


	# ── Docling setup ──────────────────────────────────────────────────────────────
	def build_converter():
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.datamodel.base_models import InputFormat

	opts = PdfPipelineOptions()
	opts.do_table_structure = True # preserve financial tables
	opts.do_ocr = False # these are digital PDFs, skip OCR
	opts.generate_picture_images = False # skip figure image extraction

	return DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
	}
	)


	# ── Parse one PDF ──────────────────────────────────────────────────────────────
	def parse_pdf(pdf_path: Path, metadata: dict, converter) -> dict:
	"""
	Parse a single PDF with Docling.
	Returns a structured dict with sections, tables, and metadata.
	"""
	log.info(f" Parsing: {pdf_path.name}")

	result = converter.convert(str(pdf_path))
	doc = result.document

	# ── Text sections ────────────────────────────────────────────────────────
	sections = []
	for item, level in doc.iterate_items():
	from docling.datamodel.document import TextItem, SectionHeaderItem
	text = getattr(item, "text", None)
	if not text or not text.strip():
	continue

	item_type = "header" if isinstance(item, SectionHeaderItem) else "text"
	page_num = item.prov[0].page_no if item.prov else None

	sections.append({
	"type" : item_type,
	"level" : level,
	"text" : text.strip(),
	"page_num": page_num,
	})

	# ── Tables ───────────────────────────────────────────────────────────────
	tables = []
	for i, table in enumerate(doc.tables):
	try:
	df = table.export_to_dataframe()
	markdown = table.export_to_markdown()
	page_num = table.prov[0].page_no if table.prov else None

	tables.append({
	"index" : i,
	"page_num" : page_num,
	"markdown" : markdown,
	"rows" : len(df),
	"cols" : len(df.columns),
	"headers" : list(df.columns.astype(str)),
	"data" : df.values.tolist(),
	"is_atomic": True, # never split this chunk
	})
	except Exception as e:
	log.warning(f" Table {i} export failed: {e}")

	# ── Full markdown export (for quick inspection) ───────────────────────────
	full_markdown = doc.export_to_markdown()

	parsed = {
	"metadata" : {
	**metadata,
	"parsed_at" : datetime.now(timezone.utc).isoformat(),
	"parser" : "docling",
	"total_pages" : max((s["page_num"] for s in sections if s["page_num"]), default=0),
	"total_sections": len(sections),
	"total_tables" : len(tables),
	},
	"sections" : sections,
	"tables" : tables,
	"full_markdown" : full_markdown,
	}

	return parsed


	def save_parsed(data: dict, out_path: Path):
	out_path.parent.mkdir(parents=True, exist_ok=True)
	with open(out_path, "w") as f:
	json.dump(data, f, indent=2, ensure_ascii=False, default=str)
	size_kb = out_path.stat().st_size / 1024
	log.info(f" Saved: {out_path.name} ({size_kb:.1f} KB)")


	# ── Morningstar PDFs ───────────────────────────────────────────────────────────
	def process_morningstar(converter):
	log.info("\n=== Morningstar PDFs ===")
	pdfs = list(MORNINGSTAR_RAW.glob("*.pdf"))
	log.info(f"Found {len(pdfs)} PDFs")

	for pdf in pdfs:
	out_path = MORNINGSTAR_OUT / f"{pdf.stem}.json"
	if out_path.exists():
	log.info(f" SKIP {pdf.name} (already parsed)")
	continue

	metadata = {
	"source" : "morningstar",
	"doc_type" : "research_report",
	"file_name" : pdf.name,
	"file_path" : str(pdf),
	"license" : "proprietary",
	"access_level": "internal",
	}

	try:
	parsed = parse_pdf(pdf, metadata, converter)
	save_parsed(parsed, out_path)
	log.info(
	f" Sections: {parsed['metadata']['total_sections']} "
	f"Tables: {parsed['metadata']['total_tables']} "
	f"Pages: {parsed['metadata']['total_pages']}"
	)
	except Exception as e:
	log.error(f" FAILED {pdf.name}: {e}")


	# ── SEC Filings ────────────────────────────────────────────────────────────────
	def process_sec_filings(converter):
	log.info("\n=== SEC Filings (AAPL) ===")

	for ftype in ["10-K", "10-Q", "8-K"]:
	ftype_dir = SEC_RAW / ftype
	if not ftype_dir.exists():
	continue

	for folder in sorted(ftype_dir.iterdir()):
	htm_files = list(folder.glob("filing.htm"))
	if not htm_files:
	continue

	htm = htm_files[0]
	out_name = f"{ftype}_{folder.name}.json"
	out_path = SEC_OUT / out_name

	if out_path.exists():
	log.info(f" SKIP {out_name} (already parsed)")
	continue

	# Load filing metadata
	meta_file = folder / "metadata.json"
	file_meta = {}
	if meta_file.exists():
	with open(meta_file) as f:
	file_meta = json.load(f)

	metadata = {
	"source" : "sec_edgar",
	"doc_type" : ftype,
	"ticker" : "AAPL",
	"company" : "Apple Inc.",
	"fiscal_year" : file_meta.get("fiscal_year", folder.name[:4]),
	"filing_date" : file_meta.get("filing_date", ""),
	"accession" : file_meta.get("accession", ""),
	"file_name" : htm.name,
	"file_path" : str(htm),
	"license" : "public",
	"access_level": "public",
	}

	log.info(f" Parsing {ftype}/{folder.name} ...")
	try:
	parsed = parse_pdf(htm, metadata, converter)
	save_parsed(parsed, out_path)
	log.info(
	f" Sections: {parsed['metadata']['total_sections']} "
	f"Tables: {parsed['metadata']['total_tables']} "
	f"Pages: {parsed['metadata']['total_pages']}"
	)
	except Exception as e:
	log.error(f" FAILED {out_name}: {e}")


	# ── Entry point ────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	log.info("=" * 60)
	log.info("Phase 2 – Document Parser")
	log.info("=" * 60)

	log.info("Loading Docling converter ...")
	converter = build_converter()
	log.info("Converter ready.")

	process_morningstar(converter)
	process_sec_filings(converter)

	# Summary
	log.info("\n" + "=" * 60)
	log.info("Parsing complete. Output files:")
	for f in sorted(PROCESSED_DIR.rglob("*.json")):
	size_kb = f.stat().st_size / 1024
	log.info(f" {f.relative_to(PROCESSED_DIR)} ({size_kb:.1f} KB)")
	log.info("=" * 60)