Spaces:

Pushkya
/

Financial_bot

Running

App Files Files Community

Financial_bot / src /sec_processor.py

Pushkya

Upload 30 files

8299003 verified 1 day ago

Raw

History Blame Contribute Delete

17.8 kB

	"""
	sec_processor.py
	================
	Phase 2b – SEC Filing Processor

	Processes Apple SEC HTML filings (10-K, 10-Q, 8-K) through Docling and saves:
	- {stem}.json → structured JSON (sections, tables, metadata)
	- {stem}_docling.json → native DoclingDocument (required for HybridChunker)

	Why not reuse pdf_processor.py?
	---------------------------------
	pdf_processor.py is built around PDFs:
	- Page-based noise filter (cover page, TOC, disclaimer pages)
	- Page numbers tracked throughout
	- Assumes DocLayNet layout detection

	SEC HTML filings are structurally different:
	- No pages — HTML has no page layout concept
	- Boilerplate is at the START of the document (cover section), not spread
	across specific pages
	- HTML headings (h1/h2/h3) map to SectionHeaderItem automatically
	- Tables use standard <table> tags — no OCR or TableFormer needed

	What stays the same
	--------------------
	- Docling converter with do_table_structure=True
	- export_to_dataframe(doc) / export_to_markdown(doc) for tables
	- doc.model_dump_json() → _docling.json (for HybridChunker)
	- cleaned_text, parent_header on every section

	Output format per chunk (after Phase 3 chunking)
	--------------------------------------------------
	{
	"chunk_id" : "10-K_2024_text_0042",
	"doc_id" : "10-K_2024",
	"chunk_type": "text" \| "table",
	"text" : "...",
	"metadata" : {
	"source" : "sec_edgar",
	"doc_type" : "10-K",
	"ticker" : "AAPL",
	"company" : "Apple Inc.",
	"fiscal_year" : "2024",
	"filing_date" : "2024-11-01",
	"accession" : "0000320193-24-000123",
	"heading_path": "PART I > Item 1. Business",
	...
	}
	}

	Usage (as a module)
	-------------------
	from src.sec_processor import SECProcessor
	processor = SECProcessor()
	processor.process_all()

	Usage (as a script)
	-------------------
	python src/sec_processor.py
	python src/sec_processor.py --force
	"""

	import re
	import json
	import logging
	from pathlib import Path
	from datetime import datetime, timezone

	# ── Logging ────────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level = logging.INFO,
	format = "%(asctime)s %(levelname)-8s %(message)s",
	)
	log = logging.getLogger(__name__)

	# ── Paths ──────────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).parent.parent
	RAW_SEC_DIR = BASE_DIR / "data" / "raw" / "sec_filings" / "AAPL"
	PROCESSED_DIR = BASE_DIR / "data" / "processed" / "sec_filings" / "AAPL"

	# ── SEC boilerplate detection ──────────────────────────────────────────────────
	# Every SEC filing begins with a cover section containing form labels,
	# legal boilerplate, and administrative identifiers. These fragments are
	# short and carry no analytical signal for RAG queries.
	_BOILERPLATE_EXACT = {
	"united states",
	"securities and exchange commission",
	"washington, d.c. 20549",
	"(mark one)",
	"or",
	"for the transition period from to .",
	"☒", "☐",
	}

	_BOILERPLATE_RE = re.compile(
	r"^("
	r"form \d+[\-/][a-z]+" # FORM 10-K, FORM 10-Q
	r"\|commission file" # Commission File Number
	r"\|irs employer" # IRS Employer Identification
	r"\|state or other" # State or other jurisdiction
	r"\|jurisdiction" # of incorporation
	r"\|\(exact name" # (Exact name of Registrant...)
	r"\|\(zip code" # (Zip Code)
	r"\|indicate by check" # Indicate by check mark...
	r"\|securities registered" # Securities registered...
	r"\|aggregate market value" # Aggregate market value...
	r"\|number of shares" # Number of shares outstanding
	r"\|☒\|☐" # form checkboxes
	r")",
	re.IGNORECASE,
	)


	def _df_to_markdown(df) -> str:
	"""
	Build a clean markdown table from a pandas DataFrame.

	Why not use table.export_to_markdown(doc)?
	Docling's HTML→markdown export produces blank cells for SEC HTML tables that
	use iXBRL inline tags or complex colspan/rowspan structures. The DataFrame
	export correctly populates cell values; we build the markdown from that instead.

	SEC HTML tables often expand colspan cells into N identical columns (e.g. a
	cell spanning 3 columns becomes ['Americas','Americas','Americas']). We
	de-duplicate consecutive identical values in each row before rendering so the
	markdown stays readable.
	"""
	def _dedup(cells: list[str]) -> list[str]:
	"""Remove consecutive identical tokens (colspan artefacts)."""
	result, prev = [], object()
	for c in cells:
	if c != prev:
	result.append(c)
	prev = c
	return result

	rows_md = []
	for _, row in df.iterrows():
	cells = _dedup([str(c).strip() if c else "" for c in row.values])
	rows_md.append(cells)

	# Drop rows that are entirely empty after dedup
	rows_md = [r for r in rows_md if any(c for c in r)]
	if not rows_md:
	return ""

	# Normalise column count to the widest row
	width = max(len(r) for r in rows_md)
	rows_md = [r + [""] * (width - len(r)) for r in rows_md]

	# Treat the first non-empty row as the header
	header = rows_md[0]
	data_rows = rows_md[1:]

	lines = ["\| " + " \| ".join(header) + " \|",
	"\| " + " \| ".join(["---"] * width) + " \|"]
	for r in data_rows:
	lines.append("\| " + " \| ".join(r) + " \|")

	return "\n".join(lines)


	def _is_boilerplate(text: str) -> bool:
	"""Return True for known SEC cover-page administrative fragments."""
	t = text.strip().lower()
	if t in _BOILERPLATE_EXACT:
	return True
	if len(t) < 5:
	return True
	if _BOILERPLATE_RE.match(text.strip()):
	return True
	return False


	# ── Text cleaning ──────────────────────────────────────────────────────────────

	def clean_text(text: str) -> str:
	"""Remove soft hyphens, zero-width spaces, and collapse whitespace."""
	if not text:
	return ""
	text = text.replace("\u00ad", "").replace("\u200b", "")
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	# ══════════════════════════════════════════════════════════════════════════════
	# MAIN PROCESSOR CLASS
	# ══════════════════════════════════════════════════════════════════════════════

	class SECProcessor:
	"""
	Processes Apple SEC HTML filings through Docling.

	Saves two files per filing:
	{stem}.json — structured JSON for inspection and table extraction
	{stem}_docling.json — native DoclingDocument for HybridChunker (Phase 3)
	"""

	def __init__(self, output_dir: Path = PROCESSED_DIR):
	self.output_dir = Path(output_dir)
	self.output_dir.mkdir(parents=True, exist_ok=True)
	self._converter = None

	# ── Lazy-loaded Docling converter ──────────────────────────────────────────

	@property
	def converter(self):
	"""Build the Docling converter on first use (slow import)."""
	if self._converter is None:
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling.datamodel.base_models import InputFormat

	opts = PdfPipelineOptions()
	opts.do_table_structure = True # reconstruct table rows/cols
	opts.do_ocr = False # HTML — no OCR needed
	opts.generate_picture_images = False # skip figure images

	self._converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(pipeline_options=opts)
	}
	)
	log.info("Docling converter ready.")
	return self._converter

	# ── Process one filing ─────────────────────────────────────────────────────

	def process_filing(
	self,
	htm_path : Path,
	metadata : dict,
	force : bool = False,
	) -> dict:
	"""
	Parse one SEC HTML filing and save JSON + _docling.json.

	Args:
	htm_path : path to filing.htm
	metadata : dict containing doc_stem, source, doc_type, ticker, etc.
	force : re-process even if output already exists

	Returns:
	parsed document dict
	"""
	stem = metadata["doc_stem"]
	out_path = self.output_dir / f"{stem}.json"
	docling_path = self.output_dir / f"{stem}_docling.json"

	# Skip if both outputs already exist
	if out_path.exists() and docling_path.exists() and not force:
	log.info(f"SKIP {stem} (already processed → {out_path.name})")
	with open(out_path) as f:
	return json.load(f)

	log.info(f"Processing: {stem} ({htm_path.name})")

	# ── Parse with Docling ────────────────────────────────────────────────
	result = self.converter.convert(str(htm_path))
	doc = result.document

	from docling.datamodel.document import SectionHeaderItem, TableItem

	# ── Extract sections ──────────────────────────────────────────────────
	sections = []
	current_header = ""

	for item, level in doc.iterate_items():
	text = getattr(item, "text", None)
	if not text or not text.strip():
	continue
	if isinstance(item, TableItem):
	continue # tables handled separately below

	raw = text.strip()
	cleaned = clean_text(raw)
	is_hdr = isinstance(item, SectionHeaderItem)

	sections.append({
	"type" : "header" if is_hdr else "text",
	"level" : level,
	"text" : raw,
	"cleaned_text" : cleaned,
	"page_num" : None, # HTML has no page numbers
	"parent_header" : current_header,
	"is_boilerplate": _is_boilerplate(raw),
	})

	if is_hdr:
	current_header = raw

	# ── Extract tables ────────────────────────────────────────────────────
	tables = []
	for i, table in enumerate(doc.tables):
	try:
	df = table.export_to_dataframe(doc)

	if df.empty or len(df) < 2:
	continue

	# Build markdown from the DataFrame values, not from
	# export_to_markdown() which produces blank cells for SEC HTML.
	markdown = _df_to_markdown(df)
	if not markdown:
	continue

	tables.append({
	"index" : i,
	"page_num" : None, # HTML has no page numbers
	"markdown" : markdown,
	"headers" : list(df.columns.astype(str)),
	"rows" : len(df),
	"cols" : len(df.columns),
	"data" : df.fillna("").values.tolist(),
	"is_atomic": True,
	})
	except Exception as e:
	log.warning(f" Table {i} skipped: {e}")

	# ── Build document metadata ───────────────────────────────────────────
	doc_meta = {
	k: v for k, v in metadata.items() if k != "doc_stem"
	}
	doc_meta.update({
	"parsed_at" : datetime.now(timezone.utc).isoformat(),
	"parser" : "docling",
	"total_pages" : 0,
	"total_sections" : len(sections),
	"total_tables" : len(tables),
	"removed_pages" : [], # no pages in HTML — nothing to remove
	})

	parsed = {
	"metadata" : doc_meta,
	"sections" : sections,
	"tables" : tables,
	}

	# ── Save structured JSON ──────────────────────────────────────────────
	with open(out_path, "w") as f:
	json.dump(parsed, f, indent=2, ensure_ascii=False, default=str)
	size_kb = out_path.stat().st_size / 1024
	log.info(f" Saved JSON : {out_path.name} ({size_kb:.1f} KB)")

	# ── Save native DoclingDocument (for HybridChunker) ───────────────────
	with open(docling_path, "w") as f:
	f.write(doc.model_dump_json())
	dl_kb = docling_path.stat().st_size / 1024
	log.info(f" Saved _docling : {docling_path.name} ({dl_kb:.1f} KB)")

	boilerplate_n = sum(1 for s in sections if s.get("is_boilerplate"))
	log.info(
	f" Sections: {len(sections)} "
	f"(boilerplate: {boilerplate_n}) "
	f"Tables: {len(tables)}"
	)

	return parsed

	# ── Batch process all filings ──────────────────────────────────────────────

	def process_all(
	self,
	raw_dir : Path = RAW_SEC_DIR,
	force : bool = False,
	) -> list[dict]:
	"""
	Process all 10-K, 10-Q, and 8-K filings under raw_dir.

	Returns:
	list of parsed document dicts
	"""
	results = []

	for doc_type in ["10-K", "10-Q", "8-K"]:
	type_dir = Path(raw_dir) / doc_type
	if not type_dir.exists():
	continue

	log.info(f"\n── {doc_type} filings ────────────────────────────")

	for period_dir in sorted(type_dir.iterdir()):
	htm = period_dir / "filing.htm"
	if not htm.exists():
	continue

	# Load filing metadata
	meta_file = period_dir / "metadata.json"
	file_meta = {}
	if meta_file.exists():
	with open(meta_file) as f:
	file_meta = json.load(f)

	period = period_dir.name
	stem = f"{doc_type}_{period}"
	metadata = {
	"doc_stem" : stem,
	"source" : "sec_edgar",
	"doc_type" : doc_type,
	"ticker" : "AAPL",
	"company" : "Apple Inc.",
	"fiscal_year" : file_meta.get("fiscal_year", period[:4]),
	"filing_date" : file_meta.get("filing_date", ""),
	"accession" : file_meta.get("accession", ""),
	"file_name" : htm.name,
	"file_path" : str(htm),
	"license" : "public",
	"access_level": "public",
	}

	try:
	parsed = self.process_filing(htm, metadata, force=force)
	results.append(parsed)
	except Exception as e:
	log.error(f" FAILED {stem}: {e}")

	return results


	# ── Entry point ────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	import sys

	force = "--force" in sys.argv

	log.info("=" * 60)
	log.info("Phase 2b – SEC Filing Processor")
	log.info("=" * 60)

	processor = SECProcessor()
	results = processor.process_all(force=force)

	log.info("\n" + "=" * 60)
	log.info("Processing complete.")
	log.info(f" Filings processed : {len(results)}")
	log.info(f" Total sections : {sum(r['metadata']['total_sections'] for r in results)}")
	log.info(f" Total tables : {sum(r['metadata']['total_tables'] for r in results)}")
	log.info("\nOutput files:")
	for f in sorted(PROCESSED_DIR.rglob("*.json")):
	if not f.name.endswith("_docling.json"):
	size_kb = f.stat().st_size / 1024
	log.info(f" {f.name:40s} ({size_kb:.1f} KB)")
	log.info("=" * 60)