Spaces:

Pushkya
/

Financial_bot

Running

App Files Files Community

Financial_bot / src /chunker.py

Pushkya

Upload 30 files

8299003 verified 2 days ago

Raw

History Blame Contribute Delete

17.9 kB

	"""
	chunker.py
	==========
	Phase 3 – Document Chunking

	Converts the structured output from pdf_processor.py into chunks ready for
	embedding and storage in ChromaDB.

	Why not LangChain SemanticChunker?
	-----------------------------------
	LangChain's SemanticChunker treats every document as a flat string. It has
	no concept of headings, section hierarchy, or whether a block of text is a
	table or a paragraph. Docling's HybridChunker, on the other hand, works on
	the native DoclingDocument object and understands the full visual structure
	parsed by the DocLayNet layout model.

	Why not HybridChunker for tables?
	-----------------------------------
	HybridChunker splits tables to fit within the token limit. For financial
	documents, a holdings table with 30 rows becomes 5 fragments — each fragment
	loses the column-header context. An LLM reading "Ticker = GS. Sector =
	Financial Services." with no table headers cannot reconstruct the original
	table structure reliably.

	Solution: best of both worlds
	-------------------------------
	TEXT → Docling HybridChunker (tokenizer-aware, structure-aware,
	full heading-path injection, respects document hierarchy)

	TABLES → Atomic markdown pass-through (one complete table = one chunk,
	markdown preserves column/row alignment, never split)

	This gives semantic coherence for text AND lossless fidelity for tables.

	Output format per chunk
	------------------------
	{
	"chunk_id" : "ptc01302411420_text_0042",
	"doc_id" : "ptc01302411420",
	"chunk_type" : "text" \| "table",
	"text" : "...", # prose or full markdown table
	"is_atomic" : false \| true,
	"metadata" : {
	"source" : "morningstar" \| "sec_edgar",
	"doc_type" : "research_report" \| "10-K" \| "10-Q" \| "8-K",
	"file_name" : "ptc01302411420.pdf",
	"page_num" : 5,
	"section_title" : "Financial Summary", # immediate heading
	"heading_path" : "Results > Financial Summary", # full hierarchy
	... # all keys from document metadata
	}
	}

	Usage (as a module)
	-------------------
	from src.chunker import DocumentChunker
	chunker = DocumentChunker()
	chunks = chunker.chunk_document("data/processed/morningstar/ptc01302411420.json")

	Usage (as a script)
	-------------------
	python src/chunker.py
	python src/chunker.py --force # re-chunk even if output exists
	"""

	import json
	import logging
	from pathlib import Path
	from datetime import datetime, timezone

	# ── Logging ────────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level = logging.INFO,
	format = "%(asctime)s %(levelname)-8s %(message)s"
	)
	log = logging.getLogger(__name__)

	# ── Paths ──────────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).parent.parent
	PROCESSED_DIR = BASE_DIR / "data" / "processed"
	CHUNKS_DIR = BASE_DIR / "data" / "chunks"

	# ── Constants ──────────────────────────────────────────────────────────────────
	EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
	MAX_TOKENS = 256 # all-MiniLM-L6-v2 context window


	# ══════════════════════════════════════════════════════════════════════════════
	# TEXT CHUNKING — Docling HybridChunker
	# ──────────────────────────────────────────────────────────────────────────────
	# HybridChunker works on the native DoclingDocument, understanding:
	# • Section headings and their hierarchy
	# • Reading order determined by DocLayNet
	# • Token limits of the target embedding model
	#
	# For each text chunk it produces:
	# chunk.text → the prose content
	# chunk.meta.headings → list of heading titles from root to this section
	# e.g. ["Results", "Financial Summary", "Revenue"]
	# chunk.meta.doc_items → original DocItem objects (for page number lookup)
	#
	# We SKIP chunks whose doc_items contain a TableItem — tables are handled
	# separately with our atomic markdown approach.
	# ══════════════════════════════════════════════════════════════════════════════

	def chunk_text_with_hybrid(
	docling_doc_path : Path,
	doc_meta : dict,
	removed_pages : set,
	) -> list[dict]:
	"""
	Use Docling HybridChunker to split text sections of a document.

	Tables are skipped here — they are handled by chunk_tables_atomic().

	Args:
	docling_doc_path : path to the _docling.json file saved by Phase 2
	doc_meta : document-level metadata dict (from processed JSON)
	removed_pages : set of page numbers filtered out by the noise filter

	Returns:
	list of text chunk dicts (chunk_type="text")
	"""
	from docling.chunking import HybridChunker
	from docling.datamodel.document import DoclingDocument, TableItem

	# Load native DoclingDocument
	with open(docling_doc_path) as f:
	dl_doc = DoclingDocument.model_validate_json(f.read())

	chunker = HybridChunker(
	tokenizer = EMBEDDING_MODEL,
	max_tokens = MAX_TOKENS,
	merge_peers = True, # merge small adjacent chunks sharing same heading
	)

	chunks: list[dict] = []

	for chunk in chunker.chunk(dl_doc):
	# ── Skip table chunks ─────────────────────────────────────────────
	# HybridChunker splits large tables by token count, which destroys
	# column-header context. We handle tables separately with full
	# markdown representation (one table = one chunk, never split).
	if any(isinstance(item, TableItem) for item in chunk.meta.doc_items):
	continue

	# ── Get page number from first doc item ───────────────────────────
	page_num = None
	for item in chunk.meta.doc_items:
	if item.prov:
	page_num = item.prov[0].page_no
	break

	# ── Skip chunks on noise pages ────────────────────────────────────
	if page_num in removed_pages:
	continue

	# ── Skip empty chunks ─────────────────────────────────────────────
	text = chunk.text.strip()
	if not text:
	continue

	# ── Build heading metadata ────────────────────────────────────────
	headings = chunk.meta.headings or []
	section_title = headings[-1] if headings else ""
	heading_path = " > ".join(headings) if headings else ""

	chunks.append({
	"chunk_type" : "text",
	"text" : text,
	"is_atomic" : False,
	"metadata" : {
	**doc_meta,
	"section_title": section_title,
	"heading_path" : heading_path,
	"page_num" : page_num,
	},
	})

	return chunks


	# ══════════════════════════════════════════════════════════════════════════════
	# TABLE CHUNKING — Atomic markdown pass-through
	# ──────────────────────────────────────────────────────────────────────────────
	# Why markdown and not HybridChunker's key=value format?
	#
	# HybridChunker serialises a table cell as:
	# "Goldman Sachs, Top 10 Holdings.Ticker = GS. Goldman Sachs, Top 10
	# Holdings.Sector = Financial Services."
	#
	# Our markdown representation:
	# \| Company \| Ticker \| Sector \|
	# \|Goldman Sachs \| GS \| Financial Services \|
	#
	# LLMs are trained on markdown and read it far more accurately than the
	# key=value format. A query about "Goldman Sachs sector" against the
	# markdown chunk will produce a correct, well-grounded answer.
	# ══════════════════════════════════════════════════════════════════════════════

	def chunk_tables_atomic(tables: list[dict], doc_meta: dict) -> list[dict]:
	"""
	Convert extracted tables into atomic chunks using markdown representation.

	Each table = exactly one chunk. The `is_atomic` flag instructs downstream
	systems to never split this chunk.

	Args:
	tables : list of table dicts from processed JSON (with 'markdown' key)
	doc_meta : document-level metadata dict

	Returns:
	list of table chunk dicts (chunk_type="table")
	"""
	chunks: list[dict] = []

	for t in tables:
	markdown = t.get("markdown", "").strip()
	if not markdown:
	continue

	chunks.append({
	"chunk_type": "table",
	"text" : markdown,
	"is_atomic" : True,
	"metadata" : {
	**doc_meta,
	"page_num" : t.get("page_num"),
	"table_index" : t.get("index"),
	"rows" : t.get("rows"),
	"cols" : t.get("cols"),
	"col_headers" : t.get("headers", []),
	},
	})

	return chunks


	# ══════════════════════════════════════════════════════════════════════════════
	# CHUNK ID ASSIGNMENT
	# ──────────────────────────────────────────────────────────────────────────────
	# IDs are deterministic: {doc_stem}_{type}_{zero_padded_index}
	# Deterministic IDs allow ChromaDB upserts (add-or-update) without duplicates
	# when the chunker is re-run after adding new documents.
	# ══════════════════════════════════════════════════════════════════════════════

	def _assign_ids(chunks: list[dict], doc_stem: str) -> list[dict]:
	"""Attach chunk_id and doc_id to every chunk in-place."""
	text_idx = table_idx = 0
	for chunk in chunks:
	if chunk["chunk_type"] == "text":
	chunk["chunk_id"] = f"{doc_stem}_text_{text_idx:04d}"
	text_idx += 1
	else:
	chunk["chunk_id"] = f"{doc_stem}_table_{table_idx:03d}"
	table_idx += 1
	chunk["doc_id"] = doc_stem
	return chunks


	# ══════════════════════════════════════════════════════════════════════════════
	# MAIN CHUNKER CLASS
	# ══════════════════════════════════════════════════════════════════════════════

	class DocumentChunker:
	"""
	End-to-end document chunker.

	Loads a processed JSON + its paired _docling.json (from PDFProcessor),
	applies HybridChunker to text sections, passes tables through atomically
	as markdown, and saves a chunks JSON.
	"""

	def __init__(self, chunks_dir: Path = CHUNKS_DIR):
	self.chunks_dir = Path(chunks_dir)
	self.chunks_dir.mkdir(parents=True, exist_ok=True)

	def chunk_document(
	self,
	json_path : str \| Path,
	force : bool = False,
	) -> list[dict]:
	"""
	Chunk a single processed document.

	Args:
	json_path : path to the processed JSON (output of PDFProcessor)
	force : if True, re-chunk even if output already exists

	Returns:
	list of chunk dicts
	"""
	json_path = Path(json_path).resolve()

	# Mirror processed/ directory structure under chunks/
	rel = json_path.relative_to(PROCESSED_DIR.resolve())
	out_path = self.chunks_dir / rel.parent / f"{json_path.stem}_chunks.json"
	doc_stem = json_path.stem

	if out_path.exists() and not force:
	log.info(f"SKIP {json_path.name} (already chunked → {out_path.name})")
	with open(out_path) as f:
	return json.load(f)["chunks"]

	log.info(f"Chunking: {json_path.name}")

	# ── Load processed JSON ───────────────────────────────────────────
	with open(json_path) as f:
	doc = json.load(f)

	doc_meta = doc["metadata"]
	tables = doc.get("tables", [])
	removed_pages = set(doc_meta.get("removed_pages", []))

	# ── Locate paired _docling.json ───────────────────────────────────
	docling_path = json_path.with_name(json_path.stem + "_docling.json")
	if not docling_path.exists():
	log.warning(
	f" _docling.json not found for {json_path.name}. "
	f"Re-run pdf_processor.py with force=True to regenerate it."
	)
	return []

	# ── Text chunks via HybridChunker ─────────────────────────────────
	text_chunks = chunk_text_with_hybrid(docling_path, doc_meta, removed_pages)
	log.info(f" Text chunks : {len(text_chunks)}")

	# ── Table chunks via atomic markdown pass-through ─────────────────
	table_chunks = chunk_tables_atomic(tables, doc_meta)
	log.info(f" Table chunks : {len(table_chunks)}")

	# ── Combine, assign IDs, save ─────────────────────────────────────
	all_chunks = _assign_ids(text_chunks + table_chunks, doc_stem)

	out_path.parent.mkdir(parents=True, exist_ok=True)
	with open(out_path, "w") as f:
	json.dump({
	"doc_id" : doc_stem,
	"source_file" : str(json_path),
	"chunked_at" : datetime.now(timezone.utc).isoformat(),
	"total_chunks" : len(all_chunks),
	"text_chunks" : len(text_chunks),
	"table_chunks" : len(table_chunks),
	"chunks" : all_chunks,
	}, f, indent=2, ensure_ascii=False)

	size_kb = out_path.stat().st_size / 1024
	log.info(f" Saved → {out_path.name} ({size_kb:.1f} KB) "
	f"[{len(text_chunks)} text + {len(table_chunks)} table = "
	f"{len(all_chunks)} total]")

	return all_chunks

	def chunk_all(
	self,
	processed_dir : Path = PROCESSED_DIR,
	force : bool = False,
	) -> dict:
	"""Chunk all processed JSON files. Returns {filename: chunk_count}."""
	json_files = sorted(Path(processed_dir).rglob("*.json"))
	# Only process main processed JSONs, not the _docling.json files
	json_files = [f for f in json_files if not f.stem.endswith("_docling")]
	log.info(f"Found {len(json_files)} processed documents")

	summary = {}
	for jf in json_files:
	try:
	chunks = self.chunk_document(jf, force=force)
	summary[jf.name] = len(chunks)
	except Exception as e:
	log.error(f" FAILED {jf.name}: {e}")
	summary[jf.name] = 0

	return summary


	# ── Entry point ────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	import sys

	force = "--force" in sys.argv

	log.info("=" * 60)
	log.info("Phase 3 – Document Chunker (HybridChunker + atomic tables)")
	log.info("=" * 60)

	chunker = DocumentChunker()
	summary = chunker.chunk_all(force=force)

	log.info("\n" + "=" * 60)
	log.info("Chunking complete.")
	total = 0
	for fname, n in summary.items():
	log.info(f" {fname:50s} {n:>5} chunks")
	total += n
	log.info(f" {'TOTAL':50s} {total:>5} chunks")
	log.info("=" * 60)