"""Finance-aware text chunking.""" import re from typing import Dict, List from config import settings from utils.text_splitter import split_text FINANCE_SECTION_PATTERNS = [ r"(?i)(income\s+statement|profit\s+and\s+loss)", r"(?i)(balance\s+sheet|statement\s+of\s+financial\s+position)", r"(?i)(cash\s+flow\s+statement)", r"(?i)(earnings\s+per\s+share|eps)", r"(?i)(management\s+discussion|md&a)", r"(?i)(risk\s+factors)", r"(?i)(notes\s+to\s+(financial\s+)?statements)", r"(?i)(quarterly\s+results|annual\s+results)", ] # Narrow patterns — avoid firing Modal chart OCR on every page that mentions "chart". CHART_PAGE_PATTERNS = re.compile( r"(?i)(?:" r"\b(?:figure|fig\.)\s*\d+\b" r"|\b(?:bar|line|pie|area)\s+(?:chart|graph)\b" r"|\bchart\s+(?:shows|below|above|illustrates|depicts)\b" r")", ) class FinanceAwareChunker: def __init__( self, chunk_size: int | None = None, chunk_overlap: int | None = None, ): self.chunk_size = chunk_size or settings.CHUNK_SIZE self.chunk_overlap = chunk_overlap or settings.CHUNK_OVERLAP self.separators = ["\n\n", "\n", ". ", ", ", " ", ""] def _split(self, text: str) -> List[str]: return split_text( text, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap, separators=self.separators, ) def _detect_section(self, text: str) -> str: for pattern in FINANCE_SECTION_PATTERNS: match = re.search(pattern, text) if match: return match.group(0).strip() return "general" @staticmethod def looks_like_chart(text: str) -> bool: return FinanceAwareChunker.should_extract_chart(text) @staticmethod def should_extract_chart(text: str, max_text_chars: int | None = None) -> bool: """True only for short, chart-like pages — skips dense text pages.""" from config import settings limit = max_text_chars if max_text_chars is not None else settings.CHART_OCR_MAX_TEXT_CHARS stripped = text.strip() if len(stripped) > limit: return False return bool(CHART_PAGE_PATTERNS.search(stripped)) def _split_preserving_tables(self, text: str) -> List[str]: """Split text but keep markdown table blocks intact.""" table_pattern = re.compile(r"(\|[^\n]+\|\n(?:\|[^\n]+\|\n?)+)", re.MULTILINE) parts = [] last_end = 0 for match in table_pattern.finditer(text): before = text[last_end : match.start()] if before.strip(): parts.extend(self._split(before)) parts.append(match.group(0)) last_end = match.end() remaining = text[last_end:] if remaining.strip(): parts.extend(self._split(remaining)) return parts if parts else self._split(text) def chunk( self, text: str, page_num: int = 0, source: str = "embedded", section_override: str | None = None, ) -> List[Dict]: chunks = self._split_preserving_tables(text) result = [] for i, chunk_text in enumerate(chunks): section = section_override or self._detect_section(chunk_text) result.append( { "text": chunk_text, "chunk_index": i, "section": section, "source": source, "page_number": page_num, } ) return result