Spaces:
Sleeping
Sleeping
| """Finance-aware text chunking.""" | |
| import re | |
| from typing import Dict, List | |
| from config import settings | |
| from utils.text_splitter import split_text | |
| FINANCE_SECTION_PATTERNS = [ | |
| r"(?i)(income\s+statement|profit\s+and\s+loss)", | |
| r"(?i)(balance\s+sheet|statement\s+of\s+financial\s+position)", | |
| r"(?i)(cash\s+flow\s+statement)", | |
| r"(?i)(earnings\s+per\s+share|eps)", | |
| r"(?i)(management\s+discussion|md&a)", | |
| r"(?i)(risk\s+factors)", | |
| r"(?i)(notes\s+to\s+(financial\s+)?statements)", | |
| r"(?i)(quarterly\s+results|annual\s+results)", | |
| ] | |
| # Narrow patterns — avoid firing Modal chart OCR on every page that mentions "chart". | |
| CHART_PAGE_PATTERNS = re.compile( | |
| r"(?i)(?:" | |
| r"\b(?:figure|fig\.)\s*\d+\b" | |
| r"|\b(?:bar|line|pie|area)\s+(?:chart|graph)\b" | |
| r"|\bchart\s+(?:shows|below|above|illustrates|depicts)\b" | |
| r")", | |
| ) | |
| class FinanceAwareChunker: | |
| def __init__( | |
| self, | |
| chunk_size: int | None = None, | |
| chunk_overlap: int | None = None, | |
| ): | |
| self.chunk_size = chunk_size or settings.CHUNK_SIZE | |
| self.chunk_overlap = chunk_overlap or settings.CHUNK_OVERLAP | |
| self.separators = ["\n\n", "\n", ". ", ", ", " ", ""] | |
| def _split(self, text: str) -> List[str]: | |
| return split_text( | |
| text, | |
| chunk_size=self.chunk_size, | |
| chunk_overlap=self.chunk_overlap, | |
| separators=self.separators, | |
| ) | |
| def _detect_section(self, text: str) -> str: | |
| for pattern in FINANCE_SECTION_PATTERNS: | |
| match = re.search(pattern, text) | |
| if match: | |
| return match.group(0).strip() | |
| return "general" | |
| def looks_like_chart(text: str) -> bool: | |
| return FinanceAwareChunker.should_extract_chart(text) | |
| def should_extract_chart(text: str, max_text_chars: int | None = None) -> bool: | |
| """True only for short, chart-like pages — skips dense text pages.""" | |
| from config import settings | |
| limit = max_text_chars if max_text_chars is not None else settings.CHART_OCR_MAX_TEXT_CHARS | |
| stripped = text.strip() | |
| if len(stripped) > limit: | |
| return False | |
| return bool(CHART_PAGE_PATTERNS.search(stripped)) | |
| def _split_preserving_tables(self, text: str) -> List[str]: | |
| """Split text but keep markdown table blocks intact.""" | |
| table_pattern = re.compile(r"(\|[^\n]+\|\n(?:\|[^\n]+\|\n?)+)", re.MULTILINE) | |
| parts = [] | |
| last_end = 0 | |
| for match in table_pattern.finditer(text): | |
| before = text[last_end : match.start()] | |
| if before.strip(): | |
| parts.extend(self._split(before)) | |
| parts.append(match.group(0)) | |
| last_end = match.end() | |
| remaining = text[last_end:] | |
| if remaining.strip(): | |
| parts.extend(self._split(remaining)) | |
| return parts if parts else self._split(text) | |
| def chunk( | |
| self, | |
| text: str, | |
| page_num: int = 0, | |
| source: str = "embedded", | |
| section_override: str | None = None, | |
| ) -> List[Dict]: | |
| chunks = self._split_preserving_tables(text) | |
| result = [] | |
| for i, chunk_text in enumerate(chunks): | |
| section = section_override or self._detect_section(chunk_text) | |
| result.append( | |
| { | |
| "text": chunk_text, | |
| "chunk_index": i, | |
| "section": section, | |
| "source": source, | |
| "page_number": page_num, | |
| } | |
| ) | |
| return result | |