FinSightAI / backend /utils /chunker.py
Aniket2003333333's picture
start
7248d39
Raw
History Blame Contribute Delete
3.61 kB
"""Finance-aware text chunking."""
import re
from typing import Dict, List
from config import settings
from utils.text_splitter import split_text
FINANCE_SECTION_PATTERNS = [
r"(?i)(income\s+statement|profit\s+and\s+loss)",
r"(?i)(balance\s+sheet|statement\s+of\s+financial\s+position)",
r"(?i)(cash\s+flow\s+statement)",
r"(?i)(earnings\s+per\s+share|eps)",
r"(?i)(management\s+discussion|md&a)",
r"(?i)(risk\s+factors)",
r"(?i)(notes\s+to\s+(financial\s+)?statements)",
r"(?i)(quarterly\s+results|annual\s+results)",
]
# Narrow patterns — avoid firing Modal chart OCR on every page that mentions "chart".
CHART_PAGE_PATTERNS = re.compile(
r"(?i)(?:"
r"\b(?:figure|fig\.)\s*\d+\b"
r"|\b(?:bar|line|pie|area)\s+(?:chart|graph)\b"
r"|\bchart\s+(?:shows|below|above|illustrates|depicts)\b"
r")",
)
class FinanceAwareChunker:
def __init__(
self,
chunk_size: int | None = None,
chunk_overlap: int | None = None,
):
self.chunk_size = chunk_size or settings.CHUNK_SIZE
self.chunk_overlap = chunk_overlap or settings.CHUNK_OVERLAP
self.separators = ["\n\n", "\n", ". ", ", ", " ", ""]
def _split(self, text: str) -> List[str]:
return split_text(
text,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
separators=self.separators,
)
def _detect_section(self, text: str) -> str:
for pattern in FINANCE_SECTION_PATTERNS:
match = re.search(pattern, text)
if match:
return match.group(0).strip()
return "general"
@staticmethod
def looks_like_chart(text: str) -> bool:
return FinanceAwareChunker.should_extract_chart(text)
@staticmethod
def should_extract_chart(text: str, max_text_chars: int | None = None) -> bool:
"""True only for short, chart-like pages — skips dense text pages."""
from config import settings
limit = max_text_chars if max_text_chars is not None else settings.CHART_OCR_MAX_TEXT_CHARS
stripped = text.strip()
if len(stripped) > limit:
return False
return bool(CHART_PAGE_PATTERNS.search(stripped))
def _split_preserving_tables(self, text: str) -> List[str]:
"""Split text but keep markdown table blocks intact."""
table_pattern = re.compile(r"(\|[^\n]+\|\n(?:\|[^\n]+\|\n?)+)", re.MULTILINE)
parts = []
last_end = 0
for match in table_pattern.finditer(text):
before = text[last_end : match.start()]
if before.strip():
parts.extend(self._split(before))
parts.append(match.group(0))
last_end = match.end()
remaining = text[last_end:]
if remaining.strip():
parts.extend(self._split(remaining))
return parts if parts else self._split(text)
def chunk(
self,
text: str,
page_num: int = 0,
source: str = "embedded",
section_override: str | None = None,
) -> List[Dict]:
chunks = self._split_preserving_tables(text)
result = []
for i, chunk_text in enumerate(chunks):
section = section_override or self._detect_section(chunk_text)
result.append(
{
"text": chunk_text,
"chunk_index": i,
"section": section,
"source": source,
"page_number": page_num,
}
)
return result