Spaces:

build-small-hackathon
/

FinSightAI

Sleeping

App Files Files Community

FinSightAI / backend /utils /chunker.py

Aniket2003333333

start

7248d39 18 days ago

Raw

History Blame Contribute Delete

3.61 kB

	"""Finance-aware text chunking."""

	import re
	from typing import Dict, List

	from config import settings
	from utils.text_splitter import split_text

	FINANCE_SECTION_PATTERNS = [
	r"(?i)(income\s+statement\|profit\s+and\s+loss)",
	r"(?i)(balance\s+sheet\|statement\s+of\s+financial\s+position)",
	r"(?i)(cash\s+flow\s+statement)",
	r"(?i)(earnings\s+per\s+share\|eps)",
	r"(?i)(management\s+discussion\|md&a)",
	r"(?i)(risk\s+factors)",
	r"(?i)(notes\s+to\s+(financial\s+)?statements)",
	r"(?i)(quarterly\s+results\|annual\s+results)",
	]

	# Narrow patterns — avoid firing Modal chart OCR on every page that mentions "chart".
	CHART_PAGE_PATTERNS = re.compile(
	r"(?i)(?:"
	r"\b(?:figure\|fig\.)\s*\d+\b"
	r"\|\b(?:bar\|line\|pie\|area)\s+(?:chart\|graph)\b"
	r"\|\bchart\s+(?:shows\|below\|above\|illustrates\|depicts)\b"
	r")",
	)


	class FinanceAwareChunker:
	def __init__(
	self,
	chunk_size: int \| None = None,
	chunk_overlap: int \| None = None,
	):
	self.chunk_size = chunk_size or settings.CHUNK_SIZE
	self.chunk_overlap = chunk_overlap or settings.CHUNK_OVERLAP
	self.separators = ["\n\n", "\n", ". ", ", ", " ", ""]

	def _split(self, text: str) -> List[str]:
	return split_text(
	text,
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	separators=self.separators,
	)

	def _detect_section(self, text: str) -> str:
	for pattern in FINANCE_SECTION_PATTERNS:
	match = re.search(pattern, text)
	if match:
	return match.group(0).strip()
	return "general"

	@staticmethod
	def looks_like_chart(text: str) -> bool:
	return FinanceAwareChunker.should_extract_chart(text)

	@staticmethod
	def should_extract_chart(text: str, max_text_chars: int \| None = None) -> bool:
	"""True only for short, chart-like pages — skips dense text pages."""
	from config import settings

	limit = max_text_chars if max_text_chars is not None else settings.CHART_OCR_MAX_TEXT_CHARS
	stripped = text.strip()
	if len(stripped) > limit:
	return False
	return bool(CHART_PAGE_PATTERNS.search(stripped))

	def _split_preserving_tables(self, text: str) -> List[str]:
	"""Split text but keep markdown table blocks intact."""
	table_pattern = re.compile(r"(\\|[^\n]+\\|\n(?:\\|[^\n]+\\|\n?)+)", re.MULTILINE)
	parts = []
	last_end = 0
	for match in table_pattern.finditer(text):
	before = text[last_end : match.start()]
	if before.strip():
	parts.extend(self._split(before))
	parts.append(match.group(0))
	last_end = match.end()
	remaining = text[last_end:]
	if remaining.strip():
	parts.extend(self._split(remaining))
	return parts if parts else self._split(text)

	def chunk(
	self,
	text: str,
	page_num: int = 0,
	source: str = "embedded",
	section_override: str \| None = None,
	) -> List[Dict]:
	chunks = self._split_preserving_tables(text)
	result = []
	for i, chunk_text in enumerate(chunks):
	section = section_override or self._detect_section(chunk_text)
	result.append(
	{
	"text": chunk_text,
	"chunk_index": i,
	"section": section,
	"source": source,
	"page_number": page_num,
	}
	)
	return result