"""Finance-aware text chunking."""

import re
from typing import Dict, List

from config import settings
from utils.text_splitter import split_text

FINANCE_SECTION_PATTERNS = [
    r"(?i)(income\s+statement|profit\s+and\s+loss)",
    r"(?i)(balance\s+sheet|statement\s+of\s+financial\s+position)",
    r"(?i)(cash\s+flow\s+statement)",
    r"(?i)(earnings\s+per\s+share|eps)",
    r"(?i)(management\s+discussion|md&a)",
    r"(?i)(risk\s+factors)",
    r"(?i)(notes\s+to\s+(financial\s+)?statements)",
    r"(?i)(quarterly\s+results|annual\s+results)",
]

# Narrow patterns — avoid firing Modal chart OCR on every page that mentions "chart".
CHART_PAGE_PATTERNS = re.compile(
    r"(?i)(?:"
    r"\b(?:figure|fig\.)\s*\d+\b"
    r"|\b(?:bar|line|pie|area)\s+(?:chart|graph)\b"
    r"|\bchart\s+(?:shows|below|above|illustrates|depicts)\b"
    r")",
)


class FinanceAwareChunker:
    def __init__(
        self,
        chunk_size: int | None = None,
        chunk_overlap: int | None = None,
    ):
        self.chunk_size = chunk_size or settings.CHUNK_SIZE
        self.chunk_overlap = chunk_overlap or settings.CHUNK_OVERLAP
        self.separators = ["\n\n", "\n", ". ", ", ", " ", ""]

    def _split(self, text: str) -> List[str]:
        return split_text(
            text,
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=self.separators,
        )

    def _detect_section(self, text: str) -> str:
        for pattern in FINANCE_SECTION_PATTERNS:
            match = re.search(pattern, text)
            if match:
                return match.group(0).strip()
        return "general"

    @staticmethod
    def looks_like_chart(text: str) -> bool:
        return FinanceAwareChunker.should_extract_chart(text)

    @staticmethod
    def should_extract_chart(text: str, max_text_chars: int | None = None) -> bool:
        """True only for short, chart-like pages — skips dense text pages."""
        from config import settings

        limit = max_text_chars if max_text_chars is not None else settings.CHART_OCR_MAX_TEXT_CHARS
        stripped = text.strip()
        if len(stripped) > limit:
            return False
        return bool(CHART_PAGE_PATTERNS.search(stripped))

    def _split_preserving_tables(self, text: str) -> List[str]:
        """Split text but keep markdown table blocks intact."""
        table_pattern = re.compile(r"(\|[^\n]+\|\n(?:\|[^\n]+\|\n?)+)", re.MULTILINE)
        parts = []
        last_end = 0
        for match in table_pattern.finditer(text):
            before = text[last_end : match.start()]
            if before.strip():
                parts.extend(self._split(before))
            parts.append(match.group(0))
            last_end = match.end()
        remaining = text[last_end:]
        if remaining.strip():
            parts.extend(self._split(remaining))
        return parts if parts else self._split(text)

    def chunk(
        self,
        text: str,
        page_num: int = 0,
        source: str = "embedded",
        section_override: str | None = None,
    ) -> List[Dict]:
        chunks = self._split_preserving_tables(text)
        result = []
        for i, chunk_text in enumerate(chunks):
            section = section_override or self._detect_section(chunk_text)
            result.append(
                {
                    "text": chunk_text,
                    "chunk_index": i,
                    "section": section,
                    "source": source,
                    "page_number": page_num,
                }
            )
        return result