"""
processing.chunker — Paragraph-aware text chunker with overlap.

Splits text into ~300-word chunks, preferring paragraph boundaries.
Falls back to word-level splitting for very long paragraphs.
Maintains configurable word overlap between adjacent chunks
to preserve context continuity for embeddings.
"""

import re
from typing import List


def chunk_text(
    text: str,
    max_words: int = 300,
    overlap: int = 50,
    min_chunk_words: int = 15,
) -> List[str]:
    """
    Split text into chunks of approximately `max_words`, aligned
    to paragraph boundaries where possible.

    Args:
        text:            Input text to chunk.
        max_words:       Target maximum words per chunk.
        overlap:         Words of overlap between consecutive chunks.
        min_chunk_words: Discard trailing chunks smaller than this.

    Returns:
        List of text chunks.
    """
    if not text or not text.strip():
        return []

    chunks: List[str] = []

    # Step 1: Split into paragraphs (double-newline separated)
    paragraphs = re.split(r"\n\n+", text)

    current_words: List[str] = []
    current_len = 0

    for para in paragraphs:
        words = para.split()
        if not words:
            continue

        # If this paragraph fits in the current chunk, append it
        if current_len + len(words) <= max_words:
            current_words.extend(words)
            current_len += len(words)
        else:
            # Flush current chunk if it has content
            if current_words:
                chunks.append(" ".join(current_words))
                # Retain overlap from end of current chunk
                if overlap > 0:
                    current_words = current_words[-overlap:]
                    current_len = len(current_words)
                else:
                    current_words = []
                    current_len = 0

            # If paragraph itself exceeds max_words, split strictly
            if len(words) > max_words:
                i = 0
                while i < len(words):
                    sub = words[i : i + max_words]
                    chunks.append(" ".join(sub))
                    step = max(1, max_words - overlap)
                    i += step
                # Reset after processing oversized paragraph
                current_words = []
                current_len = 0
            else:
                current_words.extend(words)
                current_len += len(words)

    # Flush remaining words if they form a meaningful chunk
    if current_words and len(current_words) >= min_chunk_words:
        chunks.append(" ".join(current_words))

    return chunks