"""
Text chunking module
Intelligently splits legal documents into meaningful chunks
"""

import re
import logging
from typing import List, Tuple, Optional, Dict
from pathlib import Path

from .config import (
    CHUNK_SIZE_MIN_WORDS,
    CHUNK_SIZE_MAX_WORDS,
    CHUNK_SIZE_TARGET_WORDS,
    CHUNK_OVERLAP_WORDS,
    COMPILED_SECTION_PATTERNS
)
from .models import DocumentChunk, ChunkMetadata

logger = logging.getLogger(__name__)


class LegalDocumentChunker:
    """Chunks legal documents with section/article awareness"""
    
    def __init__(
        self,
        min_words: int = CHUNK_SIZE_MIN_WORDS,
        max_words: int = CHUNK_SIZE_MAX_WORDS,
        target_words: int = CHUNK_SIZE_TARGET_WORDS,
        overlap_words: int = CHUNK_OVERLAP_WORDS
    ):
        """
        Initialize chunker
        
        Args:
            min_words: Minimum words per chunk
            max_words: Maximum words per chunk
            target_words: Target words per chunk
            overlap_words: Words to overlap between chunks
        """
        self.min_words = min_words
        self.max_words = max_words
        self.target_words = target_words
        self.overlap_words = overlap_words
    
    def chunk_document(
        self,
        text: str,
        source_file: str,
        pages_data: List[Dict[str, any]] = None
    ) -> List[DocumentChunk]:
        """
        Chunk a document into meaningful pieces
        
        Args:
            text: Full document text
            source_file: Source filename
            pages_data: Optional page data for page number tracking
            
        Returns:
            List of DocumentChunk objects
        """
        logger.info(f"Chunking document: {source_file}")
        
        # First, try to split by sections/articles
        sections = self._split_by_sections(text)
        
        # Then chunk each section appropriately
        all_chunks = []
        chunk_counter = 0
        
        for section_title, section_text in sections:
            section_chunks = self._chunk_section(
                section_text,
                section_title,
                source_file,
                chunk_counter
            )
            all_chunks.extend(section_chunks)
            chunk_counter += len(section_chunks)
        
        logger.info(f"Created {len(all_chunks)} chunks from {source_file}")
        
        return all_chunks
    
    def _split_by_sections(self, text: str) -> List[Tuple[Optional[str], str]]:
        """
        Split text by sections/articles
        
        Returns:
            List of (section_title, section_text) tuples
        """
        sections = []
        current_section = None
        current_text = []
        
        lines = text.split('\n')
        
        for line in lines:
            # Check if line contains a section marker
            section_match = self._detect_section(line)
            
            if section_match:
                # Save previous section if it has content
                if current_text:
                    sections.append((current_section, '\n'.join(current_text)))
                    current_text = []
                
                # Start new section with this title
                current_section = section_match
                # Include the section header line in the text
                current_text = [line]
            else:
                current_text.append(line)
        
        # Add final section
        if current_text:
            sections.append((current_section, '\n'.join(current_text)))
        
        # If no sections detected, return entire text as one section
        if len(sections) == 0:
            sections.append((None, text))
        
        logger.info(f"Detected {len(sections)} sections in document")
        
        return sections
    
    def _detect_section(self, line: str) -> Optional[str]:
        """
        Detect if a line contains a section/article marker
        
        Returns:
            Section title if detected, None otherwise
        """
        for pattern in COMPILED_SECTION_PATTERNS:
            match = pattern.search(line)
            if match:
                # For numbered sections like "11. Citizenship:", return "11. Citizenship"
                if len(match.groups()) >= 2:
                    # Pattern has both number and title
                    return f"{match.group(1)}. {match.group(2)}"
                else:
                    # Pattern has just the identifier, return the full match
                    return match.group(0)
        
        return None
    
    def _chunk_section(
        self,
        section_text: str,
        section_title: Optional[str],
        source_file: str,
        start_counter: int
    ) -> List[DocumentChunk]:
        """
        Chunk a single section into appropriate sizes
        
        Args:
            section_text: Text of the section
            section_title: Title/identifier of the section
            source_file: Source filename
            start_counter: Starting chunk number
            
        Returns:
            List of chunks for this section
        """
        words = section_text.split()
        word_count = len(words)
        
        # If section is small enough, keep as single chunk
        if word_count <= self.max_words:
            chunk = self._create_chunk(
                text=section_text,
                chunk_id=f"{Path(source_file).stem}_chunk_{start_counter:04d}",
                source_file=source_file,
                article_section=section_title
            )
            return [chunk]
        
        # Otherwise, split into multiple chunks
        chunks = []
        start_idx = 0
        chunk_num = start_counter
        max_iterations = word_count  # Safety limit to prevent infinite loops
        iteration_count = 0
        
        while start_idx < word_count and iteration_count < max_iterations:
            iteration_count += 1
            
            # Calculate end index
            end_idx = min(start_idx + self.target_words, word_count)
            
            # Ensure we make progress (end_idx must be greater than start_idx)
            if end_idx <= start_idx:
                logger.warning(f"Chunking issue: end_idx ({end_idx}) <= start_idx ({start_idx}), breaking")
                break
            
            # Try to find a good break point (sentence end)
            if end_idx < word_count:
                # Look for sentence endings near target
                chunk_words = words[start_idx:end_idx]
                chunk_text = ' '.join(chunk_words)
                
                # Find last sentence ending
                last_period = max(
                    chunk_text.rfind('. '),
                    chunk_text.rfind('! '),
                    chunk_text.rfind('? ')
                )
                
                if last_period > len(chunk_text) * 0.5:  # At least 50% through
                    # Adjust end_idx to sentence boundary
                    words_before_period = chunk_text[:last_period + 1].split()
                    new_end_idx = start_idx + len(words_before_period)
                    # Only use the new end_idx if it's actually moving forward
                    if new_end_idx > start_idx:
                        end_idx = new_end_idx
            
            # Create chunk
            chunk_words = words[start_idx:end_idx]
            chunk_text = ' '.join(chunk_words)
            
            chunk = self._create_chunk(
                text=chunk_text,
                chunk_id=f"{Path(source_file).stem}_chunk_{chunk_num:04d}",
                source_file=source_file,
                article_section=section_title
            )
            chunks.append(chunk)
            
            # Move to next chunk with overlap
            # Ensure we always move forward by at least 1 word
            overlap = min(self.overlap_words, end_idx - start_idx - 1)
            next_start_idx = end_idx - overlap
            
            # Safety check: ensure we're making progress
            if next_start_idx <= start_idx:
                next_start_idx = start_idx + 1
            
            start_idx = next_start_idx
            chunk_num += 1
        
        if iteration_count >= max_iterations:
            logger.warning(f"Hit max iterations ({max_iterations}) while chunking section")
        
        return chunks
    
    def _create_chunk(
        self,
        text: str,
        chunk_id: str,
        source_file: str,
        article_section: Optional[str] = None
    ) -> DocumentChunk:
        """Create a DocumentChunk object"""
        words = text.split()
        
        metadata = ChunkMetadata(
            source_file=source_file,
            article_section=article_section,
            word_count=len(words),
            char_count=len(text)
        )
        
        return DocumentChunk(
            chunk_id=chunk_id,
            text=text,
            metadata=metadata
        )