Spaces:

khagu
/

setu

Running

File size: 9,031 Bytes
"""
Text chunking module
Intelligently splits legal documents into meaningful chunks
"""

import re
import logging
from typing import List, Tuple, Optional, Dict
from pathlib import Path

from .config import (
    CHUNK_SIZE_MIN_WORDS,
    CHUNK_SIZE_MAX_WORDS,
    CHUNK_SIZE_TARGET_WORDS,
    CHUNK_OVERLAP_WORDS,
    COMPILED_SECTION_PATTERNS
)
from .models import DocumentChunk, ChunkMetadata

logger = logging.getLogger(__name__)


class LegalDocumentChunker:
    """Chunks legal documents with section/article awareness"""
    
    def __init__(
        self,
        min_words: int = CHUNK_SIZE_MIN_WORDS,
        max_words: int = CHUNK_SIZE_MAX_WORDS,
        target_words: int = CHUNK_SIZE_TARGET_WORDS,
        overlap_words: int = CHUNK_OVERLAP_WORDS
    ):
        """
        Initialize chunker
        
        Args:
            min_words: Minimum words per chunk
            max_words: Maximum words per chunk
            target_words: Target words per chunk
            overlap_words: Words to overlap between chunks
        """
        self.min_words = min_words
        self.max_words = max_words
        self.target_words = target_words
        self.overlap_words = overlap_words
    
    def chunk_document(
        self,
        text: str,
        source_file: str,
        pages_data: List[Dict[str, any]] = None
    ) -> List[DocumentChunk]:
        """
        Chunk a document into meaningful pieces
        
        Args:
            text: Full document text
            source_file: Source filename
            pages_data: Optional page data for page number tracking
            
        Returns:
            List of DocumentChunk objects
        """
        logger.info(f"Chunking document: {source_file}")
        
        # First, try to split by sections/articles
        sections = self._split_by_sections(text)
        
        # Then chunk each section appropriately
        all_chunks = []
        chunk_counter = 0
        
        for section_title, section_text in sections:
            section_chunks = self._chunk_section(
                section_text,
                section_title,
                source_file,
                chunk_counter
            )
            all_chunks.extend(section_chunks)
            chunk_counter += len(section_chunks)
        
        logger.info(f"Created {len(all_chunks)} chunks from {source_file}")
        
        return all_chunks
    
    def _split_by_sections(self, text: str) -> List[Tuple[Optional[str], str]]:
        """
        Split text by sections/articles
        
        Returns:
            List of (section_title, section_text) tuples
        """
        sections = []
        current_section = None
        current_text = []
        
        lines = text.split('\n')
        
        for line in lines:
            # Check if line contains a section marker
            section_match = self._detect_section(line)
            
            if section_match:
                # Save previous section if it has content
                if current_text:
                    sections.append((current_section, '\n'.join(current_text)))
                    current_text = []
                
                # Start new section with this title
                current_section = section_match
                # Include the section header line in the text
                current_text = [line]
            else:
                current_text.append(line)
        
        # Add final section
        if current_text:
            sections.append((current_section, '\n'.join(current_text)))
        
        # If no sections detected, return entire text as one section
        if len(sections) == 0:
            sections.append((None, text))
        
        logger.info(f"Detected {len(sections)} sections in document")
        
        return sections
    
    def _detect_section(self, line: str) -> Optional[str]:
        """
        Detect if a line contains a section/article marker
        
        Returns:
            Section title if detected, None otherwise
        """
        for pattern in COMPILED_SECTION_PATTERNS:
            match = pattern.search(line)
            if match:
                # For numbered sections like "11. Citizenship:", return "11. Citizenship"
                if len(match.groups()) >= 2:
                    # Pattern has both number and title
                    return f"{match.group(1)}. {match.group(2)}"
                else:
                    # Pattern has just the identifier, return the full match
                    return match.group(0)
        
        return None
    
    def _chunk_section(
        self,
        section_text: str,
        section_title: Optional[str],
        source_file: str,
        start_counter: int
    ) -> List[DocumentChunk]:
        """
        Chunk a single section into appropriate sizes
        
        Args:
            section_text: Text of the section
            section_title: Title/identifier of the section
            source_file: Source filename
            start_counter: Starting chunk number
            
        Returns:
            List of chunks for this section
        """
        words = section_text.split()
        word_count = len(words)
        
        # If section is small enough, keep as single chunk
        if word_count <= self.max_words:
            chunk = self._create_chunk(
                text=section_text,
                chunk_id=f"{Path(source_file).stem}_chunk_{start_counter:04d}",
                source_file=source_file,
                article_section=section_title
            )
            return [chunk]
        
        # Otherwise, split into multiple chunks
        chunks = []
        start_idx = 0
        chunk_num = start_counter
        max_iterations = word_count  # Safety limit to prevent infinite loops
        iteration_count = 0
        
        while start_idx < word_count and iteration_count < max_iterations:
            iteration_count += 1
            
            # Calculate end index
            end_idx = min(start_idx + self.target_words, word_count)
            
            # Ensure we make progress (end_idx must be greater than start_idx)
            if end_idx <= start_idx:
                logger.warning(f"Chunking issue: end_idx ({end_idx}) <= start_idx ({start_idx}), breaking")
                break
            
            # Try to find a good break point (sentence end)
            if end_idx < word_count:
                # Look for sentence endings near target
                chunk_words = words[start_idx:end_idx]
                chunk_text = ' '.join(chunk_words)
                
                # Find last sentence ending
                last_period = max(
                    chunk_text.rfind('. '),
                    chunk_text.rfind('! '),
                    chunk_text.rfind('? ')
                )
                
                if last_period > len(chunk_text) * 0.5:  # At least 50% through
                    # Adjust end_idx to sentence boundary
                    words_before_period = chunk_text[:last_period + 1].split()
                    new_end_idx = start_idx + len(words_before_period)
                    # Only use the new end_idx if it's actually moving forward
                    if new_end_idx > start_idx:
                        end_idx = new_end_idx
            
            # Create chunk
            chunk_words = words[start_idx:end_idx]
            chunk_text = ' '.join(chunk_words)
            
            chunk = self._create_chunk(
                text=chunk_text,
                chunk_id=f"{Path(source_file).stem}_chunk_{chunk_num:04d}",
                source_file=source_file,
                article_section=section_title
            )
            chunks.append(chunk)
            
            # Move to next chunk with overlap
            # Ensure we always move forward by at least 1 word
            overlap = min(self.overlap_words, end_idx - start_idx - 1)
            next_start_idx = end_idx - overlap
            
            # Safety check: ensure we're making progress
            if next_start_idx <= start_idx:
                next_start_idx = start_idx + 1
            
            start_idx = next_start_idx
            chunk_num += 1
        
        if iteration_count >= max_iterations:
            logger.warning(f"Hit max iterations ({max_iterations}) while chunking section")
        
        return chunks
    
    def _create_chunk(
        self,
        text: str,
        chunk_id: str,
        source_file: str,
        article_section: Optional[str] = None
    ) -> DocumentChunk:
        """Create a DocumentChunk object"""
        words = text.split()
        
        metadata = ChunkMetadata(
            source_file=source_file,
            article_section=article_section,
            word_count=len(words),
            char_count=len(text)
        )
        
        return DocumentChunk(
            chunk_id=chunk_id,
            text=text,
            metadata=metadata
        )