Spaces:

NavyDevilDoc
/

AI_Toolkit

Sleeping

File size: 4,367 Bytes

c0f31c1

"""

PageChunker.py



A module for page-level document chunking with token counting and preprocessing.



Features:

- Page-based document splitting

- Content validation

- Blank page detection

- Document metadata enrichment

"""

from typing import List, Optional
import logging
from langchain_core.documents import Document
from core.BaseChunker import BaseChunker

logger = logging.getLogger(__name__)

class PageChunker(BaseChunker):
    """Handles document chunking at the page level."""
    
    def __init__(self, model_name=None, embedding_model=None):
        """

        Initialize page chunker with specified models.

        

        Args:

            model_name: Name of the model for tokenization

            embedding_model: Model for generating embeddings

        """
        super().__init__(model_name, embedding_model)
        self.page_stats = []

    def _is_blank_page(self, text: str) -> bool:
        """Check if page is blank or contains only whitespace/special characters."""
        cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
        return len(cleaned_text) < self.BLANK_THRESHOLD

    def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]:
        """

        Process a single page with optional preprocessing and analysis.

        

        Args:

            content: The page content

            page_number: The page number

            preprocess: Whether to preprocess the text

            

        Returns:

            Document object with processed content and metadata, or None if page is blank

        """
        if self._is_blank_page(content):
            self.page_stats.append(f"Page {page_number} is blank.")
            return None
            
        # Optionally preprocess the text
        if preprocess:
            content = self.preprocess_text(content)
            
        # Analyze the page and generate metadata
        stats = self.analyze_text(content)
        
        metadata = {
            "page": page_number,
            "char_count": stats["char_count"],
            "token_count": stats["token_count"],
            "sentence_count": stats["sentence_count"],
            "word_count": stats["word_count"],
            "has_ocr": str(stats.get("has_content", True)),
            "is_blank": "false"
        }
        
        return Document(page_content=content, metadata=metadata)

    def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
        """

        Process PDF document page by page with analysis and optional preprocessing.

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess page text

            

        Returns:

            List of Document objects, one per non-blank page

        """
        try:
            self.page_stats = []  # Reset stats for this document
            raw_pages = self.load_document(file_path)
            processed_pages = []
            
            logger.info(f"Processing document with {len(raw_pages)} pages")
            
            for idx, page in enumerate(raw_pages):
                processed_page = self._process_single_page(page.page_content, idx + 1, preprocess)
                if processed_page:
                    processed_pages.append(processed_page)
            
            # Output skipped pages for transparency
            if self.page_stats:
                logger.info("\n".join(self.page_stats))
                
            logger.info(f"Processed {len(processed_pages)} non-blank pages")
            return processed_pages
            
        except Exception as e:
            logger.error(f"Error in page_process_document: {e}")
            raise
    
    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
        """

        Process document using page chunking strategy (implements abstract method).

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess page text

            

        Returns:

            List of Document objects, one per non-blank page

        """
        return self.page_process_document(file_path, preprocess)