""" HierarchicalChunker.py A module for hierarchical document chunking that combines page-level and semantic chunking. Features: - Multi-level document representation (pages and chunks) - Semantic chunking with sentence boundaries - Size and overlap controls - Hierarchical metadata """ import logging import spacy from typing import Dict, List, Optional, Any from langchain_core.documents import Document from core.PageChunker import PageChunker logger = logging.getLogger(__name__) class HierarchicalChunker(PageChunker): """Handles document chunking at multiple hierarchical levels.""" def __init__( self, model_name: Optional[str] = None, embedding_model: Optional[Any] = None, chunk_size: int = 500, chunk_overlap: int = 50, similarity_threshold: float = 0.85 ): """ Initialize hierarchical chunker with specified models and parameters. Args: model_name: Name of the model for tokenization embedding_model: Model for generating embeddings chunk_size: Maximum size of semantic chunks chunk_overlap: Overlap between chunks similarity_threshold: Similarity threshold for merging chunks """ super().__init__(model_name, embedding_model) self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.similarity_threshold = similarity_threshold # Initialize spaCy for NLP tasks try: self.nlp = spacy.load("en_core_web_sm") except OSError: logger.info("Installing spaCy model...") import subprocess subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], capture_output=True) self.nlp = spacy.load("en_core_web_sm") def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]: """ Create semantic chunks with detailed metadata. Args: content: The page content to chunk page_number: The page number Returns: List of Document objects representing semantic chunks """ if not content.strip(): return [] sentences = list(self.nlp(content).sents) chunks = [] current_chunk = [] current_length = 0 for sent in sentences: sent_text = sent.text.strip() sent_length = len(sent_text) if current_length + sent_length > self.chunk_size: if current_chunk: chunk_text = " ".join(current_chunk) stats = self.analyze_text(chunk_text) chunks.append(Document( page_content=chunk_text, metadata={ "level": "chunk", "page_num": page_number, "chunk_num": len(chunks) + 1, "parent_page": page_number, "char_count": stats["char_count"], "token_count": stats["token_count"], "sentence_count": stats["sentence_count"], "word_count": stats["word_count"], "has_ocr": stats.get("has_content", "true") } )) current_chunk = [sent_text] current_length = sent_length else: current_chunk.append(sent_text) current_length += sent_length # Handle final chunk if current_chunk: chunk_text = " ".join(current_chunk) stats = self.analyze_text(chunk_text) chunks.append(Document( page_content=chunk_text, metadata={ "level": "chunk", "page_num": page_number, "chunk_num": len(chunks) + 1, "parent_page": page_number, "char_count": stats["char_count"], "token_count": stats["token_count"], "sentence_count": stats["sentence_count"], "word_count": stats["word_count"], "has_ocr": stats.get("has_content", "true") } )) self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}") return chunks def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]: """ Process document with hierarchical chunking strategy. Args: file_path: Path to the PDF file preprocess: Whether to preprocess text Returns: Dictionary with 'pages' and 'chunks' lists of Documents """ self.page_stats = [] # Reset stats # First get the page-level documents using PageChunker page_docs = super().page_process_document(file_path, preprocess) # Now create chunk-level documents chunk_docs = [] total_chunks = 0 for page_doc in page_docs: page_num = page_doc.metadata["page"] # Mark this as a page-level document page_doc.metadata["level"] = "page" # Create chunks for this page page_chunks = self._create_semantic_chunks( page_doc.page_content, page_num ) chunk_docs.extend(page_chunks) total_chunks += len(page_chunks) # Log summary information logger.info(f"\nHierarchical Processing Summary:") logger.info(f"Total Pages: {len(page_docs)}") logger.info(f"Total Chunks: {total_chunks}") logger.info("\n".join(self.page_stats)) return { "pages": page_docs, "chunks": chunk_docs } def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]: """ Process document using hierarchical chunking strategy (implements abstract method). Args: file_path: Path to the PDF file preprocess: Whether to preprocess text Returns: Dictionary with 'pages' and 'chunks' lists of Documents """ return self.hierarchical_process_document(file_path, preprocess)