Spaces:
Sleeping
Sleeping
| """ | |
| HierarchicalChunker.py | |
| A module for hierarchical document chunking that combines page-level and semantic chunking. | |
| Features: | |
| - Multi-level document representation (pages and chunks) | |
| - Semantic chunking with sentence boundaries | |
| - Size and overlap controls | |
| - Hierarchical metadata | |
| """ | |
| import logging | |
| import spacy | |
| from typing import Dict, List, Optional, Any | |
| from langchain_core.documents import Document | |
| from core.PageChunker import PageChunker | |
| logger = logging.getLogger(__name__) | |
| class HierarchicalChunker(PageChunker): | |
| """Handles document chunking at multiple hierarchical levels.""" | |
| def __init__( | |
| self, | |
| model_name: Optional[str] = None, | |
| embedding_model: Optional[Any] = None, | |
| chunk_size: int = 500, | |
| chunk_overlap: int = 50, | |
| similarity_threshold: float = 0.85 | |
| ): | |
| """ | |
| Initialize hierarchical chunker with specified models and parameters. | |
| Args: | |
| model_name: Name of the model for tokenization | |
| embedding_model: Model for generating embeddings | |
| chunk_size: Maximum size of semantic chunks | |
| chunk_overlap: Overlap between chunks | |
| similarity_threshold: Similarity threshold for merging chunks | |
| """ | |
| super().__init__(model_name, embedding_model) | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.similarity_threshold = similarity_threshold | |
| # Initialize spaCy for NLP tasks | |
| try: | |
| self.nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| logger.info("Installing spaCy model...") | |
| import subprocess | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], | |
| capture_output=True) | |
| self.nlp = spacy.load("en_core_web_sm") | |
| def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]: | |
| """ | |
| Create semantic chunks with detailed metadata. | |
| Args: | |
| content: The page content to chunk | |
| page_number: The page number | |
| Returns: | |
| List of Document objects representing semantic chunks | |
| """ | |
| if not content.strip(): | |
| return [] | |
| sentences = list(self.nlp(content).sents) | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for sent in sentences: | |
| sent_text = sent.text.strip() | |
| sent_length = len(sent_text) | |
| if current_length + sent_length > self.chunk_size: | |
| if current_chunk: | |
| chunk_text = " ".join(current_chunk) | |
| stats = self.analyze_text(chunk_text) | |
| chunks.append(Document( | |
| page_content=chunk_text, | |
| metadata={ | |
| "level": "chunk", | |
| "page_num": page_number, | |
| "chunk_num": len(chunks) + 1, | |
| "parent_page": page_number, | |
| "char_count": stats["char_count"], | |
| "token_count": stats["token_count"], | |
| "sentence_count": stats["sentence_count"], | |
| "word_count": stats["word_count"], | |
| "has_ocr": stats.get("has_content", "true") | |
| } | |
| )) | |
| current_chunk = [sent_text] | |
| current_length = sent_length | |
| else: | |
| current_chunk.append(sent_text) | |
| current_length += sent_length | |
| # Handle final chunk | |
| if current_chunk: | |
| chunk_text = " ".join(current_chunk) | |
| stats = self.analyze_text(chunk_text) | |
| chunks.append(Document( | |
| page_content=chunk_text, | |
| metadata={ | |
| "level": "chunk", | |
| "page_num": page_number, | |
| "chunk_num": len(chunks) + 1, | |
| "parent_page": page_number, | |
| "char_count": stats["char_count"], | |
| "token_count": stats["token_count"], | |
| "sentence_count": stats["sentence_count"], | |
| "word_count": stats["word_count"], | |
| "has_ocr": stats.get("has_content", "true") | |
| } | |
| )) | |
| self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}") | |
| return chunks | |
| def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]: | |
| """ | |
| Process document with hierarchical chunking strategy. | |
| Args: | |
| file_path: Path to the PDF file | |
| preprocess: Whether to preprocess text | |
| Returns: | |
| Dictionary with 'pages' and 'chunks' lists of Documents | |
| """ | |
| self.page_stats = [] # Reset stats | |
| # First get the page-level documents using PageChunker | |
| page_docs = super().page_process_document(file_path, preprocess) | |
| # Now create chunk-level documents | |
| chunk_docs = [] | |
| total_chunks = 0 | |
| for page_doc in page_docs: | |
| page_num = page_doc.metadata["page"] | |
| # Mark this as a page-level document | |
| page_doc.metadata["level"] = "page" | |
| # Create chunks for this page | |
| page_chunks = self._create_semantic_chunks( | |
| page_doc.page_content, | |
| page_num | |
| ) | |
| chunk_docs.extend(page_chunks) | |
| total_chunks += len(page_chunks) | |
| # Log summary information | |
| logger.info(f"\nHierarchical Processing Summary:") | |
| logger.info(f"Total Pages: {len(page_docs)}") | |
| logger.info(f"Total Chunks: {total_chunks}") | |
| logger.info("\n".join(self.page_stats)) | |
| return { | |
| "pages": page_docs, | |
| "chunks": chunk_docs | |
| } | |
| def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]: | |
| """ | |
| Process document using hierarchical chunking strategy (implements abstract method). | |
| Args: | |
| file_path: Path to the PDF file | |
| preprocess: Whether to preprocess text | |
| Returns: | |
| Dictionary with 'pages' and 'chunks' lists of Documents | |
| """ | |
| return self.hierarchical_process_document(file_path, preprocess) |