Spaces:
Sleeping
Sleeping
File size: 6,821 Bytes
c0f31c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | """
HierarchicalChunker.py
A module for hierarchical document chunking that combines page-level and semantic chunking.
Features:
- Multi-level document representation (pages and chunks)
- Semantic chunking with sentence boundaries
- Size and overlap controls
- Hierarchical metadata
"""
import logging
import spacy
from typing import Dict, List, Optional, Any
from langchain_core.documents import Document
from core.PageChunker import PageChunker
logger = logging.getLogger(__name__)
class HierarchicalChunker(PageChunker):
"""Handles document chunking at multiple hierarchical levels."""
def __init__(
self,
model_name: Optional[str] = None,
embedding_model: Optional[Any] = None,
chunk_size: int = 500,
chunk_overlap: int = 50,
similarity_threshold: float = 0.85
):
"""
Initialize hierarchical chunker with specified models and parameters.
Args:
model_name: Name of the model for tokenization
embedding_model: Model for generating embeddings
chunk_size: Maximum size of semantic chunks
chunk_overlap: Overlap between chunks
similarity_threshold: Similarity threshold for merging chunks
"""
super().__init__(model_name, embedding_model)
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.similarity_threshold = similarity_threshold
# Initialize spaCy for NLP tasks
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
logger.info("Installing spaCy model...")
import subprocess
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
capture_output=True)
self.nlp = spacy.load("en_core_web_sm")
def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
"""
Create semantic chunks with detailed metadata.
Args:
content: The page content to chunk
page_number: The page number
Returns:
List of Document objects representing semantic chunks
"""
if not content.strip():
return []
sentences = list(self.nlp(content).sents)
chunks = []
current_chunk = []
current_length = 0
for sent in sentences:
sent_text = sent.text.strip()
sent_length = len(sent_text)
if current_length + sent_length > self.chunk_size:
if current_chunk:
chunk_text = " ".join(current_chunk)
stats = self.analyze_text(chunk_text)
chunks.append(Document(
page_content=chunk_text,
metadata={
"level": "chunk",
"page_num": page_number,
"chunk_num": len(chunks) + 1,
"parent_page": page_number,
"char_count": stats["char_count"],
"token_count": stats["token_count"],
"sentence_count": stats["sentence_count"],
"word_count": stats["word_count"],
"has_ocr": stats.get("has_content", "true")
}
))
current_chunk = [sent_text]
current_length = sent_length
else:
current_chunk.append(sent_text)
current_length += sent_length
# Handle final chunk
if current_chunk:
chunk_text = " ".join(current_chunk)
stats = self.analyze_text(chunk_text)
chunks.append(Document(
page_content=chunk_text,
metadata={
"level": "chunk",
"page_num": page_number,
"chunk_num": len(chunks) + 1,
"parent_page": page_number,
"char_count": stats["char_count"],
"token_count": stats["token_count"],
"sentence_count": stats["sentence_count"],
"word_count": stats["word_count"],
"has_ocr": stats.get("has_content", "true")
}
))
self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
return chunks
def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
"""
Process document with hierarchical chunking strategy.
Args:
file_path: Path to the PDF file
preprocess: Whether to preprocess text
Returns:
Dictionary with 'pages' and 'chunks' lists of Documents
"""
self.page_stats = [] # Reset stats
# First get the page-level documents using PageChunker
page_docs = super().page_process_document(file_path, preprocess)
# Now create chunk-level documents
chunk_docs = []
total_chunks = 0
for page_doc in page_docs:
page_num = page_doc.metadata["page"]
# Mark this as a page-level document
page_doc.metadata["level"] = "page"
# Create chunks for this page
page_chunks = self._create_semantic_chunks(
page_doc.page_content,
page_num
)
chunk_docs.extend(page_chunks)
total_chunks += len(page_chunks)
# Log summary information
logger.info(f"\nHierarchical Processing Summary:")
logger.info(f"Total Pages: {len(page_docs)}")
logger.info(f"Total Chunks: {total_chunks}")
logger.info("\n".join(self.page_stats))
return {
"pages": page_docs,
"chunks": chunk_docs
}
def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
"""
Process document using hierarchical chunking strategy (implements abstract method).
Args:
file_path: Path to the PDF file
preprocess: Whether to preprocess text
Returns:
Dictionary with 'pages' and 'chunks' lists of Documents
"""
return self.hierarchical_process_document(file_path, preprocess) |