Spaces:
Running
Running
File size: 17,666 Bytes
c0f31c1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 | """
TokenChunker.py
A module for token-based document chunking with configurable overlap and preprocessing.
Features:
- Token-based document splitting with overlap
- Content validation and token counting
- Smart boundary detection to preserve word integrity
- Compatible with multiple tokenizer types (tiktoken, transformers, basic)
"""
import logging
import re
from typing import List, Optional, Dict, Any
from langchain_core.documents import Document
from core.BaseChunker import BaseChunker
logger = logging.getLogger(__name__)
class TokenChunker(BaseChunker):
"""Handles document chunking at the token level with configurable overlap."""
def __init__(
self,
model_name=None,
embedding_model=None,
chunk_size: int = 256,
chunk_overlap: int = 50,
min_chunk_size: int = 50
):
"""
Initialize token chunker with specified models and parameters.
Args:
model_name: Name of the model for tokenization
embedding_model: Model for generating embeddings
chunk_size: Maximum tokens per chunk
chunk_overlap: Number of tokens to overlap between chunks
min_chunk_size: Minimum tokens for a valid chunk
"""
super().__init__(model_name, embedding_model)
# Validate chunking parameters
if chunk_overlap >= chunk_size:
raise ValueError("chunk_overlap must be less than chunk_size")
if min_chunk_size <= 0:
raise ValueError("min_chunk_size must be positive")
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.min_chunk_size = min_chunk_size
self.chunk_stats = []
logger.info(f"TokenChunker initialized: chunk_size={chunk_size}, overlap={chunk_overlap}, min_size={min_chunk_size}")
def _smart_tokenize(self, text: str) -> List[str]:
"""
Tokenize text while preserving word boundaries for reconstruction.
Args:
text: The text content to tokenize
Returns:
List of tokens that can be cleanly rejoined
"""
if not text.strip():
return []
try:
if self.uses_tiktoken:
# For tiktoken, we need a hybrid approach to preserve boundaries
return self._tiktoken_boundary_aware_split(text)
elif hasattr(self.tokenizer, 'tokenize'):
# For transformers tokenizers
tokens = self.tokenizer.tokenize(text)
return self._clean_subword_tokens(tokens)
else:
# Fallback to intelligent word splitting
return self._word_boundary_split(text)
except Exception as e:
logger.warning(f"Tokenization failed: {e}. Using word boundary fallback.")
return self._word_boundary_split(text)
def _tiktoken_boundary_aware_split(self, text: str) -> List[str]:
"""
Split text in a way that's compatible with tiktoken while preserving boundaries.
Args:
text: Input text
Returns:
List of text segments that approximate tokens
"""
# Get actual token count for validation
target_token_count = self.count_tokens(text)
# Split on natural boundaries (spaces, punctuation)
words = re.findall(r'\S+|\s+', text)
# If we have roughly the right number of words, use them
if abs(len(words) - target_token_count) / max(target_token_count, 1) < 0.3:
return [w for w in words if w.strip()]
# Otherwise, use a more granular split
segments = re.findall(r'\w+|[^\w\s]|\s+', text)
return [s for s in segments if s.strip()]
def _clean_subword_tokens(self, tokens: List[str]) -> List[str]:
"""
Clean subword tokens for better reconstruction.
Args:
tokens: Raw tokens from tokenizer
Returns:
Cleaned tokens
"""
cleaned = []
for token in tokens:
# Remove special tokens but keep the content
if token.startswith('##'):
# BERT-style subwords
cleaned.append(token[2:])
elif token.startswith('▁'):
# SentencePiece-style
cleaned.append(' ' + token[1:])
else:
cleaned.append(token)
return [t for t in cleaned if t.strip()]
def _word_boundary_split(self, text: str) -> List[str]:
"""
Split text on word boundaries as fallback tokenization.
Args:
text: Input text
Returns:
List of words
"""
# Split on whitespace but preserve some punctuation as separate tokens
tokens = re.findall(r'\w+|[.!?;,]', text)
return tokens
def _detokenize(self, tokens: List[str]) -> str:
"""
Reconstruct text from tokens, handling different tokenizer types.
Args:
tokens: List of token strings
Returns:
Reconstructed text
"""
if not tokens:
return ""
if self.uses_tiktoken or not hasattr(self.tokenizer, 'tokenize'):
# For tiktoken and basic tokenizers, use space joining with smart spacing
result = ""
for i, token in enumerate(tokens):
if not token.strip():
continue
if i == 0:
result = token
elif token in '.,!?;:':
result += token
elif result and result[-1] in '.,!?;:':
result += " " + token
else:
result += " " + token
return result
else:
# For transformers tokenizers, handle subword reconstruction
text = "".join(tokens)
# Clean up spacing around punctuation
text = re.sub(r'\s+([.!?;,])', r'\1', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def _create_token_chunks(self, tokens: List[str]) -> List[List[str]]:
"""
Split tokens into overlapping chunks of specified size.
Args:
tokens: List of token strings
Returns:
List of token chunks
"""
if not tokens:
return []
chunks = []
start = 0
while start < len(tokens):
# Calculate end position for this chunk
end = min(start + self.chunk_size, len(tokens))
# Extract the chunk
chunk_tokens = tokens[start:end]
# Only add chunks that meet minimum size requirement
if len(chunk_tokens) >= self.min_chunk_size:
chunks.append(chunk_tokens)
self.chunk_stats.append(f"Created chunk with {len(chunk_tokens)} tokens")
else:
self.chunk_stats.append(f"Skipped small chunk with {len(chunk_tokens)} tokens")
# Break if we've reached the end
if end >= len(tokens):
break
# Calculate next start position with overlap
start = end - self.chunk_overlap
# Ensure forward progress
if start <= 0:
start = end
return chunks
def _process_single_chunk(self, chunk_tokens: List[str], chunk_index: int,
source_metadata: Dict[str, Any]) -> Optional[Document]:
"""
Process a single token chunk into a Document with metadata.
Args:
chunk_tokens: List of tokens for this chunk
chunk_index: Index of this chunk in the document
source_metadata: Metadata from source document
Returns:
Document object with processed content and metadata, or None if invalid
"""
# Reconstruct text from tokens
chunk_text = self._detokenize(chunk_tokens)
# Validate chunk content
if not self.is_content_valid(chunk_text, min_tokens=self.min_chunk_size):
self.chunk_stats.append(f"Chunk {chunk_index} failed validation")
return None
# Analyze the chunk content
stats = self.analyze_text(chunk_text)
# Create comprehensive metadata
metadata = source_metadata.copy()
metadata.update({
"chunk_index": chunk_index,
"chunk_type": "token",
"chunking_method": "token_based",
"token_count": len(chunk_tokens),
"char_count": stats["char_count"],
"sentence_count": stats["sentence_count"],
"word_count": stats["word_count"],
"chunk_size_limit": self.chunk_size,
"chunk_overlap": self.chunk_overlap
})
return Document(page_content=chunk_text, metadata=metadata)
def token_process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
"""
Process document using token-based chunking with overlap.
Args:
file_path: Path to the document file
preprocess: Whether to preprocess text content
Returns:
List of Document objects, one per valid token chunk
"""
try:
self.chunk_stats = [] # Reset stats for this document
raw_pages = self.load_document(file_path)
processed_chunks = []
logger.info(f"Processing document with {len(raw_pages)} pages using token chunking")
# Combine all pages into a single text for token-based processing
full_text = ""
combined_metadata = {}
page_info = [] # Track which pages contributed to the text
for page_idx, page in enumerate(raw_pages):
content = page.page_content
# Skip invalid content
if not self.is_content_valid(content):
logger.debug(f"Skipping invalid content on page {page_idx + 1}")
continue
# Preprocess if requested
if preprocess:
content = self.preprocess_text(content)
if not self.is_content_valid(content):
continue
# Track page information
page_info.append({
"page_number": page_idx + 1,
"original_metadata": page.metadata
})
# Combine text with page separation
if full_text:
full_text += "\n\n" + content
else:
full_text = content
# Use metadata from first valid page as base
combined_metadata = page.metadata.copy()
# Update combined metadata to reflect all pages
if page_info:
combined_metadata.update({
"total_pages_processed": len(page_info),
"page_range": f"{page_info[0]['page_number']}-{page_info[-1]['page_number']}",
"source_pages": [str(p["page_number"]) for p in page_info] # ✅ Convert to list of strings
})
# Remove the single "page" field since this represents multiple pages
combined_metadata.pop("page", None)
if not full_text.strip():
logger.warning("No valid content found in document")
return []
# Tokenize the entire document
all_tokens = self._smart_tokenize(full_text)
logger.info(f"Document tokenized into {len(all_tokens)} tokens")
if len(all_tokens) < self.min_chunk_size:
logger.warning(f"Document too short for chunking ({len(all_tokens)} tokens)")
return []
# Create overlapping token chunks
token_chunks = self._create_token_chunks(all_tokens)
logger.info(f"Created {len(token_chunks)} token chunks")
# Convert token chunks to Document objects
for chunk_idx, chunk_tokens in enumerate(token_chunks):
chunk_doc = self._process_single_chunk(
chunk_tokens,
chunk_idx,
combined_metadata
)
if chunk_doc:
processed_chunks.append(chunk_doc)
# Output processing statistics
if self.chunk_stats:
logger.info("\n".join(self.chunk_stats))
logger.info(f"Processed {len(processed_chunks)} valid token chunks")
return processed_chunks
except Exception as e:
logger.error(f"Error in token_process_document: {e}")
raise
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
"""
Process document using token chunking strategy (implements abstract method).
Args:
file_path: Path to the document file
preprocess: Whether to preprocess text content
Returns:
List of Document objects, one per valid token chunk
"""
return self.token_process_document(file_path, preprocess)
def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
"""
Process text file directly using token-based chunking with overlap.
Args:
file_path: Path to the text file
preprocess: Whether to preprocess text content
Returns:
List of Document objects, one per valid token chunk
"""
try:
from pathlib import Path
from datetime import datetime
self.chunk_stats = [] # Reset stats for this document
# Load the text file directly
content = self.load_text_file(file_path)
# Clean the text using the same logic as PDF conversion
content = self.clean_text_for_processing(content)
# Basic validation
if not self.is_content_valid(content):
logger.warning("Text file content failed validation")
return []
# Light preprocessing if requested (no header/footer removal for txt files)
if preprocess:
# Only apply basic text cleaning, not aggressive preprocessing
content = ' '.join(content.split()) # Normalize whitespace
# Create file-level metadata
file_path_obj = Path(file_path)
file_metadata = {
"source": file_path,
"file_name": file_path_obj.name,
"file_type": "txt",
"total_characters": len(content),
"processing_timestamp": datetime.now().isoformat(),
}
logger.info(f"Processing text file: {file_path_obj.name} ({len(content)} characters)")
# Tokenize the entire document
all_tokens = self._smart_tokenize(content)
logger.info(f"Text file tokenized into {len(all_tokens)} tokens")
if len(all_tokens) < self.min_chunk_size:
logger.warning(f"Text file too short for chunking ({len(all_tokens)} tokens)")
return []
# Create overlapping token chunks
token_chunks = self._create_token_chunks(all_tokens)
logger.info(f"Created {len(token_chunks)} token chunks from text file")
# Convert token chunks to Document objects
processed_chunks = []
for chunk_idx, chunk_tokens in enumerate(token_chunks):
chunk_doc = self._process_single_chunk(
chunk_tokens,
chunk_idx,
file_metadata
)
if chunk_doc:
processed_chunks.append(chunk_doc)
# Output processing statistics
if self.chunk_stats:
logger.info("\n".join(self.chunk_stats))
logger.info(f"Processed {len(processed_chunks)} valid token chunks from text file")
return processed_chunks
except Exception as e:
logger.error(f"Error processing text file: {e}")
raise |