Spaces:
Sleeping
Sleeping
Upload 10 files
Browse files- src/core/BaseChunker.py +367 -0
- src/core/ChunkingManager.py +354 -0
- src/core/HierarchicalChunker.py +183 -0
- src/core/OCREnhancedPDFLoader.py +89 -0
- src/core/PageChunker.py +119 -0
- src/core/ParagraphChunker.py +315 -0
- src/core/SemanticChunker.py +200 -0
- src/core/TextPreprocessor.py +198 -0
- src/core/TokenChunker.py +458 -0
- src/core/__init__.py +0 -0
src/core/BaseChunker.py
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BaseChunker.py
|
| 3 |
+
|
| 4 |
+
An abstract base class defining the interface for document chunking strategies.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from core.OCREnhancedPDFLoader import OCREnhancedPDFLoader
|
| 9 |
+
from core.TextPreprocessor import TextPreprocessor
|
| 10 |
+
import numpy as np
|
| 11 |
+
from abc import ABC, abstractmethod
|
| 12 |
+
from typing import List, Dict, Any, Optional, Union
|
| 13 |
+
|
| 14 |
+
import spacy
|
| 15 |
+
from langchain_core.documents import Document
|
| 16 |
+
|
| 17 |
+
# Import tiktoken at the module level
|
| 18 |
+
try:
|
| 19 |
+
import tiktoken
|
| 20 |
+
TIKTOKEN_AVAILABLE = True
|
| 21 |
+
except ImportError:
|
| 22 |
+
TIKTOKEN_AVAILABLE = False
|
| 23 |
+
logging.warning("tiktoken not installed. Some tokenization features will be limited. "
|
| 24 |
+
"Install with: pip install tiktoken")
|
| 25 |
+
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
class BaseChunker(ABC):
|
| 29 |
+
"""Abstract base class for document chunking strategies."""
|
| 30 |
+
|
| 31 |
+
# Common constants
|
| 32 |
+
BLANK_THRESHOLD = 20 # Minimum characters for non-blank text
|
| 33 |
+
TOKEN_THRESHOLD = 10 # Minimum tokens for valid content
|
| 34 |
+
|
| 35 |
+
# Model type indicators
|
| 36 |
+
TIKTOKEN_MODELS = ["gpt", "davinci", "curie", "babbage", "ada"]
|
| 37 |
+
BASIC_TOKENIZER_MODELS = ["llama", "mistral", "granite"]
|
| 38 |
+
|
| 39 |
+
def __init__(self, model_name: Optional[str] = None, embedding_model: Optional[Any] = None):
|
| 40 |
+
"""
|
| 41 |
+
Initialize base chunker with model settings.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
model_name: Name of the model for tokenization
|
| 45 |
+
embedding_model: Model for generating embeddings
|
| 46 |
+
"""
|
| 47 |
+
self.model_name = model_name
|
| 48 |
+
self.embedding_model = embedding_model
|
| 49 |
+
self.uses_tiktoken = False
|
| 50 |
+
self.uses_basic_tokenizer = False
|
| 51 |
+
self.tokenizer = None
|
| 52 |
+
self._initialize_tokenizer()
|
| 53 |
+
|
| 54 |
+
# Initialize NLP pipeline for text analysis
|
| 55 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 56 |
+
|
| 57 |
+
def _initialize_tokenizer(self):
|
| 58 |
+
"""Initialize the appropriate tokenizer based on model name."""
|
| 59 |
+
if not self.model_name:
|
| 60 |
+
logger.warning("No model name provided. Using basic tokenization.")
|
| 61 |
+
self.uses_basic_tokenizer = True
|
| 62 |
+
return
|
| 63 |
+
|
| 64 |
+
# Check if model is supported by tiktoken
|
| 65 |
+
if TIKTOKEN_AVAILABLE and self.model_name in ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]:
|
| 66 |
+
try:
|
| 67 |
+
encoding = tiktoken.get_encoding(self.model_name)
|
| 68 |
+
|
| 69 |
+
# Create a tokenizer-like interface for tiktoken
|
| 70 |
+
class TiktokenWrapper:
|
| 71 |
+
def __init__(self, encoding):
|
| 72 |
+
self.encoding = encoding
|
| 73 |
+
|
| 74 |
+
def tokenize(self, text):
|
| 75 |
+
return self.encoding.encode(text)
|
| 76 |
+
|
| 77 |
+
self.tokenizer = TiktokenWrapper(encoding)
|
| 78 |
+
self.uses_tiktoken = True
|
| 79 |
+
logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
|
| 80 |
+
return
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.warning(f"Error with specified tiktoken model: {e}")
|
| 83 |
+
# Fall back to a standard encoding
|
| 84 |
+
try:
|
| 85 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 86 |
+
|
| 87 |
+
class TiktokenWrapper:
|
| 88 |
+
def __init__(self, encoding):
|
| 89 |
+
self.encoding = encoding
|
| 90 |
+
|
| 91 |
+
def tokenize(self, text):
|
| 92 |
+
return self.encoding.encode(text)
|
| 93 |
+
|
| 94 |
+
self.tokenizer = TiktokenWrapper(encoding)
|
| 95 |
+
self.uses_tiktoken = True
|
| 96 |
+
logger.info("Initialized tiktoken with cl100k_base encoding")
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.warning(f"Error initializing tiktoken: {e}")
|
| 99 |
+
self.uses_basic_tokenizer = True
|
| 100 |
+
|
| 101 |
+
if TIKTOKEN_AVAILABLE and (
|
| 102 |
+
any(model in self.model_name.lower() for model in self.TIKTOKEN_MODELS) or
|
| 103 |
+
self.model_name.startswith("gpt-") or
|
| 104 |
+
self.model_name.endswith("-base")
|
| 105 |
+
):
|
| 106 |
+
try:
|
| 107 |
+
encoding = tiktoken.get_encoding(self.model_name)
|
| 108 |
+
|
| 109 |
+
# Create a tokenizer-like interface for tiktoken
|
| 110 |
+
class TiktokenWrapper:
|
| 111 |
+
def __init__(self, encoding):
|
| 112 |
+
self.encoding = encoding
|
| 113 |
+
|
| 114 |
+
def tokenize(self, text):
|
| 115 |
+
return self.encoding.encode(text)
|
| 116 |
+
|
| 117 |
+
self.tokenizer = TiktokenWrapper(encoding)
|
| 118 |
+
self.uses_tiktoken = True
|
| 119 |
+
logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.warning(f"Error with specified tiktoken model: {e}")
|
| 122 |
+
# Fall back to a standard encoding
|
| 123 |
+
try:
|
| 124 |
+
encoding = tiktoken.get_encoding("cl100k_base")
|
| 125 |
+
|
| 126 |
+
class TiktokenWrapper:
|
| 127 |
+
def __init__(self, encoding):
|
| 128 |
+
self.encoding = encoding
|
| 129 |
+
|
| 130 |
+
def tokenize(self, text):
|
| 131 |
+
return self.encoding.encode(text)
|
| 132 |
+
|
| 133 |
+
self.tokenizer = TiktokenWrapper(encoding)
|
| 134 |
+
self.uses_tiktoken = True
|
| 135 |
+
logger.info("Initialized tiktoken with cl100k_base encoding")
|
| 136 |
+
except Exception as e:
|
| 137 |
+
logger.warning(f"Error initializing tiktoken: {e}")
|
| 138 |
+
self.uses_basic_tokenizer = True
|
| 139 |
+
|
| 140 |
+
# Check if model uses basic tokenization
|
| 141 |
+
elif any(model in self.model_name.lower() for model in self.BASIC_TOKENIZER_MODELS):
|
| 142 |
+
self.uses_basic_tokenizer = True
|
| 143 |
+
logger.info("Using basic tokenization for model")
|
| 144 |
+
|
| 145 |
+
# Fall back to transformers tokenizer
|
| 146 |
+
else:
|
| 147 |
+
try:
|
| 148 |
+
from transformers import AutoTokenizer
|
| 149 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 150 |
+
logger.info(f"Initialized transformers tokenizer for model: {self.model_name}")
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.warning(f"Error initializing transformer tokenizer: {e}")
|
| 153 |
+
logger.warning("Falling back to basic tokenization")
|
| 154 |
+
self.uses_basic_tokenizer = True
|
| 155 |
+
|
| 156 |
+
def count_tokens(self, text: str) -> int:
|
| 157 |
+
"""Count tokens in a text string using the available tokenizer."""
|
| 158 |
+
if not text:
|
| 159 |
+
return 0
|
| 160 |
+
|
| 161 |
+
try:
|
| 162 |
+
# Try with the standard tokenizer
|
| 163 |
+
if self.tokenizer:
|
| 164 |
+
if self.uses_tiktoken:
|
| 165 |
+
# For tiktoken wrapper
|
| 166 |
+
return len(self.tokenizer.tokenize(text))
|
| 167 |
+
else:
|
| 168 |
+
# For transformers tokenizer
|
| 169 |
+
tokens = self.tokenizer.tokenize(text)
|
| 170 |
+
return len(tokens)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.warning(f"Primary tokenization failed: {e}")
|
| 173 |
+
|
| 174 |
+
# Basic tokenization fallback
|
| 175 |
+
if self.uses_basic_tokenizer or not self.tokenizer:
|
| 176 |
+
# Simple approximation (word count)
|
| 177 |
+
return len(text.split())
|
| 178 |
+
|
| 179 |
+
# If we somehow got here, return a reasonable approximation
|
| 180 |
+
return len(text) // 4 # Rough character-to-token ratio
|
| 181 |
+
|
| 182 |
+
def get_embedding(self, text: str) -> Optional[np.ndarray]:
|
| 183 |
+
"""Generate embedding vector for text."""
|
| 184 |
+
if not text.strip() or not self.embedding_model:
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
return self.embedding_model.encode(text)
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.error(f"Error generating embedding: {e}")
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
+
def analyze_text(self, text: str) -> Dict[str, Any]:
|
| 194 |
+
"""Perform detailed analysis of text content."""
|
| 195 |
+
if not text.strip():
|
| 196 |
+
return {
|
| 197 |
+
"char_count": 0,
|
| 198 |
+
"token_count": 0,
|
| 199 |
+
"sentence_count": 0,
|
| 200 |
+
"word_count": 0,
|
| 201 |
+
"embedding_dim": 0,
|
| 202 |
+
"has_content": False
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
embedding = self.get_embedding(text)
|
| 207 |
+
doc = self.nlp(text)
|
| 208 |
+
|
| 209 |
+
return {
|
| 210 |
+
"char_count": len(text),
|
| 211 |
+
"token_count": self.count_tokens(text),
|
| 212 |
+
"sentence_count": len(list(doc.sents)),
|
| 213 |
+
"word_count": len(text.split()),
|
| 214 |
+
"embedding_dim": len(embedding) if embedding is not None else 0,
|
| 215 |
+
"has_content": bool(text.strip())
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
except Exception as e:
|
| 219 |
+
logger.error(f"Error analyzing text: {e}")
|
| 220 |
+
return {
|
| 221 |
+
"char_count": len(text),
|
| 222 |
+
"token_count": 0,
|
| 223 |
+
"sentence_count": 0,
|
| 224 |
+
"word_count": len(text.split()),
|
| 225 |
+
"embedding_dim": 0,
|
| 226 |
+
"has_content": bool(text.strip())
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
def is_content_valid(self, text: str, min_chars: int = None, min_tokens: int = None) -> bool:
|
| 230 |
+
"""Check if content meets minimum requirements."""
|
| 231 |
+
if not text.strip():
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
min_chars = min_chars or self.BLANK_THRESHOLD
|
| 235 |
+
min_tokens = min_tokens or self.TOKEN_THRESHOLD
|
| 236 |
+
|
| 237 |
+
if len(text.strip()) < min_chars:
|
| 238 |
+
return False
|
| 239 |
+
|
| 240 |
+
token_count = self.count_tokens(text)
|
| 241 |
+
return token_count >= min_tokens
|
| 242 |
+
|
| 243 |
+
def validate_documents(self, documents):
|
| 244 |
+
"""Validate documents before sending to vector database"""
|
| 245 |
+
valid_documents = []
|
| 246 |
+
|
| 247 |
+
for i, doc in enumerate(documents):
|
| 248 |
+
# Check if document content is empty or just whitespace
|
| 249 |
+
if not doc.page_content or not doc.page_content.strip():
|
| 250 |
+
print(f"Skipping document {i}: Empty content")
|
| 251 |
+
continue
|
| 252 |
+
|
| 253 |
+
# Check if content starts with invalid characters
|
| 254 |
+
if doc.page_content and len(doc.page_content) > 0:
|
| 255 |
+
# Remove any potential BOM or invisible characters at start
|
| 256 |
+
cleaned_content = doc.page_content.lstrip('\ufeff\u200b\u200c\u200d\u200e\u200f\u2060')
|
| 257 |
+
|
| 258 |
+
# Replace document content with cleaned version
|
| 259 |
+
doc.page_content = cleaned_content
|
| 260 |
+
|
| 261 |
+
valid_documents.append(doc)
|
| 262 |
+
|
| 263 |
+
print(f"Validated {len(valid_documents)} of {len(documents)} documents")
|
| 264 |
+
return valid_documents
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def debug_documents(self, documents, num_chars=50):
|
| 268 |
+
"""Print diagnostic information about documents"""
|
| 269 |
+
print(f"\nDEBUG INFO: Examining {len(documents)} documents")
|
| 270 |
+
|
| 271 |
+
for i, doc in enumerate(documents):
|
| 272 |
+
content = doc.page_content
|
| 273 |
+
if not content:
|
| 274 |
+
print(f" Doc {i}: EMPTY CONTENT")
|
| 275 |
+
continue
|
| 276 |
+
|
| 277 |
+
# Get first few characters and their ASCII/Unicode codes
|
| 278 |
+
first_chars = content[:num_chars]
|
| 279 |
+
char_codes = [f"{c}({ord(c)})" for c in first_chars[:10]]
|
| 280 |
+
|
| 281 |
+
print(f" Doc {i}: Length={len(content)}, First chars: {''.join(char_codes)}")
|
| 282 |
+
print(f" Preview: {first_chars!r}")
|
| 283 |
+
|
| 284 |
+
print("DEBUG INFO END\n")
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def load_document(self, file_path: str) -> List[Document]:
|
| 288 |
+
"""Load document using OCREnhancedPDFLoader."""
|
| 289 |
+
try:
|
| 290 |
+
loader = OCREnhancedPDFLoader(file_path)
|
| 291 |
+
documents = loader.load()
|
| 292 |
+
self.debug_documents(documents)
|
| 293 |
+
cleaned_docs = self.validate_documents(documents)
|
| 294 |
+
return cleaned_docs
|
| 295 |
+
except Exception as e:
|
| 296 |
+
logger.error(f"Error loading document: {e}")
|
| 297 |
+
raise
|
| 298 |
+
|
| 299 |
+
def preprocess_text(self, text: str, remove_headers_footers: bool = True) -> str:
|
| 300 |
+
"""Preprocess text using TextPreprocessor."""
|
| 301 |
+
try:
|
| 302 |
+
preprocessor = TextPreprocessor()
|
| 303 |
+
return preprocessor.preprocess(text, remove_headers_footers)
|
| 304 |
+
except Exception as e:
|
| 305 |
+
logger.error(f"Error preprocessing text: {e}")
|
| 306 |
+
return text
|
| 307 |
+
|
| 308 |
+
@abstractmethod
|
| 309 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> Union[List[Document], Dict[str, List[Document]]]:
|
| 310 |
+
"""Process document using specific chunking strategy."""
|
| 311 |
+
pass
|
| 312 |
+
|
| 313 |
+
def load_text_file(self, file_path: str) -> str:
|
| 314 |
+
"""
|
| 315 |
+
Load raw text file content.
|
| 316 |
+
|
| 317 |
+
Args:
|
| 318 |
+
file_path: Path to the text file
|
| 319 |
+
|
| 320 |
+
Returns:
|
| 321 |
+
Raw text content
|
| 322 |
+
"""
|
| 323 |
+
try:
|
| 324 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 325 |
+
content = f.read()
|
| 326 |
+
logger.info(f"Loaded text file: {file_path} ({len(content)} characters)")
|
| 327 |
+
return content
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logger.error(f"Error loading text file {file_path}: {e}")
|
| 330 |
+
raise
|
| 331 |
+
|
| 332 |
+
def clean_text_for_processing(self, text: str) -> str:
|
| 333 |
+
"""
|
| 334 |
+
Clean text using Unicode character replacement (same as PDF conversion logic).
|
| 335 |
+
|
| 336 |
+
Args:
|
| 337 |
+
text: Raw text content
|
| 338 |
+
|
| 339 |
+
Returns:
|
| 340 |
+
Cleaned text content
|
| 341 |
+
"""
|
| 342 |
+
replacements = {
|
| 343 |
+
'\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
|
| 344 |
+
'\u2014': '-', '\u2013': '-', '\u2026': '...',
|
| 345 |
+
'\u200b': '', '\u00a0': ' ', '\u2022': '*',
|
| 346 |
+
'\u2192': '->', '\u2190': '<-',
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
for old, new in replacements.items():
|
| 350 |
+
text = text.replace(old, new)
|
| 351 |
+
|
| 352 |
+
return text
|
| 353 |
+
|
| 354 |
+
def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 355 |
+
"""
|
| 356 |
+
Default text file processing method. Can be overridden by specific chunkers.
|
| 357 |
+
|
| 358 |
+
Args:
|
| 359 |
+
file_path: Path to the text file
|
| 360 |
+
preprocess: Whether to preprocess the text
|
| 361 |
+
|
| 362 |
+
Returns:
|
| 363 |
+
List of Document objects
|
| 364 |
+
"""
|
| 365 |
+
# This is a default implementation that should be overridden
|
| 366 |
+
# by specific chunkers like ParagraphChunker and TokenChunker
|
| 367 |
+
raise NotImplementedError("Subclasses must implement process_text_file method")
|
src/core/ChunkingManager.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ChunkingManager.py
|
| 3 |
+
|
| 4 |
+
A manager class that orchestrates document chunking using different strategies.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from typing import Dict, List, Optional, Union
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from sentence_transformers import SentenceTransformer
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
|
| 12 |
+
# Import chunker strategies
|
| 13 |
+
from core.BaseChunker import BaseChunker
|
| 14 |
+
from core.PageChunker import PageChunker
|
| 15 |
+
from core.ParagraphChunker import ParagraphChunker
|
| 16 |
+
from core.SemanticChunker import SemanticChunker
|
| 17 |
+
from core.HierarchicalChunker import HierarchicalChunker
|
| 18 |
+
from core.TokenChunker import TokenChunker
|
| 19 |
+
import logging
|
| 20 |
+
|
| 21 |
+
# Configure logging
|
| 22 |
+
logging.basicConfig(level=logging.INFO)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
class ChunkingStrategy:
|
| 26 |
+
"""Enumeration of available chunking strategies."""
|
| 27 |
+
PAGE = "page"
|
| 28 |
+
PARAGRAPH = "paragraph"
|
| 29 |
+
SEMANTIC = "semantic"
|
| 30 |
+
HIERARCHICAL = "hierarchical"
|
| 31 |
+
TOKEN = "token"
|
| 32 |
+
|
| 33 |
+
class ChunkingManager:
|
| 34 |
+
"""Manager class for document chunking strategies."""
|
| 35 |
+
|
| 36 |
+
def __init__(
|
| 37 |
+
self,
|
| 38 |
+
embedding_model_name: str = "all-mpnet-base-v2",
|
| 39 |
+
token_model_name: Optional[str] = None
|
| 40 |
+
):
|
| 41 |
+
"""
|
| 42 |
+
Initialize chunking manager.
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
embedding_model_name: Name of the sentence transformer model
|
| 46 |
+
token_model_name: Name of the token counting model
|
| 47 |
+
"""
|
| 48 |
+
self.token_model_name = token_model_name
|
| 49 |
+
self.embedding_model_name = embedding_model_name
|
| 50 |
+
self._embedding_model = None
|
| 51 |
+
self._chunkers = {}
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def embedding_model(self):
|
| 55 |
+
"""Lazy-load the embedding model."""
|
| 56 |
+
if self._embedding_model is None:
|
| 57 |
+
try:
|
| 58 |
+
# Only try to load as SentenceTransformer if it's a known SentenceTransformer model
|
| 59 |
+
if self.embedding_model_name and not any(x in self.embedding_model_name.lower() for x in ["gpt", "text-embedding", "openai"]):
|
| 60 |
+
logger.info(f"Loading embedding model: {self.embedding_model_name}")
|
| 61 |
+
self._embedding_model = SentenceTransformer(self.embedding_model_name)
|
| 62 |
+
else:
|
| 63 |
+
# Return a dummy embedding model that returns None
|
| 64 |
+
logger.info("Using dummy embedding model for tokenization only")
|
| 65 |
+
class DummyEmbedder:
|
| 66 |
+
def encode(self, text, **kwargs):
|
| 67 |
+
return [0.0] * 384 # Return dummy vector
|
| 68 |
+
self._embedding_model = DummyEmbedder()
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Error loading embedding model: {e}")
|
| 71 |
+
# Return a dummy embedding model that returns None
|
| 72 |
+
class DummyEmbedder:
|
| 73 |
+
def encode(self, text, **kwargs):
|
| 74 |
+
return [0.0] * 384 # Return dummy vector
|
| 75 |
+
self._embedding_model = DummyEmbedder()
|
| 76 |
+
return self._embedding_model
|
| 77 |
+
|
| 78 |
+
def _get_chunker(self, strategy: str) -> BaseChunker:
|
| 79 |
+
"""Get or create chunker for the specified strategy."""
|
| 80 |
+
strategy = strategy.lower()
|
| 81 |
+
|
| 82 |
+
if strategy not in self._chunkers:
|
| 83 |
+
if strategy == ChunkingStrategy.PAGE:
|
| 84 |
+
self._chunkers[strategy] = PageChunker(
|
| 85 |
+
model_name=self.token_model_name,
|
| 86 |
+
embedding_model=self.embedding_model
|
| 87 |
+
)
|
| 88 |
+
elif strategy == ChunkingStrategy.PARAGRAPH:
|
| 89 |
+
self._chunkers[strategy] = ParagraphChunker(
|
| 90 |
+
model_name=self.token_model_name,
|
| 91 |
+
embedding_model=self.embedding_model
|
| 92 |
+
)
|
| 93 |
+
elif strategy == ChunkingStrategy.SEMANTIC:
|
| 94 |
+
self._chunkers[strategy] = SemanticChunker(
|
| 95 |
+
embedding_model=self.embedding_model,
|
| 96 |
+
model_name=self.token_model_name
|
| 97 |
+
)
|
| 98 |
+
elif strategy == ChunkingStrategy.HIERARCHICAL:
|
| 99 |
+
self._chunkers[strategy] = HierarchicalChunker(
|
| 100 |
+
model_name=self.token_model_name,
|
| 101 |
+
embedding_model=self.embedding_model
|
| 102 |
+
)
|
| 103 |
+
elif strategy == ChunkingStrategy.TOKEN:
|
| 104 |
+
self._chunkers[strategy] = TokenChunker(
|
| 105 |
+
model_name=self.token_model_name,
|
| 106 |
+
embedding_model=self.embedding_model,
|
| 107 |
+
chunk_size=256, # Default values, could be made configurable
|
| 108 |
+
chunk_overlap=50
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
raise ValueError(f"Unknown chunking strategy: {strategy}")
|
| 112 |
+
|
| 113 |
+
return self._chunkers[strategy]
|
| 114 |
+
|
| 115 |
+
def process_document(
|
| 116 |
+
self,
|
| 117 |
+
file_path: str,
|
| 118 |
+
strategy: str = ChunkingStrategy.PARAGRAPH,
|
| 119 |
+
preprocess: bool = True
|
| 120 |
+
) -> Union[List[Document], Dict[str, List[Document]]]:
|
| 121 |
+
"""
|
| 122 |
+
Process document using specified chunking strategy.
|
| 123 |
+
|
| 124 |
+
Args:
|
| 125 |
+
file_path: Path to document file
|
| 126 |
+
strategy: Chunking strategy to use
|
| 127 |
+
preprocess: Whether to preprocess text
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Chunked document(s) according to strategy
|
| 131 |
+
"""
|
| 132 |
+
# Validate file exists
|
| 133 |
+
path = Path(file_path)
|
| 134 |
+
if not path.exists():
|
| 135 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 136 |
+
|
| 137 |
+
# Determine file type
|
| 138 |
+
file_extension = path.suffix.lower()
|
| 139 |
+
|
| 140 |
+
# Process based on file type
|
| 141 |
+
if file_extension == '.csv':
|
| 142 |
+
return self._process_csv(file_path, strategy)
|
| 143 |
+
elif file_extension == '.txt':
|
| 144 |
+
return self._process_txt(file_path, strategy, preprocess)
|
| 145 |
+
elif file_extension == '.pdf':
|
| 146 |
+
# Get appropriate chunker and process document
|
| 147 |
+
chunker = self._get_chunker(strategy)
|
| 148 |
+
|
| 149 |
+
logger.info(f"Processing document using {strategy} chunking strategy")
|
| 150 |
+
|
| 151 |
+
if strategy == ChunkingStrategy.PAGE:
|
| 152 |
+
return chunker.page_process_document(file_path, preprocess)
|
| 153 |
+
elif strategy == ChunkingStrategy.PARAGRAPH:
|
| 154 |
+
return chunker.paragraph_process_document(file_path, preprocess)
|
| 155 |
+
elif strategy == ChunkingStrategy.SEMANTIC:
|
| 156 |
+
return chunker.semantic_process_document(file_path, preprocess)
|
| 157 |
+
elif strategy == ChunkingStrategy.HIERARCHICAL:
|
| 158 |
+
return chunker.hierarchical_process_document(file_path, preprocess)
|
| 159 |
+
elif strategy == ChunkingStrategy.TOKEN:
|
| 160 |
+
return chunker.token_process_document(file_path, preprocess)
|
| 161 |
+
|
| 162 |
+
else:
|
| 163 |
+
raise ValueError(f"Unknown chunking strategy: {strategy}")
|
| 164 |
+
else:
|
| 165 |
+
raise ValueError(f"Unsupported file type: {file_extension}. Supported types: .pdf, .csv, .txt")
|
| 166 |
+
|
| 167 |
+
def process_directory(
|
| 168 |
+
self,
|
| 169 |
+
dir_path: str,
|
| 170 |
+
strategy: str = ChunkingStrategy.PARAGRAPH,
|
| 171 |
+
preprocess: bool = True
|
| 172 |
+
) -> Dict[str, Union[List[Document], Dict[str, List[Document]]]]:
|
| 173 |
+
"""
|
| 174 |
+
Process all supported documents in a directory.
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
dir_path: Directory containing files
|
| 178 |
+
strategy: Chunking strategy to use
|
| 179 |
+
preprocess: Whether to preprocess text
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
Dictionary mapping filenames to their processed documents
|
| 183 |
+
"""
|
| 184 |
+
path = Path(dir_path)
|
| 185 |
+
if not path.is_dir():
|
| 186 |
+
raise NotADirectoryError(f"Not a directory: {dir_path}")
|
| 187 |
+
|
| 188 |
+
results = {}
|
| 189 |
+
|
| 190 |
+
# Find supported files (PDFs, CSVs, and TXT files)
|
| 191 |
+
pdf_files = list(path.glob("**/*.pdf"))
|
| 192 |
+
csv_files = list(path.glob("**/*.csv"))
|
| 193 |
+
txt_files = list(path.glob("**/*.txt"))
|
| 194 |
+
all_files = pdf_files + csv_files + txt_files
|
| 195 |
+
|
| 196 |
+
logger.info(f"Found {len(pdf_files)} PDF files, {len(csv_files)} CSV files, and {len(txt_files)} TXT files in {dir_path}")
|
| 197 |
+
|
| 198 |
+
for file in all_files:
|
| 199 |
+
try:
|
| 200 |
+
logger.info(f"Processing {file.name}")
|
| 201 |
+
result = self.process_document(
|
| 202 |
+
str(file),
|
| 203 |
+
strategy=strategy,
|
| 204 |
+
preprocess=preprocess
|
| 205 |
+
)
|
| 206 |
+
results[file.name] = result
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.error(f"Error processing {file.name}: {e}")
|
| 209 |
+
results[file.name] = {"error": str(e)}
|
| 210 |
+
|
| 211 |
+
return results
|
| 212 |
+
|
| 213 |
+
def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
|
| 214 |
+
"""Process a TXT file into document chunks."""
|
| 215 |
+
logger.info(f"Processing TXT file: {file_path}")
|
| 216 |
+
|
| 217 |
+
# Validate strategy for TXT files
|
| 218 |
+
if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
|
| 219 |
+
raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
|
| 220 |
+
|
| 221 |
+
# Get appropriate chunker
|
| 222 |
+
chunker = self._get_chunker(strategy)
|
| 223 |
+
|
| 224 |
+
# Process based on strategy
|
| 225 |
+
if strategy == ChunkingStrategy.PARAGRAPH:
|
| 226 |
+
return chunker.process_text_file(file_path, preprocess)
|
| 227 |
+
elif strategy == ChunkingStrategy.TOKEN:
|
| 228 |
+
return chunker.process_text_file(file_path, preprocess)
|
| 229 |
+
|
| 230 |
+
else:
|
| 231 |
+
raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
|
| 232 |
+
|
| 233 |
+
def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
|
| 234 |
+
"""Process a TXT file into document chunks."""
|
| 235 |
+
logger.info(f"Processing TXT file: {file_path}")
|
| 236 |
+
|
| 237 |
+
# Validate strategy for TXT files
|
| 238 |
+
if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
|
| 239 |
+
raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
|
| 240 |
+
|
| 241 |
+
# Get appropriate chunker
|
| 242 |
+
chunker = self._get_chunker(strategy)
|
| 243 |
+
|
| 244 |
+
# Process based on strategy
|
| 245 |
+
if strategy == ChunkingStrategy.PARAGRAPH:
|
| 246 |
+
return chunker.process_text_file(file_path, preprocess)
|
| 247 |
+
elif strategy == ChunkingStrategy.TOKEN:
|
| 248 |
+
return chunker.process_text_file(file_path, preprocess)
|
| 249 |
+
|
| 250 |
+
else:
|
| 251 |
+
raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
|
| 252 |
+
|
| 253 |
+
def _process_csv(self, file_path: str, strategy: str) -> List[Document]:
|
| 254 |
+
"""Process a CSV file into document chunks."""
|
| 255 |
+
import pandas as pd
|
| 256 |
+
|
| 257 |
+
logger.info(f"Loading CSV file: {file_path}")
|
| 258 |
+
|
| 259 |
+
# Read the CSV file
|
| 260 |
+
df = pd.read_csv(file_path)
|
| 261 |
+
|
| 262 |
+
# Determine the chunking approach based on strategy
|
| 263 |
+
if strategy == ChunkingStrategy.PARAGRAPH:
|
| 264 |
+
# For these strategies, we treat each row as a separate document
|
| 265 |
+
# with columns combined into a structured text format
|
| 266 |
+
return self._chunk_csv_by_row(df, file_path)
|
| 267 |
+
elif strategy == ChunkingStrategy.PAGE:
|
| 268 |
+
# For page strategy, we create larger chunks with multiple rows
|
| 269 |
+
return self._chunk_csv_by_page(df, file_path)
|
| 270 |
+
elif strategy == ChunkingStrategy.HIERARCHICAL:
|
| 271 |
+
# For hierarchical, create documents with metadata structure
|
| 272 |
+
return {"chunks": self._chunk_csv_by_row(df, file_path)}
|
| 273 |
+
else:
|
| 274 |
+
raise ValueError(f"Unsupported chunking strategy for CSV: {strategy}")
|
| 275 |
+
|
| 276 |
+
def _chunk_csv_by_row(self, df, file_path: str) -> List[Document]:
|
| 277 |
+
"""Convert each CSV row to a document chunk."""
|
| 278 |
+
chunks = []
|
| 279 |
+
file_name = Path(file_path).name
|
| 280 |
+
|
| 281 |
+
# Get column names
|
| 282 |
+
columns = df.columns.tolist()
|
| 283 |
+
|
| 284 |
+
# Process each row
|
| 285 |
+
for i, row in df.iterrows():
|
| 286 |
+
# Convert row to formatted text
|
| 287 |
+
content = "\n".join([f"{col}: {row[col]}" for col in columns])
|
| 288 |
+
|
| 289 |
+
# Create metadata
|
| 290 |
+
metadata = {
|
| 291 |
+
"source": file_path,
|
| 292 |
+
"file_name": file_name,
|
| 293 |
+
"file_type": "csv",
|
| 294 |
+
"row_index": i,
|
| 295 |
+
"chunk_type": "csv_row",
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
# Add columns as additional metadata
|
| 299 |
+
for col in columns:
|
| 300 |
+
# Convert to string to ensure compatibility
|
| 301 |
+
metadata[f"csv_{col}"] = str(row[col])
|
| 302 |
+
|
| 303 |
+
# Create document
|
| 304 |
+
doc = Document(page_content=content, metadata=metadata)
|
| 305 |
+
chunks.append(doc)
|
| 306 |
+
|
| 307 |
+
logger.info(f"Created {len(chunks)} chunks from CSV (row-based)")
|
| 308 |
+
return chunks
|
| 309 |
+
|
| 310 |
+
def _chunk_csv_by_page(self, df, file_path: str, rows_per_chunk: int = 20) -> List[Document]:
|
| 311 |
+
"""Convert CSV into larger chunks with multiple rows per chunk."""
|
| 312 |
+
chunks = []
|
| 313 |
+
file_name = Path(file_path).name
|
| 314 |
+
columns = df.columns.tolist()
|
| 315 |
+
|
| 316 |
+
# Calculate number of chunks
|
| 317 |
+
total_rows = len(df)
|
| 318 |
+
chunk_count = (total_rows + rows_per_chunk - 1) // rows_per_chunk # Ceiling division
|
| 319 |
+
|
| 320 |
+
# Generate chunks
|
| 321 |
+
for chunk_idx in range(chunk_count):
|
| 322 |
+
start_row = chunk_idx * rows_per_chunk
|
| 323 |
+
end_row = min(start_row + rows_per_chunk, total_rows)
|
| 324 |
+
|
| 325 |
+
chunk_df = df.iloc[start_row:end_row]
|
| 326 |
+
|
| 327 |
+
# Format the chunk content
|
| 328 |
+
content = f"CSV Data (Rows {start_row+1}-{end_row}):\n\n"
|
| 329 |
+
|
| 330 |
+
# Add header row
|
| 331 |
+
content += " | ".join(columns) + "\n"
|
| 332 |
+
content += "-" * (sum(len(col) for col in columns) + 3 * (len(columns) - 1)) + "\n"
|
| 333 |
+
|
| 334 |
+
# Add data rows
|
| 335 |
+
for _, row in chunk_df.iterrows():
|
| 336 |
+
content += " | ".join(str(row[col]) for col in columns) + "\n"
|
| 337 |
+
|
| 338 |
+
# Create metadata
|
| 339 |
+
metadata = {
|
| 340 |
+
"source": file_path,
|
| 341 |
+
"file_name": file_name,
|
| 342 |
+
"file_type": "csv",
|
| 343 |
+
"chunk_type": "csv_page",
|
| 344 |
+
"start_row": start_row,
|
| 345 |
+
"end_row": end_row - 1,
|
| 346 |
+
"row_count": end_row - start_row,
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
# Create document
|
| 350 |
+
doc = Document(page_content=content, metadata=metadata)
|
| 351 |
+
chunks.append(doc)
|
| 352 |
+
|
| 353 |
+
logger.info(f"Created {len(chunks)} chunks from CSV (page-based)")
|
| 354 |
+
return chunks
|
src/core/HierarchicalChunker.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HierarchicalChunker.py
|
| 3 |
+
|
| 4 |
+
A module for hierarchical document chunking that combines page-level and semantic chunking.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Multi-level document representation (pages and chunks)
|
| 8 |
+
- Semantic chunking with sentence boundaries
|
| 9 |
+
- Size and overlap controls
|
| 10 |
+
- Hierarchical metadata
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import spacy
|
| 15 |
+
from typing import Dict, List, Optional, Any
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
from core.PageChunker import PageChunker
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
class HierarchicalChunker(PageChunker):
|
| 22 |
+
"""Handles document chunking at multiple hierarchical levels."""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model_name: Optional[str] = None,
|
| 27 |
+
embedding_model: Optional[Any] = None,
|
| 28 |
+
chunk_size: int = 500,
|
| 29 |
+
chunk_overlap: int = 50,
|
| 30 |
+
similarity_threshold: float = 0.85
|
| 31 |
+
):
|
| 32 |
+
"""
|
| 33 |
+
Initialize hierarchical chunker with specified models and parameters.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
model_name: Name of the model for tokenization
|
| 37 |
+
embedding_model: Model for generating embeddings
|
| 38 |
+
chunk_size: Maximum size of semantic chunks
|
| 39 |
+
chunk_overlap: Overlap between chunks
|
| 40 |
+
similarity_threshold: Similarity threshold for merging chunks
|
| 41 |
+
"""
|
| 42 |
+
super().__init__(model_name, embedding_model)
|
| 43 |
+
self.chunk_size = chunk_size
|
| 44 |
+
self.chunk_overlap = chunk_overlap
|
| 45 |
+
self.similarity_threshold = similarity_threshold
|
| 46 |
+
|
| 47 |
+
# Initialize spaCy for NLP tasks
|
| 48 |
+
try:
|
| 49 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 50 |
+
except OSError:
|
| 51 |
+
logger.info("Installing spaCy model...")
|
| 52 |
+
import subprocess
|
| 53 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
|
| 54 |
+
capture_output=True)
|
| 55 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 56 |
+
|
| 57 |
+
def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
|
| 58 |
+
"""
|
| 59 |
+
Create semantic chunks with detailed metadata.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
content: The page content to chunk
|
| 63 |
+
page_number: The page number
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
List of Document objects representing semantic chunks
|
| 67 |
+
"""
|
| 68 |
+
if not content.strip():
|
| 69 |
+
return []
|
| 70 |
+
|
| 71 |
+
sentences = list(self.nlp(content).sents)
|
| 72 |
+
chunks = []
|
| 73 |
+
current_chunk = []
|
| 74 |
+
current_length = 0
|
| 75 |
+
|
| 76 |
+
for sent in sentences:
|
| 77 |
+
sent_text = sent.text.strip()
|
| 78 |
+
sent_length = len(sent_text)
|
| 79 |
+
|
| 80 |
+
if current_length + sent_length > self.chunk_size:
|
| 81 |
+
if current_chunk:
|
| 82 |
+
chunk_text = " ".join(current_chunk)
|
| 83 |
+
stats = self.analyze_text(chunk_text)
|
| 84 |
+
chunks.append(Document(
|
| 85 |
+
page_content=chunk_text,
|
| 86 |
+
metadata={
|
| 87 |
+
"level": "chunk",
|
| 88 |
+
"page_num": page_number,
|
| 89 |
+
"chunk_num": len(chunks) + 1,
|
| 90 |
+
"parent_page": page_number,
|
| 91 |
+
"char_count": stats["char_count"],
|
| 92 |
+
"token_count": stats["token_count"],
|
| 93 |
+
"sentence_count": stats["sentence_count"],
|
| 94 |
+
"word_count": stats["word_count"],
|
| 95 |
+
"has_ocr": stats.get("has_content", "true")
|
| 96 |
+
}
|
| 97 |
+
))
|
| 98 |
+
current_chunk = [sent_text]
|
| 99 |
+
current_length = sent_length
|
| 100 |
+
else:
|
| 101 |
+
current_chunk.append(sent_text)
|
| 102 |
+
current_length += sent_length
|
| 103 |
+
|
| 104 |
+
# Handle final chunk
|
| 105 |
+
if current_chunk:
|
| 106 |
+
chunk_text = " ".join(current_chunk)
|
| 107 |
+
stats = self.analyze_text(chunk_text)
|
| 108 |
+
chunks.append(Document(
|
| 109 |
+
page_content=chunk_text,
|
| 110 |
+
metadata={
|
| 111 |
+
"level": "chunk",
|
| 112 |
+
"page_num": page_number,
|
| 113 |
+
"chunk_num": len(chunks) + 1,
|
| 114 |
+
"parent_page": page_number,
|
| 115 |
+
"char_count": stats["char_count"],
|
| 116 |
+
"token_count": stats["token_count"],
|
| 117 |
+
"sentence_count": stats["sentence_count"],
|
| 118 |
+
"word_count": stats["word_count"],
|
| 119 |
+
"has_ocr": stats.get("has_content", "true")
|
| 120 |
+
}
|
| 121 |
+
))
|
| 122 |
+
|
| 123 |
+
self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
|
| 124 |
+
return chunks
|
| 125 |
+
|
| 126 |
+
def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
|
| 127 |
+
"""
|
| 128 |
+
Process document with hierarchical chunking strategy.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
file_path: Path to the PDF file
|
| 132 |
+
preprocess: Whether to preprocess text
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
Dictionary with 'pages' and 'chunks' lists of Documents
|
| 136 |
+
"""
|
| 137 |
+
self.page_stats = [] # Reset stats
|
| 138 |
+
|
| 139 |
+
# First get the page-level documents using PageChunker
|
| 140 |
+
page_docs = super().page_process_document(file_path, preprocess)
|
| 141 |
+
|
| 142 |
+
# Now create chunk-level documents
|
| 143 |
+
chunk_docs = []
|
| 144 |
+
total_chunks = 0
|
| 145 |
+
|
| 146 |
+
for page_doc in page_docs:
|
| 147 |
+
page_num = page_doc.metadata["page"]
|
| 148 |
+
|
| 149 |
+
# Mark this as a page-level document
|
| 150 |
+
page_doc.metadata["level"] = "page"
|
| 151 |
+
|
| 152 |
+
# Create chunks for this page
|
| 153 |
+
page_chunks = self._create_semantic_chunks(
|
| 154 |
+
page_doc.page_content,
|
| 155 |
+
page_num
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
chunk_docs.extend(page_chunks)
|
| 159 |
+
total_chunks += len(page_chunks)
|
| 160 |
+
|
| 161 |
+
# Log summary information
|
| 162 |
+
logger.info(f"\nHierarchical Processing Summary:")
|
| 163 |
+
logger.info(f"Total Pages: {len(page_docs)}")
|
| 164 |
+
logger.info(f"Total Chunks: {total_chunks}")
|
| 165 |
+
logger.info("\n".join(self.page_stats))
|
| 166 |
+
|
| 167 |
+
return {
|
| 168 |
+
"pages": page_docs,
|
| 169 |
+
"chunks": chunk_docs
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
|
| 173 |
+
"""
|
| 174 |
+
Process document using hierarchical chunking strategy (implements abstract method).
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
file_path: Path to the PDF file
|
| 178 |
+
preprocess: Whether to preprocess text
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Dictionary with 'pages' and 'chunks' lists of Documents
|
| 182 |
+
"""
|
| 183 |
+
return self.hierarchical_process_document(file_path, preprocess)
|
src/core/OCREnhancedPDFLoader.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pytesseract
|
| 3 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
| 4 |
+
from langchain_core.documents import Document
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
+
|
| 7 |
+
class OCREnhancedPDFLoader:
|
| 8 |
+
"""Loads PDFs with OCR support for text extraction"""
|
| 9 |
+
|
| 10 |
+
BLANK_THRESHOLD = 10
|
| 11 |
+
|
| 12 |
+
# FIXED: Removed Windows default path
|
| 13 |
+
def __init__(self, file_path: str, tesseract_path: str = None):
|
| 14 |
+
if not os.path.isfile(file_path):
|
| 15 |
+
raise FileNotFoundError(f"PDF file not found at path: {file_path}")
|
| 16 |
+
|
| 17 |
+
self.file_path = file_path
|
| 18 |
+
self.skipped_pages = []
|
| 19 |
+
|
| 20 |
+
# Only set cmd if specific path provided, otherwise trust Linux PATH
|
| 21 |
+
if tesseract_path:
|
| 22 |
+
if not os.path.isfile(tesseract_path):
|
| 23 |
+
raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
|
| 24 |
+
pytesseract.pytesseract.tesseract_cmd = tesseract_path
|
| 25 |
+
|
| 26 |
+
def _is_blank_page(self, text: str) -> bool:
|
| 27 |
+
if not text or not text.strip():
|
| 28 |
+
return True
|
| 29 |
+
cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
|
| 30 |
+
return len(cleaned_text) < self.BLANK_THRESHOLD
|
| 31 |
+
|
| 32 |
+
def _process_page(self, doc, img, page_number: int):
|
| 33 |
+
existing_text = doc.page_content
|
| 34 |
+
|
| 35 |
+
# Use existing text if substantial
|
| 36 |
+
if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5:
|
| 37 |
+
combined_text = existing_text
|
| 38 |
+
ocr_used = False
|
| 39 |
+
else:
|
| 40 |
+
# Fallback to OCR
|
| 41 |
+
try:
|
| 42 |
+
ocr_text = pytesseract.image_to_string(img)
|
| 43 |
+
combined_text = ocr_text
|
| 44 |
+
ocr_used = True
|
| 45 |
+
except Exception as e:
|
| 46 |
+
print(f"Error applying OCR to page {page_number}: {e}")
|
| 47 |
+
combined_text = existing_text
|
| 48 |
+
ocr_used = False
|
| 49 |
+
|
| 50 |
+
if self._is_blank_page(combined_text):
|
| 51 |
+
self.skipped_pages.append(page_number)
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
return Document(
|
| 55 |
+
page_content=combined_text,
|
| 56 |
+
metadata={
|
| 57 |
+
**doc.metadata,
|
| 58 |
+
"source": "ocr" if ocr_used else "text_extraction",
|
| 59 |
+
"page": page_number,
|
| 60 |
+
"is_blank": "false",
|
| 61 |
+
"has_ocr": str(ocr_used)
|
| 62 |
+
}
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
def load(self):
|
| 66 |
+
try:
|
| 67 |
+
# 1. Standard Load
|
| 68 |
+
loader = PyMuPDFLoader(self.file_path)
|
| 69 |
+
text_documents = loader.load()
|
| 70 |
+
|
| 71 |
+
# 2. Image Conversion (Linux requires poppler-utils installed)
|
| 72 |
+
images = convert_from_path(self.file_path, dpi=300)
|
| 73 |
+
|
| 74 |
+
enhanced_documents = []
|
| 75 |
+
for idx, (doc, img) in enumerate(zip(text_documents, images)):
|
| 76 |
+
page_number = idx + 1
|
| 77 |
+
enhanced_doc = self._process_page(doc, img, page_number)
|
| 78 |
+
|
| 79 |
+
if enhanced_doc:
|
| 80 |
+
enhanced_documents.append(enhanced_doc)
|
| 81 |
+
|
| 82 |
+
if self.skipped_pages:
|
| 83 |
+
print(f"Skipped blank pages: {self.skipped_pages}")
|
| 84 |
+
|
| 85 |
+
return enhanced_documents
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
print(f"Error in OCR-enhanced loading: {e}")
|
| 89 |
+
raise
|
src/core/PageChunker.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PageChunker.py
|
| 3 |
+
|
| 4 |
+
A module for page-level document chunking with token counting and preprocessing.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Page-based document splitting
|
| 8 |
+
- Content validation
|
| 9 |
+
- Blank page detection
|
| 10 |
+
- Document metadata enrichment
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from typing import List, Optional
|
| 14 |
+
import logging
|
| 15 |
+
from langchain_core.documents import Document
|
| 16 |
+
from core.BaseChunker import BaseChunker
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
class PageChunker(BaseChunker):
|
| 21 |
+
"""Handles document chunking at the page level."""
|
| 22 |
+
|
| 23 |
+
def __init__(self, model_name=None, embedding_model=None):
|
| 24 |
+
"""
|
| 25 |
+
Initialize page chunker with specified models.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
model_name: Name of the model for tokenization
|
| 29 |
+
embedding_model: Model for generating embeddings
|
| 30 |
+
"""
|
| 31 |
+
super().__init__(model_name, embedding_model)
|
| 32 |
+
self.page_stats = []
|
| 33 |
+
|
| 34 |
+
def _is_blank_page(self, text: str) -> bool:
|
| 35 |
+
"""Check if page is blank or contains only whitespace/special characters."""
|
| 36 |
+
cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
|
| 37 |
+
return len(cleaned_text) < self.BLANK_THRESHOLD
|
| 38 |
+
|
| 39 |
+
def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]:
|
| 40 |
+
"""
|
| 41 |
+
Process a single page with optional preprocessing and analysis.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
content: The page content
|
| 45 |
+
page_number: The page number
|
| 46 |
+
preprocess: Whether to preprocess the text
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Document object with processed content and metadata, or None if page is blank
|
| 50 |
+
"""
|
| 51 |
+
if self._is_blank_page(content):
|
| 52 |
+
self.page_stats.append(f"Page {page_number} is blank.")
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
# Optionally preprocess the text
|
| 56 |
+
if preprocess:
|
| 57 |
+
content = self.preprocess_text(content)
|
| 58 |
+
|
| 59 |
+
# Analyze the page and generate metadata
|
| 60 |
+
stats = self.analyze_text(content)
|
| 61 |
+
|
| 62 |
+
metadata = {
|
| 63 |
+
"page": page_number,
|
| 64 |
+
"char_count": stats["char_count"],
|
| 65 |
+
"token_count": stats["token_count"],
|
| 66 |
+
"sentence_count": stats["sentence_count"],
|
| 67 |
+
"word_count": stats["word_count"],
|
| 68 |
+
"has_ocr": str(stats.get("has_content", True)),
|
| 69 |
+
"is_blank": "false"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return Document(page_content=content, metadata=metadata)
|
| 73 |
+
|
| 74 |
+
def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
|
| 75 |
+
"""
|
| 76 |
+
Process PDF document page by page with analysis and optional preprocessing.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
file_path: Path to the PDF file
|
| 80 |
+
preprocess: Whether to preprocess page text
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
List of Document objects, one per non-blank page
|
| 84 |
+
"""
|
| 85 |
+
try:
|
| 86 |
+
self.page_stats = [] # Reset stats for this document
|
| 87 |
+
raw_pages = self.load_document(file_path)
|
| 88 |
+
processed_pages = []
|
| 89 |
+
|
| 90 |
+
logger.info(f"Processing document with {len(raw_pages)} pages")
|
| 91 |
+
|
| 92 |
+
for idx, page in enumerate(raw_pages):
|
| 93 |
+
processed_page = self._process_single_page(page.page_content, idx + 1, preprocess)
|
| 94 |
+
if processed_page:
|
| 95 |
+
processed_pages.append(processed_page)
|
| 96 |
+
|
| 97 |
+
# Output skipped pages for transparency
|
| 98 |
+
if self.page_stats:
|
| 99 |
+
logger.info("\n".join(self.page_stats))
|
| 100 |
+
|
| 101 |
+
logger.info(f"Processed {len(processed_pages)} non-blank pages")
|
| 102 |
+
return processed_pages
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Error in page_process_document: {e}")
|
| 106 |
+
raise
|
| 107 |
+
|
| 108 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 109 |
+
"""
|
| 110 |
+
Process document using page chunking strategy (implements abstract method).
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
file_path: Path to the PDF file
|
| 114 |
+
preprocess: Whether to preprocess page text
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
List of Document objects, one per non-blank page
|
| 118 |
+
"""
|
| 119 |
+
return self.page_process_document(file_path, preprocess)
|
src/core/ParagraphChunker.py
ADDED
|
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ParagraphChunker.py
|
| 3 |
+
|
| 4 |
+
A module for paragraph-level document chunking with token counting and preprocessing.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Paragraph-based document splitting
|
| 8 |
+
- Content validation
|
| 9 |
+
- Multi-level delimiter detection
|
| 10 |
+
- Smart paragraph boundary detection
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import spacy
|
| 15 |
+
from typing import List, Optional
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from datetime import datetime
|
| 18 |
+
from langchain_core.documents import Document
|
| 19 |
+
from core.BaseChunker import BaseChunker
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
class ParagraphChunker(BaseChunker):
|
| 24 |
+
"""Handles document chunking at the paragraph level with token counting."""
|
| 25 |
+
|
| 26 |
+
PARAGRAPH_MIN_LENGTH = 50 # Minimum characters for a valid paragraph
|
| 27 |
+
|
| 28 |
+
def __init__(self, model_name=None, embedding_model=None):
|
| 29 |
+
"""
|
| 30 |
+
Initialize paragraph chunker with specified models.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
model_name: Name of the model for tokenization
|
| 34 |
+
embedding_model: Model for generating embeddings
|
| 35 |
+
"""
|
| 36 |
+
super().__init__(model_name, embedding_model)
|
| 37 |
+
self.page_stats = []
|
| 38 |
+
|
| 39 |
+
# Initialize spaCy for NLP tasks
|
| 40 |
+
try:
|
| 41 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 42 |
+
except Exception as e:
|
| 43 |
+
logger.error(f"Error loading spaCy model: {e}")
|
| 44 |
+
import subprocess
|
| 45 |
+
logger.info("Installing spaCy model...")
|
| 46 |
+
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
|
| 47 |
+
capture_output=True)
|
| 48 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 49 |
+
|
| 50 |
+
def _split_into_paragraphs(self, text: str) -> List[str]:
|
| 51 |
+
"""
|
| 52 |
+
Split text into paragraphs using length and punctuation heuristics.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
text: The text content to split
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
List of paragraphs
|
| 59 |
+
"""
|
| 60 |
+
# Pre-clean the text
|
| 61 |
+
text = text.replace('\r', '\n')
|
| 62 |
+
|
| 63 |
+
# First, try double line breaks
|
| 64 |
+
paragraphs = text.split('\n\n')
|
| 65 |
+
|
| 66 |
+
# If that fails (PDF extraction issue), use sentence-based reconstruction
|
| 67 |
+
if len(paragraphs) <= 3:
|
| 68 |
+
print(f"PDF extraction flattened structure. Reconstructing from sentences...")
|
| 69 |
+
|
| 70 |
+
# Use spaCy for sentence detection
|
| 71 |
+
doc = self.nlp(text)
|
| 72 |
+
paragraphs = []
|
| 73 |
+
current_para = []
|
| 74 |
+
current_length = 0
|
| 75 |
+
|
| 76 |
+
for sent in doc.sents:
|
| 77 |
+
sent_text = sent.text.strip()
|
| 78 |
+
if not sent_text:
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
# Add sentence to current paragraph
|
| 82 |
+
current_para.append(sent_text)
|
| 83 |
+
current_length += len(sent_text)
|
| 84 |
+
|
| 85 |
+
# Check if we should end the current paragraph
|
| 86 |
+
should_end_paragraph = (
|
| 87 |
+
# Paragraph is getting long (300-600 chars is typical)
|
| 88 |
+
current_length > 300 and
|
| 89 |
+
# Current sentence ends with proper punctuation
|
| 90 |
+
sent_text.endswith(('.', '!', '?')) and
|
| 91 |
+
# We have substantial content
|
| 92 |
+
len(current_para) >= 2
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
if should_end_paragraph:
|
| 96 |
+
paragraphs.append(' '.join(current_para))
|
| 97 |
+
current_para = []
|
| 98 |
+
current_length = 0
|
| 99 |
+
|
| 100 |
+
# Add the last paragraph
|
| 101 |
+
if current_para:
|
| 102 |
+
paragraphs.append(' '.join(current_para))
|
| 103 |
+
|
| 104 |
+
print(f"Reconstructed {len(paragraphs)} paragraphs using length heuristics")
|
| 105 |
+
|
| 106 |
+
# Clean and filter paragraphs
|
| 107 |
+
cleaned_paragraphs = []
|
| 108 |
+
for para in paragraphs:
|
| 109 |
+
clean_para = ' '.join(para.split())
|
| 110 |
+
if len(clean_para) >= self.PARAGRAPH_MIN_LENGTH:
|
| 111 |
+
cleaned_paragraphs.append(clean_para)
|
| 112 |
+
|
| 113 |
+
print(f"Final paragraph count: {len(cleaned_paragraphs)}")
|
| 114 |
+
return cleaned_paragraphs
|
| 115 |
+
|
| 116 |
+
def _process_single_paragraph(self, content: str, page_number: int,
|
| 117 |
+
para_number: int, preprocess: bool) -> Optional[Document]:
|
| 118 |
+
"""
|
| 119 |
+
Process a single paragraph with analysis and metadata.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
content: The paragraph content
|
| 123 |
+
page_number: The page number
|
| 124 |
+
para_number: The paragraph number
|
| 125 |
+
preprocess: Whether to preprocess the text
|
| 126 |
+
|
| 127 |
+
Returns:
|
| 128 |
+
Document object with processed content and metadata, or None if paragraph is invalid
|
| 129 |
+
"""
|
| 130 |
+
# First check character length
|
| 131 |
+
if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
|
| 132 |
+
self.page_stats.append(f"Paragraph {para_number} on page {page_number} is too short.")
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
# Optionally preprocess the text
|
| 136 |
+
if preprocess:
|
| 137 |
+
content = self.preprocess_text(content)
|
| 138 |
+
|
| 139 |
+
# Analyze the paragraph and generate metadata
|
| 140 |
+
stats = self.analyze_text(content)
|
| 141 |
+
|
| 142 |
+
# Check token threshold
|
| 143 |
+
if stats["token_count"] < self.TOKEN_THRESHOLD:
|
| 144 |
+
self.page_stats.append(
|
| 145 |
+
f"Paragraph {para_number} on page {page_number} dropped: "
|
| 146 |
+
f"only {stats['token_count']} tokens"
|
| 147 |
+
)
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
metadata = {
|
| 151 |
+
"page": page_number,
|
| 152 |
+
"paragraph": para_number,
|
| 153 |
+
"char_count": stats["char_count"],
|
| 154 |
+
"token_count": stats["token_count"],
|
| 155 |
+
"sentence_count": stats["sentence_count"],
|
| 156 |
+
"word_count": stats["word_count"],
|
| 157 |
+
"has_ocr": str(stats.get("has_content", True))
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
return Document(page_content=content, metadata=metadata)
|
| 161 |
+
|
| 162 |
+
def paragraph_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
|
| 163 |
+
"""
|
| 164 |
+
Process PDF document paragraph by paragraph with analysis.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
file_path: Path to the PDF file
|
| 168 |
+
preprocess: Whether to preprocess paragraph text
|
| 169 |
+
|
| 170 |
+
Returns:
|
| 171 |
+
List of Document objects, one per valid paragraph
|
| 172 |
+
"""
|
| 173 |
+
try:
|
| 174 |
+
self.page_stats = [] # Reset stats for this document
|
| 175 |
+
raw_pages = self.load_document(file_path)
|
| 176 |
+
processed_paragraphs = []
|
| 177 |
+
|
| 178 |
+
logger.info(f"Processing document with {len(raw_pages)} pages")
|
| 179 |
+
|
| 180 |
+
for page_idx, page in enumerate(raw_pages):
|
| 181 |
+
paragraphs = self._split_into_paragraphs(page.page_content)
|
| 182 |
+
logger.info(f"Page {page_idx+1}: Found {len(paragraphs)} paragraphs")
|
| 183 |
+
|
| 184 |
+
for para_idx, paragraph in enumerate(paragraphs):
|
| 185 |
+
processed_para = self._process_single_paragraph(
|
| 186 |
+
paragraph,
|
| 187 |
+
page_idx + 1,
|
| 188 |
+
para_idx + 1,
|
| 189 |
+
preprocess
|
| 190 |
+
)
|
| 191 |
+
if processed_para:
|
| 192 |
+
processed_paragraphs.append(processed_para)
|
| 193 |
+
|
| 194 |
+
# Output skipped paragraphs for transparency
|
| 195 |
+
if self.page_stats:
|
| 196 |
+
logger.info("\n".join(self.page_stats))
|
| 197 |
+
|
| 198 |
+
logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs")
|
| 199 |
+
return processed_paragraphs
|
| 200 |
+
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.error(f"Error in paragraph_process_document: {e}")
|
| 203 |
+
raise
|
| 204 |
+
|
| 205 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 206 |
+
"""
|
| 207 |
+
Process document using paragraph chunking strategy (implements abstract method).
|
| 208 |
+
|
| 209 |
+
Args:
|
| 210 |
+
file_path: Path to the PDF file
|
| 211 |
+
preprocess: Whether to preprocess paragraph text
|
| 212 |
+
|
| 213 |
+
Returns:
|
| 214 |
+
List of Document objects, one per valid paragraph
|
| 215 |
+
"""
|
| 216 |
+
return self.paragraph_process_document(file_path, preprocess)
|
| 217 |
+
|
| 218 |
+
def process_text_file(self, file_path: str, preprocess: bool = False) -> List[Document]:
|
| 219 |
+
"""
|
| 220 |
+
Process text file directly, preserving paragraph structure.
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
file_path: Path to the text file
|
| 224 |
+
preprocess: Whether to preprocess paragraph text
|
| 225 |
+
|
| 226 |
+
Returns:
|
| 227 |
+
List of Document objects, one per valid paragraph
|
| 228 |
+
"""
|
| 229 |
+
try:
|
| 230 |
+
# Load the text file directly
|
| 231 |
+
content = self.load_text_file(file_path)
|
| 232 |
+
|
| 233 |
+
# Clean the text using the same logic as PDF conversion
|
| 234 |
+
content = self.clean_text_for_processing(content)
|
| 235 |
+
|
| 236 |
+
# Split into paragraphs using double line breaks
|
| 237 |
+
paragraphs = content.split('\n\n')
|
| 238 |
+
|
| 239 |
+
logger.info(f"Found {len(paragraphs)} paragraphs in text file: {file_path}")
|
| 240 |
+
|
| 241 |
+
processed_paragraphs = []
|
| 242 |
+
file_name = Path(file_path).name
|
| 243 |
+
|
| 244 |
+
for para_idx, paragraph in enumerate(paragraphs):
|
| 245 |
+
paragraph = paragraph.strip()
|
| 246 |
+
if paragraph:
|
| 247 |
+
processed_para = self._process_single_paragraph_from_text(
|
| 248 |
+
paragraph,
|
| 249 |
+
file_path,
|
| 250 |
+
file_name,
|
| 251 |
+
para_idx + 1,
|
| 252 |
+
preprocess
|
| 253 |
+
)
|
| 254 |
+
if processed_para:
|
| 255 |
+
processed_paragraphs.append(processed_para)
|
| 256 |
+
|
| 257 |
+
logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs from text file")
|
| 258 |
+
return processed_paragraphs
|
| 259 |
+
|
| 260 |
+
except Exception as e:
|
| 261 |
+
logger.error(f"Error processing text file: {e}")
|
| 262 |
+
raise
|
| 263 |
+
|
| 264 |
+
def _process_single_paragraph_from_text(self, content: str, file_path: str,
|
| 265 |
+
file_name: str, para_number: int,
|
| 266 |
+
preprocess: bool) -> Optional[Document]:
|
| 267 |
+
"""
|
| 268 |
+
Process a single paragraph from text file with analysis and metadata.
|
| 269 |
+
|
| 270 |
+
Args:
|
| 271 |
+
content: The paragraph content
|
| 272 |
+
file_path: Full path to the source file
|
| 273 |
+
file_name: Name of the source file
|
| 274 |
+
para_number: The paragraph number
|
| 275 |
+
preprocess: Whether to preprocess the text
|
| 276 |
+
|
| 277 |
+
Returns:
|
| 278 |
+
Document object with processed content and metadata, or None if paragraph is invalid
|
| 279 |
+
"""
|
| 280 |
+
# First check character length
|
| 281 |
+
if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
|
| 282 |
+
logger.debug(f"Paragraph {para_number} too short ({len(content)} chars), skipping")
|
| 283 |
+
return None
|
| 284 |
+
|
| 285 |
+
# Preprocess if requested
|
| 286 |
+
if preprocess:
|
| 287 |
+
content = self.preprocess_text(content, remove_headers_footers=False)
|
| 288 |
+
|
| 289 |
+
# Analyze the paragraph
|
| 290 |
+
analysis = self.analyze_text(content)
|
| 291 |
+
|
| 292 |
+
# Validate content quality
|
| 293 |
+
if not self.is_content_valid(content):
|
| 294 |
+
logger.debug(f"Paragraph {para_number} failed content validation, skipping")
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
# Create metadata
|
| 298 |
+
metadata = {
|
| 299 |
+
"source": file_path,
|
| 300 |
+
"file_name": file_name,
|
| 301 |
+
"file_type": "txt",
|
| 302 |
+
"paragraph": para_number,
|
| 303 |
+
"char_count": analysis["char_count"],
|
| 304 |
+
"token_count": analysis["token_count"],
|
| 305 |
+
"sentence_count": analysis["sentence_count"],
|
| 306 |
+
"word_count": analysis["word_count"],
|
| 307 |
+
"chunk_type": "paragraph",
|
| 308 |
+
"processing_timestamp": datetime.now().isoformat(),
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
# Create and return document
|
| 312 |
+
doc = Document(page_content=content, metadata=metadata)
|
| 313 |
+
logger.debug(f"Created paragraph {para_number}: {analysis['char_count']} chars, {analysis['token_count']} tokens")
|
| 314 |
+
|
| 315 |
+
return doc
|
src/core/SemanticChunker.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SemanticChunker.py
|
| 3 |
+
A module for semantic-aware text chunking using embeddings and similarity metrics.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import List, Optional, Any
|
| 8 |
+
import numpy as np
|
| 9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
# FIXED IMPORT: Updated for LangChain v0.2+
|
| 12 |
+
from langchain_text_splitters import SpacyTextSplitter
|
| 13 |
+
from sentence_transformers import SentenceTransformer
|
| 14 |
+
from core.BaseChunker import BaseChunker
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
class SemanticChunker(BaseChunker):
|
| 19 |
+
"""Chunks text based on semantic similarity and size constraints"""
|
| 20 |
+
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
model_name: Optional[str] = None,
|
| 24 |
+
embedding_model: Optional[Any] = None,
|
| 25 |
+
chunk_size: int = 200,
|
| 26 |
+
chunk_overlap: int = 0,
|
| 27 |
+
similarity_threshold: float = 0.9,
|
| 28 |
+
separator: str = " "
|
| 29 |
+
):
|
| 30 |
+
"""
|
| 31 |
+
Initialize the semantic chunker with configurable parameters
|
| 32 |
+
"""
|
| 33 |
+
# Validate parameters
|
| 34 |
+
if chunk_size <= 0:
|
| 35 |
+
raise ValueError("chunk_size must be a positive integer.")
|
| 36 |
+
if not (0 <= similarity_threshold <= 1):
|
| 37 |
+
raise ValueError("similarity_threshold must be between 0 and 1.")
|
| 38 |
+
|
| 39 |
+
# Initialize BaseChunker first
|
| 40 |
+
super().__init__(model_name, embedding_model)
|
| 41 |
+
|
| 42 |
+
# Set semantic chunking parameters
|
| 43 |
+
self.chunk_size = chunk_size
|
| 44 |
+
self.chunk_overlap = chunk_overlap
|
| 45 |
+
self.similarity_threshold = similarity_threshold
|
| 46 |
+
self.separator = separator
|
| 47 |
+
|
| 48 |
+
# Use provided embedding model or initialize sentence transformer
|
| 49 |
+
is_dummy = False
|
| 50 |
+
if embedding_model is not None:
|
| 51 |
+
try:
|
| 52 |
+
test_output = embedding_model.encode("test")
|
| 53 |
+
if isinstance(test_output, list) and len(test_output) == 384 and all(x == 0.0 for x in test_output):
|
| 54 |
+
is_dummy = True
|
| 55 |
+
except:
|
| 56 |
+
pass
|
| 57 |
+
|
| 58 |
+
if embedding_model is None or is_dummy:
|
| 59 |
+
try:
|
| 60 |
+
self.sentence_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
|
| 61 |
+
self.embedding_model = self.sentence_model
|
| 62 |
+
logger.info("Initialized SentenceTransformer for semantic chunking")
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.error(f"Error loading SentenceTransformer: {e}")
|
| 65 |
+
class DummyEmbedder:
|
| 66 |
+
def encode(self, text, **kwargs):
|
| 67 |
+
return [0.0] * 384
|
| 68 |
+
self.sentence_model = DummyEmbedder()
|
| 69 |
+
self.embedding_model = self.sentence_model
|
| 70 |
+
else:
|
| 71 |
+
self.sentence_model = embedding_model
|
| 72 |
+
logger.info("Using provided embedding model for semantic chunking")
|
| 73 |
+
|
| 74 |
+
# Initialize text splitter for initial chunking
|
| 75 |
+
self.text_splitter = SpacyTextSplitter(
|
| 76 |
+
chunk_size=self.chunk_size - self.chunk_overlap,
|
| 77 |
+
chunk_overlap=self.chunk_overlap,
|
| 78 |
+
separator=self.separator
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
def _enforce_size_immediately(self, text: str) -> List[str]:
|
| 82 |
+
if not text.strip():
|
| 83 |
+
return []
|
| 84 |
+
|
| 85 |
+
chunks = []
|
| 86 |
+
current_chunk = []
|
| 87 |
+
words = text.split()
|
| 88 |
+
|
| 89 |
+
for word in words:
|
| 90 |
+
if sum(len(w) for w in current_chunk) + len(word) + len(current_chunk) <= self.chunk_size:
|
| 91 |
+
current_chunk.append(word)
|
| 92 |
+
else:
|
| 93 |
+
if current_chunk:
|
| 94 |
+
chunks.append(" ".join(current_chunk))
|
| 95 |
+
current_chunk = [word]
|
| 96 |
+
|
| 97 |
+
if current_chunk:
|
| 98 |
+
chunks.append(" ".join(current_chunk))
|
| 99 |
+
|
| 100 |
+
return chunks
|
| 101 |
+
|
| 102 |
+
def get_semantic_chunks(self, documents: List[Document]) -> List[Document]:
|
| 103 |
+
if not documents:
|
| 104 |
+
logger.warning("No documents provided for semantic chunking")
|
| 105 |
+
return []
|
| 106 |
+
|
| 107 |
+
try:
|
| 108 |
+
base_chunks = self.text_splitter.split_documents(documents)
|
| 109 |
+
logger.info(f"Initial splitting created {len(base_chunks)} base chunks")
|
| 110 |
+
|
| 111 |
+
if not base_chunks:
|
| 112 |
+
return []
|
| 113 |
+
|
| 114 |
+
chunk_contents = [doc.page_content for doc in base_chunks]
|
| 115 |
+
chunk_embeddings = self.sentence_model.encode(chunk_contents)
|
| 116 |
+
|
| 117 |
+
grouped_chunks = []
|
| 118 |
+
current_group = []
|
| 119 |
+
current_embedding = None
|
| 120 |
+
|
| 121 |
+
for i, base_chunk in enumerate(base_chunks):
|
| 122 |
+
if not current_group:
|
| 123 |
+
current_group.append(base_chunk)
|
| 124 |
+
current_embedding = chunk_embeddings[i].reshape(1, -1)
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
similarity = cosine_similarity(current_embedding, chunk_embeddings[i].reshape(1, -1))[0][0]
|
| 128 |
+
combined_content = " ".join([doc.page_content for doc in current_group] + [base_chunk.page_content])
|
| 129 |
+
|
| 130 |
+
if similarity >= self.similarity_threshold and len(combined_content) <= self.chunk_size:
|
| 131 |
+
current_group.append(base_chunk)
|
| 132 |
+
else:
|
| 133 |
+
grouped_chunks.extend(self._finalize_chunk_group(current_group))
|
| 134 |
+
current_group = [base_chunk]
|
| 135 |
+
current_embedding = chunk_embeddings[i].reshape(1, -1)
|
| 136 |
+
|
| 137 |
+
if current_group:
|
| 138 |
+
grouped_chunks.extend(self._finalize_chunk_group(current_group))
|
| 139 |
+
|
| 140 |
+
logger.info(f"Created {len(grouped_chunks)} semantic chunks")
|
| 141 |
+
return grouped_chunks
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"Error in semantic chunking: {e}")
|
| 145 |
+
return documents
|
| 146 |
+
|
| 147 |
+
def _finalize_chunk_group(self, group: List[Document]) -> List[Document]:
|
| 148 |
+
if not group:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
processed_chunks = []
|
| 152 |
+
content = " ".join([doc.page_content for doc in group])
|
| 153 |
+
size_limited_chunks = self._enforce_size_immediately(content)
|
| 154 |
+
|
| 155 |
+
base_metadata = group[0].metadata.copy()
|
| 156 |
+
|
| 157 |
+
for i, chunk in enumerate(size_limited_chunks):
|
| 158 |
+
stats = self.analyze_text(chunk)
|
| 159 |
+
metadata = base_metadata.copy()
|
| 160 |
+
metadata.update({
|
| 161 |
+
"chunk_index": i + 1,
|
| 162 |
+
"chunk_count": len(size_limited_chunks),
|
| 163 |
+
"char_count": stats["char_count"],
|
| 164 |
+
"token_count": stats["token_count"],
|
| 165 |
+
"sentence_count": stats["sentence_count"],
|
| 166 |
+
"word_count": stats["word_count"],
|
| 167 |
+
"chunk_type": "semantic"
|
| 168 |
+
})
|
| 169 |
+
|
| 170 |
+
processed_chunks.append(Document(page_content=chunk, metadata=metadata))
|
| 171 |
+
|
| 172 |
+
return processed_chunks
|
| 173 |
+
|
| 174 |
+
def semantic_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
|
| 175 |
+
try:
|
| 176 |
+
logger.info(f"Processing document with semantic chunking: {file_path}")
|
| 177 |
+
|
| 178 |
+
raw_documents = self.load_document(file_path)
|
| 179 |
+
|
| 180 |
+
processed_documents = []
|
| 181 |
+
for doc in raw_documents:
|
| 182 |
+
content = doc.page_content
|
| 183 |
+
if preprocess:
|
| 184 |
+
content = self.preprocess_text(content)
|
| 185 |
+
processed_documents.append(Document(
|
| 186 |
+
page_content=content,
|
| 187 |
+
metadata=doc.metadata
|
| 188 |
+
))
|
| 189 |
+
|
| 190 |
+
documents = self.get_semantic_chunks(processed_documents)
|
| 191 |
+
logger.info(f"Created {len(documents)} semantic chunks")
|
| 192 |
+
|
| 193 |
+
return documents
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
logger.error(f"Error in semantic_process_document: {e}")
|
| 197 |
+
raise
|
| 198 |
+
|
| 199 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 200 |
+
return self.semantic_process_document(file_path, preprocess)
|
src/core/TextPreprocessor.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from nltk.corpus import stopwords
|
| 3 |
+
from nltk.stem import WordNetLemmatizer
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
class TextPreprocessor:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
try:
|
| 9 |
+
self.stopwords = set(stopwords.words('english'))
|
| 10 |
+
self.lemmatizer = WordNetLemmatizer()
|
| 11 |
+
self.logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
except Exception as e:
|
| 14 |
+
self.logger.error(f"Failed to initialize NLTK resources: {e}")
|
| 15 |
+
raise
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def standardize_case(self, text):
|
| 19 |
+
return text.lower()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def remove_punctuation(self, text):
|
| 23 |
+
return re.sub(r'[^\w\s]', '', text)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def normalize_whitespace(self, text):
|
| 27 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def remove_stopwords(self, words):
|
| 31 |
+
return [word for word in words if word not in self.stopwords]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def lemmatize_words(self, words):
|
| 35 |
+
return [self.lemmatizer.lemmatize(word) for word in words]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
|
| 39 |
+
try:
|
| 40 |
+
if not text or not text.strip():
|
| 41 |
+
return text
|
| 42 |
+
|
| 43 |
+
lines = text.splitlines()
|
| 44 |
+
if len(lines) <= 4: # For very short text, don't remove anything
|
| 45 |
+
return text
|
| 46 |
+
|
| 47 |
+
# Store original lines for fallback
|
| 48 |
+
original_lines = lines.copy()
|
| 49 |
+
|
| 50 |
+
# Use different strategies based on document characteristics
|
| 51 |
+
if self._appears_to_be_slide(lines):
|
| 52 |
+
# Slide-friendly approach - only remove obvious headers/footers
|
| 53 |
+
cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
|
| 54 |
+
elif aggressive:
|
| 55 |
+
# Traditional document approach - remove first/last few lines
|
| 56 |
+
num_lines = 2
|
| 57 |
+
cleaned_lines = lines[num_lines:-num_lines]
|
| 58 |
+
else:
|
| 59 |
+
# Conservative approach - only remove based on patterns
|
| 60 |
+
cleaned_lines = self._pattern_based_removal(lines, pattern)
|
| 61 |
+
|
| 62 |
+
# If we removed too much (over 30% of content), revert to original
|
| 63 |
+
if len(cleaned_lines) < len(lines) * 0.7:
|
| 64 |
+
self.logger.warning("Header/footer removal eliminated too much content, reverting")
|
| 65 |
+
cleaned_lines = original_lines
|
| 66 |
+
|
| 67 |
+
# Additional heuristic: Remove single-word lines that might be page numbers
|
| 68 |
+
cleaned_lines = [line for line in cleaned_lines
|
| 69 |
+
if not (len(line.strip().split()) == 1 and
|
| 70 |
+
line.strip().isdigit())]
|
| 71 |
+
|
| 72 |
+
# Join lines back into text
|
| 73 |
+
return '\n'.join(cleaned_lines)
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
self.logger.error(f"Error removing headers/footers: {e}")
|
| 77 |
+
return text # Return original text on error
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _appears_to_be_slide(self, lines):
|
| 81 |
+
"""Detect if the content appears to be from a slide/presentation."""
|
| 82 |
+
# Characteristics of slides:
|
| 83 |
+
# - Shorter overall text
|
| 84 |
+
# - Fewer lines
|
| 85 |
+
# - More bullet points
|
| 86 |
+
# - Title followed by bullet points
|
| 87 |
+
|
| 88 |
+
if len(lines) < 15: # Short content
|
| 89 |
+
return True
|
| 90 |
+
|
| 91 |
+
# Check for bullet point patterns
|
| 92 |
+
bullet_pattern = r'^\s*[•\-\*\>\◦\○\◆\◇\▪\▫\⚫\⚪\✓\✔\✕\✖\✗\✘]'
|
| 93 |
+
bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))
|
| 94 |
+
|
| 95 |
+
# If more than 20% of lines are bullets, likely a slide
|
| 96 |
+
if bullet_lines > len(lines) * 0.2:
|
| 97 |
+
return True
|
| 98 |
+
|
| 99 |
+
# If first non-empty line is short (likely a title) and followed by bullet points
|
| 100 |
+
non_empty_lines = [line for line in lines if line.strip()]
|
| 101 |
+
if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
|
| 102 |
+
# Check for bullet points in the following lines
|
| 103 |
+
for line in non_empty_lines[1:4]: # Check next few lines
|
| 104 |
+
if re.match(bullet_pattern, line):
|
| 105 |
+
return True
|
| 106 |
+
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _clean_slide_headers_footers(self, lines, pattern=None):
|
| 111 |
+
"""Clean headers/footers from slide-based content."""
|
| 112 |
+
cleaned_lines = lines.copy()
|
| 113 |
+
|
| 114 |
+
# For slides, we primarily rely on pattern matching rather than line position
|
| 115 |
+
if pattern:
|
| 116 |
+
cleaned_lines = [line for line in cleaned_lines
|
| 117 |
+
if not re.search(pattern, line)]
|
| 118 |
+
|
| 119 |
+
# Common slide footer patterns to remove
|
| 120 |
+
footer_patterns = [
|
| 121 |
+
r'^\s*\d+\s*$', # Standalone page number
|
| 122 |
+
r'confidential', # Confidentiality notices
|
| 123 |
+
r'all rights reserved',
|
| 124 |
+
r'proprietary',
|
| 125 |
+
r'^\s*www\.', # Website in footer
|
| 126 |
+
r'^\s*https?://', # URL in footer
|
| 127 |
+
r'\bpage\s+\d+\b', # "Page X" footer
|
| 128 |
+
r'^\s*[©Ⓒ]\s*\d{4}' # Copyright notice
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
# Combine all patterns
|
| 132 |
+
combined_pattern = '|'.join(f'({p})' for p in footer_patterns)
|
| 133 |
+
|
| 134 |
+
# Filter out footer lines
|
| 135 |
+
if combined_pattern:
|
| 136 |
+
cleaned_lines = [line for line in cleaned_lines
|
| 137 |
+
if not re.search(combined_pattern, line, re.IGNORECASE)]
|
| 138 |
+
|
| 139 |
+
return cleaned_lines
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _pattern_based_removal(self, lines, pattern=None):
|
| 143 |
+
"""Remove headers/footers based only on patterns, not position."""
|
| 144 |
+
if not pattern:
|
| 145 |
+
# Default patterns for headers/footers
|
| 146 |
+
patterns = [
|
| 147 |
+
r'^\s*\d+\s*$', # Standalone page numbers
|
| 148 |
+
r'^\s*page\s+\d+\s+of\s+\d+\s*$', # Page X of Y
|
| 149 |
+
r'^\s*[©Ⓒ]\s*\d{4}.*$', # Copyright lines
|
| 150 |
+
r'^\s*confidential\s*$', # Confidentiality markers
|
| 151 |
+
r'^\s*https?://.*$', # URLs alone on a line
|
| 152 |
+
r'^\s*www\..*$', # Website alone on a line
|
| 153 |
+
r'^\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*$' # Email addresses
|
| 154 |
+
]
|
| 155 |
+
combined_pattern = '|'.join(f'({p})' for p in patterns)
|
| 156 |
+
else:
|
| 157 |
+
combined_pattern = pattern
|
| 158 |
+
|
| 159 |
+
return [line for line in lines
|
| 160 |
+
if not re.search(combined_pattern, line, re.IGNORECASE)]
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def remove_common_pdf_artifacts(self, text):
|
| 164 |
+
try:
|
| 165 |
+
# Remove form field indicators
|
| 166 |
+
text = re.sub(r'\[\s*\]\s*|\[\s*X\s*\]|\(\s*\)\s*|\(\s*X\s*\)', '', text)
|
| 167 |
+
|
| 168 |
+
# Remove common PDF annotations
|
| 169 |
+
text = re.sub(r'<<[^>]*>>', '', text)
|
| 170 |
+
|
| 171 |
+
# Remove artifact markers often found in PDFs
|
| 172 |
+
text = re.sub(r'obj\s*\d+\s*\d+\s*R', '', text)
|
| 173 |
+
|
| 174 |
+
return text
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
self.logger.error(f"Error removing PDF artifacts: {e}")
|
| 178 |
+
return text
|
| 179 |
+
|
| 180 |
+
def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
|
| 181 |
+
try:
|
| 182 |
+
if remove_headers_footers:
|
| 183 |
+
text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)
|
| 184 |
+
|
| 185 |
+
text = self.remove_common_pdf_artifacts(text)
|
| 186 |
+
|
| 187 |
+
text = self.standardize_case(text)
|
| 188 |
+
text = self.remove_punctuation(text)
|
| 189 |
+
text = self.normalize_whitespace(text)
|
| 190 |
+
|
| 191 |
+
words = text.split()
|
| 192 |
+
words = self.remove_stopwords(words)
|
| 193 |
+
words = self.lemmatize_words(words)
|
| 194 |
+
|
| 195 |
+
return ' '.join(words)
|
| 196 |
+
except Exception as e:
|
| 197 |
+
self.logger.error(f"Error preprocessing text: {e}")
|
| 198 |
+
raise
|
src/core/TokenChunker.py
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TokenChunker.py
|
| 3 |
+
|
| 4 |
+
A module for token-based document chunking with configurable overlap and preprocessing.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- Token-based document splitting with overlap
|
| 8 |
+
- Content validation and token counting
|
| 9 |
+
- Smart boundary detection to preserve word integrity
|
| 10 |
+
- Compatible with multiple tokenizer types (tiktoken, transformers, basic)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import re
|
| 15 |
+
from typing import List, Optional, Dict, Any
|
| 16 |
+
from langchain_core.documents import Document
|
| 17 |
+
from core.BaseChunker import BaseChunker
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
class TokenChunker(BaseChunker):
|
| 22 |
+
"""Handles document chunking at the token level with configurable overlap."""
|
| 23 |
+
|
| 24 |
+
def __init__(
|
| 25 |
+
self,
|
| 26 |
+
model_name=None,
|
| 27 |
+
embedding_model=None,
|
| 28 |
+
chunk_size: int = 256,
|
| 29 |
+
chunk_overlap: int = 50,
|
| 30 |
+
min_chunk_size: int = 50
|
| 31 |
+
):
|
| 32 |
+
"""
|
| 33 |
+
Initialize token chunker with specified models and parameters.
|
| 34 |
+
|
| 35 |
+
Args:
|
| 36 |
+
model_name: Name of the model for tokenization
|
| 37 |
+
embedding_model: Model for generating embeddings
|
| 38 |
+
chunk_size: Maximum tokens per chunk
|
| 39 |
+
chunk_overlap: Number of tokens to overlap between chunks
|
| 40 |
+
min_chunk_size: Minimum tokens for a valid chunk
|
| 41 |
+
"""
|
| 42 |
+
super().__init__(model_name, embedding_model)
|
| 43 |
+
|
| 44 |
+
# Validate chunking parameters
|
| 45 |
+
if chunk_overlap >= chunk_size:
|
| 46 |
+
raise ValueError("chunk_overlap must be less than chunk_size")
|
| 47 |
+
if min_chunk_size <= 0:
|
| 48 |
+
raise ValueError("min_chunk_size must be positive")
|
| 49 |
+
|
| 50 |
+
self.chunk_size = chunk_size
|
| 51 |
+
self.chunk_overlap = chunk_overlap
|
| 52 |
+
self.min_chunk_size = min_chunk_size
|
| 53 |
+
self.chunk_stats = []
|
| 54 |
+
|
| 55 |
+
logger.info(f"TokenChunker initialized: chunk_size={chunk_size}, overlap={chunk_overlap}, min_size={min_chunk_size}")
|
| 56 |
+
|
| 57 |
+
def _smart_tokenize(self, text: str) -> List[str]:
|
| 58 |
+
"""
|
| 59 |
+
Tokenize text while preserving word boundaries for reconstruction.
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
text: The text content to tokenize
|
| 63 |
+
|
| 64 |
+
Returns:
|
| 65 |
+
List of tokens that can be cleanly rejoined
|
| 66 |
+
"""
|
| 67 |
+
if not text.strip():
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
if self.uses_tiktoken:
|
| 72 |
+
# For tiktoken, we need a hybrid approach to preserve boundaries
|
| 73 |
+
return self._tiktoken_boundary_aware_split(text)
|
| 74 |
+
|
| 75 |
+
elif hasattr(self.tokenizer, 'tokenize'):
|
| 76 |
+
# For transformers tokenizers
|
| 77 |
+
tokens = self.tokenizer.tokenize(text)
|
| 78 |
+
return self._clean_subword_tokens(tokens)
|
| 79 |
+
|
| 80 |
+
else:
|
| 81 |
+
# Fallback to intelligent word splitting
|
| 82 |
+
return self._word_boundary_split(text)
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.warning(f"Tokenization failed: {e}. Using word boundary fallback.")
|
| 86 |
+
return self._word_boundary_split(text)
|
| 87 |
+
|
| 88 |
+
def _tiktoken_boundary_aware_split(self, text: str) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Split text in a way that's compatible with tiktoken while preserving boundaries.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
text: Input text
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
List of text segments that approximate tokens
|
| 97 |
+
"""
|
| 98 |
+
# Get actual token count for validation
|
| 99 |
+
target_token_count = self.count_tokens(text)
|
| 100 |
+
|
| 101 |
+
# Split on natural boundaries (spaces, punctuation)
|
| 102 |
+
words = re.findall(r'\S+|\s+', text)
|
| 103 |
+
|
| 104 |
+
# If we have roughly the right number of words, use them
|
| 105 |
+
if abs(len(words) - target_token_count) / max(target_token_count, 1) < 0.3:
|
| 106 |
+
return [w for w in words if w.strip()]
|
| 107 |
+
|
| 108 |
+
# Otherwise, use a more granular split
|
| 109 |
+
segments = re.findall(r'\w+|[^\w\s]|\s+', text)
|
| 110 |
+
return [s for s in segments if s.strip()]
|
| 111 |
+
|
| 112 |
+
def _clean_subword_tokens(self, tokens: List[str]) -> List[str]:
|
| 113 |
+
"""
|
| 114 |
+
Clean subword tokens for better reconstruction.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
tokens: Raw tokens from tokenizer
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Cleaned tokens
|
| 121 |
+
"""
|
| 122 |
+
cleaned = []
|
| 123 |
+
for token in tokens:
|
| 124 |
+
# Remove special tokens but keep the content
|
| 125 |
+
if token.startswith('##'):
|
| 126 |
+
# BERT-style subwords
|
| 127 |
+
cleaned.append(token[2:])
|
| 128 |
+
elif token.startswith('▁'):
|
| 129 |
+
# SentencePiece-style
|
| 130 |
+
cleaned.append(' ' + token[1:])
|
| 131 |
+
else:
|
| 132 |
+
cleaned.append(token)
|
| 133 |
+
return [t for t in cleaned if t.strip()]
|
| 134 |
+
|
| 135 |
+
def _word_boundary_split(self, text: str) -> List[str]:
|
| 136 |
+
"""
|
| 137 |
+
Split text on word boundaries as fallback tokenization.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
text: Input text
|
| 141 |
+
|
| 142 |
+
Returns:
|
| 143 |
+
List of words
|
| 144 |
+
"""
|
| 145 |
+
# Split on whitespace but preserve some punctuation as separate tokens
|
| 146 |
+
tokens = re.findall(r'\w+|[.!?;,]', text)
|
| 147 |
+
return tokens
|
| 148 |
+
|
| 149 |
+
def _detokenize(self, tokens: List[str]) -> str:
|
| 150 |
+
"""
|
| 151 |
+
Reconstruct text from tokens, handling different tokenizer types.
|
| 152 |
+
|
| 153 |
+
Args:
|
| 154 |
+
tokens: List of token strings
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Reconstructed text
|
| 158 |
+
"""
|
| 159 |
+
if not tokens:
|
| 160 |
+
return ""
|
| 161 |
+
|
| 162 |
+
if self.uses_tiktoken or not hasattr(self.tokenizer, 'tokenize'):
|
| 163 |
+
# For tiktoken and basic tokenizers, use space joining with smart spacing
|
| 164 |
+
result = ""
|
| 165 |
+
for i, token in enumerate(tokens):
|
| 166 |
+
if not token.strip():
|
| 167 |
+
continue
|
| 168 |
+
|
| 169 |
+
if i == 0:
|
| 170 |
+
result = token
|
| 171 |
+
elif token in '.,!?;:':
|
| 172 |
+
result += token
|
| 173 |
+
elif result and result[-1] in '.,!?;:':
|
| 174 |
+
result += " " + token
|
| 175 |
+
else:
|
| 176 |
+
result += " " + token
|
| 177 |
+
return result
|
| 178 |
+
|
| 179 |
+
else:
|
| 180 |
+
# For transformers tokenizers, handle subword reconstruction
|
| 181 |
+
text = "".join(tokens)
|
| 182 |
+
# Clean up spacing around punctuation
|
| 183 |
+
text = re.sub(r'\s+([.!?;,])', r'\1', text)
|
| 184 |
+
text = re.sub(r'\s+', ' ', text)
|
| 185 |
+
return text.strip()
|
| 186 |
+
|
| 187 |
+
def _create_token_chunks(self, tokens: List[str]) -> List[List[str]]:
|
| 188 |
+
"""
|
| 189 |
+
Split tokens into overlapping chunks of specified size.
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
tokens: List of token strings
|
| 193 |
+
|
| 194 |
+
Returns:
|
| 195 |
+
List of token chunks
|
| 196 |
+
"""
|
| 197 |
+
if not tokens:
|
| 198 |
+
return []
|
| 199 |
+
|
| 200 |
+
chunks = []
|
| 201 |
+
start = 0
|
| 202 |
+
|
| 203 |
+
while start < len(tokens):
|
| 204 |
+
# Calculate end position for this chunk
|
| 205 |
+
end = min(start + self.chunk_size, len(tokens))
|
| 206 |
+
|
| 207 |
+
# Extract the chunk
|
| 208 |
+
chunk_tokens = tokens[start:end]
|
| 209 |
+
|
| 210 |
+
# Only add chunks that meet minimum size requirement
|
| 211 |
+
if len(chunk_tokens) >= self.min_chunk_size:
|
| 212 |
+
chunks.append(chunk_tokens)
|
| 213 |
+
self.chunk_stats.append(f"Created chunk with {len(chunk_tokens)} tokens")
|
| 214 |
+
else:
|
| 215 |
+
self.chunk_stats.append(f"Skipped small chunk with {len(chunk_tokens)} tokens")
|
| 216 |
+
|
| 217 |
+
# Break if we've reached the end
|
| 218 |
+
if end >= len(tokens):
|
| 219 |
+
break
|
| 220 |
+
|
| 221 |
+
# Calculate next start position with overlap
|
| 222 |
+
start = end - self.chunk_overlap
|
| 223 |
+
|
| 224 |
+
# Ensure forward progress
|
| 225 |
+
if start <= 0:
|
| 226 |
+
start = end
|
| 227 |
+
|
| 228 |
+
return chunks
|
| 229 |
+
|
| 230 |
+
def _process_single_chunk(self, chunk_tokens: List[str], chunk_index: int,
|
| 231 |
+
source_metadata: Dict[str, Any]) -> Optional[Document]:
|
| 232 |
+
"""
|
| 233 |
+
Process a single token chunk into a Document with metadata.
|
| 234 |
+
|
| 235 |
+
Args:
|
| 236 |
+
chunk_tokens: List of tokens for this chunk
|
| 237 |
+
chunk_index: Index of this chunk in the document
|
| 238 |
+
source_metadata: Metadata from source document
|
| 239 |
+
|
| 240 |
+
Returns:
|
| 241 |
+
Document object with processed content and metadata, or None if invalid
|
| 242 |
+
"""
|
| 243 |
+
# Reconstruct text from tokens
|
| 244 |
+
chunk_text = self._detokenize(chunk_tokens)
|
| 245 |
+
|
| 246 |
+
# Validate chunk content
|
| 247 |
+
if not self.is_content_valid(chunk_text, min_tokens=self.min_chunk_size):
|
| 248 |
+
self.chunk_stats.append(f"Chunk {chunk_index} failed validation")
|
| 249 |
+
return None
|
| 250 |
+
|
| 251 |
+
# Analyze the chunk content
|
| 252 |
+
stats = self.analyze_text(chunk_text)
|
| 253 |
+
|
| 254 |
+
# Create comprehensive metadata
|
| 255 |
+
metadata = source_metadata.copy()
|
| 256 |
+
metadata.update({
|
| 257 |
+
"chunk_index": chunk_index,
|
| 258 |
+
"chunk_type": "token",
|
| 259 |
+
"chunking_method": "token_based",
|
| 260 |
+
"token_count": len(chunk_tokens),
|
| 261 |
+
"char_count": stats["char_count"],
|
| 262 |
+
"sentence_count": stats["sentence_count"],
|
| 263 |
+
"word_count": stats["word_count"],
|
| 264 |
+
"chunk_size_limit": self.chunk_size,
|
| 265 |
+
"chunk_overlap": self.chunk_overlap
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
return Document(page_content=chunk_text, metadata=metadata)
|
| 269 |
+
|
| 270 |
+
def token_process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 271 |
+
"""
|
| 272 |
+
Process document using token-based chunking with overlap.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
file_path: Path to the document file
|
| 276 |
+
preprocess: Whether to preprocess text content
|
| 277 |
+
|
| 278 |
+
Returns:
|
| 279 |
+
List of Document objects, one per valid token chunk
|
| 280 |
+
"""
|
| 281 |
+
try:
|
| 282 |
+
self.chunk_stats = [] # Reset stats for this document
|
| 283 |
+
raw_pages = self.load_document(file_path)
|
| 284 |
+
processed_chunks = []
|
| 285 |
+
|
| 286 |
+
logger.info(f"Processing document with {len(raw_pages)} pages using token chunking")
|
| 287 |
+
|
| 288 |
+
# Combine all pages into a single text for token-based processing
|
| 289 |
+
full_text = ""
|
| 290 |
+
combined_metadata = {}
|
| 291 |
+
page_info = [] # Track which pages contributed to the text
|
| 292 |
+
|
| 293 |
+
for page_idx, page in enumerate(raw_pages):
|
| 294 |
+
content = page.page_content
|
| 295 |
+
|
| 296 |
+
# Skip invalid content
|
| 297 |
+
if not self.is_content_valid(content):
|
| 298 |
+
logger.debug(f"Skipping invalid content on page {page_idx + 1}")
|
| 299 |
+
continue
|
| 300 |
+
|
| 301 |
+
# Preprocess if requested
|
| 302 |
+
if preprocess:
|
| 303 |
+
content = self.preprocess_text(content)
|
| 304 |
+
if not self.is_content_valid(content):
|
| 305 |
+
continue
|
| 306 |
+
|
| 307 |
+
# Track page information
|
| 308 |
+
page_info.append({
|
| 309 |
+
"page_number": page_idx + 1,
|
| 310 |
+
"original_metadata": page.metadata
|
| 311 |
+
})
|
| 312 |
+
|
| 313 |
+
# Combine text with page separation
|
| 314 |
+
if full_text:
|
| 315 |
+
full_text += "\n\n" + content
|
| 316 |
+
else:
|
| 317 |
+
full_text = content
|
| 318 |
+
# Use metadata from first valid page as base
|
| 319 |
+
combined_metadata = page.metadata.copy()
|
| 320 |
+
|
| 321 |
+
# Update combined metadata to reflect all pages
|
| 322 |
+
if page_info:
|
| 323 |
+
combined_metadata.update({
|
| 324 |
+
"total_pages_processed": len(page_info),
|
| 325 |
+
"page_range": f"{page_info[0]['page_number']}-{page_info[-1]['page_number']}",
|
| 326 |
+
"source_pages": [str(p["page_number"]) for p in page_info] # ✅ Convert to list of strings
|
| 327 |
+
})
|
| 328 |
+
# Remove the single "page" field since this represents multiple pages
|
| 329 |
+
combined_metadata.pop("page", None)
|
| 330 |
+
|
| 331 |
+
if not full_text.strip():
|
| 332 |
+
logger.warning("No valid content found in document")
|
| 333 |
+
return []
|
| 334 |
+
|
| 335 |
+
# Tokenize the entire document
|
| 336 |
+
all_tokens = self._smart_tokenize(full_text)
|
| 337 |
+
logger.info(f"Document tokenized into {len(all_tokens)} tokens")
|
| 338 |
+
|
| 339 |
+
if len(all_tokens) < self.min_chunk_size:
|
| 340 |
+
logger.warning(f"Document too short for chunking ({len(all_tokens)} tokens)")
|
| 341 |
+
return []
|
| 342 |
+
|
| 343 |
+
# Create overlapping token chunks
|
| 344 |
+
token_chunks = self._create_token_chunks(all_tokens)
|
| 345 |
+
logger.info(f"Created {len(token_chunks)} token chunks")
|
| 346 |
+
|
| 347 |
+
# Convert token chunks to Document objects
|
| 348 |
+
for chunk_idx, chunk_tokens in enumerate(token_chunks):
|
| 349 |
+
chunk_doc = self._process_single_chunk(
|
| 350 |
+
chunk_tokens,
|
| 351 |
+
chunk_idx,
|
| 352 |
+
combined_metadata
|
| 353 |
+
)
|
| 354 |
+
if chunk_doc:
|
| 355 |
+
processed_chunks.append(chunk_doc)
|
| 356 |
+
|
| 357 |
+
# Output processing statistics
|
| 358 |
+
if self.chunk_stats:
|
| 359 |
+
logger.info("\n".join(self.chunk_stats))
|
| 360 |
+
|
| 361 |
+
logger.info(f"Processed {len(processed_chunks)} valid token chunks")
|
| 362 |
+
return processed_chunks
|
| 363 |
+
|
| 364 |
+
except Exception as e:
|
| 365 |
+
logger.error(f"Error in token_process_document: {e}")
|
| 366 |
+
raise
|
| 367 |
+
|
| 368 |
+
def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 369 |
+
"""
|
| 370 |
+
Process document using token chunking strategy (implements abstract method).
|
| 371 |
+
|
| 372 |
+
Args:
|
| 373 |
+
file_path: Path to the document file
|
| 374 |
+
preprocess: Whether to preprocess text content
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
List of Document objects, one per valid token chunk
|
| 378 |
+
"""
|
| 379 |
+
return self.token_process_document(file_path, preprocess)
|
| 380 |
+
|
| 381 |
+
def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
|
| 382 |
+
"""
|
| 383 |
+
Process text file directly using token-based chunking with overlap.
|
| 384 |
+
|
| 385 |
+
Args:
|
| 386 |
+
file_path: Path to the text file
|
| 387 |
+
preprocess: Whether to preprocess text content
|
| 388 |
+
|
| 389 |
+
Returns:
|
| 390 |
+
List of Document objects, one per valid token chunk
|
| 391 |
+
"""
|
| 392 |
+
try:
|
| 393 |
+
from pathlib import Path
|
| 394 |
+
from datetime import datetime
|
| 395 |
+
|
| 396 |
+
self.chunk_stats = [] # Reset stats for this document
|
| 397 |
+
|
| 398 |
+
# Load the text file directly
|
| 399 |
+
content = self.load_text_file(file_path)
|
| 400 |
+
|
| 401 |
+
# Clean the text using the same logic as PDF conversion
|
| 402 |
+
content = self.clean_text_for_processing(content)
|
| 403 |
+
|
| 404 |
+
# Basic validation
|
| 405 |
+
if not self.is_content_valid(content):
|
| 406 |
+
logger.warning("Text file content failed validation")
|
| 407 |
+
return []
|
| 408 |
+
|
| 409 |
+
# Light preprocessing if requested (no header/footer removal for txt files)
|
| 410 |
+
if preprocess:
|
| 411 |
+
# Only apply basic text cleaning, not aggressive preprocessing
|
| 412 |
+
content = ' '.join(content.split()) # Normalize whitespace
|
| 413 |
+
|
| 414 |
+
# Create file-level metadata
|
| 415 |
+
file_path_obj = Path(file_path)
|
| 416 |
+
file_metadata = {
|
| 417 |
+
"source": file_path,
|
| 418 |
+
"file_name": file_path_obj.name,
|
| 419 |
+
"file_type": "txt",
|
| 420 |
+
"total_characters": len(content),
|
| 421 |
+
"processing_timestamp": datetime.now().isoformat(),
|
| 422 |
+
}
|
| 423 |
+
|
| 424 |
+
logger.info(f"Processing text file: {file_path_obj.name} ({len(content)} characters)")
|
| 425 |
+
|
| 426 |
+
# Tokenize the entire document
|
| 427 |
+
all_tokens = self._smart_tokenize(content)
|
| 428 |
+
logger.info(f"Text file tokenized into {len(all_tokens)} tokens")
|
| 429 |
+
|
| 430 |
+
if len(all_tokens) < self.min_chunk_size:
|
| 431 |
+
logger.warning(f"Text file too short for chunking ({len(all_tokens)} tokens)")
|
| 432 |
+
return []
|
| 433 |
+
|
| 434 |
+
# Create overlapping token chunks
|
| 435 |
+
token_chunks = self._create_token_chunks(all_tokens)
|
| 436 |
+
logger.info(f"Created {len(token_chunks)} token chunks from text file")
|
| 437 |
+
|
| 438 |
+
# Convert token chunks to Document objects
|
| 439 |
+
processed_chunks = []
|
| 440 |
+
for chunk_idx, chunk_tokens in enumerate(token_chunks):
|
| 441 |
+
chunk_doc = self._process_single_chunk(
|
| 442 |
+
chunk_tokens,
|
| 443 |
+
chunk_idx,
|
| 444 |
+
file_metadata
|
| 445 |
+
)
|
| 446 |
+
if chunk_doc:
|
| 447 |
+
processed_chunks.append(chunk_doc)
|
| 448 |
+
|
| 449 |
+
# Output processing statistics
|
| 450 |
+
if self.chunk_stats:
|
| 451 |
+
logger.info("\n".join(self.chunk_stats))
|
| 452 |
+
|
| 453 |
+
logger.info(f"Processed {len(processed_chunks)} valid token chunks from text file")
|
| 454 |
+
return processed_chunks
|
| 455 |
+
|
| 456 |
+
except Exception as e:
|
| 457 |
+
logger.error(f"Error processing text file: {e}")
|
| 458 |
+
raise
|
src/core/__init__.py
ADDED
|
File without changes
|