NavyDevilDoc commited on
Commit
c0f31c1
·
verified ·
1 Parent(s): 3755446

Upload 10 files

Browse files
src/core/BaseChunker.py ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BaseChunker.py
3
+
4
+ An abstract base class defining the interface for document chunking strategies.
5
+ """
6
+
7
+ import logging
8
+ from core.OCREnhancedPDFLoader import OCREnhancedPDFLoader
9
+ from core.TextPreprocessor import TextPreprocessor
10
+ import numpy as np
11
+ from abc import ABC, abstractmethod
12
+ from typing import List, Dict, Any, Optional, Union
13
+
14
+ import spacy
15
+ from langchain_core.documents import Document
16
+
17
+ # Import tiktoken at the module level
18
+ try:
19
+ import tiktoken
20
+ TIKTOKEN_AVAILABLE = True
21
+ except ImportError:
22
+ TIKTOKEN_AVAILABLE = False
23
+ logging.warning("tiktoken not installed. Some tokenization features will be limited. "
24
+ "Install with: pip install tiktoken")
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ class BaseChunker(ABC):
29
+ """Abstract base class for document chunking strategies."""
30
+
31
+ # Common constants
32
+ BLANK_THRESHOLD = 20 # Minimum characters for non-blank text
33
+ TOKEN_THRESHOLD = 10 # Minimum tokens for valid content
34
+
35
+ # Model type indicators
36
+ TIKTOKEN_MODELS = ["gpt", "davinci", "curie", "babbage", "ada"]
37
+ BASIC_TOKENIZER_MODELS = ["llama", "mistral", "granite"]
38
+
39
+ def __init__(self, model_name: Optional[str] = None, embedding_model: Optional[Any] = None):
40
+ """
41
+ Initialize base chunker with model settings.
42
+
43
+ Args:
44
+ model_name: Name of the model for tokenization
45
+ embedding_model: Model for generating embeddings
46
+ """
47
+ self.model_name = model_name
48
+ self.embedding_model = embedding_model
49
+ self.uses_tiktoken = False
50
+ self.uses_basic_tokenizer = False
51
+ self.tokenizer = None
52
+ self._initialize_tokenizer()
53
+
54
+ # Initialize NLP pipeline for text analysis
55
+ self.nlp = spacy.load("en_core_web_sm")
56
+
57
+ def _initialize_tokenizer(self):
58
+ """Initialize the appropriate tokenizer based on model name."""
59
+ if not self.model_name:
60
+ logger.warning("No model name provided. Using basic tokenization.")
61
+ self.uses_basic_tokenizer = True
62
+ return
63
+
64
+ # Check if model is supported by tiktoken
65
+ if TIKTOKEN_AVAILABLE and self.model_name in ["cl100k_base", "p50k_base", "r50k_base", "gpt2"]:
66
+ try:
67
+ encoding = tiktoken.get_encoding(self.model_name)
68
+
69
+ # Create a tokenizer-like interface for tiktoken
70
+ class TiktokenWrapper:
71
+ def __init__(self, encoding):
72
+ self.encoding = encoding
73
+
74
+ def tokenize(self, text):
75
+ return self.encoding.encode(text)
76
+
77
+ self.tokenizer = TiktokenWrapper(encoding)
78
+ self.uses_tiktoken = True
79
+ logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
80
+ return
81
+ except Exception as e:
82
+ logger.warning(f"Error with specified tiktoken model: {e}")
83
+ # Fall back to a standard encoding
84
+ try:
85
+ encoding = tiktoken.get_encoding("cl100k_base")
86
+
87
+ class TiktokenWrapper:
88
+ def __init__(self, encoding):
89
+ self.encoding = encoding
90
+
91
+ def tokenize(self, text):
92
+ return self.encoding.encode(text)
93
+
94
+ self.tokenizer = TiktokenWrapper(encoding)
95
+ self.uses_tiktoken = True
96
+ logger.info("Initialized tiktoken with cl100k_base encoding")
97
+ except Exception as e:
98
+ logger.warning(f"Error initializing tiktoken: {e}")
99
+ self.uses_basic_tokenizer = True
100
+
101
+ if TIKTOKEN_AVAILABLE and (
102
+ any(model in self.model_name.lower() for model in self.TIKTOKEN_MODELS) or
103
+ self.model_name.startswith("gpt-") or
104
+ self.model_name.endswith("-base")
105
+ ):
106
+ try:
107
+ encoding = tiktoken.get_encoding(self.model_name)
108
+
109
+ # Create a tokenizer-like interface for tiktoken
110
+ class TiktokenWrapper:
111
+ def __init__(self, encoding):
112
+ self.encoding = encoding
113
+
114
+ def tokenize(self, text):
115
+ return self.encoding.encode(text)
116
+
117
+ self.tokenizer = TiktokenWrapper(encoding)
118
+ self.uses_tiktoken = True
119
+ logger.info(f"Initialized tiktoken tokenizer for model: {self.model_name}")
120
+ except Exception as e:
121
+ logger.warning(f"Error with specified tiktoken model: {e}")
122
+ # Fall back to a standard encoding
123
+ try:
124
+ encoding = tiktoken.get_encoding("cl100k_base")
125
+
126
+ class TiktokenWrapper:
127
+ def __init__(self, encoding):
128
+ self.encoding = encoding
129
+
130
+ def tokenize(self, text):
131
+ return self.encoding.encode(text)
132
+
133
+ self.tokenizer = TiktokenWrapper(encoding)
134
+ self.uses_tiktoken = True
135
+ logger.info("Initialized tiktoken with cl100k_base encoding")
136
+ except Exception as e:
137
+ logger.warning(f"Error initializing tiktoken: {e}")
138
+ self.uses_basic_tokenizer = True
139
+
140
+ # Check if model uses basic tokenization
141
+ elif any(model in self.model_name.lower() for model in self.BASIC_TOKENIZER_MODELS):
142
+ self.uses_basic_tokenizer = True
143
+ logger.info("Using basic tokenization for model")
144
+
145
+ # Fall back to transformers tokenizer
146
+ else:
147
+ try:
148
+ from transformers import AutoTokenizer
149
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
150
+ logger.info(f"Initialized transformers tokenizer for model: {self.model_name}")
151
+ except Exception as e:
152
+ logger.warning(f"Error initializing transformer tokenizer: {e}")
153
+ logger.warning("Falling back to basic tokenization")
154
+ self.uses_basic_tokenizer = True
155
+
156
+ def count_tokens(self, text: str) -> int:
157
+ """Count tokens in a text string using the available tokenizer."""
158
+ if not text:
159
+ return 0
160
+
161
+ try:
162
+ # Try with the standard tokenizer
163
+ if self.tokenizer:
164
+ if self.uses_tiktoken:
165
+ # For tiktoken wrapper
166
+ return len(self.tokenizer.tokenize(text))
167
+ else:
168
+ # For transformers tokenizer
169
+ tokens = self.tokenizer.tokenize(text)
170
+ return len(tokens)
171
+ except Exception as e:
172
+ logger.warning(f"Primary tokenization failed: {e}")
173
+
174
+ # Basic tokenization fallback
175
+ if self.uses_basic_tokenizer or not self.tokenizer:
176
+ # Simple approximation (word count)
177
+ return len(text.split())
178
+
179
+ # If we somehow got here, return a reasonable approximation
180
+ return len(text) // 4 # Rough character-to-token ratio
181
+
182
+ def get_embedding(self, text: str) -> Optional[np.ndarray]:
183
+ """Generate embedding vector for text."""
184
+ if not text.strip() or not self.embedding_model:
185
+ return None
186
+
187
+ try:
188
+ return self.embedding_model.encode(text)
189
+ except Exception as e:
190
+ logger.error(f"Error generating embedding: {e}")
191
+ return None
192
+
193
+ def analyze_text(self, text: str) -> Dict[str, Any]:
194
+ """Perform detailed analysis of text content."""
195
+ if not text.strip():
196
+ return {
197
+ "char_count": 0,
198
+ "token_count": 0,
199
+ "sentence_count": 0,
200
+ "word_count": 0,
201
+ "embedding_dim": 0,
202
+ "has_content": False
203
+ }
204
+
205
+ try:
206
+ embedding = self.get_embedding(text)
207
+ doc = self.nlp(text)
208
+
209
+ return {
210
+ "char_count": len(text),
211
+ "token_count": self.count_tokens(text),
212
+ "sentence_count": len(list(doc.sents)),
213
+ "word_count": len(text.split()),
214
+ "embedding_dim": len(embedding) if embedding is not None else 0,
215
+ "has_content": bool(text.strip())
216
+ }
217
+
218
+ except Exception as e:
219
+ logger.error(f"Error analyzing text: {e}")
220
+ return {
221
+ "char_count": len(text),
222
+ "token_count": 0,
223
+ "sentence_count": 0,
224
+ "word_count": len(text.split()),
225
+ "embedding_dim": 0,
226
+ "has_content": bool(text.strip())
227
+ }
228
+
229
+ def is_content_valid(self, text: str, min_chars: int = None, min_tokens: int = None) -> bool:
230
+ """Check if content meets minimum requirements."""
231
+ if not text.strip():
232
+ return False
233
+
234
+ min_chars = min_chars or self.BLANK_THRESHOLD
235
+ min_tokens = min_tokens or self.TOKEN_THRESHOLD
236
+
237
+ if len(text.strip()) < min_chars:
238
+ return False
239
+
240
+ token_count = self.count_tokens(text)
241
+ return token_count >= min_tokens
242
+
243
+ def validate_documents(self, documents):
244
+ """Validate documents before sending to vector database"""
245
+ valid_documents = []
246
+
247
+ for i, doc in enumerate(documents):
248
+ # Check if document content is empty or just whitespace
249
+ if not doc.page_content or not doc.page_content.strip():
250
+ print(f"Skipping document {i}: Empty content")
251
+ continue
252
+
253
+ # Check if content starts with invalid characters
254
+ if doc.page_content and len(doc.page_content) > 0:
255
+ # Remove any potential BOM or invisible characters at start
256
+ cleaned_content = doc.page_content.lstrip('\ufeff\u200b\u200c\u200d\u200e\u200f\u2060')
257
+
258
+ # Replace document content with cleaned version
259
+ doc.page_content = cleaned_content
260
+
261
+ valid_documents.append(doc)
262
+
263
+ print(f"Validated {len(valid_documents)} of {len(documents)} documents")
264
+ return valid_documents
265
+
266
+
267
+ def debug_documents(self, documents, num_chars=50):
268
+ """Print diagnostic information about documents"""
269
+ print(f"\nDEBUG INFO: Examining {len(documents)} documents")
270
+
271
+ for i, doc in enumerate(documents):
272
+ content = doc.page_content
273
+ if not content:
274
+ print(f" Doc {i}: EMPTY CONTENT")
275
+ continue
276
+
277
+ # Get first few characters and their ASCII/Unicode codes
278
+ first_chars = content[:num_chars]
279
+ char_codes = [f"{c}({ord(c)})" for c in first_chars[:10]]
280
+
281
+ print(f" Doc {i}: Length={len(content)}, First chars: {''.join(char_codes)}")
282
+ print(f" Preview: {first_chars!r}")
283
+
284
+ print("DEBUG INFO END\n")
285
+
286
+
287
+ def load_document(self, file_path: str) -> List[Document]:
288
+ """Load document using OCREnhancedPDFLoader."""
289
+ try:
290
+ loader = OCREnhancedPDFLoader(file_path)
291
+ documents = loader.load()
292
+ self.debug_documents(documents)
293
+ cleaned_docs = self.validate_documents(documents)
294
+ return cleaned_docs
295
+ except Exception as e:
296
+ logger.error(f"Error loading document: {e}")
297
+ raise
298
+
299
+ def preprocess_text(self, text: str, remove_headers_footers: bool = True) -> str:
300
+ """Preprocess text using TextPreprocessor."""
301
+ try:
302
+ preprocessor = TextPreprocessor()
303
+ return preprocessor.preprocess(text, remove_headers_footers)
304
+ except Exception as e:
305
+ logger.error(f"Error preprocessing text: {e}")
306
+ return text
307
+
308
+ @abstractmethod
309
+ def process_document(self, file_path: str, preprocess: bool = True) -> Union[List[Document], Dict[str, List[Document]]]:
310
+ """Process document using specific chunking strategy."""
311
+ pass
312
+
313
+ def load_text_file(self, file_path: str) -> str:
314
+ """
315
+ Load raw text file content.
316
+
317
+ Args:
318
+ file_path: Path to the text file
319
+
320
+ Returns:
321
+ Raw text content
322
+ """
323
+ try:
324
+ with open(file_path, 'r', encoding='utf-8') as f:
325
+ content = f.read()
326
+ logger.info(f"Loaded text file: {file_path} ({len(content)} characters)")
327
+ return content
328
+ except Exception as e:
329
+ logger.error(f"Error loading text file {file_path}: {e}")
330
+ raise
331
+
332
+ def clean_text_for_processing(self, text: str) -> str:
333
+ """
334
+ Clean text using Unicode character replacement (same as PDF conversion logic).
335
+
336
+ Args:
337
+ text: Raw text content
338
+
339
+ Returns:
340
+ Cleaned text content
341
+ """
342
+ replacements = {
343
+ '\u2019': "'", '\u2018': "'", '\u201c': '"', '\u201d': '"',
344
+ '\u2014': '-', '\u2013': '-', '\u2026': '...',
345
+ '\u200b': '', '\u00a0': ' ', '\u2022': '*',
346
+ '\u2192': '->', '\u2190': '<-',
347
+ }
348
+
349
+ for old, new in replacements.items():
350
+ text = text.replace(old, new)
351
+
352
+ return text
353
+
354
+ def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
355
+ """
356
+ Default text file processing method. Can be overridden by specific chunkers.
357
+
358
+ Args:
359
+ file_path: Path to the text file
360
+ preprocess: Whether to preprocess the text
361
+
362
+ Returns:
363
+ List of Document objects
364
+ """
365
+ # This is a default implementation that should be overridden
366
+ # by specific chunkers like ParagraphChunker and TokenChunker
367
+ raise NotImplementedError("Subclasses must implement process_text_file method")
src/core/ChunkingManager.py ADDED
@@ -0,0 +1,354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ChunkingManager.py
3
+
4
+ A manager class that orchestrates document chunking using different strategies.
5
+ """
6
+
7
+ from typing import Dict, List, Optional, Union
8
+ from pathlib import Path
9
+ from sentence_transformers import SentenceTransformer
10
+ from langchain_core.documents import Document
11
+
12
+ # Import chunker strategies
13
+ from core.BaseChunker import BaseChunker
14
+ from core.PageChunker import PageChunker
15
+ from core.ParagraphChunker import ParagraphChunker
16
+ from core.SemanticChunker import SemanticChunker
17
+ from core.HierarchicalChunker import HierarchicalChunker
18
+ from core.TokenChunker import TokenChunker
19
+ import logging
20
+
21
+ # Configure logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class ChunkingStrategy:
26
+ """Enumeration of available chunking strategies."""
27
+ PAGE = "page"
28
+ PARAGRAPH = "paragraph"
29
+ SEMANTIC = "semantic"
30
+ HIERARCHICAL = "hierarchical"
31
+ TOKEN = "token"
32
+
33
+ class ChunkingManager:
34
+ """Manager class for document chunking strategies."""
35
+
36
+ def __init__(
37
+ self,
38
+ embedding_model_name: str = "all-mpnet-base-v2",
39
+ token_model_name: Optional[str] = None
40
+ ):
41
+ """
42
+ Initialize chunking manager.
43
+
44
+ Args:
45
+ embedding_model_name: Name of the sentence transformer model
46
+ token_model_name: Name of the token counting model
47
+ """
48
+ self.token_model_name = token_model_name
49
+ self.embedding_model_name = embedding_model_name
50
+ self._embedding_model = None
51
+ self._chunkers = {}
52
+
53
+ @property
54
+ def embedding_model(self):
55
+ """Lazy-load the embedding model."""
56
+ if self._embedding_model is None:
57
+ try:
58
+ # Only try to load as SentenceTransformer if it's a known SentenceTransformer model
59
+ if self.embedding_model_name and not any(x in self.embedding_model_name.lower() for x in ["gpt", "text-embedding", "openai"]):
60
+ logger.info(f"Loading embedding model: {self.embedding_model_name}")
61
+ self._embedding_model = SentenceTransformer(self.embedding_model_name)
62
+ else:
63
+ # Return a dummy embedding model that returns None
64
+ logger.info("Using dummy embedding model for tokenization only")
65
+ class DummyEmbedder:
66
+ def encode(self, text, **kwargs):
67
+ return [0.0] * 384 # Return dummy vector
68
+ self._embedding_model = DummyEmbedder()
69
+ except Exception as e:
70
+ logger.error(f"Error loading embedding model: {e}")
71
+ # Return a dummy embedding model that returns None
72
+ class DummyEmbedder:
73
+ def encode(self, text, **kwargs):
74
+ return [0.0] * 384 # Return dummy vector
75
+ self._embedding_model = DummyEmbedder()
76
+ return self._embedding_model
77
+
78
+ def _get_chunker(self, strategy: str) -> BaseChunker:
79
+ """Get or create chunker for the specified strategy."""
80
+ strategy = strategy.lower()
81
+
82
+ if strategy not in self._chunkers:
83
+ if strategy == ChunkingStrategy.PAGE:
84
+ self._chunkers[strategy] = PageChunker(
85
+ model_name=self.token_model_name,
86
+ embedding_model=self.embedding_model
87
+ )
88
+ elif strategy == ChunkingStrategy.PARAGRAPH:
89
+ self._chunkers[strategy] = ParagraphChunker(
90
+ model_name=self.token_model_name,
91
+ embedding_model=self.embedding_model
92
+ )
93
+ elif strategy == ChunkingStrategy.SEMANTIC:
94
+ self._chunkers[strategy] = SemanticChunker(
95
+ embedding_model=self.embedding_model,
96
+ model_name=self.token_model_name
97
+ )
98
+ elif strategy == ChunkingStrategy.HIERARCHICAL:
99
+ self._chunkers[strategy] = HierarchicalChunker(
100
+ model_name=self.token_model_name,
101
+ embedding_model=self.embedding_model
102
+ )
103
+ elif strategy == ChunkingStrategy.TOKEN:
104
+ self._chunkers[strategy] = TokenChunker(
105
+ model_name=self.token_model_name,
106
+ embedding_model=self.embedding_model,
107
+ chunk_size=256, # Default values, could be made configurable
108
+ chunk_overlap=50
109
+ )
110
+ else:
111
+ raise ValueError(f"Unknown chunking strategy: {strategy}")
112
+
113
+ return self._chunkers[strategy]
114
+
115
+ def process_document(
116
+ self,
117
+ file_path: str,
118
+ strategy: str = ChunkingStrategy.PARAGRAPH,
119
+ preprocess: bool = True
120
+ ) -> Union[List[Document], Dict[str, List[Document]]]:
121
+ """
122
+ Process document using specified chunking strategy.
123
+
124
+ Args:
125
+ file_path: Path to document file
126
+ strategy: Chunking strategy to use
127
+ preprocess: Whether to preprocess text
128
+
129
+ Returns:
130
+ Chunked document(s) according to strategy
131
+ """
132
+ # Validate file exists
133
+ path = Path(file_path)
134
+ if not path.exists():
135
+ raise FileNotFoundError(f"File not found: {file_path}")
136
+
137
+ # Determine file type
138
+ file_extension = path.suffix.lower()
139
+
140
+ # Process based on file type
141
+ if file_extension == '.csv':
142
+ return self._process_csv(file_path, strategy)
143
+ elif file_extension == '.txt':
144
+ return self._process_txt(file_path, strategy, preprocess)
145
+ elif file_extension == '.pdf':
146
+ # Get appropriate chunker and process document
147
+ chunker = self._get_chunker(strategy)
148
+
149
+ logger.info(f"Processing document using {strategy} chunking strategy")
150
+
151
+ if strategy == ChunkingStrategy.PAGE:
152
+ return chunker.page_process_document(file_path, preprocess)
153
+ elif strategy == ChunkingStrategy.PARAGRAPH:
154
+ return chunker.paragraph_process_document(file_path, preprocess)
155
+ elif strategy == ChunkingStrategy.SEMANTIC:
156
+ return chunker.semantic_process_document(file_path, preprocess)
157
+ elif strategy == ChunkingStrategy.HIERARCHICAL:
158
+ return chunker.hierarchical_process_document(file_path, preprocess)
159
+ elif strategy == ChunkingStrategy.TOKEN:
160
+ return chunker.token_process_document(file_path, preprocess)
161
+
162
+ else:
163
+ raise ValueError(f"Unknown chunking strategy: {strategy}")
164
+ else:
165
+ raise ValueError(f"Unsupported file type: {file_extension}. Supported types: .pdf, .csv, .txt")
166
+
167
+ def process_directory(
168
+ self,
169
+ dir_path: str,
170
+ strategy: str = ChunkingStrategy.PARAGRAPH,
171
+ preprocess: bool = True
172
+ ) -> Dict[str, Union[List[Document], Dict[str, List[Document]]]]:
173
+ """
174
+ Process all supported documents in a directory.
175
+
176
+ Args:
177
+ dir_path: Directory containing files
178
+ strategy: Chunking strategy to use
179
+ preprocess: Whether to preprocess text
180
+
181
+ Returns:
182
+ Dictionary mapping filenames to their processed documents
183
+ """
184
+ path = Path(dir_path)
185
+ if not path.is_dir():
186
+ raise NotADirectoryError(f"Not a directory: {dir_path}")
187
+
188
+ results = {}
189
+
190
+ # Find supported files (PDFs, CSVs, and TXT files)
191
+ pdf_files = list(path.glob("**/*.pdf"))
192
+ csv_files = list(path.glob("**/*.csv"))
193
+ txt_files = list(path.glob("**/*.txt"))
194
+ all_files = pdf_files + csv_files + txt_files
195
+
196
+ logger.info(f"Found {len(pdf_files)} PDF files, {len(csv_files)} CSV files, and {len(txt_files)} TXT files in {dir_path}")
197
+
198
+ for file in all_files:
199
+ try:
200
+ logger.info(f"Processing {file.name}")
201
+ result = self.process_document(
202
+ str(file),
203
+ strategy=strategy,
204
+ preprocess=preprocess
205
+ )
206
+ results[file.name] = result
207
+ except Exception as e:
208
+ logger.error(f"Error processing {file.name}: {e}")
209
+ results[file.name] = {"error": str(e)}
210
+
211
+ return results
212
+
213
+ def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
214
+ """Process a TXT file into document chunks."""
215
+ logger.info(f"Processing TXT file: {file_path}")
216
+
217
+ # Validate strategy for TXT files
218
+ if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
219
+ raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
220
+
221
+ # Get appropriate chunker
222
+ chunker = self._get_chunker(strategy)
223
+
224
+ # Process based on strategy
225
+ if strategy == ChunkingStrategy.PARAGRAPH:
226
+ return chunker.process_text_file(file_path, preprocess)
227
+ elif strategy == ChunkingStrategy.TOKEN:
228
+ return chunker.process_text_file(file_path, preprocess)
229
+
230
+ else:
231
+ raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
232
+
233
+ def _process_txt(self, file_path: str, strategy: str, preprocess: bool) -> List[Document]:
234
+ """Process a TXT file into document chunks."""
235
+ logger.info(f"Processing TXT file: {file_path}")
236
+
237
+ # Validate strategy for TXT files
238
+ if strategy not in [ChunkingStrategy.PARAGRAPH, ChunkingStrategy.TOKEN]:
239
+ raise ValueError(f"TXT files only support paragraph and token chunking strategies. Got: {strategy}")
240
+
241
+ # Get appropriate chunker
242
+ chunker = self._get_chunker(strategy)
243
+
244
+ # Process based on strategy
245
+ if strategy == ChunkingStrategy.PARAGRAPH:
246
+ return chunker.process_text_file(file_path, preprocess)
247
+ elif strategy == ChunkingStrategy.TOKEN:
248
+ return chunker.process_text_file(file_path, preprocess)
249
+
250
+ else:
251
+ raise ValueError(f"Unsupported chunking strategy for TXT: {strategy}")
252
+
253
+ def _process_csv(self, file_path: str, strategy: str) -> List[Document]:
254
+ """Process a CSV file into document chunks."""
255
+ import pandas as pd
256
+
257
+ logger.info(f"Loading CSV file: {file_path}")
258
+
259
+ # Read the CSV file
260
+ df = pd.read_csv(file_path)
261
+
262
+ # Determine the chunking approach based on strategy
263
+ if strategy == ChunkingStrategy.PARAGRAPH:
264
+ # For these strategies, we treat each row as a separate document
265
+ # with columns combined into a structured text format
266
+ return self._chunk_csv_by_row(df, file_path)
267
+ elif strategy == ChunkingStrategy.PAGE:
268
+ # For page strategy, we create larger chunks with multiple rows
269
+ return self._chunk_csv_by_page(df, file_path)
270
+ elif strategy == ChunkingStrategy.HIERARCHICAL:
271
+ # For hierarchical, create documents with metadata structure
272
+ return {"chunks": self._chunk_csv_by_row(df, file_path)}
273
+ else:
274
+ raise ValueError(f"Unsupported chunking strategy for CSV: {strategy}")
275
+
276
+ def _chunk_csv_by_row(self, df, file_path: str) -> List[Document]:
277
+ """Convert each CSV row to a document chunk."""
278
+ chunks = []
279
+ file_name = Path(file_path).name
280
+
281
+ # Get column names
282
+ columns = df.columns.tolist()
283
+
284
+ # Process each row
285
+ for i, row in df.iterrows():
286
+ # Convert row to formatted text
287
+ content = "\n".join([f"{col}: {row[col]}" for col in columns])
288
+
289
+ # Create metadata
290
+ metadata = {
291
+ "source": file_path,
292
+ "file_name": file_name,
293
+ "file_type": "csv",
294
+ "row_index": i,
295
+ "chunk_type": "csv_row",
296
+ }
297
+
298
+ # Add columns as additional metadata
299
+ for col in columns:
300
+ # Convert to string to ensure compatibility
301
+ metadata[f"csv_{col}"] = str(row[col])
302
+
303
+ # Create document
304
+ doc = Document(page_content=content, metadata=metadata)
305
+ chunks.append(doc)
306
+
307
+ logger.info(f"Created {len(chunks)} chunks from CSV (row-based)")
308
+ return chunks
309
+
310
+ def _chunk_csv_by_page(self, df, file_path: str, rows_per_chunk: int = 20) -> List[Document]:
311
+ """Convert CSV into larger chunks with multiple rows per chunk."""
312
+ chunks = []
313
+ file_name = Path(file_path).name
314
+ columns = df.columns.tolist()
315
+
316
+ # Calculate number of chunks
317
+ total_rows = len(df)
318
+ chunk_count = (total_rows + rows_per_chunk - 1) // rows_per_chunk # Ceiling division
319
+
320
+ # Generate chunks
321
+ for chunk_idx in range(chunk_count):
322
+ start_row = chunk_idx * rows_per_chunk
323
+ end_row = min(start_row + rows_per_chunk, total_rows)
324
+
325
+ chunk_df = df.iloc[start_row:end_row]
326
+
327
+ # Format the chunk content
328
+ content = f"CSV Data (Rows {start_row+1}-{end_row}):\n\n"
329
+
330
+ # Add header row
331
+ content += " | ".join(columns) + "\n"
332
+ content += "-" * (sum(len(col) for col in columns) + 3 * (len(columns) - 1)) + "\n"
333
+
334
+ # Add data rows
335
+ for _, row in chunk_df.iterrows():
336
+ content += " | ".join(str(row[col]) for col in columns) + "\n"
337
+
338
+ # Create metadata
339
+ metadata = {
340
+ "source": file_path,
341
+ "file_name": file_name,
342
+ "file_type": "csv",
343
+ "chunk_type": "csv_page",
344
+ "start_row": start_row,
345
+ "end_row": end_row - 1,
346
+ "row_count": end_row - start_row,
347
+ }
348
+
349
+ # Create document
350
+ doc = Document(page_content=content, metadata=metadata)
351
+ chunks.append(doc)
352
+
353
+ logger.info(f"Created {len(chunks)} chunks from CSV (page-based)")
354
+ return chunks
src/core/HierarchicalChunker.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HierarchicalChunker.py
3
+
4
+ A module for hierarchical document chunking that combines page-level and semantic chunking.
5
+
6
+ Features:
7
+ - Multi-level document representation (pages and chunks)
8
+ - Semantic chunking with sentence boundaries
9
+ - Size and overlap controls
10
+ - Hierarchical metadata
11
+ """
12
+
13
+ import logging
14
+ import spacy
15
+ from typing import Dict, List, Optional, Any
16
+ from langchain_core.documents import Document
17
+ from core.PageChunker import PageChunker
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class HierarchicalChunker(PageChunker):
22
+ """Handles document chunking at multiple hierarchical levels."""
23
+
24
+ def __init__(
25
+ self,
26
+ model_name: Optional[str] = None,
27
+ embedding_model: Optional[Any] = None,
28
+ chunk_size: int = 500,
29
+ chunk_overlap: int = 50,
30
+ similarity_threshold: float = 0.85
31
+ ):
32
+ """
33
+ Initialize hierarchical chunker with specified models and parameters.
34
+
35
+ Args:
36
+ model_name: Name of the model for tokenization
37
+ embedding_model: Model for generating embeddings
38
+ chunk_size: Maximum size of semantic chunks
39
+ chunk_overlap: Overlap between chunks
40
+ similarity_threshold: Similarity threshold for merging chunks
41
+ """
42
+ super().__init__(model_name, embedding_model)
43
+ self.chunk_size = chunk_size
44
+ self.chunk_overlap = chunk_overlap
45
+ self.similarity_threshold = similarity_threshold
46
+
47
+ # Initialize spaCy for NLP tasks
48
+ try:
49
+ self.nlp = spacy.load("en_core_web_sm")
50
+ except OSError:
51
+ logger.info("Installing spaCy model...")
52
+ import subprocess
53
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
54
+ capture_output=True)
55
+ self.nlp = spacy.load("en_core_web_sm")
56
+
57
+ def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
58
+ """
59
+ Create semantic chunks with detailed metadata.
60
+
61
+ Args:
62
+ content: The page content to chunk
63
+ page_number: The page number
64
+
65
+ Returns:
66
+ List of Document objects representing semantic chunks
67
+ """
68
+ if not content.strip():
69
+ return []
70
+
71
+ sentences = list(self.nlp(content).sents)
72
+ chunks = []
73
+ current_chunk = []
74
+ current_length = 0
75
+
76
+ for sent in sentences:
77
+ sent_text = sent.text.strip()
78
+ sent_length = len(sent_text)
79
+
80
+ if current_length + sent_length > self.chunk_size:
81
+ if current_chunk:
82
+ chunk_text = " ".join(current_chunk)
83
+ stats = self.analyze_text(chunk_text)
84
+ chunks.append(Document(
85
+ page_content=chunk_text,
86
+ metadata={
87
+ "level": "chunk",
88
+ "page_num": page_number,
89
+ "chunk_num": len(chunks) + 1,
90
+ "parent_page": page_number,
91
+ "char_count": stats["char_count"],
92
+ "token_count": stats["token_count"],
93
+ "sentence_count": stats["sentence_count"],
94
+ "word_count": stats["word_count"],
95
+ "has_ocr": stats.get("has_content", "true")
96
+ }
97
+ ))
98
+ current_chunk = [sent_text]
99
+ current_length = sent_length
100
+ else:
101
+ current_chunk.append(sent_text)
102
+ current_length += sent_length
103
+
104
+ # Handle final chunk
105
+ if current_chunk:
106
+ chunk_text = " ".join(current_chunk)
107
+ stats = self.analyze_text(chunk_text)
108
+ chunks.append(Document(
109
+ page_content=chunk_text,
110
+ metadata={
111
+ "level": "chunk",
112
+ "page_num": page_number,
113
+ "chunk_num": len(chunks) + 1,
114
+ "parent_page": page_number,
115
+ "char_count": stats["char_count"],
116
+ "token_count": stats["token_count"],
117
+ "sentence_count": stats["sentence_count"],
118
+ "word_count": stats["word_count"],
119
+ "has_ocr": stats.get("has_content", "true")
120
+ }
121
+ ))
122
+
123
+ self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
124
+ return chunks
125
+
126
+ def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
127
+ """
128
+ Process document with hierarchical chunking strategy.
129
+
130
+ Args:
131
+ file_path: Path to the PDF file
132
+ preprocess: Whether to preprocess text
133
+
134
+ Returns:
135
+ Dictionary with 'pages' and 'chunks' lists of Documents
136
+ """
137
+ self.page_stats = [] # Reset stats
138
+
139
+ # First get the page-level documents using PageChunker
140
+ page_docs = super().page_process_document(file_path, preprocess)
141
+
142
+ # Now create chunk-level documents
143
+ chunk_docs = []
144
+ total_chunks = 0
145
+
146
+ for page_doc in page_docs:
147
+ page_num = page_doc.metadata["page"]
148
+
149
+ # Mark this as a page-level document
150
+ page_doc.metadata["level"] = "page"
151
+
152
+ # Create chunks for this page
153
+ page_chunks = self._create_semantic_chunks(
154
+ page_doc.page_content,
155
+ page_num
156
+ )
157
+
158
+ chunk_docs.extend(page_chunks)
159
+ total_chunks += len(page_chunks)
160
+
161
+ # Log summary information
162
+ logger.info(f"\nHierarchical Processing Summary:")
163
+ logger.info(f"Total Pages: {len(page_docs)}")
164
+ logger.info(f"Total Chunks: {total_chunks}")
165
+ logger.info("\n".join(self.page_stats))
166
+
167
+ return {
168
+ "pages": page_docs,
169
+ "chunks": chunk_docs
170
+ }
171
+
172
+ def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
173
+ """
174
+ Process document using hierarchical chunking strategy (implements abstract method).
175
+
176
+ Args:
177
+ file_path: Path to the PDF file
178
+ preprocess: Whether to preprocess text
179
+
180
+ Returns:
181
+ Dictionary with 'pages' and 'chunks' lists of Documents
182
+ """
183
+ return self.hierarchical_process_document(file_path, preprocess)
src/core/OCREnhancedPDFLoader.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pytesseract
3
+ from langchain_community.document_loaders import PyMuPDFLoader
4
+ from langchain_core.documents import Document
5
+ from pdf2image import convert_from_path
6
+
7
+ class OCREnhancedPDFLoader:
8
+ """Loads PDFs with OCR support for text extraction"""
9
+
10
+ BLANK_THRESHOLD = 10
11
+
12
+ # FIXED: Removed Windows default path
13
+ def __init__(self, file_path: str, tesseract_path: str = None):
14
+ if not os.path.isfile(file_path):
15
+ raise FileNotFoundError(f"PDF file not found at path: {file_path}")
16
+
17
+ self.file_path = file_path
18
+ self.skipped_pages = []
19
+
20
+ # Only set cmd if specific path provided, otherwise trust Linux PATH
21
+ if tesseract_path:
22
+ if not os.path.isfile(tesseract_path):
23
+ raise ValueError(f"Tesseract executable not found at path: {tesseract_path}")
24
+ pytesseract.pytesseract.tesseract_cmd = tesseract_path
25
+
26
+ def _is_blank_page(self, text: str) -> bool:
27
+ if not text or not text.strip():
28
+ return True
29
+ cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
30
+ return len(cleaned_text) < self.BLANK_THRESHOLD
31
+
32
+ def _process_page(self, doc, img, page_number: int):
33
+ existing_text = doc.page_content
34
+
35
+ # Use existing text if substantial
36
+ if len(existing_text.strip()) > self.BLANK_THRESHOLD * 5:
37
+ combined_text = existing_text
38
+ ocr_used = False
39
+ else:
40
+ # Fallback to OCR
41
+ try:
42
+ ocr_text = pytesseract.image_to_string(img)
43
+ combined_text = ocr_text
44
+ ocr_used = True
45
+ except Exception as e:
46
+ print(f"Error applying OCR to page {page_number}: {e}")
47
+ combined_text = existing_text
48
+ ocr_used = False
49
+
50
+ if self._is_blank_page(combined_text):
51
+ self.skipped_pages.append(page_number)
52
+ return None
53
+
54
+ return Document(
55
+ page_content=combined_text,
56
+ metadata={
57
+ **doc.metadata,
58
+ "source": "ocr" if ocr_used else "text_extraction",
59
+ "page": page_number,
60
+ "is_blank": "false",
61
+ "has_ocr": str(ocr_used)
62
+ }
63
+ )
64
+
65
+ def load(self):
66
+ try:
67
+ # 1. Standard Load
68
+ loader = PyMuPDFLoader(self.file_path)
69
+ text_documents = loader.load()
70
+
71
+ # 2. Image Conversion (Linux requires poppler-utils installed)
72
+ images = convert_from_path(self.file_path, dpi=300)
73
+
74
+ enhanced_documents = []
75
+ for idx, (doc, img) in enumerate(zip(text_documents, images)):
76
+ page_number = idx + 1
77
+ enhanced_doc = self._process_page(doc, img, page_number)
78
+
79
+ if enhanced_doc:
80
+ enhanced_documents.append(enhanced_doc)
81
+
82
+ if self.skipped_pages:
83
+ print(f"Skipped blank pages: {self.skipped_pages}")
84
+
85
+ return enhanced_documents
86
+
87
+ except Exception as e:
88
+ print(f"Error in OCR-enhanced loading: {e}")
89
+ raise
src/core/PageChunker.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PageChunker.py
3
+
4
+ A module for page-level document chunking with token counting and preprocessing.
5
+
6
+ Features:
7
+ - Page-based document splitting
8
+ - Content validation
9
+ - Blank page detection
10
+ - Document metadata enrichment
11
+ """
12
+
13
+ from typing import List, Optional
14
+ import logging
15
+ from langchain_core.documents import Document
16
+ from core.BaseChunker import BaseChunker
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ class PageChunker(BaseChunker):
21
+ """Handles document chunking at the page level."""
22
+
23
+ def __init__(self, model_name=None, embedding_model=None):
24
+ """
25
+ Initialize page chunker with specified models.
26
+
27
+ Args:
28
+ model_name: Name of the model for tokenization
29
+ embedding_model: Model for generating embeddings
30
+ """
31
+ super().__init__(model_name, embedding_model)
32
+ self.page_stats = []
33
+
34
+ def _is_blank_page(self, text: str) -> bool:
35
+ """Check if page is blank or contains only whitespace/special characters."""
36
+ cleaned_text = text.strip().replace('\n', '').replace('\r', '').replace('\t', '')
37
+ return len(cleaned_text) < self.BLANK_THRESHOLD
38
+
39
+ def _process_single_page(self, content: str, page_number: int, preprocess: bool) -> Optional[Document]:
40
+ """
41
+ Process a single page with optional preprocessing and analysis.
42
+
43
+ Args:
44
+ content: The page content
45
+ page_number: The page number
46
+ preprocess: Whether to preprocess the text
47
+
48
+ Returns:
49
+ Document object with processed content and metadata, or None if page is blank
50
+ """
51
+ if self._is_blank_page(content):
52
+ self.page_stats.append(f"Page {page_number} is blank.")
53
+ return None
54
+
55
+ # Optionally preprocess the text
56
+ if preprocess:
57
+ content = self.preprocess_text(content)
58
+
59
+ # Analyze the page and generate metadata
60
+ stats = self.analyze_text(content)
61
+
62
+ metadata = {
63
+ "page": page_number,
64
+ "char_count": stats["char_count"],
65
+ "token_count": stats["token_count"],
66
+ "sentence_count": stats["sentence_count"],
67
+ "word_count": stats["word_count"],
68
+ "has_ocr": str(stats.get("has_content", True)),
69
+ "is_blank": "false"
70
+ }
71
+
72
+ return Document(page_content=content, metadata=metadata)
73
+
74
+ def page_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
75
+ """
76
+ Process PDF document page by page with analysis and optional preprocessing.
77
+
78
+ Args:
79
+ file_path: Path to the PDF file
80
+ preprocess: Whether to preprocess page text
81
+
82
+ Returns:
83
+ List of Document objects, one per non-blank page
84
+ """
85
+ try:
86
+ self.page_stats = [] # Reset stats for this document
87
+ raw_pages = self.load_document(file_path)
88
+ processed_pages = []
89
+
90
+ logger.info(f"Processing document with {len(raw_pages)} pages")
91
+
92
+ for idx, page in enumerate(raw_pages):
93
+ processed_page = self._process_single_page(page.page_content, idx + 1, preprocess)
94
+ if processed_page:
95
+ processed_pages.append(processed_page)
96
+
97
+ # Output skipped pages for transparency
98
+ if self.page_stats:
99
+ logger.info("\n".join(self.page_stats))
100
+
101
+ logger.info(f"Processed {len(processed_pages)} non-blank pages")
102
+ return processed_pages
103
+
104
+ except Exception as e:
105
+ logger.error(f"Error in page_process_document: {e}")
106
+ raise
107
+
108
+ def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
109
+ """
110
+ Process document using page chunking strategy (implements abstract method).
111
+
112
+ Args:
113
+ file_path: Path to the PDF file
114
+ preprocess: Whether to preprocess page text
115
+
116
+ Returns:
117
+ List of Document objects, one per non-blank page
118
+ """
119
+ return self.page_process_document(file_path, preprocess)
src/core/ParagraphChunker.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ParagraphChunker.py
3
+
4
+ A module for paragraph-level document chunking with token counting and preprocessing.
5
+
6
+ Features:
7
+ - Paragraph-based document splitting
8
+ - Content validation
9
+ - Multi-level delimiter detection
10
+ - Smart paragraph boundary detection
11
+ """
12
+
13
+ import logging
14
+ import spacy
15
+ from typing import List, Optional
16
+ from pathlib import Path
17
+ from datetime import datetime
18
+ from langchain_core.documents import Document
19
+ from core.BaseChunker import BaseChunker
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ class ParagraphChunker(BaseChunker):
24
+ """Handles document chunking at the paragraph level with token counting."""
25
+
26
+ PARAGRAPH_MIN_LENGTH = 50 # Minimum characters for a valid paragraph
27
+
28
+ def __init__(self, model_name=None, embedding_model=None):
29
+ """
30
+ Initialize paragraph chunker with specified models.
31
+
32
+ Args:
33
+ model_name: Name of the model for tokenization
34
+ embedding_model: Model for generating embeddings
35
+ """
36
+ super().__init__(model_name, embedding_model)
37
+ self.page_stats = []
38
+
39
+ # Initialize spaCy for NLP tasks
40
+ try:
41
+ self.nlp = spacy.load("en_core_web_sm")
42
+ except Exception as e:
43
+ logger.error(f"Error loading spaCy model: {e}")
44
+ import subprocess
45
+ logger.info("Installing spaCy model...")
46
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"],
47
+ capture_output=True)
48
+ self.nlp = spacy.load("en_core_web_sm")
49
+
50
+ def _split_into_paragraphs(self, text: str) -> List[str]:
51
+ """
52
+ Split text into paragraphs using length and punctuation heuristics.
53
+
54
+ Args:
55
+ text: The text content to split
56
+
57
+ Returns:
58
+ List of paragraphs
59
+ """
60
+ # Pre-clean the text
61
+ text = text.replace('\r', '\n')
62
+
63
+ # First, try double line breaks
64
+ paragraphs = text.split('\n\n')
65
+
66
+ # If that fails (PDF extraction issue), use sentence-based reconstruction
67
+ if len(paragraphs) <= 3:
68
+ print(f"PDF extraction flattened structure. Reconstructing from sentences...")
69
+
70
+ # Use spaCy for sentence detection
71
+ doc = self.nlp(text)
72
+ paragraphs = []
73
+ current_para = []
74
+ current_length = 0
75
+
76
+ for sent in doc.sents:
77
+ sent_text = sent.text.strip()
78
+ if not sent_text:
79
+ continue
80
+
81
+ # Add sentence to current paragraph
82
+ current_para.append(sent_text)
83
+ current_length += len(sent_text)
84
+
85
+ # Check if we should end the current paragraph
86
+ should_end_paragraph = (
87
+ # Paragraph is getting long (300-600 chars is typical)
88
+ current_length > 300 and
89
+ # Current sentence ends with proper punctuation
90
+ sent_text.endswith(('.', '!', '?')) and
91
+ # We have substantial content
92
+ len(current_para) >= 2
93
+ )
94
+
95
+ if should_end_paragraph:
96
+ paragraphs.append(' '.join(current_para))
97
+ current_para = []
98
+ current_length = 0
99
+
100
+ # Add the last paragraph
101
+ if current_para:
102
+ paragraphs.append(' '.join(current_para))
103
+
104
+ print(f"Reconstructed {len(paragraphs)} paragraphs using length heuristics")
105
+
106
+ # Clean and filter paragraphs
107
+ cleaned_paragraphs = []
108
+ for para in paragraphs:
109
+ clean_para = ' '.join(para.split())
110
+ if len(clean_para) >= self.PARAGRAPH_MIN_LENGTH:
111
+ cleaned_paragraphs.append(clean_para)
112
+
113
+ print(f"Final paragraph count: {len(cleaned_paragraphs)}")
114
+ return cleaned_paragraphs
115
+
116
+ def _process_single_paragraph(self, content: str, page_number: int,
117
+ para_number: int, preprocess: bool) -> Optional[Document]:
118
+ """
119
+ Process a single paragraph with analysis and metadata.
120
+
121
+ Args:
122
+ content: The paragraph content
123
+ page_number: The page number
124
+ para_number: The paragraph number
125
+ preprocess: Whether to preprocess the text
126
+
127
+ Returns:
128
+ Document object with processed content and metadata, or None if paragraph is invalid
129
+ """
130
+ # First check character length
131
+ if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
132
+ self.page_stats.append(f"Paragraph {para_number} on page {page_number} is too short.")
133
+ return None
134
+
135
+ # Optionally preprocess the text
136
+ if preprocess:
137
+ content = self.preprocess_text(content)
138
+
139
+ # Analyze the paragraph and generate metadata
140
+ stats = self.analyze_text(content)
141
+
142
+ # Check token threshold
143
+ if stats["token_count"] < self.TOKEN_THRESHOLD:
144
+ self.page_stats.append(
145
+ f"Paragraph {para_number} on page {page_number} dropped: "
146
+ f"only {stats['token_count']} tokens"
147
+ )
148
+ return None
149
+
150
+ metadata = {
151
+ "page": page_number,
152
+ "paragraph": para_number,
153
+ "char_count": stats["char_count"],
154
+ "token_count": stats["token_count"],
155
+ "sentence_count": stats["sentence_count"],
156
+ "word_count": stats["word_count"],
157
+ "has_ocr": str(stats.get("has_content", True))
158
+ }
159
+
160
+ return Document(page_content=content, metadata=metadata)
161
+
162
+ def paragraph_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
163
+ """
164
+ Process PDF document paragraph by paragraph with analysis.
165
+
166
+ Args:
167
+ file_path: Path to the PDF file
168
+ preprocess: Whether to preprocess paragraph text
169
+
170
+ Returns:
171
+ List of Document objects, one per valid paragraph
172
+ """
173
+ try:
174
+ self.page_stats = [] # Reset stats for this document
175
+ raw_pages = self.load_document(file_path)
176
+ processed_paragraphs = []
177
+
178
+ logger.info(f"Processing document with {len(raw_pages)} pages")
179
+
180
+ for page_idx, page in enumerate(raw_pages):
181
+ paragraphs = self._split_into_paragraphs(page.page_content)
182
+ logger.info(f"Page {page_idx+1}: Found {len(paragraphs)} paragraphs")
183
+
184
+ for para_idx, paragraph in enumerate(paragraphs):
185
+ processed_para = self._process_single_paragraph(
186
+ paragraph,
187
+ page_idx + 1,
188
+ para_idx + 1,
189
+ preprocess
190
+ )
191
+ if processed_para:
192
+ processed_paragraphs.append(processed_para)
193
+
194
+ # Output skipped paragraphs for transparency
195
+ if self.page_stats:
196
+ logger.info("\n".join(self.page_stats))
197
+
198
+ logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs")
199
+ return processed_paragraphs
200
+
201
+ except Exception as e:
202
+ logger.error(f"Error in paragraph_process_document: {e}")
203
+ raise
204
+
205
+ def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
206
+ """
207
+ Process document using paragraph chunking strategy (implements abstract method).
208
+
209
+ Args:
210
+ file_path: Path to the PDF file
211
+ preprocess: Whether to preprocess paragraph text
212
+
213
+ Returns:
214
+ List of Document objects, one per valid paragraph
215
+ """
216
+ return self.paragraph_process_document(file_path, preprocess)
217
+
218
+ def process_text_file(self, file_path: str, preprocess: bool = False) -> List[Document]:
219
+ """
220
+ Process text file directly, preserving paragraph structure.
221
+
222
+ Args:
223
+ file_path: Path to the text file
224
+ preprocess: Whether to preprocess paragraph text
225
+
226
+ Returns:
227
+ List of Document objects, one per valid paragraph
228
+ """
229
+ try:
230
+ # Load the text file directly
231
+ content = self.load_text_file(file_path)
232
+
233
+ # Clean the text using the same logic as PDF conversion
234
+ content = self.clean_text_for_processing(content)
235
+
236
+ # Split into paragraphs using double line breaks
237
+ paragraphs = content.split('\n\n')
238
+
239
+ logger.info(f"Found {len(paragraphs)} paragraphs in text file: {file_path}")
240
+
241
+ processed_paragraphs = []
242
+ file_name = Path(file_path).name
243
+
244
+ for para_idx, paragraph in enumerate(paragraphs):
245
+ paragraph = paragraph.strip()
246
+ if paragraph:
247
+ processed_para = self._process_single_paragraph_from_text(
248
+ paragraph,
249
+ file_path,
250
+ file_name,
251
+ para_idx + 1,
252
+ preprocess
253
+ )
254
+ if processed_para:
255
+ processed_paragraphs.append(processed_para)
256
+
257
+ logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs from text file")
258
+ return processed_paragraphs
259
+
260
+ except Exception as e:
261
+ logger.error(f"Error processing text file: {e}")
262
+ raise
263
+
264
+ def _process_single_paragraph_from_text(self, content: str, file_path: str,
265
+ file_name: str, para_number: int,
266
+ preprocess: bool) -> Optional[Document]:
267
+ """
268
+ Process a single paragraph from text file with analysis and metadata.
269
+
270
+ Args:
271
+ content: The paragraph content
272
+ file_path: Full path to the source file
273
+ file_name: Name of the source file
274
+ para_number: The paragraph number
275
+ preprocess: Whether to preprocess the text
276
+
277
+ Returns:
278
+ Document object with processed content and metadata, or None if paragraph is invalid
279
+ """
280
+ # First check character length
281
+ if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
282
+ logger.debug(f"Paragraph {para_number} too short ({len(content)} chars), skipping")
283
+ return None
284
+
285
+ # Preprocess if requested
286
+ if preprocess:
287
+ content = self.preprocess_text(content, remove_headers_footers=False)
288
+
289
+ # Analyze the paragraph
290
+ analysis = self.analyze_text(content)
291
+
292
+ # Validate content quality
293
+ if not self.is_content_valid(content):
294
+ logger.debug(f"Paragraph {para_number} failed content validation, skipping")
295
+ return None
296
+
297
+ # Create metadata
298
+ metadata = {
299
+ "source": file_path,
300
+ "file_name": file_name,
301
+ "file_type": "txt",
302
+ "paragraph": para_number,
303
+ "char_count": analysis["char_count"],
304
+ "token_count": analysis["token_count"],
305
+ "sentence_count": analysis["sentence_count"],
306
+ "word_count": analysis["word_count"],
307
+ "chunk_type": "paragraph",
308
+ "processing_timestamp": datetime.now().isoformat(),
309
+ }
310
+
311
+ # Create and return document
312
+ doc = Document(page_content=content, metadata=metadata)
313
+ logger.debug(f"Created paragraph {para_number}: {analysis['char_count']} chars, {analysis['token_count']} tokens")
314
+
315
+ return doc
src/core/SemanticChunker.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SemanticChunker.py
3
+ A module for semantic-aware text chunking using embeddings and similarity metrics.
4
+ """
5
+
6
+ import logging
7
+ from typing import List, Optional, Any
8
+ import numpy as np
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ from langchain_core.documents import Document
11
+ # FIXED IMPORT: Updated for LangChain v0.2+
12
+ from langchain_text_splitters import SpacyTextSplitter
13
+ from sentence_transformers import SentenceTransformer
14
+ from core.BaseChunker import BaseChunker
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class SemanticChunker(BaseChunker):
19
+ """Chunks text based on semantic similarity and size constraints"""
20
+
21
+ def __init__(
22
+ self,
23
+ model_name: Optional[str] = None,
24
+ embedding_model: Optional[Any] = None,
25
+ chunk_size: int = 200,
26
+ chunk_overlap: int = 0,
27
+ similarity_threshold: float = 0.9,
28
+ separator: str = " "
29
+ ):
30
+ """
31
+ Initialize the semantic chunker with configurable parameters
32
+ """
33
+ # Validate parameters
34
+ if chunk_size <= 0:
35
+ raise ValueError("chunk_size must be a positive integer.")
36
+ if not (0 <= similarity_threshold <= 1):
37
+ raise ValueError("similarity_threshold must be between 0 and 1.")
38
+
39
+ # Initialize BaseChunker first
40
+ super().__init__(model_name, embedding_model)
41
+
42
+ # Set semantic chunking parameters
43
+ self.chunk_size = chunk_size
44
+ self.chunk_overlap = chunk_overlap
45
+ self.similarity_threshold = similarity_threshold
46
+ self.separator = separator
47
+
48
+ # Use provided embedding model or initialize sentence transformer
49
+ is_dummy = False
50
+ if embedding_model is not None:
51
+ try:
52
+ test_output = embedding_model.encode("test")
53
+ if isinstance(test_output, list) and len(test_output) == 384 and all(x == 0.0 for x in test_output):
54
+ is_dummy = True
55
+ except:
56
+ pass
57
+
58
+ if embedding_model is None or is_dummy:
59
+ try:
60
+ self.sentence_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
61
+ self.embedding_model = self.sentence_model
62
+ logger.info("Initialized SentenceTransformer for semantic chunking")
63
+ except Exception as e:
64
+ logger.error(f"Error loading SentenceTransformer: {e}")
65
+ class DummyEmbedder:
66
+ def encode(self, text, **kwargs):
67
+ return [0.0] * 384
68
+ self.sentence_model = DummyEmbedder()
69
+ self.embedding_model = self.sentence_model
70
+ else:
71
+ self.sentence_model = embedding_model
72
+ logger.info("Using provided embedding model for semantic chunking")
73
+
74
+ # Initialize text splitter for initial chunking
75
+ self.text_splitter = SpacyTextSplitter(
76
+ chunk_size=self.chunk_size - self.chunk_overlap,
77
+ chunk_overlap=self.chunk_overlap,
78
+ separator=self.separator
79
+ )
80
+
81
+ def _enforce_size_immediately(self, text: str) -> List[str]:
82
+ if not text.strip():
83
+ return []
84
+
85
+ chunks = []
86
+ current_chunk = []
87
+ words = text.split()
88
+
89
+ for word in words:
90
+ if sum(len(w) for w in current_chunk) + len(word) + len(current_chunk) <= self.chunk_size:
91
+ current_chunk.append(word)
92
+ else:
93
+ if current_chunk:
94
+ chunks.append(" ".join(current_chunk))
95
+ current_chunk = [word]
96
+
97
+ if current_chunk:
98
+ chunks.append(" ".join(current_chunk))
99
+
100
+ return chunks
101
+
102
+ def get_semantic_chunks(self, documents: List[Document]) -> List[Document]:
103
+ if not documents:
104
+ logger.warning("No documents provided for semantic chunking")
105
+ return []
106
+
107
+ try:
108
+ base_chunks = self.text_splitter.split_documents(documents)
109
+ logger.info(f"Initial splitting created {len(base_chunks)} base chunks")
110
+
111
+ if not base_chunks:
112
+ return []
113
+
114
+ chunk_contents = [doc.page_content for doc in base_chunks]
115
+ chunk_embeddings = self.sentence_model.encode(chunk_contents)
116
+
117
+ grouped_chunks = []
118
+ current_group = []
119
+ current_embedding = None
120
+
121
+ for i, base_chunk in enumerate(base_chunks):
122
+ if not current_group:
123
+ current_group.append(base_chunk)
124
+ current_embedding = chunk_embeddings[i].reshape(1, -1)
125
+ continue
126
+
127
+ similarity = cosine_similarity(current_embedding, chunk_embeddings[i].reshape(1, -1))[0][0]
128
+ combined_content = " ".join([doc.page_content for doc in current_group] + [base_chunk.page_content])
129
+
130
+ if similarity >= self.similarity_threshold and len(combined_content) <= self.chunk_size:
131
+ current_group.append(base_chunk)
132
+ else:
133
+ grouped_chunks.extend(self._finalize_chunk_group(current_group))
134
+ current_group = [base_chunk]
135
+ current_embedding = chunk_embeddings[i].reshape(1, -1)
136
+
137
+ if current_group:
138
+ grouped_chunks.extend(self._finalize_chunk_group(current_group))
139
+
140
+ logger.info(f"Created {len(grouped_chunks)} semantic chunks")
141
+ return grouped_chunks
142
+
143
+ except Exception as e:
144
+ logger.error(f"Error in semantic chunking: {e}")
145
+ return documents
146
+
147
+ def _finalize_chunk_group(self, group: List[Document]) -> List[Document]:
148
+ if not group:
149
+ return []
150
+
151
+ processed_chunks = []
152
+ content = " ".join([doc.page_content for doc in group])
153
+ size_limited_chunks = self._enforce_size_immediately(content)
154
+
155
+ base_metadata = group[0].metadata.copy()
156
+
157
+ for i, chunk in enumerate(size_limited_chunks):
158
+ stats = self.analyze_text(chunk)
159
+ metadata = base_metadata.copy()
160
+ metadata.update({
161
+ "chunk_index": i + 1,
162
+ "chunk_count": len(size_limited_chunks),
163
+ "char_count": stats["char_count"],
164
+ "token_count": stats["token_count"],
165
+ "sentence_count": stats["sentence_count"],
166
+ "word_count": stats["word_count"],
167
+ "chunk_type": "semantic"
168
+ })
169
+
170
+ processed_chunks.append(Document(page_content=chunk, metadata=metadata))
171
+
172
+ return processed_chunks
173
+
174
+ def semantic_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
175
+ try:
176
+ logger.info(f"Processing document with semantic chunking: {file_path}")
177
+
178
+ raw_documents = self.load_document(file_path)
179
+
180
+ processed_documents = []
181
+ for doc in raw_documents:
182
+ content = doc.page_content
183
+ if preprocess:
184
+ content = self.preprocess_text(content)
185
+ processed_documents.append(Document(
186
+ page_content=content,
187
+ metadata=doc.metadata
188
+ ))
189
+
190
+ documents = self.get_semantic_chunks(processed_documents)
191
+ logger.info(f"Created {len(documents)} semantic chunks")
192
+
193
+ return documents
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error in semantic_process_document: {e}")
197
+ raise
198
+
199
+ def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
200
+ return self.semantic_process_document(file_path, preprocess)
src/core/TextPreprocessor.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import WordNetLemmatizer
4
+ import re
5
+
6
+ class TextPreprocessor:
7
+ def __init__(self):
8
+ try:
9
+ self.stopwords = set(stopwords.words('english'))
10
+ self.lemmatizer = WordNetLemmatizer()
11
+ self.logger = logging.getLogger(__name__)
12
+
13
+ except Exception as e:
14
+ self.logger.error(f"Failed to initialize NLTK resources: {e}")
15
+ raise
16
+
17
+
18
+ def standardize_case(self, text):
19
+ return text.lower()
20
+
21
+
22
+ def remove_punctuation(self, text):
23
+ return re.sub(r'[^\w\s]', '', text)
24
+
25
+
26
+ def normalize_whitespace(self, text):
27
+ return re.sub(r'\s+', ' ', text).strip()
28
+
29
+
30
+ def remove_stopwords(self, words):
31
+ return [word for word in words if word not in self.stopwords]
32
+
33
+
34
+ def lemmatize_words(self, words):
35
+ return [self.lemmatizer.lemmatize(word) for word in words]
36
+
37
+
38
+ def remove_headers_and_footers(self, text, aggressive=False, pattern=None):
39
+ try:
40
+ if not text or not text.strip():
41
+ return text
42
+
43
+ lines = text.splitlines()
44
+ if len(lines) <= 4: # For very short text, don't remove anything
45
+ return text
46
+
47
+ # Store original lines for fallback
48
+ original_lines = lines.copy()
49
+
50
+ # Use different strategies based on document characteristics
51
+ if self._appears_to_be_slide(lines):
52
+ # Slide-friendly approach - only remove obvious headers/footers
53
+ cleaned_lines = self._clean_slide_headers_footers(lines, pattern)
54
+ elif aggressive:
55
+ # Traditional document approach - remove first/last few lines
56
+ num_lines = 2
57
+ cleaned_lines = lines[num_lines:-num_lines]
58
+ else:
59
+ # Conservative approach - only remove based on patterns
60
+ cleaned_lines = self._pattern_based_removal(lines, pattern)
61
+
62
+ # If we removed too much (over 30% of content), revert to original
63
+ if len(cleaned_lines) < len(lines) * 0.7:
64
+ self.logger.warning("Header/footer removal eliminated too much content, reverting")
65
+ cleaned_lines = original_lines
66
+
67
+ # Additional heuristic: Remove single-word lines that might be page numbers
68
+ cleaned_lines = [line for line in cleaned_lines
69
+ if not (len(line.strip().split()) == 1 and
70
+ line.strip().isdigit())]
71
+
72
+ # Join lines back into text
73
+ return '\n'.join(cleaned_lines)
74
+
75
+ except Exception as e:
76
+ self.logger.error(f"Error removing headers/footers: {e}")
77
+ return text # Return original text on error
78
+
79
+
80
+ def _appears_to_be_slide(self, lines):
81
+ """Detect if the content appears to be from a slide/presentation."""
82
+ # Characteristics of slides:
83
+ # - Shorter overall text
84
+ # - Fewer lines
85
+ # - More bullet points
86
+ # - Title followed by bullet points
87
+
88
+ if len(lines) < 15: # Short content
89
+ return True
90
+
91
+ # Check for bullet point patterns
92
+ bullet_pattern = r'^\s*[•\-\*\>\◦\○\◆\◇\▪\▫\⚫\⚪\✓\✔\✕\✖\✗\✘]'
93
+ bullet_lines = sum(1 for line in lines if re.match(bullet_pattern, line))
94
+
95
+ # If more than 20% of lines are bullets, likely a slide
96
+ if bullet_lines > len(lines) * 0.2:
97
+ return True
98
+
99
+ # If first non-empty line is short (likely a title) and followed by bullet points
100
+ non_empty_lines = [line for line in lines if line.strip()]
101
+ if non_empty_lines and len(non_empty_lines[0].strip()) < 60:
102
+ # Check for bullet points in the following lines
103
+ for line in non_empty_lines[1:4]: # Check next few lines
104
+ if re.match(bullet_pattern, line):
105
+ return True
106
+
107
+ return False
108
+
109
+
110
+ def _clean_slide_headers_footers(self, lines, pattern=None):
111
+ """Clean headers/footers from slide-based content."""
112
+ cleaned_lines = lines.copy()
113
+
114
+ # For slides, we primarily rely on pattern matching rather than line position
115
+ if pattern:
116
+ cleaned_lines = [line for line in cleaned_lines
117
+ if not re.search(pattern, line)]
118
+
119
+ # Common slide footer patterns to remove
120
+ footer_patterns = [
121
+ r'^\s*\d+\s*$', # Standalone page number
122
+ r'confidential', # Confidentiality notices
123
+ r'all rights reserved',
124
+ r'proprietary',
125
+ r'^\s*www\.', # Website in footer
126
+ r'^\s*https?://', # URL in footer
127
+ r'\bpage\s+\d+\b', # "Page X" footer
128
+ r'^\s*[©Ⓒ]\s*\d{4}' # Copyright notice
129
+ ]
130
+
131
+ # Combine all patterns
132
+ combined_pattern = '|'.join(f'({p})' for p in footer_patterns)
133
+
134
+ # Filter out footer lines
135
+ if combined_pattern:
136
+ cleaned_lines = [line for line in cleaned_lines
137
+ if not re.search(combined_pattern, line, re.IGNORECASE)]
138
+
139
+ return cleaned_lines
140
+
141
+
142
+ def _pattern_based_removal(self, lines, pattern=None):
143
+ """Remove headers/footers based only on patterns, not position."""
144
+ if not pattern:
145
+ # Default patterns for headers/footers
146
+ patterns = [
147
+ r'^\s*\d+\s*$', # Standalone page numbers
148
+ r'^\s*page\s+\d+\s+of\s+\d+\s*$', # Page X of Y
149
+ r'^\s*[©Ⓒ]\s*\d{4}.*$', # Copyright lines
150
+ r'^\s*confidential\s*$', # Confidentiality markers
151
+ r'^\s*https?://.*$', # URLs alone on a line
152
+ r'^\s*www\..*$', # Website alone on a line
153
+ r'^\s*[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\s*$' # Email addresses
154
+ ]
155
+ combined_pattern = '|'.join(f'({p})' for p in patterns)
156
+ else:
157
+ combined_pattern = pattern
158
+
159
+ return [line for line in lines
160
+ if not re.search(combined_pattern, line, re.IGNORECASE)]
161
+
162
+
163
+ def remove_common_pdf_artifacts(self, text):
164
+ try:
165
+ # Remove form field indicators
166
+ text = re.sub(r'\[\s*\]\s*|\[\s*X\s*\]|\(\s*\)\s*|\(\s*X\s*\)', '', text)
167
+
168
+ # Remove common PDF annotations
169
+ text = re.sub(r'<<[^>]*>>', '', text)
170
+
171
+ # Remove artifact markers often found in PDFs
172
+ text = re.sub(r'obj\s*\d+\s*\d+\s*R', '', text)
173
+
174
+ return text
175
+
176
+ except Exception as e:
177
+ self.logger.error(f"Error removing PDF artifacts: {e}")
178
+ return text
179
+
180
+ def preprocess(self, text, remove_headers_footers=True, aggressive_removal=False):
181
+ try:
182
+ if remove_headers_footers:
183
+ text = self.remove_headers_and_footers(text, aggressive=aggressive_removal)
184
+
185
+ text = self.remove_common_pdf_artifacts(text)
186
+
187
+ text = self.standardize_case(text)
188
+ text = self.remove_punctuation(text)
189
+ text = self.normalize_whitespace(text)
190
+
191
+ words = text.split()
192
+ words = self.remove_stopwords(words)
193
+ words = self.lemmatize_words(words)
194
+
195
+ return ' '.join(words)
196
+ except Exception as e:
197
+ self.logger.error(f"Error preprocessing text: {e}")
198
+ raise
src/core/TokenChunker.py ADDED
@@ -0,0 +1,458 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TokenChunker.py
3
+
4
+ A module for token-based document chunking with configurable overlap and preprocessing.
5
+
6
+ Features:
7
+ - Token-based document splitting with overlap
8
+ - Content validation and token counting
9
+ - Smart boundary detection to preserve word integrity
10
+ - Compatible with multiple tokenizer types (tiktoken, transformers, basic)
11
+ """
12
+
13
+ import logging
14
+ import re
15
+ from typing import List, Optional, Dict, Any
16
+ from langchain_core.documents import Document
17
+ from core.BaseChunker import BaseChunker
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class TokenChunker(BaseChunker):
22
+ """Handles document chunking at the token level with configurable overlap."""
23
+
24
+ def __init__(
25
+ self,
26
+ model_name=None,
27
+ embedding_model=None,
28
+ chunk_size: int = 256,
29
+ chunk_overlap: int = 50,
30
+ min_chunk_size: int = 50
31
+ ):
32
+ """
33
+ Initialize token chunker with specified models and parameters.
34
+
35
+ Args:
36
+ model_name: Name of the model for tokenization
37
+ embedding_model: Model for generating embeddings
38
+ chunk_size: Maximum tokens per chunk
39
+ chunk_overlap: Number of tokens to overlap between chunks
40
+ min_chunk_size: Minimum tokens for a valid chunk
41
+ """
42
+ super().__init__(model_name, embedding_model)
43
+
44
+ # Validate chunking parameters
45
+ if chunk_overlap >= chunk_size:
46
+ raise ValueError("chunk_overlap must be less than chunk_size")
47
+ if min_chunk_size <= 0:
48
+ raise ValueError("min_chunk_size must be positive")
49
+
50
+ self.chunk_size = chunk_size
51
+ self.chunk_overlap = chunk_overlap
52
+ self.min_chunk_size = min_chunk_size
53
+ self.chunk_stats = []
54
+
55
+ logger.info(f"TokenChunker initialized: chunk_size={chunk_size}, overlap={chunk_overlap}, min_size={min_chunk_size}")
56
+
57
+ def _smart_tokenize(self, text: str) -> List[str]:
58
+ """
59
+ Tokenize text while preserving word boundaries for reconstruction.
60
+
61
+ Args:
62
+ text: The text content to tokenize
63
+
64
+ Returns:
65
+ List of tokens that can be cleanly rejoined
66
+ """
67
+ if not text.strip():
68
+ return []
69
+
70
+ try:
71
+ if self.uses_tiktoken:
72
+ # For tiktoken, we need a hybrid approach to preserve boundaries
73
+ return self._tiktoken_boundary_aware_split(text)
74
+
75
+ elif hasattr(self.tokenizer, 'tokenize'):
76
+ # For transformers tokenizers
77
+ tokens = self.tokenizer.tokenize(text)
78
+ return self._clean_subword_tokens(tokens)
79
+
80
+ else:
81
+ # Fallback to intelligent word splitting
82
+ return self._word_boundary_split(text)
83
+
84
+ except Exception as e:
85
+ logger.warning(f"Tokenization failed: {e}. Using word boundary fallback.")
86
+ return self._word_boundary_split(text)
87
+
88
+ def _tiktoken_boundary_aware_split(self, text: str) -> List[str]:
89
+ """
90
+ Split text in a way that's compatible with tiktoken while preserving boundaries.
91
+
92
+ Args:
93
+ text: Input text
94
+
95
+ Returns:
96
+ List of text segments that approximate tokens
97
+ """
98
+ # Get actual token count for validation
99
+ target_token_count = self.count_tokens(text)
100
+
101
+ # Split on natural boundaries (spaces, punctuation)
102
+ words = re.findall(r'\S+|\s+', text)
103
+
104
+ # If we have roughly the right number of words, use them
105
+ if abs(len(words) - target_token_count) / max(target_token_count, 1) < 0.3:
106
+ return [w for w in words if w.strip()]
107
+
108
+ # Otherwise, use a more granular split
109
+ segments = re.findall(r'\w+|[^\w\s]|\s+', text)
110
+ return [s for s in segments if s.strip()]
111
+
112
+ def _clean_subword_tokens(self, tokens: List[str]) -> List[str]:
113
+ """
114
+ Clean subword tokens for better reconstruction.
115
+
116
+ Args:
117
+ tokens: Raw tokens from tokenizer
118
+
119
+ Returns:
120
+ Cleaned tokens
121
+ """
122
+ cleaned = []
123
+ for token in tokens:
124
+ # Remove special tokens but keep the content
125
+ if token.startswith('##'):
126
+ # BERT-style subwords
127
+ cleaned.append(token[2:])
128
+ elif token.startswith('▁'):
129
+ # SentencePiece-style
130
+ cleaned.append(' ' + token[1:])
131
+ else:
132
+ cleaned.append(token)
133
+ return [t for t in cleaned if t.strip()]
134
+
135
+ def _word_boundary_split(self, text: str) -> List[str]:
136
+ """
137
+ Split text on word boundaries as fallback tokenization.
138
+
139
+ Args:
140
+ text: Input text
141
+
142
+ Returns:
143
+ List of words
144
+ """
145
+ # Split on whitespace but preserve some punctuation as separate tokens
146
+ tokens = re.findall(r'\w+|[.!?;,]', text)
147
+ return tokens
148
+
149
+ def _detokenize(self, tokens: List[str]) -> str:
150
+ """
151
+ Reconstruct text from tokens, handling different tokenizer types.
152
+
153
+ Args:
154
+ tokens: List of token strings
155
+
156
+ Returns:
157
+ Reconstructed text
158
+ """
159
+ if not tokens:
160
+ return ""
161
+
162
+ if self.uses_tiktoken or not hasattr(self.tokenizer, 'tokenize'):
163
+ # For tiktoken and basic tokenizers, use space joining with smart spacing
164
+ result = ""
165
+ for i, token in enumerate(tokens):
166
+ if not token.strip():
167
+ continue
168
+
169
+ if i == 0:
170
+ result = token
171
+ elif token in '.,!?;:':
172
+ result += token
173
+ elif result and result[-1] in '.,!?;:':
174
+ result += " " + token
175
+ else:
176
+ result += " " + token
177
+ return result
178
+
179
+ else:
180
+ # For transformers tokenizers, handle subword reconstruction
181
+ text = "".join(tokens)
182
+ # Clean up spacing around punctuation
183
+ text = re.sub(r'\s+([.!?;,])', r'\1', text)
184
+ text = re.sub(r'\s+', ' ', text)
185
+ return text.strip()
186
+
187
+ def _create_token_chunks(self, tokens: List[str]) -> List[List[str]]:
188
+ """
189
+ Split tokens into overlapping chunks of specified size.
190
+
191
+ Args:
192
+ tokens: List of token strings
193
+
194
+ Returns:
195
+ List of token chunks
196
+ """
197
+ if not tokens:
198
+ return []
199
+
200
+ chunks = []
201
+ start = 0
202
+
203
+ while start < len(tokens):
204
+ # Calculate end position for this chunk
205
+ end = min(start + self.chunk_size, len(tokens))
206
+
207
+ # Extract the chunk
208
+ chunk_tokens = tokens[start:end]
209
+
210
+ # Only add chunks that meet minimum size requirement
211
+ if len(chunk_tokens) >= self.min_chunk_size:
212
+ chunks.append(chunk_tokens)
213
+ self.chunk_stats.append(f"Created chunk with {len(chunk_tokens)} tokens")
214
+ else:
215
+ self.chunk_stats.append(f"Skipped small chunk with {len(chunk_tokens)} tokens")
216
+
217
+ # Break if we've reached the end
218
+ if end >= len(tokens):
219
+ break
220
+
221
+ # Calculate next start position with overlap
222
+ start = end - self.chunk_overlap
223
+
224
+ # Ensure forward progress
225
+ if start <= 0:
226
+ start = end
227
+
228
+ return chunks
229
+
230
+ def _process_single_chunk(self, chunk_tokens: List[str], chunk_index: int,
231
+ source_metadata: Dict[str, Any]) -> Optional[Document]:
232
+ """
233
+ Process a single token chunk into a Document with metadata.
234
+
235
+ Args:
236
+ chunk_tokens: List of tokens for this chunk
237
+ chunk_index: Index of this chunk in the document
238
+ source_metadata: Metadata from source document
239
+
240
+ Returns:
241
+ Document object with processed content and metadata, or None if invalid
242
+ """
243
+ # Reconstruct text from tokens
244
+ chunk_text = self._detokenize(chunk_tokens)
245
+
246
+ # Validate chunk content
247
+ if not self.is_content_valid(chunk_text, min_tokens=self.min_chunk_size):
248
+ self.chunk_stats.append(f"Chunk {chunk_index} failed validation")
249
+ return None
250
+
251
+ # Analyze the chunk content
252
+ stats = self.analyze_text(chunk_text)
253
+
254
+ # Create comprehensive metadata
255
+ metadata = source_metadata.copy()
256
+ metadata.update({
257
+ "chunk_index": chunk_index,
258
+ "chunk_type": "token",
259
+ "chunking_method": "token_based",
260
+ "token_count": len(chunk_tokens),
261
+ "char_count": stats["char_count"],
262
+ "sentence_count": stats["sentence_count"],
263
+ "word_count": stats["word_count"],
264
+ "chunk_size_limit": self.chunk_size,
265
+ "chunk_overlap": self.chunk_overlap
266
+ })
267
+
268
+ return Document(page_content=chunk_text, metadata=metadata)
269
+
270
+ def token_process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
271
+ """
272
+ Process document using token-based chunking with overlap.
273
+
274
+ Args:
275
+ file_path: Path to the document file
276
+ preprocess: Whether to preprocess text content
277
+
278
+ Returns:
279
+ List of Document objects, one per valid token chunk
280
+ """
281
+ try:
282
+ self.chunk_stats = [] # Reset stats for this document
283
+ raw_pages = self.load_document(file_path)
284
+ processed_chunks = []
285
+
286
+ logger.info(f"Processing document with {len(raw_pages)} pages using token chunking")
287
+
288
+ # Combine all pages into a single text for token-based processing
289
+ full_text = ""
290
+ combined_metadata = {}
291
+ page_info = [] # Track which pages contributed to the text
292
+
293
+ for page_idx, page in enumerate(raw_pages):
294
+ content = page.page_content
295
+
296
+ # Skip invalid content
297
+ if not self.is_content_valid(content):
298
+ logger.debug(f"Skipping invalid content on page {page_idx + 1}")
299
+ continue
300
+
301
+ # Preprocess if requested
302
+ if preprocess:
303
+ content = self.preprocess_text(content)
304
+ if not self.is_content_valid(content):
305
+ continue
306
+
307
+ # Track page information
308
+ page_info.append({
309
+ "page_number": page_idx + 1,
310
+ "original_metadata": page.metadata
311
+ })
312
+
313
+ # Combine text with page separation
314
+ if full_text:
315
+ full_text += "\n\n" + content
316
+ else:
317
+ full_text = content
318
+ # Use metadata from first valid page as base
319
+ combined_metadata = page.metadata.copy()
320
+
321
+ # Update combined metadata to reflect all pages
322
+ if page_info:
323
+ combined_metadata.update({
324
+ "total_pages_processed": len(page_info),
325
+ "page_range": f"{page_info[0]['page_number']}-{page_info[-1]['page_number']}",
326
+ "source_pages": [str(p["page_number"]) for p in page_info] # ✅ Convert to list of strings
327
+ })
328
+ # Remove the single "page" field since this represents multiple pages
329
+ combined_metadata.pop("page", None)
330
+
331
+ if not full_text.strip():
332
+ logger.warning("No valid content found in document")
333
+ return []
334
+
335
+ # Tokenize the entire document
336
+ all_tokens = self._smart_tokenize(full_text)
337
+ logger.info(f"Document tokenized into {len(all_tokens)} tokens")
338
+
339
+ if len(all_tokens) < self.min_chunk_size:
340
+ logger.warning(f"Document too short for chunking ({len(all_tokens)} tokens)")
341
+ return []
342
+
343
+ # Create overlapping token chunks
344
+ token_chunks = self._create_token_chunks(all_tokens)
345
+ logger.info(f"Created {len(token_chunks)} token chunks")
346
+
347
+ # Convert token chunks to Document objects
348
+ for chunk_idx, chunk_tokens in enumerate(token_chunks):
349
+ chunk_doc = self._process_single_chunk(
350
+ chunk_tokens,
351
+ chunk_idx,
352
+ combined_metadata
353
+ )
354
+ if chunk_doc:
355
+ processed_chunks.append(chunk_doc)
356
+
357
+ # Output processing statistics
358
+ if self.chunk_stats:
359
+ logger.info("\n".join(self.chunk_stats))
360
+
361
+ logger.info(f"Processed {len(processed_chunks)} valid token chunks")
362
+ return processed_chunks
363
+
364
+ except Exception as e:
365
+ logger.error(f"Error in token_process_document: {e}")
366
+ raise
367
+
368
+ def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
369
+ """
370
+ Process document using token chunking strategy (implements abstract method).
371
+
372
+ Args:
373
+ file_path: Path to the document file
374
+ preprocess: Whether to preprocess text content
375
+
376
+ Returns:
377
+ List of Document objects, one per valid token chunk
378
+ """
379
+ return self.token_process_document(file_path, preprocess)
380
+
381
+ def process_text_file(self, file_path: str, preprocess: bool = True) -> List[Document]:
382
+ """
383
+ Process text file directly using token-based chunking with overlap.
384
+
385
+ Args:
386
+ file_path: Path to the text file
387
+ preprocess: Whether to preprocess text content
388
+
389
+ Returns:
390
+ List of Document objects, one per valid token chunk
391
+ """
392
+ try:
393
+ from pathlib import Path
394
+ from datetime import datetime
395
+
396
+ self.chunk_stats = [] # Reset stats for this document
397
+
398
+ # Load the text file directly
399
+ content = self.load_text_file(file_path)
400
+
401
+ # Clean the text using the same logic as PDF conversion
402
+ content = self.clean_text_for_processing(content)
403
+
404
+ # Basic validation
405
+ if not self.is_content_valid(content):
406
+ logger.warning("Text file content failed validation")
407
+ return []
408
+
409
+ # Light preprocessing if requested (no header/footer removal for txt files)
410
+ if preprocess:
411
+ # Only apply basic text cleaning, not aggressive preprocessing
412
+ content = ' '.join(content.split()) # Normalize whitespace
413
+
414
+ # Create file-level metadata
415
+ file_path_obj = Path(file_path)
416
+ file_metadata = {
417
+ "source": file_path,
418
+ "file_name": file_path_obj.name,
419
+ "file_type": "txt",
420
+ "total_characters": len(content),
421
+ "processing_timestamp": datetime.now().isoformat(),
422
+ }
423
+
424
+ logger.info(f"Processing text file: {file_path_obj.name} ({len(content)} characters)")
425
+
426
+ # Tokenize the entire document
427
+ all_tokens = self._smart_tokenize(content)
428
+ logger.info(f"Text file tokenized into {len(all_tokens)} tokens")
429
+
430
+ if len(all_tokens) < self.min_chunk_size:
431
+ logger.warning(f"Text file too short for chunking ({len(all_tokens)} tokens)")
432
+ return []
433
+
434
+ # Create overlapping token chunks
435
+ token_chunks = self._create_token_chunks(all_tokens)
436
+ logger.info(f"Created {len(token_chunks)} token chunks from text file")
437
+
438
+ # Convert token chunks to Document objects
439
+ processed_chunks = []
440
+ for chunk_idx, chunk_tokens in enumerate(token_chunks):
441
+ chunk_doc = self._process_single_chunk(
442
+ chunk_tokens,
443
+ chunk_idx,
444
+ file_metadata
445
+ )
446
+ if chunk_doc:
447
+ processed_chunks.append(chunk_doc)
448
+
449
+ # Output processing statistics
450
+ if self.chunk_stats:
451
+ logger.info("\n".join(self.chunk_stats))
452
+
453
+ logger.info(f"Processed {len(processed_chunks)} valid token chunks from text file")
454
+ return processed_chunks
455
+
456
+ except Exception as e:
457
+ logger.error(f"Error processing text file: {e}")
458
+ raise
src/core/__init__.py ADDED
File without changes