Kalpokoch commited on
Commit
1d57863
·
verified ·
1 Parent(s): 4a276db

Update core/chunking.py

Browse files
Files changed (1) hide show
  1. core/chunking.py +210 -25
core/chunking.py CHANGED
@@ -4,43 +4,228 @@ import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import logging
 
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
10
- def semantic_chunker(text: str, model: SentenceTransformer, similarity_threshold: float = 0.55):
 
11
  """
12
- Splits text into chunks based on semantic similarity of sentences.
13
  """
14
- logger.info("Starting semantic chunking...")
15
- # First, split the document into sentences. A simple split by newline and period.
16
- sentences = [s.strip() for s in text.replace("\n", ". ").split(".") if s.strip()]
17
- if not sentences:
18
- return []
19
-
20
- # Generate embeddings for each sentence
21
- embeddings = model.encode(sentences, convert_to_numpy=True)
22
 
23
- chunks = []
24
- current_chunk_sentences = [sentences[0]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- for i in range(1, len(sentences)):
27
- # Calculate similarity between the current sentence and the previous one
28
- similarity = cosine_similarity(
29
  embeddings[i].reshape(1, -1),
30
  embeddings[i-1].reshape(1, -1)
31
  )[0, 0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # If similarity is below the threshold, it's a semantic break.
34
- # Finalize the current chunk and start a new one.
35
- if similarity < similarity_threshold:
36
- chunks.append(" ".join(current_chunk_sentences))
37
- current_chunk_sentences = []
38
 
39
- current_chunk_sentences.append(sentences[i])
 
 
40
 
41
- # Add the last remaining chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if current_chunk_sentences:
43
- chunks.append(" ".join(current_chunk_sentences))
44
 
45
- logger.info(f"Semantic chunking resulted in {len(chunks)} chunks.")
46
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import logging
7
+ import re
8
+ from typing import List, Optional
9
+
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
+
14
+ def _split_into_sentences(text: str) -> List[str]:
15
  """
16
+ Improved sentence splitting that handles common edge cases.
17
  """
18
+ # Handle common abbreviations that shouldn't cause splits
19
+ abbreviations = [
20
+ 'Dr', 'Mr', 'Mrs', 'Ms', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'Inc', 'Ltd', 'Corp',
21
+ 'U.S', 'U.K', 'U.N', 'E.U', 'NASA', 'FBI', 'CIA', 'GDP', 'CEO', 'CFO', 'CTO'
22
+ ]
 
 
 
23
 
24
+ # Temporarily replace abbreviations to protect them from splitting
25
+ protected_text = text
26
+ replacements = {}
27
+ for i, abbr in enumerate(abbreviations):
28
+ placeholder = f"__ABBR_{i}__"
29
+ protected_text = re.sub(rf'\b{re.escape(abbr)}\.', placeholder, protected_text, flags=re.IGNORECASE)
30
+ replacements[placeholder] = f"{abbr}."
31
+
32
+ # Split on sentence-ending punctuation followed by whitespace or end of string
33
+ sentence_pattern = r'[.!?]+(?:\s+|$)'
34
+ sentences = re.split(sentence_pattern, protected_text)
35
+
36
+ # Restore abbreviations and clean up
37
+ cleaned_sentences = []
38
+ for sentence in sentences:
39
+ if sentence.strip():
40
+ # Restore abbreviations
41
+ for placeholder, original in replacements.items():
42
+ sentence = sentence.replace(placeholder, original)
43
+ cleaned_sentences.append(sentence.strip())
44
+
45
+ return cleaned_sentences
46
+
47
+
48
+ def _calculate_rolling_similarity(embeddings: np.ndarray, window_size: int = 3) -> List[float]:
49
+ """
50
+ Calculate rolling average similarity to smooth out noise and capture broader semantic shifts.
51
+ """
52
+ similarities = []
53
 
54
+ for i in range(1, len(embeddings)):
55
+ # Calculate similarity between current and previous sentence
56
+ current_sim = cosine_similarity(
57
  embeddings[i].reshape(1, -1),
58
  embeddings[i-1].reshape(1, -1)
59
  )[0, 0]
60
+ similarities.append(current_sim)
61
+
62
+ # Apply rolling average to smooth similarities
63
+ if len(similarities) <= window_size:
64
+ return similarities
65
+
66
+ smoothed = []
67
+ for i in range(len(similarities)):
68
+ start_idx = max(0, i - window_size // 2)
69
+ end_idx = min(len(similarities), i + window_size // 2 + 1)
70
+ window_similarities = similarities[start_idx:end_idx]
71
+ smoothed.append(np.mean(window_similarities))
72
+
73
+ return smoothed
74
+
75
+
76
+ def _adaptive_threshold(similarities: List[float], base_threshold: float = 0.55) -> float:
77
+ """
78
+ Dynamically adjust threshold based on the distribution of similarities in the text.
79
+ """
80
+ if not similarities:
81
+ return base_threshold
82
+
83
+ mean_sim = np.mean(similarities)
84
+ std_sim = np.std(similarities)
85
+
86
+ # Adjust threshold based on text characteristics
87
+ # If similarities are generally high, use a higher threshold
88
+ # If similarities vary a lot, be more conservative
89
+ adjusted_threshold = max(
90
+ base_threshold,
91
+ mean_sim - (0.5 * std_sim)
92
+ )
93
+
94
+ return min(adjusted_threshold, 0.8) # Cap at 0.8 to avoid over-splitting
95
+
96
+
97
+ def semantic_chunker(
98
+ text: str,
99
+ model: SentenceTransformer,
100
+ similarity_threshold: float = 0.55,
101
+ min_chunk_size: int = 50,
102
+ max_chunk_size: int = 1000,
103
+ adaptive_threshold_enabled: bool = True
104
+ ) -> List[str]:
105
+ """
106
+ Enhanced semantic chunking with improved sentence splitting, adaptive thresholding,
107
+ and chunk size controls.
108
+
109
+ Args:
110
+ text: Input text to chunk
111
+ model: SentenceTransformer model for embeddings
112
+ similarity_threshold: Base threshold for semantic breaks
113
+ min_chunk_size: Minimum characters per chunk
114
+ max_chunk_size: Maximum characters per chunk
115
+ adaptive_threshold_enabled: Whether to use adaptive thresholding
116
+
117
+ Returns:
118
+ List of text chunks
119
+ """
120
+ logger.info("Starting enhanced semantic chunking...")
121
+
122
+ if not text or not text.strip():
123
+ logger.warning("Empty or whitespace-only text provided")
124
+ return []
125
+
126
+ # Improved sentence splitting
127
+ sentences = _split_into_sentences(text)
128
+
129
+ if len(sentences) <= 1:
130
+ logger.info("Text contains only one sentence, returning as single chunk")
131
+ return [text.strip()]
132
+
133
+ logger.info(f"Split text into {len(sentences)} sentences")
134
+
135
+ try:
136
+ # Generate embeddings with error handling
137
+ embeddings = model.encode(sentences, convert_to_numpy=True, show_progress_bar=False)
138
+ logger.info("Generated sentence embeddings")
139
+ except Exception as e:
140
+ logger.error(f"Failed to generate embeddings: {e}")
141
+ # Fallback to simple splitting if embeddings fail
142
+ return [text]
143
+
144
+ # Calculate smoothed similarities
145
+ similarities = _calculate_rolling_similarity(embeddings)
146
+
147
+ if not similarities:
148
+ return [text.strip()]
149
+
150
+ # Adaptive threshold adjustment
151
+ if adaptive_threshold_enabled:
152
+ threshold = _adaptive_threshold(similarities, similarity_threshold)
153
+ logger.info(f"Adjusted threshold from {similarity_threshold:.3f} to {threshold:.3f}")
154
+ else:
155
+ threshold = similarity_threshold
156
+
157
+ # Enhanced chunking with size constraints
158
+ chunks = []
159
+ current_chunk_sentences = [sentences[0]]
160
+ current_chunk_length = len(sentences[0])
161
+
162
+ for i, similarity in enumerate(similarities):
163
+ sentence_idx = i + 1 # similarities[i] compares sentence[i+1] with sentence[i]
164
+ sentence = sentences[sentence_idx]
165
+ sentence_length = len(sentence)
166
+
167
+ # Check if we should create a new chunk
168
+ should_break = False
169
 
170
+ # Semantic break condition
171
+ if similarity < threshold:
172
+ should_break = True
 
 
173
 
174
+ # Maximum size constraint - force break if adding sentence exceeds max size
175
+ elif current_chunk_length + sentence_length > max_chunk_size:
176
+ should_break = True
177
 
178
+ # If we decide to break, finalize current chunk
179
+ if should_break and current_chunk_sentences:
180
+ chunk_text = " ".join(current_chunk_sentences)
181
+
182
+ # Only add chunk if it meets minimum size, otherwise merge with next
183
+ if len(chunk_text) >= min_chunk_size or not chunks:
184
+ chunks.append(chunk_text)
185
+ current_chunk_sentences = []
186
+ current_chunk_length = 0
187
+
188
+ # Add current sentence to chunk
189
+ current_chunk_sentences.append(sentence)
190
+ current_chunk_length += sentence_length + 1 # +1 for space
191
+
192
+ # Handle final chunk
193
  if current_chunk_sentences:
194
+ final_chunk = " ".join(current_chunk_sentences)
195
 
196
+ # If final chunk is too small, merge with previous chunk
197
+ if len(final_chunk) < min_chunk_size and chunks:
198
+ chunks[-1] = chunks[-1] + " " + final_chunk
199
+ else:
200
+ chunks.append(final_chunk)
201
+
202
+ # Post-processing: ensure no chunks are too large
203
+ final_chunks = []
204
+ for chunk in chunks:
205
+ if len(chunk) <= max_chunk_size:
206
+ final_chunks.append(chunk)
207
+ else:
208
+ # Split oversized chunks at sentence boundaries
209
+ chunk_sentences = _split_into_sentences(chunk)
210
+ temp_chunk = ""
211
+
212
+ for sent in chunk_sentences:
213
+ if len(temp_chunk) + len(sent) <= max_chunk_size:
214
+ temp_chunk += (" " + sent) if temp_chunk else sent
215
+ else:
216
+ if temp_chunk:
217
+ final_chunks.append(temp_chunk)
218
+ temp_chunk = sent
219
+
220
+ if temp_chunk:
221
+ final_chunks.append(temp_chunk)
222
+
223
+ logger.info(f"Enhanced semantic chunking resulted in {len(final_chunks)} chunks")
224
+
225
+ # Log chunk statistics for debugging
226
+ if final_chunks:
227
+ chunk_lengths = [len(chunk) for chunk in final_chunks]
228
+ logger.debug(f"Chunk length stats - Min: {min(chunk_lengths)}, "
229
+ f"Max: {max(chunk_lengths)}, Mean: {np.mean(chunk_lengths):.1f}")
230
+
231
+ return final_chunks