Update src/ingestion.py
Browse files- src/ingestion.py +29 -23
src/ingestion.py
CHANGED
|
@@ -229,7 +229,7 @@ def get_hybrid_toc(text: str):
|
|
| 229 |
|
| 230 |
|
| 231 |
# ==========================================================
|
| 232 |
-
# 4️⃣ SMART CHUNKING (
|
| 233 |
# ==========================================================
|
| 234 |
def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
|
| 235 |
text_length = len(text)
|
|
@@ -248,24 +248,44 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
|
|
| 248 |
# --- Normalize ---
|
| 249 |
text = re.sub(r"\s+", " ", text.strip())
|
| 250 |
|
| 251 |
-
#
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
)
|
| 255 |
-
chunks = []
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
for block in procedure_blocks:
|
| 258 |
if not block.strip():
|
| 259 |
continue
|
| 260 |
|
| 261 |
-
# Keep full procedure blocks together if not too long
|
| 262 |
if len(block) < chunk_size * 1.5:
|
| 263 |
chunks.append(block.strip())
|
| 264 |
else:
|
| 265 |
-
# Fallback: split gracefully by sentence
|
| 266 |
chunks.extend(_split_by_sentence(block, chunk_size, overlap))
|
| 267 |
|
| 268 |
-
#
|
| 269 |
chunks = _merge_small_chunks(chunks, min_len=200)
|
| 270 |
final_chunks = []
|
| 271 |
for i, ch in enumerate(chunks):
|
|
@@ -275,23 +295,9 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
|
|
| 275 |
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
|
| 276 |
final_chunks.append((prev_tail + " " + ch).strip())
|
| 277 |
|
| 278 |
-
print(f"✅ Final chunks created (procedure-aware): {len(final_chunks)}")
|
| 279 |
return final_chunks
|
| 280 |
|
| 281 |
-
def _split_by_sentence(text, chunk_size=800, overlap=80):
|
| 282 |
-
sentences = re.split(r"(?<=[.!?])\s+", text)
|
| 283 |
-
chunks, current = [], ""
|
| 284 |
-
for sent in sentences:
|
| 285 |
-
if len(current) + len(sent) + 1 <= chunk_size:
|
| 286 |
-
current += " " + sent
|
| 287 |
-
else:
|
| 288 |
-
if current.strip():
|
| 289 |
-
chunks.append(current.strip())
|
| 290 |
-
overlap_part = current[-overlap:] if overlap > 0 else ""
|
| 291 |
-
current = overlap_part + " " + sent
|
| 292 |
-
if current.strip():
|
| 293 |
-
chunks.append(current.strip())
|
| 294 |
-
return chunks
|
| 295 |
|
| 296 |
|
| 297 |
def _merge_small_chunks(chunks, min_len=150):
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
# ==========================================================
|
| 232 |
+
# 4️⃣ SMART CHUNKING (hierarchical + procedure-aware)
|
| 233 |
# ==========================================================
|
| 234 |
def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
|
| 235 |
text_length = len(text)
|
|
|
|
| 248 |
# --- Normalize ---
|
| 249 |
text = re.sub(r"\s+", " ", text.strip())
|
| 250 |
|
| 251 |
+
# ==========================================================
|
| 252 |
+
# 🧩 Step 1: Split by numbered section headers (major anchors)
|
| 253 |
+
# Example: 4.1 Preconditions | 3.2 Restrictions
|
| 254 |
+
# ==========================================================
|
| 255 |
+
section_blocks = re.split(
|
| 256 |
+
r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
|
| 257 |
+
text
|
| 258 |
)
|
|
|
|
| 259 |
|
| 260 |
+
# ==========================================================
|
| 261 |
+
# 🧩 Step 2: Within each section, detect procedural subsections
|
| 262 |
+
# ==========================================================
|
| 263 |
+
procedure_blocks = []
|
| 264 |
+
for sec in section_blocks:
|
| 265 |
+
if not sec.strip():
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
+
sub_blocks = re.split(
|
| 269 |
+
r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
|
| 270 |
+
sec,
|
| 271 |
+
flags=re.IGNORECASE
|
| 272 |
+
)
|
| 273 |
+
procedure_blocks.extend(sub_blocks)
|
| 274 |
+
|
| 275 |
+
# ==========================================================
|
| 276 |
+
# 🧠 Step 3: Build final chunks (preserve continuity + overlap)
|
| 277 |
+
# ==========================================================
|
| 278 |
+
chunks = []
|
| 279 |
for block in procedure_blocks:
|
| 280 |
if not block.strip():
|
| 281 |
continue
|
| 282 |
|
|
|
|
| 283 |
if len(block) < chunk_size * 1.5:
|
| 284 |
chunks.append(block.strip())
|
| 285 |
else:
|
|
|
|
| 286 |
chunks.extend(_split_by_sentence(block, chunk_size, overlap))
|
| 287 |
|
| 288 |
+
# Merge and continuity
|
| 289 |
chunks = _merge_small_chunks(chunks, min_len=200)
|
| 290 |
final_chunks = []
|
| 291 |
for i, ch in enumerate(chunks):
|
|
|
|
| 295 |
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
|
| 296 |
final_chunks.append((prev_tail + " " + ch).strip())
|
| 297 |
|
| 298 |
+
print(f"✅ Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
|
| 299 |
return final_chunks
|
| 300 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
|
| 303 |
def _merge_small_chunks(chunks, min_len=150):
|