Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 19

Commit

00eb202

verified ·

1 Parent(s): dd1dffd

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +29 -23

src/ingestion.py CHANGED Viewed

@@ -229,7 +229,7 @@ def get_hybrid_toc(text: str):
 # ==========================================================
-# 4️⃣ SMART CHUNKING (same as before)
 # ==========================================================
 def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     text_length = len(text)
@@ -248,24 +248,44 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     # --- Normalize ---
     text = re.sub(r"\s+", " ", text.strip())
-    # --- 🧩 Detect procedural sections (new) ---
-    procedure_blocks = re.split(
-        r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)", text, flags=re.IGNORECASE
     )
-    chunks = []
     for block in procedure_blocks:
         if not block.strip():
             continue
-        # Keep full procedure blocks together if not too long
         if len(block) < chunk_size * 1.5:
             chunks.append(block.strip())
         else:
-            # Fallback: split gracefully by sentence
             chunks.extend(_split_by_sentence(block, chunk_size, overlap))
-    # --- 🧠 Continuity preservation ---
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
@@ -275,23 +295,9 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
             prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
             final_chunks.append((prev_tail + " " + ch).strip())
-    print(f"✅ Final chunks created (procedure-aware): {len(final_chunks)}")
     return final_chunks
-def _split_by_sentence(text, chunk_size=800, overlap=80):
-    sentences = re.split(r"(?<=[.!?])\s+", text)
-    chunks, current = [], ""
-    for sent in sentences:
-        if len(current) + len(sent) + 1 <= chunk_size:
-            current += " " + sent
-        else:
-            if current.strip():
-                chunks.append(current.strip())
-            overlap_part = current[-overlap:] if overlap > 0 else ""
-            current = overlap_part + " " + sent
-    if current.strip():
-        chunks.append(current.strip())
-    return chunks
 def _merge_small_chunks(chunks, min_len=150):

 # ==========================================================
+# 4️⃣ SMART CHUNKING (hierarchical + procedure-aware)
 # ==========================================================
 def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
     text_length = len(text)
     # --- Normalize ---
     text = re.sub(r"\s+", " ", text.strip())
+    # ==========================================================
+    # 🧩 Step 1: Split by numbered section headers (major anchors)
+    # Example: 4.1 Preconditions | 3.2 Restrictions
+    # ==========================================================
+    section_blocks = re.split(
+        r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
+        text
     )
+    # ==========================================================
+    # 🧩 Step 2: Within each section, detect procedural subsections
+    # ==========================================================
+    procedure_blocks = []
+    for sec in section_blocks:
+        if not sec.strip():
+            continue
+        sub_blocks = re.split(
+            r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
+            sec,
+            flags=re.IGNORECASE
+        )
+        procedure_blocks.extend(sub_blocks)
+    # ==========================================================
+    # 🧠 Step 3: Build final chunks (preserve continuity + overlap)
+    # ==========================================================
+    chunks = []
     for block in procedure_blocks:
         if not block.strip():
             continue
         if len(block) < chunk_size * 1.5:
             chunks.append(block.strip())
         else:
             chunks.extend(_split_by_sentence(block, chunk_size, overlap))
+    # Merge and continuity
     chunks = _merge_small_chunks(chunks, min_len=200)
     final_chunks = []
     for i, ch in enumerate(chunks):
             prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
             final_chunks.append((prev_tail + " " + ch).strip())
+    print(f"✅ Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
     return final_chunks
 def _merge_small_chunks(chunks, min_len=150):