Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 17

Commit

f2fb7ac

verified ·

1 Parent(s): 85242e3

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +44 -21

src/ingestion.py CHANGED Viewed

@@ -88,36 +88,59 @@ def clean_text(text: str) -> str:
 # ==========================================================
 def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
     """
-    Splits text into overlapping, structured chunks.
-    Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
-    Falls back to sentence-based chunking for normal paragraphs.
     """
-    # Normalize whitespace first
-    text = re.sub(r'\s+', ' ', text.strip())
-    # Try to detect “Step” patterns (case-insensitive)
-    step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
-    step_splits = [s.strip() for s in step_splits if s.strip()]
     chunks = []
-    # Case 1️⃣: “Step” sections present
-    if len(step_splits) > 1:
-        for step in step_splits:
-            if len(step) > chunk_size:
-                chunks.extend(_split_by_sentence(step, chunk_size, overlap))
             else:
-                chunks.append(step.strip())
-    # Case 2️⃣: No “Step” pattern → fallback
-    else:
-        chunks.extend(_split_by_sentence(text, chunk_size, overlap))
-    # Merge tiny chunks for semantic completeness
-    chunks = _merge_small_chunks(chunks, min_len=150)
-    print(f"✅ Final chunks created: {len(chunks)}")
-    return chunks
 # ==========================================================

 # ==========================================================
 def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
     """
+    Enhanced chunking for structured enterprise PDFs (SAP guides).
+    ✅ Keeps bullet lists, numbered steps, and headings together.
+    ✅ Avoids breaking chunks mid-list or mid-section.
     """
+    # Normalize whitespace
+    text = re.sub(r"\s+", " ", text.strip())
+    # --- Step 1️⃣: Split into logical sections by headings or step titles ---
+    # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
+    section_pattern = r"(?=(?:\n?\d+(\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
+    sections = re.split(section_pattern, text)
+    sections = [s.strip() for s in sections if s.strip()]
     chunks = []
+    for section in sections:
+        # --- Step 2️⃣: Merge multi-line bullets ---
+        # e.g., "- Ensure that..." or "• Activate the feature..."
+        section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
+        bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
+        bullets = [b.strip() for b in bullets if b.strip()]
+        # Case A: Multiple bullets (keep as one coherent block)
+        if len(bullets) > 2:
+            combined = " ".join(bullets)
+            # If the bullet section is very long, split every few bullets
+            if len(combined) > chunk_size * 1.5:
+                for i in range(0, len(bullets), 6):
+                    block = " ".join(bullets[i:i+6])
+                    chunks.append(block.strip())
             else:
+                chunks.append(combined.strip())
+        # Case B: Single bullet or normal paragraph → split by sentence
+        else:
+            chunks.extend(_split_by_sentence(section, chunk_size, overlap))
+    # --- Step 3️⃣: Merge small fragments to keep continuity ---
+    chunks = _merge_small_chunks(chunks, min_len=200)
+    # --- Step 4️⃣: Ensure overlap continuity between neighboring chunks ---
+    final_chunks = []
+    for i, ch in enumerate(chunks):
+        if i == 0:
+            final_chunks.append(ch)
+        else:
+            prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
+            final_chunks.append((prev_tail + " " + ch).strip())
+    print(f"✅ Final chunks created (continuity-aware): {len(final_chunks)}")
+    return final_chunks
 # ==========================================================