Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 5

Commit

dd8eaa7

verified ·

1 Parent(s): 7b609f8

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +51 -30

src/ingestion.py CHANGED Viewed

@@ -21,7 +21,7 @@ def extract_text_from_pdf(file_path: str) -> str:
                 page_text = page.get_text("text").strip()
                 if not page_text:
                     # Fallback: extract raw blocks (helps with weird PDFs)
-                    blocks = page.get_text("blocks")
                     page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                 text += page_text + "\n"
     except Exception as e:
@@ -33,44 +33,65 @@ def extract_text_from_pdf(file_path: str) -> str:
 # -----------------------------
-# SMART CHUNKING (Context Aware)
 # -----------------------------
-def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
     """
-    Splits text into overlapping, sentence-based chunks.
-    Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.
     Args:
         text (str): Input text.
         chunk_size (int): Max characters per chunk (default: 800).
-        overlap (int): Overlapping characters for continuity (default: 150).
     Returns:
         list[str]: Chunked text segments.
     """
-    # Clean text once
     text = re.sub(r'\s+', ' ', text.strip())
-    # Sentence segmentation (simple rule-based, fast)
-    sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks, current = [], ""
-    for sent in sentences:
-        if len(current) + len(sent) + 1 <= chunk_size:
-            current += " " + sent
-        else:
-            # Store full chunk
-            if current.strip():
-                chunks.append(current.strip())
-            # Overlap control
-            overlap_part = current[-overlap:] if overlap > 0 else ""
-            current = overlap_part + " " + sent
-    # Append the last chunk
-    if current.strip():
-        chunks.append(current.strip())
     return chunks
@@ -80,12 +101,12 @@ def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
 # -----------------------------
 if __name__ == "__main__":
     sample_text = """
-    Artificial Intelligence is transforming industries.
-    Machine learning is a key subfield, driving automation and predictive analytics.
-    Neural networks power most modern AI applications today.
-    This technology is reshaping healthcare, finance, and manufacturing.
     """
-    chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
     print(f"✅ Chunks created: {len(chunks)}")
     for i, c in enumerate(chunks, 1):
-        print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")

                 page_text = page.get_text("text").strip()
                 if not page_text:
                     # Fallback: extract raw blocks (helps with weird PDFs)
+                    blocks = pdf.get_text("blocks")
                     page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                 text += page_text + "\n"
     except Exception as e:
 # -----------------------------
+# SMART CHUNKING (Step-Aware + Context Aware)
 # -----------------------------
+def chunk_text(text: str, chunk_size: int = 800, overlap: int = 200) -> list:
     """
+    Splits text into overlapping, structured chunks.
+    Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
+    Falls back to sentence-based chunking for normal paragraphs.
     Args:
         text (str): Input text.
         chunk_size (int): Max characters per chunk (default: 800).
+        overlap (int): Overlapping characters for continuity (default: 200).
     Returns:
         list[str]: Chunked text segments.
     """
+    # Clean and normalize text
     text = re.sub(r'\s+', ' ', text.strip())
+    # Try to detect “Step” patterns
+    step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
+    step_splits = [s.strip() for s in step_splits if s.strip()]
+    chunks = []
+    # Case 1️⃣: Document has visible “Step” structure
+    if len(step_splits) > 1:
+        for step in step_splits:
+            if len(step) > chunk_size:
+                # If a step is too long → split by sentences within that step
+                sentences = re.split(r'(?<=[.!?])\s+', step)
+                current = ""
+                for sent in sentences:
+                    if len(current) + len(sent) + 1 <= chunk_size:
+                        current += " " + sent
+                    else:
+                        if current.strip():
+                            chunks.append(current.strip())
+                        overlap_part = current[-overlap:] if overlap > 0 else ""
+                        current = overlap_part + " " + sent
+                if current.strip():
+                    chunks.append(current.strip())
+            else:
+                chunks.append(step.strip())
+    # Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
+    else:
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        current = ""
+        for sent in sentences:
+            if len(current) + len(sent) + 1 <= chunk_size:
+                current += " " + sent
+            else:
+                if current.strip():
+                    chunks.append(current.strip())
+                overlap_part = current[-overlap:] if overlap > 0 else ""
+                current = overlap_part + " " + sent
+        if current.strip():
+            chunks.append(current.strip())
     return chunks
 # -----------------------------
 if __name__ == "__main__":
     sample_text = """
+    Step 1: Open the application.
+    Step 2: Navigate to the dashboard.
+    Step 3: Review the summary and click ‘Export’.
+    If the steps are missing, the function should still chunk by sentences.
     """
+    chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
     print(f"✅ Chunks created: {len(chunks)}")
     for i, c in enumerate(chunks, 1):
+        print(f"\n--- Chunk {i} ---\n{c}")