Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

Shubham170793 commited on Oct 18

Commit

6d87461

verified ·

1 Parent(s): df1d611

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +15 -7

src/ingestion.py CHANGED Viewed

@@ -93,27 +93,34 @@ def clean_text(text: str) -> str:
 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION
 # ==========================================================
 def extract_table_of_contents(text: str):
     """
     Detects Table of Contents (TOC) in PDFs.
     Returns list of (section_number, section_title).
     """
     toc_entries = []
     lines = text.split("\n")
     toc_started = False
-    for line in lines:
-        # Detect start of TOC
-        if not toc_started and re.search(r"table\s*of\s*contents", line, re.IGNORECASE):
-            toc_started = True
-            continue
         if toc_started:
-            # Stop scanning when we reach main content
             if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
                 break
-            # Match TOC patterns like "3.2 Configure Endpoints ........ 13"
             match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
             if match:
                 section = match.group(1).strip()
@@ -124,6 +131,7 @@ def extract_table_of_contents(text: str):
     return toc_entries
 # ==========================================================
 # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
 # ==========================================================

 # ==========================================================
 # 3️⃣ TABLE OF CONTENTS DETECTION
 # ==========================================================
+# ==========================================================
+# 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
+# ==========================================================
 def extract_table_of_contents(text: str):
     """
     Detects Table of Contents (TOC) in PDFs.
+    Supports variants like 'Contents', 'Index', or 'Overview'.
     Returns list of (section_number, section_title).
     """
     toc_entries = []
     lines = text.split("\n")
     toc_started = False
+    for i, line in enumerate(lines):
+        # Detect possible TOC header variants
+        if not toc_started and re.search(r"\b(table\s*of\s*contents|contents|index|overview)\b", line, re.IGNORECASE):
+            # Confidence check — look ahead a few lines
+            next_lines = lines[i + 1 : i + 6]
+            if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
+                toc_started = True
+                continue
         if toc_started:
+            # Stop scanning when main content starts (e.g., "Step 1:" or "1. Introduction")
             if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
                 break
+            # Match lines like "3.2 Configure Endpoints ........ 13"
             match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
             if match:
                 section = match.group(1).strip()
     return toc_entries
 # ==========================================================
 # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
 # ==========================================================