Update src/ingestion.py
Browse files- src/ingestion.py +15 -7
src/ingestion.py
CHANGED
|
@@ -93,27 +93,34 @@ def clean_text(text: str) -> str:
|
|
| 93 |
# ==========================================================
|
| 94 |
# 3️⃣ TABLE OF CONTENTS DETECTION
|
| 95 |
# ==========================================================
|
|
|
|
|
|
|
|
|
|
| 96 |
def extract_table_of_contents(text: str):
|
| 97 |
"""
|
| 98 |
Detects Table of Contents (TOC) in PDFs.
|
|
|
|
| 99 |
Returns list of (section_number, section_title).
|
| 100 |
"""
|
| 101 |
toc_entries = []
|
| 102 |
lines = text.split("\n")
|
| 103 |
toc_started = False
|
| 104 |
|
| 105 |
-
for line in lines:
|
| 106 |
-
# Detect
|
| 107 |
-
if not toc_started and re.search(r"table\s*of\s*contents", line, re.IGNORECASE):
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
if toc_started:
|
| 112 |
-
# Stop scanning when
|
| 113 |
if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
|
| 114 |
break
|
| 115 |
|
| 116 |
-
# Match
|
| 117 |
match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
|
| 118 |
if match:
|
| 119 |
section = match.group(1).strip()
|
|
@@ -124,6 +131,7 @@ def extract_table_of_contents(text: str):
|
|
| 124 |
return toc_entries
|
| 125 |
|
| 126 |
|
|
|
|
| 127 |
# ==========================================================
|
| 128 |
# 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
|
| 129 |
# ==========================================================
|
|
|
|
| 93 |
# ==========================================================
|
| 94 |
# 3️⃣ TABLE OF CONTENTS DETECTION
|
| 95 |
# ==========================================================
|
| 96 |
+
# ==========================================================
|
| 97 |
+
# 3️⃣ TABLE OF CONTENTS DETECTION (Improved)
|
| 98 |
+
# ==========================================================
|
| 99 |
def extract_table_of_contents(text: str):
|
| 100 |
"""
|
| 101 |
Detects Table of Contents (TOC) in PDFs.
|
| 102 |
+
Supports variants like 'Contents', 'Index', or 'Overview'.
|
| 103 |
Returns list of (section_number, section_title).
|
| 104 |
"""
|
| 105 |
toc_entries = []
|
| 106 |
lines = text.split("\n")
|
| 107 |
toc_started = False
|
| 108 |
|
| 109 |
+
for i, line in enumerate(lines):
|
| 110 |
+
# Detect possible TOC header variants
|
| 111 |
+
if not toc_started and re.search(r"\b(table\s*of\s*contents|contents|index|overview)\b", line, re.IGNORECASE):
|
| 112 |
+
# Confidence check — look ahead a few lines
|
| 113 |
+
next_lines = lines[i + 1 : i + 6]
|
| 114 |
+
if any(re.match(r"^\s*\d+(\.\d+)*\s+[A-Za-z]", l) for l in next_lines):
|
| 115 |
+
toc_started = True
|
| 116 |
+
continue
|
| 117 |
|
| 118 |
if toc_started:
|
| 119 |
+
# Stop scanning when main content starts (e.g., "Step 1:" or "1. Introduction")
|
| 120 |
if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
|
| 121 |
break
|
| 122 |
|
| 123 |
+
# Match lines like "3.2 Configure Endpoints ........ 13"
|
| 124 |
match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
|
| 125 |
if match:
|
| 126 |
section = match.group(1).strip()
|
|
|
|
| 131 |
return toc_entries
|
| 132 |
|
| 133 |
|
| 134 |
+
|
| 135 |
# ==========================================================
|
| 136 |
# 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
|
| 137 |
# ==========================================================
|