Update src/ingestion.py
Browse files- src/ingestion.py +44 -21
src/ingestion.py
CHANGED
|
@@ -88,36 +88,59 @@ def clean_text(text: str) -> str:
|
|
| 88 |
# ==========================================================
|
| 89 |
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
|
| 90 |
"""
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
"""
|
| 95 |
|
| 96 |
-
# Normalize whitespace
|
| 97 |
-
text = re.sub(r
|
| 98 |
|
| 99 |
-
#
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
|
| 103 |
chunks = []
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
else:
|
| 111 |
-
chunks.append(
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
|
| 117 |
-
# Merge
|
| 118 |
-
chunks = _merge_small_chunks(chunks, min_len=
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
|
| 123 |
# ==========================================================
|
|
|
|
| 88 |
# ==========================================================
|
| 89 |
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
|
| 90 |
"""
|
| 91 |
+
Enhanced chunking for structured enterprise PDFs (SAP guides).
|
| 92 |
+
✅ Keeps bullet lists, numbered steps, and headings together.
|
| 93 |
+
✅ Avoids breaking chunks mid-list or mid-section.
|
| 94 |
"""
|
| 95 |
|
| 96 |
+
# Normalize whitespace
|
| 97 |
+
text = re.sub(r"\s+", " ", text.strip())
|
| 98 |
|
| 99 |
+
# --- Step 1️⃣: Split into logical sections by headings or step titles ---
|
| 100 |
+
# Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
|
| 101 |
+
section_pattern = r"(?=(?:\n?\d+(\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
|
| 102 |
+
sections = re.split(section_pattern, text)
|
| 103 |
+
sections = [s.strip() for s in sections if s.strip()]
|
| 104 |
|
| 105 |
chunks = []
|
| 106 |
|
| 107 |
+
for section in sections:
|
| 108 |
+
# --- Step 2️⃣: Merge multi-line bullets ---
|
| 109 |
+
# e.g., "- Ensure that..." or "• Activate the feature..."
|
| 110 |
+
section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
|
| 111 |
+
bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
|
| 112 |
+
bullets = [b.strip() for b in bullets if b.strip()]
|
| 113 |
+
|
| 114 |
+
# Case A: Multiple bullets (keep as one coherent block)
|
| 115 |
+
if len(bullets) > 2:
|
| 116 |
+
combined = " ".join(bullets)
|
| 117 |
+
|
| 118 |
+
# If the bullet section is very long, split every few bullets
|
| 119 |
+
if len(combined) > chunk_size * 1.5:
|
| 120 |
+
for i in range(0, len(bullets), 6):
|
| 121 |
+
block = " ".join(bullets[i:i+6])
|
| 122 |
+
chunks.append(block.strip())
|
| 123 |
else:
|
| 124 |
+
chunks.append(combined.strip())
|
| 125 |
|
| 126 |
+
# Case B: Single bullet or normal paragraph → split by sentence
|
| 127 |
+
else:
|
| 128 |
+
chunks.extend(_split_by_sentence(section, chunk_size, overlap))
|
| 129 |
|
| 130 |
+
# --- Step 3️⃣: Merge small fragments to keep continuity ---
|
| 131 |
+
chunks = _merge_small_chunks(chunks, min_len=200)
|
| 132 |
+
|
| 133 |
+
# --- Step 4️⃣: Ensure overlap continuity between neighboring chunks ---
|
| 134 |
+
final_chunks = []
|
| 135 |
+
for i, ch in enumerate(chunks):
|
| 136 |
+
if i == 0:
|
| 137 |
+
final_chunks.append(ch)
|
| 138 |
+
else:
|
| 139 |
+
prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
|
| 140 |
+
final_chunks.append((prev_tail + " " + ch).strip())
|
| 141 |
+
|
| 142 |
+
print(f"✅ Final chunks created (continuity-aware): {len(final_chunks)}")
|
| 143 |
+
return final_chunks
|
| 144 |
|
| 145 |
|
| 146 |
# ==========================================================
|