Update src/ingestion.py
Browse files- src/ingestion.py +1 -1
src/ingestion.py
CHANGED
|
@@ -98,7 +98,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
|
|
| 98 |
|
| 99 |
# --- Step 1️⃣: Split into logical sections by headings or step titles ---
|
| 100 |
# Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
|
| 101 |
-
section_pattern = r"(?=(?:\n?\d+(
|
| 102 |
sections = re.split(section_pattern, text)
|
| 103 |
sections = [s.strip() for s in sections if s.strip()]
|
| 104 |
|
|
|
|
| 98 |
|
| 99 |
# --- Step 1️⃣: Split into logical sections by headings or step titles ---
|
| 100 |
# Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
|
| 101 |
+
section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
|
| 102 |
sections = re.split(section_pattern, text)
|
| 103 |
sections = [s.strip() for s in sections if s.strip()]
|
| 104 |
|