Update src/ingestion.py
Browse files- src/ingestion.py +5 -1
src/ingestion.py
CHANGED
|
@@ -26,6 +26,10 @@ def extract_text_from_pdf(file_path: str) -> str:
|
|
| 26 |
blocks = page.get_text("blocks")
|
| 27 |
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
|
| 30 |
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
|
| 31 |
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
|
|
@@ -100,7 +104,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
|
|
| 100 |
# Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
|
| 101 |
section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
|
| 102 |
sections = re.split(section_pattern, text)
|
| 103 |
-
sections = [s.strip() for s in sections if s.strip()]
|
| 104 |
|
| 105 |
chunks = []
|
| 106 |
|
|
|
|
| 26 |
blocks = page.get_text("blocks")
|
| 27 |
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
|
| 28 |
|
| 29 |
+
# 🔹 NEW: ensure bullets & numbered sections start on new lines
|
| 30 |
+
page_text = page_text.replace("• ", "\n• ")
|
| 31 |
+
page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
|
| 32 |
+
|
| 33 |
# Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
|
| 34 |
page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
|
| 35 |
page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
|
|
|
|
| 104 |
# Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
|
| 105 |
section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
|
| 106 |
sections = re.split(section_pattern, text)
|
| 107 |
+
sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
|
| 108 |
|
| 109 |
chunks = []
|
| 110 |
|