Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

Shubham170793 commited on Oct 17

Commit

32f64de

verified ·

1 Parent(s): eaada01

Update src/ingestion.py

Files changed (1) hide show

src/ingestion.py CHANGED Viewed

@@ -26,6 +26,10 @@ def extract_text_from_pdf(file_path: str) -> str:
                     blocks = page.get_text("blocks")
                     page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                 # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
@@ -100,7 +104,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
     # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
     section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
     sections = re.split(section_pattern, text)
-    sections = [s.strip() for s in sections if s.strip()]
     chunks = []

                     blocks = page.get_text("blocks")
                     page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
+                # 🔹 NEW: ensure bullets & numbered sections start on new lines
+                page_text = page_text.replace("• ", "\n• ")
+                page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
                 # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
                 page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
                 page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
     # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
     section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
     sections = re.split(section_pattern, text)
+    sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
     chunks = []