Shubham170793 commited on
Commit
32f64de
·
verified ·
1 Parent(s): eaada01

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +5 -1
src/ingestion.py CHANGED
@@ -26,6 +26,10 @@ def extract_text_from_pdf(file_path: str) -> str:
26
  blocks = page.get_text("blocks")
27
  page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
28
 
 
 
 
 
29
  # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
30
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
31
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
@@ -100,7 +104,7 @@ def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
100
  # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
101
  section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
102
  sections = re.split(section_pattern, text)
103
- sections = [s.strip() for s in sections if s.strip()]
104
 
105
  chunks = []
106
 
 
26
  blocks = page.get_text("blocks")
27
  page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
28
 
29
+ # 🔹 NEW: ensure bullets & numbered sections start on new lines
30
+ page_text = page_text.replace("• ", "\n• ")
31
+ page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
32
+
33
  # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
34
  page_text = re.sub(r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE)
35
  page_text = re.sub(r"(PUBLIC|Confidential|© SAP.*|\bSAP\b\s*\d{4})", "", page_text, flags=re.IGNORECASE)
 
104
  # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
105
  section_pattern = r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
106
  sections = re.split(section_pattern, text)
107
+ sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
108
 
109
  chunks = []
110