Shubham170793 commited on
Commit
df1d611
·
verified ·
1 Parent(s): 8e133cf

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +67 -36
src/ingestion.py CHANGED
@@ -3,12 +3,13 @@ import fitz # PyMuPDF
3
  import unicodedata
4
 
5
  # ==========================================================
6
- # 1️⃣ TEXT EXTRACTION (Clean + Layout Normalization)
7
  # ==========================================================
8
- def extract_text_from_pdf(file_path: str) -> str:
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
- Handles noisy layout artifacts, page numbers, and TOC dots.
 
12
  """
13
  text = ""
14
  try:
@@ -16,18 +17,18 @@ def extract_text_from_pdf(file_path: str) -> str:
16
  for page_num, page in enumerate(pdf, start=1):
17
  page_text = page.get_text("text").strip()
18
 
19
- # Fallback: handle scanned or weirdly structured pages
20
  if not page_text:
21
  blocks = page.get_text("blocks")
22
  page_text = " ".join(
23
  block[4] for block in blocks if isinstance(block[4], str)
24
  )
25
 
26
- # 🔹 NEW: ensure bullets & numbered sections start on new lines
27
  page_text = page_text.replace("• ", "\n• ")
28
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
29
 
30
- # Remove repeating headers/footers (e.g., “PUBLIC”, “Page 5 of 110”)
31
  page_text = re.sub(
32
  r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
33
  )
@@ -45,17 +46,25 @@ def extract_text_from_pdf(file_path: str) -> str:
45
 
46
  # --- Cleaning pipeline ---
47
  text = clean_text(text)
48
- return text
 
 
 
 
 
 
 
 
49
 
50
 
51
  # ==========================================================
52
- # 2️⃣ ADVANCED CLEANING PIPELINE (SAP / Enterprise PDFs)
53
  # ==========================================================
54
  def clean_text(text: str) -> str:
55
- """Cleans noisy extracted PDF text before chunking and embedding."""
56
  text = unicodedata.normalize("NFKD", text)
57
 
58
- # Remove TOC or numbering noise (e.g., 6.3.1 Prerequisites .............. 53)
59
  text = re.sub(
60
  r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text
61
  )
@@ -63,41 +72,66 @@ def clean_text(text: str) -> str:
63
  # Replace bullet symbols and dots with consistent spacing
64
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
65
 
66
- # Remove excessive dots and hyphenated page wraps
67
  text = re.sub(r"\.{3,}", ". ", text)
68
  text = re.sub(r"-\s*\n", "", text)
69
-
70
- # Remove page headers/footers (common in SAP docs)
71
- text = re.sub(
72
- r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE
73
- )
74
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
75
 
76
- # Normalize newlines paragraph breaks
77
  text = text.replace("\r", " ")
78
  text = re.sub(r"\n{2,}", "\n", text)
79
  text = re.sub(r"\s{2,}", " ", text)
80
 
81
- # Remove leftover special chars / artifacts
82
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
83
-
84
- # Remove multiple section dots from TOC lines
85
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
86
 
87
  return text.strip()
88
 
89
 
90
  # ==========================================================
91
- # 3️⃣ SMART CHUNKING (Step-Aware + Auto-Sized)
92
  # ==========================================================
93
- def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
94
  """
95
- Enhanced chunking for structured enterprise PDFs (SAP guides).
96
- Auto-selects chunk size based on document length.
97
- ✅ Keeps bullet lists, numbered steps, and headings together.
98
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # --- Auto-tune chunk size based on document length ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  text_length = len(text)
102
  if chunk_size is None:
103
  if text_length > 200000:
@@ -111,15 +145,12 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
111
 
112
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
113
 
114
- # Normalize whitespace
115
  text = re.sub(r"\s+", " ", text.strip())
116
-
117
- # --- Step 1️⃣: Split into logical sections ---
118
  section_pattern = (
119
  r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
120
  )
121
  sections = re.split(section_pattern, text)
122
- sections = [s.strip() for s in sections if s and isinstance(s, str) and s.strip()]
123
 
124
  chunks = []
125
  for section in sections:
@@ -140,7 +171,7 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
140
 
141
  chunks = _merge_small_chunks(chunks, min_len=200)
142
 
143
- # --- Ensure overlap continuity ---
144
  final_chunks = []
145
  for i, ch in enumerate(chunks):
146
  if i == 0:
@@ -154,10 +185,9 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
154
 
155
 
156
  # ==========================================================
157
- # 4️⃣ Helper Functions
158
  # ==========================================================
159
  def _split_by_sentence(text, chunk_size=800, overlap=80):
160
- """Split by sentence punctuation to preserve semantics."""
161
  sentences = re.split(r"(?<=[.!?])\s+", text)
162
  chunks, current = [], ""
163
  for sent in sentences:
@@ -174,7 +204,6 @@ def _split_by_sentence(text, chunk_size=800, overlap=80):
174
 
175
 
176
  def _merge_small_chunks(chunks, min_len=150):
177
- """Merge undersized chunks with the next one."""
178
  merged, buffer = [], ""
179
  for ch in chunks:
180
  if len(ch) < min_len:
@@ -190,11 +219,13 @@ def _merge_small_chunks(chunks, min_len=150):
190
 
191
 
192
  # ==========================================================
193
- # 5️⃣ DEBUGGING (Manual Run)
194
  # ==========================================================
195
  if __name__ == "__main__":
196
  pdf_path = "sample.pdf"
197
- text = extract_text_from_pdf(pdf_path)
 
198
  chunks = chunk_text(text)
 
199
  for i, c in enumerate(chunks[:5], 1):
200
  print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")
 
3
  import unicodedata
4
 
5
  # ==========================================================
6
+ # 1️⃣ TEXT EXTRACTION (Clean + TOC Detection)
7
  # ==========================================================
8
+ def extract_text_from_pdf(file_path: str):
9
  """
10
  Extracts and cleans text from a PDF using PyMuPDF.
11
+ Handles layout artifacts, numbered sections, and TOC.
12
+ Returns both clean text and detected TOC (if any).
13
  """
14
  text = ""
15
  try:
 
17
  for page_num, page in enumerate(pdf, start=1):
18
  page_text = page.get_text("text").strip()
19
 
20
+ # Fallback: for scanned/weird layouts
21
  if not page_text:
22
  blocks = page.get_text("blocks")
23
  page_text = " ".join(
24
  block[4] for block in blocks if isinstance(block[4], str)
25
  )
26
 
27
+ # Ensure bullets & numbered sections start on new lines
28
  page_text = page_text.replace("• ", "\n• ")
29
  page_text = re.sub(r"(\d+\.\d+\.\d+)", r"\n\1", page_text)
30
 
31
+ # Remove headers/footers and confidential tags
32
  page_text = re.sub(
33
  r"Page\s*\d+\s*(of\s*\d+)?", "", page_text, flags=re.IGNORECASE
34
  )
 
46
 
47
  # --- Cleaning pipeline ---
48
  text = clean_text(text)
49
+
50
+ # --- TOC extraction ---
51
+ toc = extract_table_of_contents(text)
52
+ if toc:
53
+ print(f"📘 TOC detected with {len(toc)} entries.")
54
+ else:
55
+ print("⚠️ No Table of Contents detected.")
56
+
57
+ return text, toc
58
 
59
 
60
  # ==========================================================
61
+ # 2️⃣ ADVANCED CLEANING PIPELINE
62
  # ==========================================================
63
  def clean_text(text: str) -> str:
64
+ """Cleans noisy PDF text before chunking and embedding."""
65
  text = unicodedata.normalize("NFKD", text)
66
 
67
+ # Remove TOC noise (like "6.3.1 Prerequisites .............. 53")
68
  text = re.sub(
69
  r"\b\d+(\.\d+){1,}\s+[A-Za-z].{0,40}\.{2,}\s*\d+\b", "", text
70
  )
 
72
  # Replace bullet symbols and dots with consistent spacing
73
  text = text.replace("•", "- ").replace("▪", "- ").replace("‣", "- ")
74
 
75
+ # Remove excessive dots, hyphens, headers
76
  text = re.sub(r"\.{3,}", ". ", text)
77
  text = re.sub(r"-\s*\n", "", text)
78
+ text = re.sub(r"\n\s*(PUBLIC|PRIVATE|Confidential)\s*\n", "\n", text, flags=re.IGNORECASE)
 
 
 
 
79
  text = re.sub(r"©\s*[A-Z].*?\d{4}", "", text)
80
 
81
+ # Normalize newlines and spaces
82
  text = text.replace("\r", " ")
83
  text = re.sub(r"\n{2,}", "\n", text)
84
  text = re.sub(r"\s{2,}", " ", text)
85
 
86
+ # Clean leftover special chars
87
  text = re.sub(r"[^A-Za-z0-9,;:.\-\(\)/&\n\s]", "", text)
 
 
88
  text = re.sub(r"(\s*\.\s*){3,}", " ", text)
89
 
90
  return text.strip()
91
 
92
 
93
  # ==========================================================
94
+ # 3️⃣ TABLE OF CONTENTS DETECTION
95
  # ==========================================================
96
+ def extract_table_of_contents(text: str):
97
  """
98
+ Detects Table of Contents (TOC) in PDFs.
99
+ Returns list of (section_number, section_title).
 
100
  """
101
+ toc_entries = []
102
+ lines = text.split("\n")
103
+ toc_started = False
104
+
105
+ for line in lines:
106
+ # Detect start of TOC
107
+ if not toc_started and re.search(r"table\s*of\s*contents", line, re.IGNORECASE):
108
+ toc_started = True
109
+ continue
110
+
111
+ if toc_started:
112
+ # Stop scanning when we reach main content
113
+ if re.match(r"^\s*(Step\s*\d+|1\.\s*[A-Z])", line):
114
+ break
115
 
116
+ # Match TOC patterns like "3.2 Configure Endpoints ........ 13"
117
+ match = re.match(r"^\s*(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z0-9\s/&()-]+)", line)
118
+ if match:
119
+ section = match.group(1).strip()
120
+ title = match.group(2).strip()
121
+ if len(title) > 3:
122
+ toc_entries.append((section, title))
123
+
124
+ return toc_entries
125
+
126
+
127
+ # ==========================================================
128
+ # 4️⃣ SMART CHUNKING (Auto-Sized + Continuity-Aware)
129
+ # ==========================================================
130
+ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
131
+ """
132
+ Enhanced chunking for structured enterprise PDFs.
133
+ Auto-selects chunk size and keeps procedural context intact.
134
+ """
135
  text_length = len(text)
136
  if chunk_size is None:
137
  if text_length > 200000:
 
145
 
146
  print(f"⚙️ Auto-selected chunk_size={chunk_size}, overlap={overlap} (len={text_length})")
147
 
 
148
  text = re.sub(r"\s+", " ", text.strip())
 
 
149
  section_pattern = (
150
  r"(?=(?:\n?\d+(?:\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
151
  )
152
  sections = re.split(section_pattern, text)
153
+ sections = [s.strip() for s in sections if s.strip()]
154
 
155
  chunks = []
156
  for section in sections:
 
171
 
172
  chunks = _merge_small_chunks(chunks, min_len=200)
173
 
174
+ # Add continuity overlap
175
  final_chunks = []
176
  for i, ch in enumerate(chunks):
177
  if i == 0:
 
185
 
186
 
187
  # ==========================================================
188
+ # 5️⃣ Helper Functions
189
  # ==========================================================
190
  def _split_by_sentence(text, chunk_size=800, overlap=80):
 
191
  sentences = re.split(r"(?<=[.!?])\s+", text)
192
  chunks, current = [], ""
193
  for sent in sentences:
 
204
 
205
 
206
  def _merge_small_chunks(chunks, min_len=150):
 
207
  merged, buffer = [], ""
208
  for ch in chunks:
209
  if len(ch) < min_len:
 
219
 
220
 
221
  # ==========================================================
222
+ # 6️⃣ DEBUGGING (Manual Run)
223
  # ==========================================================
224
  if __name__ == "__main__":
225
  pdf_path = "sample.pdf"
226
+ text, toc = extract_text_from_pdf(pdf_path)
227
+ print("\n📚 TOC Preview:", toc[:5])
228
  chunks = chunk_text(text)
229
+ print(f"\n✅ {len(chunks)} chunks created.")
230
  for i, c in enumerate(chunks[:5], 1):
231
  print(f"\n--- Chunk {i} ---\n{c[:500]}...\n")