Shubham170793 commited on
Commit
00eb202
·
verified ·
1 Parent(s): dd1dffd

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +29 -23
src/ingestion.py CHANGED
@@ -229,7 +229,7 @@ def get_hybrid_toc(text: str):
229
 
230
 
231
  # ==========================================================
232
- # 4️⃣ SMART CHUNKING (same as before)
233
  # ==========================================================
234
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
235
  text_length = len(text)
@@ -248,24 +248,44 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
248
  # --- Normalize ---
249
  text = re.sub(r"\s+", " ", text.strip())
250
 
251
- # --- 🧩 Detect procedural sections (new) ---
252
- procedure_blocks = re.split(
253
- r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)", text, flags=re.IGNORECASE
 
 
 
 
254
  )
255
- chunks = []
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  for block in procedure_blocks:
258
  if not block.strip():
259
  continue
260
 
261
- # Keep full procedure blocks together if not too long
262
  if len(block) < chunk_size * 1.5:
263
  chunks.append(block.strip())
264
  else:
265
- # Fallback: split gracefully by sentence
266
  chunks.extend(_split_by_sentence(block, chunk_size, overlap))
267
 
268
- # --- 🧠 Continuity preservation ---
269
  chunks = _merge_small_chunks(chunks, min_len=200)
270
  final_chunks = []
271
  for i, ch in enumerate(chunks):
@@ -275,23 +295,9 @@ def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
275
  prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
276
  final_chunks.append((prev_tail + " " + ch).strip())
277
 
278
- print(f"✅ Final chunks created (procedure-aware): {len(final_chunks)}")
279
  return final_chunks
280
 
281
- def _split_by_sentence(text, chunk_size=800, overlap=80):
282
- sentences = re.split(r"(?<=[.!?])\s+", text)
283
- chunks, current = [], ""
284
- for sent in sentences:
285
- if len(current) + len(sent) + 1 <= chunk_size:
286
- current += " " + sent
287
- else:
288
- if current.strip():
289
- chunks.append(current.strip())
290
- overlap_part = current[-overlap:] if overlap > 0 else ""
291
- current = overlap_part + " " + sent
292
- if current.strip():
293
- chunks.append(current.strip())
294
- return chunks
295
 
296
 
297
  def _merge_small_chunks(chunks, min_len=150):
 
229
 
230
 
231
  # ==========================================================
232
+ # 4️⃣ SMART CHUNKING (hierarchical + procedure-aware)
233
  # ==========================================================
234
  def chunk_text(text: str, chunk_size: int = None, overlap: int = None) -> list:
235
  text_length = len(text)
 
248
  # --- Normalize ---
249
  text = re.sub(r"\s+", " ", text.strip())
250
 
251
+ # ==========================================================
252
+ # 🧩 Step 1: Split by numbered section headers (major anchors)
253
+ # Example: 4.1 Preconditions | 3.2 Restrictions
254
+ # ==========================================================
255
+ section_blocks = re.split(
256
+ r"(?=(?:\s*\n|\s+)\d+(?:\.\d+){1,2}\s+[A-Z][A-Za-z].{0,80})",
257
+ text
258
  )
 
259
 
260
+ # ==========================================================
261
+ # 🧩 Step 2: Within each section, detect procedural subsections
262
+ # ==========================================================
263
+ procedure_blocks = []
264
+ for sec in section_blocks:
265
+ if not sec.strip():
266
+ continue
267
+
268
+ sub_blocks = re.split(
269
+ r"(?=(?:\s*\n|\s+)\d+\.\d+\s+(?:Create|Configure|Set\s*up|Setup|Steps?|Process|Procedure|Integration|Replication|Connection|Mapping|Restrictions?|Limitations?|Prerequisites?|Considerations?|Guidelines?|Notes?|Cautions?|Recommendations?)\b)",
270
+ sec,
271
+ flags=re.IGNORECASE
272
+ )
273
+ procedure_blocks.extend(sub_blocks)
274
+
275
+ # ==========================================================
276
+ # 🧠 Step 3: Build final chunks (preserve continuity + overlap)
277
+ # ==========================================================
278
+ chunks = []
279
  for block in procedure_blocks:
280
  if not block.strip():
281
  continue
282
 
 
283
  if len(block) < chunk_size * 1.5:
284
  chunks.append(block.strip())
285
  else:
 
286
  chunks.extend(_split_by_sentence(block, chunk_size, overlap))
287
 
288
+ # Merge and continuity
289
  chunks = _merge_small_chunks(chunks, min_len=200)
290
  final_chunks = []
291
  for i, ch in enumerate(chunks):
 
295
  prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
296
  final_chunks.append((prev_tail + " " + ch).strip())
297
 
298
+ print(f"✅ Final chunks created (section-aware + procedure-aware): {len(final_chunks)}")
299
  return final_chunks
300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
 
303
  def _merge_small_chunks(chunks, min_len=150):