Shubham170793 commited on
Commit
f2fb7ac
·
verified ·
1 Parent(s): 85242e3

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +44 -21
src/ingestion.py CHANGED
@@ -88,36 +88,59 @@ def clean_text(text: str) -> str:
88
  # ==========================================================
89
  def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
90
  """
91
- Splits text into overlapping, structured chunks.
92
- Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
93
- Falls back to sentence-based chunking for normal paragraphs.
94
  """
95
 
96
- # Normalize whitespace first
97
- text = re.sub(r'\s+', ' ', text.strip())
98
 
99
- # Try to detect “Step” patterns (case-insensitive)
100
- step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
101
- step_splits = [s.strip() for s in step_splits if s.strip()]
 
 
102
 
103
  chunks = []
104
 
105
- # Case 1️⃣: “Step” sections present
106
- if len(step_splits) > 1:
107
- for step in step_splits:
108
- if len(step) > chunk_size:
109
- chunks.extend(_split_by_sentence(step, chunk_size, overlap))
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
- chunks.append(step.strip())
112
 
113
- # Case 2️⃣: No “Step” patternfallback
114
- else:
115
- chunks.extend(_split_by_sentence(text, chunk_size, overlap))
116
 
117
- # Merge tiny chunks for semantic completeness
118
- chunks = _merge_small_chunks(chunks, min_len=150)
119
- print(f"✅ Final chunks created: {len(chunks)}")
120
- return chunks
 
 
 
 
 
 
 
 
 
 
121
 
122
 
123
  # ==========================================================
 
88
  # ==========================================================
89
  def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
90
  """
91
+ Enhanced chunking for structured enterprise PDFs (SAP guides).
92
+ Keeps bullet lists, numbered steps, and headings together.
93
+ Avoids breaking chunks mid-list or mid-section.
94
  """
95
 
96
+ # Normalize whitespace
97
+ text = re.sub(r"\s+", " ", text.strip())
98
 
99
+ # --- Step 1️⃣: Split into logical sections by headings or step titles ---
100
+ # Detect section headers like "3.1.2 Prerequisites for Commerce Automation", "Step 2:", etc.
101
+ section_pattern = r"(?=(?:\n?\d+(\.\d+){0,3}\s+[A-Z][^\n]{3,100})|(?:Step\s*\d+[:.\s]))"
102
+ sections = re.split(section_pattern, text)
103
+ sections = [s.strip() for s in sections if s.strip()]
104
 
105
  chunks = []
106
 
107
+ for section in sections:
108
+ # --- Step 2️⃣: Merge multi-line bullets ---
109
+ # e.g., "- Ensure that..." or "• Activate the feature..."
110
+ section = re.sub(r"\n\s*[-•▪‣]\s*", " • ", section)
111
+ bullets = re.split(r"(?=\s*[-•▪‣]\s)", section)
112
+ bullets = [b.strip() for b in bullets if b.strip()]
113
+
114
+ # Case A: Multiple bullets (keep as one coherent block)
115
+ if len(bullets) > 2:
116
+ combined = " ".join(bullets)
117
+
118
+ # If the bullet section is very long, split every few bullets
119
+ if len(combined) > chunk_size * 1.5:
120
+ for i in range(0, len(bullets), 6):
121
+ block = " ".join(bullets[i:i+6])
122
+ chunks.append(block.strip())
123
  else:
124
+ chunks.append(combined.strip())
125
 
126
+ # Case B: Single bullet or normal paragraph split by sentence
127
+ else:
128
+ chunks.extend(_split_by_sentence(section, chunk_size, overlap))
129
 
130
+ # --- Step 3️⃣: Merge small fragments to keep continuity ---
131
+ chunks = _merge_small_chunks(chunks, min_len=200)
132
+
133
+ # --- Step 4️⃣: Ensure overlap continuity between neighboring chunks ---
134
+ final_chunks = []
135
+ for i, ch in enumerate(chunks):
136
+ if i == 0:
137
+ final_chunks.append(ch)
138
+ else:
139
+ prev_tail = chunks[i - 1][-overlap:] if overlap > 0 else ""
140
+ final_chunks.append((prev_tail + " " + ch).strip())
141
+
142
+ print(f"✅ Final chunks created (continuity-aware): {len(final_chunks)}")
143
+ return final_chunks
144
 
145
 
146
  # ==========================================================