srilakshu012456 commited on
Commit
62e7962
·
verified ·
1 Parent(s): 316a87a

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +21 -41
services/kb_creation.py CHANGED
@@ -59,14 +59,14 @@ BULLET_RE = re.compile(r"^\s*(?:[-*\u2022]|\d+[.)])\s+", re.IGNORECASE)
59
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
60
  lines: List[str] = []
61
  for p in (paragraphs or []):
62
- p = (p or '').strip()
63
  if not p:
64
  continue
65
- # Keep headings as single lines; otherwise split on sentence boundaries
66
- if STRONG_ACTION_HEADING_RE.match(p) or HEADING_LINE_RE.match(p) or BULLET_RE.match(p):
67
  lines.append(p)
68
  continue
69
- parts = [s.strip() for s in re.split(r'(?<=[.!?])\s+', p) if s.strip()]
 
70
  lines.extend(parts)
71
  return lines
72
 
@@ -119,11 +119,6 @@ ACTION_SYNS = {
119
  "update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
120
  "delete": {"delete", "remove", "cancel", "deletion", "unassign"}
121
  }
122
- # Strong heading detectors for action blocks (generic)
123
- STRONG_ACTION_HEADING_RE = re.compile(r"^(?P<prefix>[A-Za-z \-_/]*?)\b(?P<action>creation|updation|update|deletion|delete|cancel)\b[ A-Za-z\-_/]*$", re.IGNORECASE)
124
- # Treat markdown-style headings and TitleCase lines as boundaries
125
- HEADING_LINE_RE = re.compile(r"^\s*(?:#{1,6}\s+|[A-Z][A-Za-z0-9 \-_/]{2,}$)")
126
-
127
 
128
  STEP_NUM_PATTERNS = [
129
  re.compile(r"^\s*\d+\s*[.)]\s+"), # 1. / 1)
@@ -186,11 +181,8 @@ def is_boundary_to_new_section(prev_intent: str, curr_intent: str, ln: str) -> b
186
  # Dominant intent flips (e.g., steps → errors)
187
  if prev_intent != curr_intent:
188
  return True
189
- # Explicit action headings or generic headings
190
- if STRONG_ACTION_HEADING_RE.match(ln) or HEADING_LINE_RE.match(ln):
191
- return True
192
  # Action heading-like appears (e.g., 'Updation:', 'Deletion:')
193
- if ':' in ln and any(k in low for k in ('updation', 'update', 'deletion', 'delete', 'cancel')):
194
  return True
195
  # Escalation marker
196
  if any(m in low for m in ESCALATION_MARKERS):
@@ -219,35 +211,29 @@ def semantic_sectionize(paragraphs: List[str]) -> List[Tuple[str, List[str], Dic
219
  sections: List[Tuple[str, List[str], Dict[str, str]]] = []
220
  current_block: List[str] = []
221
  current_intent: Optional[str] = None
222
- current_first_line: Optional[str] = None
223
- for ln in [p for p in paragraphs if (p or '').strip()]:
224
- ln_intent = 'steps' if STRONG_ACTION_HEADING_RE.match(ln) else dominant_intent([ln])
225
  block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
226
- if current_block and is_boundary_to_new_section(current_intent or 'neutral', block_intent, ln):
 
 
227
  act = infer_action(current_block)
228
- # Preserve heading title when present
229
- first = current_first_line or ''
230
- if first and (STRONG_ACTION_HEADING_RE.match(first) or HEADING_LINE_RE.match(first)):
231
- title = first.strip()
232
- else:
233
- title = synthetic_title(current_intent or 'neutral', act)
234
- sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
235
  current_block = [ln]
236
  current_intent = ln_intent
237
- current_first_line = ln
238
  else:
239
- if not current_block:
240
- current_first_line = ln
241
  current_block.append(ln)
242
  current_intent = block_intent
 
 
243
  if current_block:
244
  act = infer_action(current_block)
245
- first = current_first_line or ''
246
- if first and (STRONG_ACTION_HEADING_RE.match(first) or HEADING_LINE_RE.match(first)):
247
- title = first.strip()
248
- else:
249
- title = synthetic_title(current_intent or 'neutral', act)
250
- sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
251
  return sections
252
 
253
  # ----------------------------- Intent/module vocab used by runtime -----------------------------
@@ -344,14 +330,8 @@ def ingest_documents(folder_path: str) -> None:
344
  for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
345
  chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
346
  total_chunks += len(chunks)
347
- intent_tag_hint = hints.get('intent', 'neutral')
348
- st_low = (section_title or '').lower()
349
- explicit_action = (
350
- 'create' if re.search(r"\bcreation\b|\bcreate\b", st_low) else
351
- 'update' if re.search(r"\bupdation\b|\bupdate\b|\bedit\b|\bchange\b|\breschedule\b", st_low) else
352
- 'delete' if re.search(r"\bdeletion\b|\bdelete\b|\bcancel\b", st_low) else ''
353
- )
354
- action_tag_hint = explicit_action or hints.get('action', '')
355
 
356
  for c_idx, chunk in enumerate(chunks):
357
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
 
59
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
60
  lines: List[str] = []
61
  for p in (paragraphs or []):
62
+ p = (p or "").strip()
63
  if not p:
64
  continue
65
+ if BULLET_RE.match(p):
 
66
  lines.append(p)
67
  continue
68
+ # split on sentence ends
69
+ parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
70
  lines.extend(parts)
71
  return lines
72
 
 
119
  "update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
120
  "delete": {"delete", "remove", "cancel", "deletion", "unassign"}
121
  }
 
 
 
 
 
122
 
123
  STEP_NUM_PATTERNS = [
124
  re.compile(r"^\s*\d+\s*[.)]\s+"), # 1. / 1)
 
181
  # Dominant intent flips (e.g., steps → errors)
182
  if prev_intent != curr_intent:
183
  return True
 
 
 
184
  # Action heading-like appears (e.g., 'Updation:', 'Deletion:')
185
+ if ":" in ln and any(k in low for k in ("updation", "update", "deletion", "delete", "cancel")):
186
  return True
187
  # Escalation marker
188
  if any(m in low for m in ESCALATION_MARKERS):
 
211
  sections: List[Tuple[str, List[str], Dict[str, str]]] = []
212
  current_block: List[str] = []
213
  current_intent: Optional[str] = None
214
+
215
+ for ln in [p for p in paragraphs if (p or "").strip()]:
216
+ ln_intent = dominant_intent([ln])
217
  block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
218
+
219
+ if current_block and is_boundary_to_new_section(current_intent or "neutral", block_intent, ln):
220
+ # close current section
221
  act = infer_action(current_block)
222
+ title = synthetic_title(current_intent or "neutral", act)
223
+ sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
224
+ # start new block
 
 
 
 
225
  current_block = [ln]
226
  current_intent = ln_intent
 
227
  else:
 
 
228
  current_block.append(ln)
229
  current_intent = block_intent
230
+
231
+ # close last
232
  if current_block:
233
  act = infer_action(current_block)
234
+ title = synthetic_title(current_intent or "neutral", act)
235
+ sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
236
+
 
 
 
237
  return sections
238
 
239
  # ----------------------------- Intent/module vocab used by runtime -----------------------------
 
330
  for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
331
  chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
332
  total_chunks += len(chunks)
333
+ intent_tag_hint = hints.get("intent", "neutral")
334
+ action_tag_hint = hints.get("action", "")
 
 
 
 
 
 
335
 
336
  for c_idx, chunk in enumerate(chunks):
337
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)