Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +21 -41
services/kb_creation.py
CHANGED
|
@@ -59,14 +59,14 @@ BULLET_RE = re.compile(r"^\s*(?:[-*\u2022]|\d+[.)])\s+", re.IGNORECASE)
|
|
| 59 |
def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
|
| 60 |
lines: List[str] = []
|
| 61 |
for p in (paragraphs or []):
|
| 62 |
-
p = (p or
|
| 63 |
if not p:
|
| 64 |
continue
|
| 65 |
-
|
| 66 |
-
if STRONG_ACTION_HEADING_RE.match(p) or HEADING_LINE_RE.match(p) or BULLET_RE.match(p):
|
| 67 |
lines.append(p)
|
| 68 |
continue
|
| 69 |
-
|
|
|
|
| 70 |
lines.extend(parts)
|
| 71 |
return lines
|
| 72 |
|
|
@@ -119,11 +119,6 @@ ACTION_SYNS = {
|
|
| 119 |
"update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
|
| 120 |
"delete": {"delete", "remove", "cancel", "deletion", "unassign"}
|
| 121 |
}
|
| 122 |
-
# Strong heading detectors for action blocks (generic)
|
| 123 |
-
STRONG_ACTION_HEADING_RE = re.compile(r"^(?P<prefix>[A-Za-z \-_/]*?)\b(?P<action>creation|updation|update|deletion|delete|cancel)\b[ A-Za-z\-_/]*$", re.IGNORECASE)
|
| 124 |
-
# Treat markdown-style headings and TitleCase lines as boundaries
|
| 125 |
-
HEADING_LINE_RE = re.compile(r"^\s*(?:#{1,6}\s+|[A-Z][A-Za-z0-9 \-_/]{2,}$)")
|
| 126 |
-
|
| 127 |
|
| 128 |
STEP_NUM_PATTERNS = [
|
| 129 |
re.compile(r"^\s*\d+\s*[.)]\s+"), # 1. / 1)
|
|
@@ -186,11 +181,8 @@ def is_boundary_to_new_section(prev_intent: str, curr_intent: str, ln: str) -> b
|
|
| 186 |
# Dominant intent flips (e.g., steps → errors)
|
| 187 |
if prev_intent != curr_intent:
|
| 188 |
return True
|
| 189 |
-
# Explicit action headings or generic headings
|
| 190 |
-
if STRONG_ACTION_HEADING_RE.match(ln) or HEADING_LINE_RE.match(ln):
|
| 191 |
-
return True
|
| 192 |
# Action heading-like appears (e.g., 'Updation:', 'Deletion:')
|
| 193 |
-
if
|
| 194 |
return True
|
| 195 |
# Escalation marker
|
| 196 |
if any(m in low for m in ESCALATION_MARKERS):
|
|
@@ -219,35 +211,29 @@ def semantic_sectionize(paragraphs: List[str]) -> List[Tuple[str, List[str], Dic
|
|
| 219 |
sections: List[Tuple[str, List[str], Dict[str, str]]] = []
|
| 220 |
current_block: List[str] = []
|
| 221 |
current_intent: Optional[str] = None
|
| 222 |
-
|
| 223 |
-
for ln in [p for p in paragraphs if (p or
|
| 224 |
-
ln_intent =
|
| 225 |
block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
|
| 226 |
-
|
|
|
|
|
|
|
| 227 |
act = infer_action(current_block)
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
title = first.strip()
|
| 232 |
-
else:
|
| 233 |
-
title = synthetic_title(current_intent or 'neutral', act)
|
| 234 |
-
sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
|
| 235 |
current_block = [ln]
|
| 236 |
current_intent = ln_intent
|
| 237 |
-
current_first_line = ln
|
| 238 |
else:
|
| 239 |
-
if not current_block:
|
| 240 |
-
current_first_line = ln
|
| 241 |
current_block.append(ln)
|
| 242 |
current_intent = block_intent
|
|
|
|
|
|
|
| 243 |
if current_block:
|
| 244 |
act = infer_action(current_block)
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
else:
|
| 249 |
-
title = synthetic_title(current_intent or 'neutral', act)
|
| 250 |
-
sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
|
| 251 |
return sections
|
| 252 |
|
| 253 |
# ----------------------------- Intent/module vocab used by runtime -----------------------------
|
|
@@ -344,14 +330,8 @@ def ingest_documents(folder_path: str) -> None:
|
|
| 344 |
for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
|
| 345 |
chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
|
| 346 |
total_chunks += len(chunks)
|
| 347 |
-
intent_tag_hint = hints.get(
|
| 348 |
-
|
| 349 |
-
explicit_action = (
|
| 350 |
-
'create' if re.search(r"\bcreation\b|\bcreate\b", st_low) else
|
| 351 |
-
'update' if re.search(r"\bupdation\b|\bupdate\b|\bedit\b|\bchange\b|\breschedule\b", st_low) else
|
| 352 |
-
'delete' if re.search(r"\bdeletion\b|\bdelete\b|\bcancel\b", st_low) else ''
|
| 353 |
-
)
|
| 354 |
-
action_tag_hint = explicit_action or hints.get('action', '')
|
| 355 |
|
| 356 |
for c_idx, chunk in enumerate(chunks):
|
| 357 |
derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
|
|
|
|
| 59 |
def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
|
| 60 |
lines: List[str] = []
|
| 61 |
for p in (paragraphs or []):
|
| 62 |
+
p = (p or "").strip()
|
| 63 |
if not p:
|
| 64 |
continue
|
| 65 |
+
if BULLET_RE.match(p):
|
|
|
|
| 66 |
lines.append(p)
|
| 67 |
continue
|
| 68 |
+
# split on sentence ends
|
| 69 |
+
parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
|
| 70 |
lines.extend(parts)
|
| 71 |
return lines
|
| 72 |
|
|
|
|
| 119 |
"update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
|
| 120 |
"delete": {"delete", "remove", "cancel", "deletion", "unassign"}
|
| 121 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
STEP_NUM_PATTERNS = [
|
| 124 |
re.compile(r"^\s*\d+\s*[.)]\s+"), # 1. / 1)
|
|
|
|
| 181 |
# Dominant intent flips (e.g., steps → errors)
|
| 182 |
if prev_intent != curr_intent:
|
| 183 |
return True
|
|
|
|
|
|
|
|
|
|
| 184 |
# Action heading-like appears (e.g., 'Updation:', 'Deletion:')
|
| 185 |
+
if ":" in ln and any(k in low for k in ("updation", "update", "deletion", "delete", "cancel")):
|
| 186 |
return True
|
| 187 |
# Escalation marker
|
| 188 |
if any(m in low for m in ESCALATION_MARKERS):
|
|
|
|
| 211 |
sections: List[Tuple[str, List[str], Dict[str, str]]] = []
|
| 212 |
current_block: List[str] = []
|
| 213 |
current_intent: Optional[str] = None
|
| 214 |
+
|
| 215 |
+
for ln in [p for p in paragraphs if (p or "").strip()]:
|
| 216 |
+
ln_intent = dominant_intent([ln])
|
| 217 |
block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
|
| 218 |
+
|
| 219 |
+
if current_block and is_boundary_to_new_section(current_intent or "neutral", block_intent, ln):
|
| 220 |
+
# close current section
|
| 221 |
act = infer_action(current_block)
|
| 222 |
+
title = synthetic_title(current_intent or "neutral", act)
|
| 223 |
+
sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
|
| 224 |
+
# start new block
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
current_block = [ln]
|
| 226 |
current_intent = ln_intent
|
|
|
|
| 227 |
else:
|
|
|
|
|
|
|
| 228 |
current_block.append(ln)
|
| 229 |
current_intent = block_intent
|
| 230 |
+
|
| 231 |
+
# close last
|
| 232 |
if current_block:
|
| 233 |
act = infer_action(current_block)
|
| 234 |
+
title = synthetic_title(current_intent or "neutral", act)
|
| 235 |
+
sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
|
| 236 |
+
|
|
|
|
|
|
|
|
|
|
| 237 |
return sections
|
| 238 |
|
| 239 |
# ----------------------------- Intent/module vocab used by runtime -----------------------------
|
|
|
|
| 330 |
for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
|
| 331 |
chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
|
| 332 |
total_chunks += len(chunks)
|
| 333 |
+
intent_tag_hint = hints.get("intent", "neutral")
|
| 334 |
+
action_tag_hint = hints.get("action", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
for c_idx, chunk in enumerate(chunks):
|
| 337 |
derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
|