Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Jan 7

Commit

62e7962

verified ·

1 Parent(s): 316a87a

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +21 -41

services/kb_creation.py CHANGED Viewed

@@ -59,14 +59,14 @@ BULLET_RE = re.compile(r"^\s*(?:[-*\u2022]|\d+[.)])\s+", re.IGNORECASE)
 def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
     lines: List[str] = []
     for p in (paragraphs or []):
-        p = (p or '').strip()
         if not p:
             continue
-        # Keep headings as single lines; otherwise split on sentence boundaries
-        if STRONG_ACTION_HEADING_RE.match(p) or HEADING_LINE_RE.match(p) or BULLET_RE.match(p):
             lines.append(p)
             continue
-        parts = [s.strip() for s in re.split(r'(?<=[.!?])\s+', p) if s.strip()]
         lines.extend(parts)
     return lines
@@ -119,11 +119,6 @@ ACTION_SYNS = {
     "update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
     "delete": {"delete", "remove", "cancel", "deletion", "unassign"}
 }
-# Strong heading detectors for action blocks (generic)
-STRONG_ACTION_HEADING_RE = re.compile(r"^(?P<prefix>[A-Za-z \-_/]*?)\b(?P<action>creation|updation|update|deletion|delete|cancel)\b[ A-Za-z\-_/]*$", re.IGNORECASE)
-# Treat markdown-style headings and TitleCase lines as boundaries
-HEADING_LINE_RE = re.compile(r"^\s*(?:#{1,6}\s+|[A-Z][A-Za-z0-9 \-_/]{2,}$)")
 STEP_NUM_PATTERNS = [
     re.compile(r"^\s*\d+\s*[.)]\s+"),     # 1. / 1)
@@ -186,11 +181,8 @@ def is_boundary_to_new_section(prev_intent: str, curr_intent: str, ln: str) -> b
     # Dominant intent flips (e.g., steps → errors)
     if prev_intent != curr_intent:
         return True
-    # Explicit action headings or generic headings
-    if STRONG_ACTION_HEADING_RE.match(ln) or HEADING_LINE_RE.match(ln):
-        return True
     # Action heading-like appears (e.g., 'Updation:', 'Deletion:')
-    if ':' in ln and any(k in low for k in ('updation', 'update', 'deletion', 'delete', 'cancel')):
         return True
     # Escalation marker
     if any(m in low for m in ESCALATION_MARKERS):
@@ -219,35 +211,29 @@ def semantic_sectionize(paragraphs: List[str]) -> List[Tuple[str, List[str], Dic
     sections: List[Tuple[str, List[str], Dict[str, str]]] = []
     current_block: List[str] = []
     current_intent: Optional[str] = None
-    current_first_line: Optional[str] = None
-    for ln in [p for p in paragraphs if (p or '').strip()]:
-        ln_intent = 'steps' if STRONG_ACTION_HEADING_RE.match(ln) else dominant_intent([ln])
         block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
-        if current_block and is_boundary_to_new_section(current_intent or 'neutral', block_intent, ln):
             act = infer_action(current_block)
-            # Preserve heading title when present
-            first = current_first_line or ''
-            if first and (STRONG_ACTION_HEADING_RE.match(first) or HEADING_LINE_RE.match(first)):
-                title = first.strip()
-            else:
-                title = synthetic_title(current_intent or 'neutral', act)
-            sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
             current_block = [ln]
             current_intent = ln_intent
-            current_first_line = ln
         else:
-            if not current_block:
-                current_first_line = ln
             current_block.append(ln)
             current_intent = block_intent
     if current_block:
         act = infer_action(current_block)
-        first = current_first_line or ''
-        if first and (STRONG_ACTION_HEADING_RE.match(first) or HEADING_LINE_RE.match(first)):
-            title = first.strip()
-        else:
-            title = synthetic_title(current_intent or 'neutral', act)
-        sections.append((title, current_block[:], {'intent': current_intent or 'neutral', 'action': act}))
     return sections
 # ----------------------------- Intent/module vocab used by runtime -----------------------------
@@ -344,14 +330,8 @@ def ingest_documents(folder_path: str) -> None:
         for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
             total_chunks += len(chunks)
-            intent_tag_hint = hints.get('intent', 'neutral')
-            st_low = (section_title or '').lower()
-            explicit_action = (
-                'create' if re.search(r"\bcreation\b|\bcreate\b", st_low) else
-                'update' if re.search(r"\bupdation\b|\bupdate\b|\bedit\b|\bchange\b|\breschedule\b", st_low) else
-                'delete' if re.search(r"\bdeletion\b|\bdelete\b|\bcancel\b", st_low) else ''
-            )
-            action_tag_hint = explicit_action or hints.get('action', '')
             for c_idx, chunk in enumerate(chunks):
                 derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)

 def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
     lines: List[str] = []
     for p in (paragraphs or []):
+        p = (p or "").strip()
         if not p:
             continue
+        if BULLET_RE.match(p):
             lines.append(p)
             continue
+        # split on sentence ends
+        parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
         lines.extend(parts)
     return lines
     "update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
     "delete": {"delete", "remove", "cancel", "deletion", "unassign"}
 }
 STEP_NUM_PATTERNS = [
     re.compile(r"^\s*\d+\s*[.)]\s+"),     # 1. / 1)
     # Dominant intent flips (e.g., steps → errors)
     if prev_intent != curr_intent:
         return True
     # Action heading-like appears (e.g., 'Updation:', 'Deletion:')
+    if ":" in ln and any(k in low for k in ("updation", "update", "deletion", "delete", "cancel")):
         return True
     # Escalation marker
     if any(m in low for m in ESCALATION_MARKERS):
     sections: List[Tuple[str, List[str], Dict[str, str]]] = []
     current_block: List[str] = []
     current_intent: Optional[str] = None
+    for ln in [p for p in paragraphs if (p or "").strip()]:
+        ln_intent = dominant_intent([ln])
         block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
+        if current_block and is_boundary_to_new_section(current_intent or "neutral", block_intent, ln):
+            # close current section
             act = infer_action(current_block)
+            title = synthetic_title(current_intent or "neutral", act)
+            sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
+            # start new block
             current_block = [ln]
             current_intent = ln_intent
         else:
             current_block.append(ln)
             current_intent = block_intent
+    # close last
     if current_block:
         act = infer_action(current_block)
+        title = synthetic_title(current_intent or "neutral", act)
+        sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
     return sections
 # ----------------------------- Intent/module vocab used by runtime -----------------------------
         for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
             total_chunks += len(chunks)
+            intent_tag_hint = hints.get("intent", "neutral")
+            action_tag_hint = hints.get("action", "")
             for c_idx, chunk in enumerate(chunks):
                 derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)