Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Jan 6

Commit

b2d44cf

verified ·

1 Parent(s): a93f688

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +58 -6

services/kb_creation.py CHANGED Viewed

@@ -27,6 +27,20 @@ BM25_K1 = 1.5
 BM25_B = 0.75
 # ------------------------------ Utilities ------------------------------
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
@@ -83,32 +97,64 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
     return lines
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
-    """Smaller chunks (~160 words), bullet-aware."""
     lines = _paragraphs_to_lines(paragraphs)
     chunks: List[str] = []
     current: List[str] = []
     current_len = 0
     for ln in lines:
-        w = ln.split()
-        if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
             chunk = " ".join(current).strip()
             if chunk:
                 chunks.append(chunk)
             current = [ln]
-            current_len = len(w)
         else:
             current.append(ln)
-            current_len += len(w)
     if current:
         chunk = " ".join(current).strip()
         if chunk:
             chunks.append(chunk)
     if not chunks:
         body = " ".join(lines).strip()
         if body:
             chunks = [body]
-    return chunks
 # ------------------------------ Intent & Module tagging ------------------------------
 SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
 SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
@@ -211,6 +257,11 @@ def ingest_documents(folder_path: str) -> None:
                 elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
                     final_intent = derived_intent
                 module_tags = _derive_module_tags(chunk, file, section_title)
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
                 meta = {
@@ -222,6 +273,7 @@ def ingest_documents(folder_path: str) -> None:
                     "intent_tag": final_intent,
                     "topic_tags": ", ".join(topic_tags) if topic_tags else "",
                     "module_tags": ", ".join(module_tags) if module_tags else "",
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])

 BM25_B = 0.75
 # ------------------------------ Utilities ------------------------------
+# --- Action detection helper (generic; reuses ACTION_SYNONYMS) ---
+def _line_action_tag(text: str) -> Optional[str]:
+    """
+    Return 'create'|'update'|'delete'|'navigate' if the line contains any action synonym,
+    else None. This is used to split chunks by action so creation/update/delete don't bleed
+    into one another within a single chunk.
+    """
+    low = (text or "").lower()
+    for act, syns in ACTION_SYNONYMS.items():
+        if any(s in low for s in syns):
+            return act
+    return None
 def _tokenize(text: str) -> List[str]:
     if not text:
         return []
     return lines
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
+    """
+    Smaller chunks (~160 words), bullet-aware, and NOW action-aware.
+    We start a new chunk when:
+      - Adding the next line would exceed max_words, OR
+      - The next line starts a different action topic (create/update/delete/navigate).
+    This prevents a 'create' chunk from also containing 'update'/'delete' sentences.
+    """
     lines = _paragraphs_to_lines(paragraphs)
     chunks: List[str] = []
     current: List[str] = []
     current_len = 0
+    # Track the dominant action inside the current chunk (None until detected)
+    current_action: Optional[str] = None
     for ln in lines:
+        ln_words = ln.split()
+        ln_action = _line_action_tag(ln)  # detect line action
+        # If we already have an action in the current chunk and the new line switches action,
+        # or the line is a bullet heading for a different action, flush the current chunk first.
+        switch_action = (
+            (current_action is not None and ln_action is not None and ln_action != current_action)
+        )
+        # Hard break triggers:
+        # - size limit,
+        # - switching to a different action topic,
+        # - starting a new bullet/number while current is non-empty (keeps bullets compact).
+        if (current_len + len(ln_words) > max_words) or (switch_action) or (BULLET_RE.match(ln) and current):
             chunk = " ".join(current).strip()
             if chunk:
                 chunks.append(chunk)
+            # reset current
             current = [ln]
+            current_len = len(ln_words)
+            current_action = ln_action or None
         else:
+            # Continue current chunk
             current.append(ln)
+            current_len += len(ln_words)
+            # Set the current action if not already set
+            if current_action is None and ln_action is not None:
+                current_action = ln_action
+    # Flush remainder
     if current:
         chunk = " ".join(current).strip()
         if chunk:
             chunks.append(chunk)
+    # Fallback: if nothing formed, collapse all lines into one chunk
     if not chunks:
         body = " ".join(lines).strip()
         if body:
             chunks = [body]
+    return chunks
 # ------------------------------ Intent & Module tagging ------------------------------
 SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
 SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
                 elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
                     final_intent = derived_intent
                 module_tags = _derive_module_tags(chunk, file, section_title)
+                # Fallback: appointment chunks marked as steps when neutral (existing patch)
+                if final_intent == "neutral" and ("appointments" in module_tags):
+                   final_intent = "steps"
+    # >>> NEW: annotate chunk with action tags (create/update/delete/navigate)
+                actions_here = _extract_actions(chunk)  # reuse ACTION_SYNONYMS
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
                 meta = {
                     "intent_tag": final_intent,
                     "topic_tags": ", ".join(topic_tags) if topic_tags else "",
                     "module_tags": ", ".join(module_tags) if module_tags else "",
+                    "action_tags": ", ".join(actions_here) if actions_here else "",
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])