srilakshu012456 commited on
Commit
b2d44cf
·
verified ·
1 Parent(s): a93f688

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +58 -6
services/kb_creation.py CHANGED
@@ -27,6 +27,20 @@ BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
  # ------------------------------ Utilities ------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
@@ -83,32 +97,64 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
83
  return lines
84
 
85
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
86
- """Smaller chunks (~160 words), bullet-aware."""
 
 
 
 
 
 
87
  lines = _paragraphs_to_lines(paragraphs)
88
  chunks: List[str] = []
89
  current: List[str] = []
90
  current_len = 0
 
 
 
 
91
  for ln in lines:
92
- w = ln.split()
93
- if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
 
 
 
 
 
 
 
 
 
 
 
 
94
  chunk = " ".join(current).strip()
95
  if chunk:
96
  chunks.append(chunk)
 
97
  current = [ln]
98
- current_len = len(w)
 
99
  else:
 
100
  current.append(ln)
101
- current_len += len(w)
 
 
 
 
 
102
  if current:
103
  chunk = " ".join(current).strip()
104
  if chunk:
105
  chunks.append(chunk)
 
 
106
  if not chunks:
107
  body = " ".join(lines).strip()
108
  if body:
109
  chunks = [body]
110
- return chunks
111
 
 
112
  # ------------------------------ Intent & Module tagging ------------------------------
113
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
114
  SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
@@ -211,6 +257,11 @@ def ingest_documents(folder_path: str) -> None:
211
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
212
  final_intent = derived_intent
213
  module_tags = _derive_module_tags(chunk, file, section_title)
 
 
 
 
 
214
  embedding = model.encode(chunk).tolist()
215
  doc_id = f"{file}:{s_idx}:{c_idx}"
216
  meta = {
@@ -222,6 +273,7 @@ def ingest_documents(folder_path: str) -> None:
222
  "intent_tag": final_intent,
223
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
224
  "module_tags": ", ".join(module_tags) if module_tags else "",
 
225
  }
226
  try:
227
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
27
  BM25_B = 0.75
28
 
29
  # ------------------------------ Utilities ------------------------------
30
+
31
+ # --- Action detection helper (generic; reuses ACTION_SYNONYMS) ---
32
+ def _line_action_tag(text: str) -> Optional[str]:
33
+ """
34
+ Return 'create'|'update'|'delete'|'navigate' if the line contains any action synonym,
35
+ else None. This is used to split chunks by action so creation/update/delete don't bleed
36
+ into one another within a single chunk.
37
+ """
38
+ low = (text or "").lower()
39
+ for act, syns in ACTION_SYNONYMS.items():
40
+ if any(s in low for s in syns):
41
+ return act
42
+ return None
43
+
44
  def _tokenize(text: str) -> List[str]:
45
  if not text:
46
  return []
 
97
  return lines
98
 
99
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
100
+ """
101
+ Smaller chunks (~160 words), bullet-aware, and NOW action-aware.
102
+ We start a new chunk when:
103
+ - Adding the next line would exceed max_words, OR
104
+ - The next line starts a different action topic (create/update/delete/navigate).
105
+ This prevents a 'create' chunk from also containing 'update'/'delete' sentences.
106
+ """
107
  lines = _paragraphs_to_lines(paragraphs)
108
  chunks: List[str] = []
109
  current: List[str] = []
110
  current_len = 0
111
+
112
+ # Track the dominant action inside the current chunk (None until detected)
113
+ current_action: Optional[str] = None
114
+
115
  for ln in lines:
116
+ ln_words = ln.split()
117
+ ln_action = _line_action_tag(ln) # detect line action
118
+
119
+ # If we already have an action in the current chunk and the new line switches action,
120
+ # or the line is a bullet heading for a different action, flush the current chunk first.
121
+ switch_action = (
122
+ (current_action is not None and ln_action is not None and ln_action != current_action)
123
+ )
124
+
125
+ # Hard break triggers:
126
+ # - size limit,
127
+ # - switching to a different action topic,
128
+ # - starting a new bullet/number while current is non-empty (keeps bullets compact).
129
+ if (current_len + len(ln_words) > max_words) or (switch_action) or (BULLET_RE.match(ln) and current):
130
  chunk = " ".join(current).strip()
131
  if chunk:
132
  chunks.append(chunk)
133
+ # reset current
134
  current = [ln]
135
+ current_len = len(ln_words)
136
+ current_action = ln_action or None
137
  else:
138
+ # Continue current chunk
139
  current.append(ln)
140
+ current_len += len(ln_words)
141
+ # Set the current action if not already set
142
+ if current_action is None and ln_action is not None:
143
+ current_action = ln_action
144
+
145
+ # Flush remainder
146
  if current:
147
  chunk = " ".join(current).strip()
148
  if chunk:
149
  chunks.append(chunk)
150
+
151
+ # Fallback: if nothing formed, collapse all lines into one chunk
152
  if not chunks:
153
  body = " ".join(lines).strip()
154
  if body:
155
  chunks = [body]
 
156
 
157
+ return chunks
158
  # ------------------------------ Intent & Module tagging ------------------------------
159
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
160
  SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
 
257
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
258
  final_intent = derived_intent
259
  module_tags = _derive_module_tags(chunk, file, section_title)
260
+ # Fallback: appointment chunks marked as steps when neutral (existing patch)
261
+ if final_intent == "neutral" and ("appointments" in module_tags):
262
+ final_intent = "steps"
263
+ # >>> NEW: annotate chunk with action tags (create/update/delete/navigate)
264
+ actions_here = _extract_actions(chunk) # reuse ACTION_SYNONYMS
265
  embedding = model.encode(chunk).tolist()
266
  doc_id = f"{file}:{s_idx}:{c_idx}"
267
  meta = {
 
273
  "intent_tag": final_intent,
274
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
275
  "module_tags": ", ".join(module_tags) if module_tags else "",
276
+ "action_tags": ", ".join(actions_here) if actions_here else "",
277
  }
278
  try:
279
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])