srilakshu012456 commited on
Commit
3fefb15
·
verified ·
1 Parent(s): 7463f64

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +163 -257
services/kb_creation.py CHANGED
@@ -1,33 +1,22 @@
1
 
2
- # services/kb_creation.py (DROP-IN REPLACEMENT)
3
- # Generic (heading-agnostic) sectionizing + robust intent/action tagging,
4
- # with hybrid retrieval tuned for SOPs that mix headings or omit them.
5
-
6
  import os
7
  import re
8
  import pickle
9
- from typing import TYPE_CHECKING, Any, List, Dict, Tuple, Optional
10
-
11
- # Word parsing
12
- if TYPE_CHECKING:
13
- from docx import Document as DocxDocument
14
- else:
15
- DocxDocument = Any
16
- try:
17
- from docx import Document # runtime import
18
- except Exception:
19
- Document = None # type: ignore
20
-
21
- # Vector DB / embeddings
22
- import chromadb # type: ignore
23
- from sentence_transformers import SentenceTransformer # type: ignore
24
 
 
25
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
26
  client = chromadb.PersistentClient(path=CHROMA_PATH)
27
  collection = client.get_or_create_collection(name="knowledge_base")
 
 
28
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
29
 
30
- # ----------------------------- BM25 structures -----------------------------
31
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
32
  bm25_docs: List[Dict[str, Any]] = []
33
  bm25_inverted: Dict[str, List[int]] = {}
@@ -37,7 +26,7 @@ bm25_ready: bool = False
37
  BM25_K1 = 1.5
38
  BM25_B = 0.75
39
 
40
- # ----------------------------- Token utilities -----------------------------
41
  def _tokenize(text: str) -> List[str]:
42
  if not text:
43
  return []
@@ -53,10 +42,34 @@ def _normalize_query(q: str) -> str:
53
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
54
  return _tokenize(val or "")
55
 
56
- # ----------------------------- Line splitting / chunking -----------------------------
57
- BULLET_RE = re.compile(r"^\s*(?:[-*\u2022]|\d+[.)])\s+", re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
 
60
  lines: List[str] = []
61
  for p in (paragraphs or []):
62
  p = (p or "").strip()
@@ -65,21 +78,19 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
65
  if BULLET_RE.match(p):
66
  lines.append(p)
67
  continue
68
- # split on sentence ends
69
  parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
70
  lines.extend(parts)
71
  return lines
72
 
73
- def _chunk_text_with_context(doc_title: str, section_title: str, lines_or_paras: List[str], max_words: int = 160) -> List[str]:
74
- """Chunk a list of lines or paragraphs into ~max_words, keeping bullets and step starts intact."""
75
- lines = _paragraphs_to_lines(lines_or_paras)
76
  chunks: List[str] = []
77
  current: List[str] = []
78
  current_len = 0
79
  for ln in lines:
80
  w = ln.split()
81
- # boundary when adding would exceed or bullet encounters and we already have something
82
- if current and (current_len + len(w) > max_words or BULLET_RE.match(ln)):
83
  chunk = " ".join(current).strip()
84
  if chunk:
85
  chunks.append(chunk)
@@ -98,160 +109,16 @@ def _chunk_text_with_context(doc_title: str, section_title: str, lines_or_paras:
98
  chunks = [body]
99
  return chunks
100
 
101
- # ----------------------------- Generic intent & action inference -----------------------------
102
- PROCEDURE_VERBS = {
103
- "navigate", "select", "scan", "verify", "confirm", "print", "move", "complete",
104
- "click", "open", "choose", "enter", "update", "save", "delete", "create",
105
- "attach", "assign", "reschedule", "edit", "change", "cancel", "remove", "tag"
106
- }
107
- ERROR_TERMS = {
108
- "error", "fail", "failure", "not working", "cannot", "can't", "mismatch",
109
- "locked", "permission", "permissions", "access denied", "not authorized",
110
- "authorised", "insufficient", "no access", "timeout", "short pick",
111
- "not found", "missing", "unavailable"
112
- }
113
- PREREQ_TERMS = {"pre-requisite", "prerequisites", "requirements", "pre requirements", "pre requisites"}
114
- PURPOSE_TERMS = {"purpose", "overview", "introduction"}
115
- ESCALATION_MARKERS = {"escalation", "escalation path", "→", "->"}
116
-
117
- ACTION_SYNS = {
118
- "create": {"create", "creation", "add", "new", "generate", "tag"},
119
- "update": {"update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"},
120
- "delete": {"delete", "remove", "cancel", "deletion", "unassign"}
121
- }
122
-
123
- STEP_NUM_PATTERNS = [
124
- re.compile(r"^\s*\d+\s*[.)]\s+"), # 1. / 1)
125
- re.compile(r"^\s*[\u2460-\u2473]\s+"),# ① ② ...
126
- re.compile(r"^\s*[-*\u2022]\s+"), # - * •
127
- re.compile(r"(?i)\bstep\s*\d+\s*:") # Step 1:
128
- ]
129
-
130
- def _norm(s: str) -> str:
131
- s = (s or "").lower()
132
- s = re.sub(r"[^\w\s→\-]", " ", s) # keep arrows
133
- s = re.sub(r"\s+", " ", s).strip()
134
- return s
135
-
136
- def line_signals(line: str) -> Dict[str, float]:
137
- """Return a score dictionary for intent signals on a single line."""
138
- low = _norm(line)
139
- sig = {"steps": 0.0, "errors": 0.0, "prereqs": 0.0, "purpose": 0.0, "escalation": 0.0}
140
- # Steps signal
141
- if any(p.search(line) for p in STEP_NUM_PATTERNS): sig["steps"] += 1.2
142
- if any(v in low for v in PROCEDURE_VERBS): sig["steps"] += 0.8
143
- # Errors signal
144
- if any(t in low for t in ERROR_TERMS): sig["errors"] += 1.1
145
- if ":" in line and len(line.split(":")[0]) <= 80: sig["errors"] += 0.2 # 'Heading: details'
146
- # Prereqs
147
- if any(t in low for t in PREREQ_TERMS): sig["prereqs"] += 1.1
148
- # Purpose
149
- if any(t in low for t in PURPOSE_TERMS): sig["purpose"] += 0.9
150
- # Escalation
151
- if any(m in low for m in ESCALATION_MARKERS): sig["escalation"] += 1.0
152
- return sig
153
-
154
- def dominant_intent(block: List[str]) -> str:
155
- """Return the dominant intent tag for a block of lines."""
156
- agg = {"steps": 0.0, "errors": 0.0, "prereqs": 0.0, "purpose": 0.0, "escalation": 0.0}
157
- for ln in block:
158
- s = line_signals(ln)
159
- for k, v in s.items():
160
- agg[k] += v
161
- order = ["steps", "errors", "prereqs", "escalation", "purpose"]
162
- best = max(order, key=lambda k: agg[k])
163
- if max(agg.values()) < 0.25:
164
- return "neutral"
165
- return best
166
-
167
- def infer_action(block: List[str]) -> str:
168
- """Infer action_tag (create/update/delete) from content of the block."""
169
- counts = {"create": 0, "update": 0, "delete": 0}
170
- for ln in block:
171
- low = _norm(ln)
172
- for act, syns in ACTION_SYNS.items():
173
- if any(s in low for s in syns):
174
- counts[act] += 1
175
- act = max(counts, key=lambda k: counts[k])
176
- return act if counts[act] > 0 else ""
177
-
178
- def is_boundary_to_new_section(prev_intent: str, curr_intent: str, ln: str) -> bool:
179
- """Signal to start a new synthetic section."""
180
- low = _norm(ln)
181
- # Dominant intent flips (e.g., steps → errors)
182
- if prev_intent != curr_intent:
183
- return True
184
- # Action heading-like appears (e.g., 'Updation:', 'Deletion:')
185
- if ":" in ln and any(k in low for k in ("updation", "update", "deletion", "delete", "cancel")):
186
- return True
187
- # Escalation marker
188
- if any(m in low for m in ESCALATION_MARKERS):
189
- return True
190
- return False
191
-
192
- def synthetic_title(intent: str, action: str) -> str:
193
- """Create a synthetic section title when headings are missing."""
194
- base = {
195
- "steps": "Process Steps",
196
- "errors": "Common Errors & Resolution",
197
- "prereqs": "Pre-Requisites",
198
- "purpose": "Purpose",
199
- "escalation": "Escalation Path",
200
- "neutral": "Section"
201
- }.get(intent, "Section")
202
- if intent == "steps" and action:
203
- return f"{base} — {action.capitalize()}"
204
- return base
205
-
206
- def semantic_sectionize(paragraphs: List[str]) -> List[Tuple[str, List[str], Dict[str, str]]]:
207
- """
208
- Split the document into synthetic sections based on changing intent signals.
209
- Returns: (section_title, lines, {'intent': ..., 'action': ...})
210
- """
211
- sections: List[Tuple[str, List[str], Dict[str, str]]] = []
212
- current_block: List[str] = []
213
- current_intent: Optional[str] = None
214
-
215
- for ln in [p for p in paragraphs if (p or "").strip()]:
216
- ln_intent = dominant_intent([ln])
217
- block_intent = dominant_intent(current_block + [ln]) if current_block else ln_intent
218
-
219
- if current_block and is_boundary_to_new_section(current_intent or "neutral", block_intent, ln):
220
- # close current section
221
- act = infer_action(current_block)
222
- title = synthetic_title(current_intent or "neutral", act)
223
- sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
224
- # start new block
225
- current_block = [ln]
226
- current_intent = ln_intent
227
- else:
228
- current_block.append(ln)
229
- current_intent = block_intent
230
-
231
- # close last
232
- if current_block:
233
- act = infer_action(current_block)
234
- title = synthetic_title(current_intent or "neutral", act)
235
- sections.append((title, current_block[:], {"intent": current_intent or "neutral", "action": act}))
236
-
237
- return sections
238
-
239
- # ----------------------------- Intent/module vocab used by runtime -----------------------------
240
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
241
- SECTION_ERRORS_HINTS = [
242
- "common errors", "resolution", "troubleshooting", "known issues",
243
- "common issues", "escalation", "permissions", "access"
244
- ]
245
  PERMISSION_TERMS = [
246
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
247
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
248
  "not allowed", "not authorized", "denied", "restrict"
249
  ]
250
- ERROR_TERMS_R = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
251
- STEP_VERBS = [
252
- "navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open",
253
- "choose", "enter", "update", "save", "delete", "create", "attach", "assign", "reschedule", "edit", "change", "cancel", "remove", "tag"
254
- ]
255
  MODULE_VOCAB = {
256
  "receiving": [
257
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
@@ -259,8 +126,7 @@ MODULE_VOCAB = {
259
  ],
260
  "appointments": [
261
  "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
262
- "appointment creation", "appointment details", "appointment schedule", "reschedule",
263
- "updation", "update appointment", "cancel appointment", "delete appointment"
264
  ],
265
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
266
  "putaway": ["putaway", "staging", "put away", "location assignment"],
@@ -268,12 +134,22 @@ MODULE_VOCAB = {
268
  "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
269
  "replenishment": ["replenishment", "replenish"],
270
  }
271
- ACTION_SYNONYMS = {
272
- "create": ["create", "creation", "add", "new", "generate", "tag"],
273
- "update": ["update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"],
274
- "delete": ["delete", "remove", "cancel", "deletion", "unassign"],
275
- "navigate": ["navigate", "go to", "open"],
276
- }
 
 
 
 
 
 
 
 
 
 
277
 
278
  def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
279
  t = (text or "").lower()
@@ -281,9 +157,9 @@ def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
281
  intent = "neutral"
282
  if any(term in t for term in PERMISSION_TERMS):
283
  intent = "errors"; tags.append("permissions")
284
- if "role" in t: tags.append("role_access")
285
- if "security" in t: tags.append("security")
286
- if intent == "neutral" and any(term in t for term in ERROR_TERMS_R):
287
  intent = "errors"; tags.append("errors")
288
  if intent == "neutral" and any(v in t for v in STEP_VERBS):
289
  intent = "steps"; tags.append("procedure")
@@ -304,10 +180,10 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
304
  found = ["appointments"]
305
  return list(sorted(set(found)))
306
 
307
- # ----------------------------- Ingestion -----------------------------
308
  def ingest_documents(folder_path: str) -> None:
309
  print(f"[KB] Checking folder: {folder_path}")
310
- files = [f for f in os.listdir(folder_path) if f.lower().endswith(".docx")]
311
  print(f"[KB] Found {len(files)} Word files: {files}")
312
  if not files:
313
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
@@ -321,23 +197,20 @@ def ingest_documents(folder_path: str) -> None:
321
  file_path = os.path.join(folder_path, file)
322
  doc_title = os.path.splitext(file)[0]
323
  doc = Document(file_path)
324
- # Build raw paragraphs
325
- paragraphs = [p.text.strip() for p in getattr(doc, "paragraphs", []) if p.text and p.text.strip()]
326
- # Generic (heading-agnostic) sectionizer
327
- sections = semantic_sectionize(paragraphs)
328
  total_chunks = 0
329
-
330
- for s_idx, (section_title, sec_lines, hints) in enumerate(sections):
331
- chunks = _chunk_text_with_context(doc_title, section_title, sec_lines, max_words=160)
332
  total_chunks += len(chunks)
333
- intent_tag_hint = hints.get("intent", "neutral")
334
- action_tag_hint = hints.get("action", "")
335
-
336
  for c_idx, chunk in enumerate(chunks):
337
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
338
- final_intent = intent_tag_hint if intent_tag_hint != "neutral" else derived_intent
 
 
 
 
339
  module_tags = _derive_module_tags(chunk, file, section_title)
340
-
341
  embedding = model.encode(chunk).tolist()
342
  doc_id = f"{file}:{s_idx}:{c_idx}"
343
  meta = {
@@ -349,9 +222,7 @@ def ingest_documents(folder_path: str) -> None:
349
  "intent_tag": final_intent,
350
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
351
  "module_tags": ", ".join(module_tags) if module_tags else "",
352
- "action_tag": action_tag_hint,
353
  }
354
-
355
  try:
356
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
357
  except Exception:
@@ -361,7 +232,6 @@ def ingest_documents(folder_path: str) -> None:
361
  except Exception as e2:
362
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
363
 
364
- # BM25 updates
365
  tokens = _tokenize(chunk)
366
  tf: Dict[str, int] = {}
367
  for tkn in tokens:
@@ -375,15 +245,13 @@ def ingest_documents(folder_path: str) -> None:
375
  "length": len(tokens),
376
  "meta": meta,
377
  })
378
- seen_terms = set()
379
  for term in tf.keys():
380
  bm25_inverted.setdefault(term, []).append(idx)
381
- if term not in seen_terms:
382
  bm25_df[term] = bm25_df.get(term, 0) + 1
383
- seen_terms.add(term)
384
-
385
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
386
-
387
  N = len(bm25_docs)
388
  if N > 0:
389
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
@@ -402,7 +270,7 @@ def ingest_documents(folder_path: str) -> None:
402
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
403
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
404
 
405
- # ----------------------------- BM25 load -----------------------------
406
  def _load_bm25_index() -> None:
407
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
408
  if not os.path.exists(BM25_INDEX_FILE):
@@ -422,7 +290,7 @@ def _load_bm25_index() -> None:
422
 
423
  _load_bm25_index()
424
 
425
- # ----------------------------- BM25 search -----------------------------
426
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
427
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
428
  return 0.0
@@ -437,9 +305,10 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
437
  if tf == 0:
438
  continue
439
  N = len(bm25_docs)
 
440
  try:
441
  import math
442
- idf = math.log(((N - df + 0.5) / (df + 0.5)) + 1.0)
443
  except Exception:
444
  idf = 1.0
445
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
@@ -467,17 +336,18 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
467
  scored.sort(key=lambda x: x[1], reverse=True)
468
  return scored[:top_k]
469
 
470
- # ----------------------------- Semantic-only -----------------------------
471
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
472
  query_embedding = model.encode(query).tolist()
473
  res = collection.query(
474
  query_embeddings=[query_embedding],
475
  n_results=top_k,
476
- include=['documents', 'metadatas', 'distances']
477
  )
478
- documents = (res.get("documents", [[""]]) or [[""]])[0]
479
- metadatas = (res.get("metadatas", [[{}]]) or [[{}]])[0]
480
- distances = (res.get("distances", [[None]]) or [[None]])[0]
 
481
  ids: List[str] = []
482
  if documents:
483
  synthesized = []
@@ -495,15 +365,23 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
495
  "ids": ids,
496
  }
497
 
498
- # ----------------------------- Hybrid search -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  def _detect_user_intent(query: str) -> str:
500
  q = (query or "").lower()
501
- if any(k in q for k in [
502
- "error", "issue", "fail", "not working", "resolution", "fix",
503
- "permission", "permissions", "access", "no access", "authorization", "authorisation",
504
- "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
505
- "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
506
- ]):
507
  return "errors"
508
  if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
509
  return "steps"
@@ -572,7 +450,7 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
572
  or ("permissions" in topic_list)
573
  ):
574
  return 1.10
575
- if user_intent == "steps" and any(k in st for k in ["process steps", "procedure", "instructions", "workflow", "creation", "updation", "deletion"]):
576
  return 0.75
577
  return -0.2
578
 
@@ -582,8 +460,7 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
582
  section_tokens = _tokenize_meta_value(meta.get("section"))
583
  topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
584
  module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
585
- action_token = _tokenize_meta_value((meta.get("action_tag") or ""))
586
- meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens + action_token)
587
  if not meta_tokens or not q_terms:
588
  return 0.0
589
  qset = set(q_terms)
@@ -623,12 +500,17 @@ def _literal_query_match_boost(text: str, query_norm: str) -> float:
623
  return min(boost, 1.6)
624
 
625
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
 
 
 
 
626
  norm_query = _normalize_query(query)
627
  q_terms = _tokenize(norm_query)
628
  user_intent = _detect_user_intent(query)
629
  actions = _extract_actions(query)
630
  user_modules = _extract_modules_from_query(query)
631
 
 
632
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
633
  sem_docs = sem_res.get("documents", [])
634
  sem_metas = sem_res.get("metadatas", [])
@@ -642,8 +524,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
642
  return 1.0 / (1.0 + float(d))
643
  except Exception:
644
  return 0.0
645
-
646
  sem_sims = [dist_to_sim(d) for d in sem_dists]
 
 
647
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
648
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
649
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
@@ -654,23 +537,22 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
654
  bm25_id_to_text[d["id"]] = d["text"]
655
  bm25_id_to_meta[d["id"]] = d["meta"]
656
 
 
657
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
658
- gamma = 0.30
659
- delta = 0.55
660
- epsilon = 0.35
661
- zeta = 0.65
662
- eta = 0.50
663
- iota = 0.60
664
-
665
- def _action_meta_bonus(meta: Dict[str, Any], actions: List[str]) -> float:
666
- if not actions:
667
- return 0.0
668
- act = (meta or {}).get("action_tag", "") or ""
669
- return 0.6 if act and act in actions else 0.0
670
 
671
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
 
 
 
 
 
 
 
 
 
672
 
673
  for cid in union_ids:
 
674
  if cid in sem_ids:
675
  pos = sem_ids.index(cid)
676
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
@@ -690,7 +572,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
690
  m_overlap = _meta_overlap(meta, q_terms)
691
  intent_boost = _intent_weight(meta, user_intent)
692
  act_wt = _action_weight(text, actions)
693
- act_meta = _action_meta_bonus(meta, actions)
694
  mod_wt = _module_weight(meta, user_modules)
695
  phrase_wt = _phrase_boost_score(text, q_terms)
696
  literal_wt = _literal_query_match_boost(text, norm_query)
@@ -700,48 +581,67 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
700
  + beta * bm25_sim
701
  + gamma * m_overlap
702
  + delta * intent_boost
703
- + epsilon * (act_wt + act_meta)
704
  + zeta * mod_wt
705
  + eta * phrase_wt
 
706
  + iota * literal_wt
707
  )
708
  combined_records_ext.append(
709
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta,
710
- m_overlap, intent_boost, act_wt + act_meta, mod_wt, phrase_wt, literal_wt)
711
  )
712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  from collections import defaultdict as _dd
714
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = _dd(list)
715
  for rec in combined_records_ext:
716
  meta = rec[4] or {}
717
  fn = meta.get("filename", "unknown")
718
  doc_groups[fn].append(rec)
719
 
720
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
721
  total_score = sum(r[1] for r in recs)
722
  total_overlap = sum(r[5] for r in recs)
723
  total_intent = sum(max(0.0, r[6]) for r in recs)
724
  total_action = sum(max(0.0, r[7]) for r in recs)
725
  total_module = sum(r[8] for r in recs)
726
  total_phrase = sum(r[9] for r in recs)
727
- total_literal = sum(r[10] for r in recs)
 
728
  errors_section_bonus = 0.0
729
- if any(
730
- "error" in ((r[4] or {}).get("section", "")).lower()
731
- or "known issues" in ((r[4] or {}).get("section", "")).lower()
732
- or "common issues" in ((r[4] or {}).get("section", "")).lower()
733
- for r in recs
734
- ):
735
  errors_section_bonus = 0.5
736
  return (
737
  total_score
738
  + 0.4 * total_overlap
739
  + 0.7 * total_intent
740
- + 0.55 * total_action
741
  + 0.8 * total_module
742
  + 0.6 * total_phrase
743
  + 0.7 * total_literal
744
  + errors_section_bonus
 
745
  )
746
 
747
  best_doc, best_doc_prior = None, -1.0
@@ -751,7 +651,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
751
  best_doc_prior, best_doc = p, fn
752
 
753
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
754
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
755
  for fn, recs in doc_groups.items():
756
  if fn == best_doc:
757
  continue
@@ -778,7 +678,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
778
  "actions": actions,
779
  }
780
 
781
- # ----------------------------- Section fetch helpers -----------------------------
782
  def get_section_text(filename: str, section: str) -> str:
783
  texts: List[str] = []
784
  for d in bm25_docs:
@@ -814,6 +714,7 @@ def get_best_errors_section_text(filename: str) -> str:
814
  or "access" in sec
815
  or "known issues" in sec
816
  or "common issues" in sec
 
817
  or ("permissions" in topic_list)
818
  ):
819
  t = (d.get("text") or "").strip()
@@ -822,6 +723,10 @@ def get_best_errors_section_text(filename: str) -> str:
822
  return "\n\n".join(texts).strip()
823
 
824
  def get_escalation_text(filename: str) -> str:
 
 
 
 
825
  texts: List[str] = []
826
  for d in bm25_docs:
827
  m = d.get("meta", {})
@@ -833,6 +738,7 @@ def get_escalation_text(filename: str) -> str:
833
  texts.append(t)
834
  return "\n\n".join(texts).strip()
835
 
 
836
  def get_kb_runtime_info() -> Dict[str, Any]:
837
  return {
838
  "chroma_path": CHROMA_PATH,
 
1
 
2
+ # services/kb_creation.py
 
 
 
3
  import os
4
  import re
5
  import pickle
6
+ from typing import List, Dict, Any, Tuple, Optional
7
+ from docx import Document
8
+ from sentence_transformers import SentenceTransformer
9
+ import chromadb
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # ------------------------------ ChromaDB setup ------------------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
+
16
+ # ------------------------------ Embedding model ------------------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
+ # ------------------------------ BM25 (lightweight) ------------------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
 
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
+ # ------------------------------ Utilities ------------------------------
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
 
42
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
43
  return _tokenize(val or "")
44
 
45
+ # ------------------------------ DOCX parsing & chunking ------------------------------
46
+ BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
47
+
48
+ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
49
+ sections: List[Tuple[str, List[str]]] = []
50
+ current_title = None
51
+ current_paras: List[str] = []
52
+ for para in doc.paragraphs:
53
+ text = (para.text or "").strip()
54
+ style_name = (para.style.name if para.style else "") or ""
55
+ is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
56
+ if is_heading and text:
57
+ if current_title or current_paras:
58
+ sections.append((current_title or "Untitled Section", current_paras))
59
+ current_title = text
60
+ current_paras = []
61
+ else:
62
+ if text:
63
+ current_paras.append(text)
64
+ if current_title or current_paras:
65
+ sections.append((current_title or "Untitled Section", current_paras))
66
+ if not sections:
67
+ all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
68
+ sections = [("Document", all_text)]
69
+ return sections
70
 
71
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
72
+ """Preserve bullets/numbered list lines; split long paragraphs by sentence boundaries."""
73
  lines: List[str] = []
74
  for p in (paragraphs or []):
75
  p = (p or "").strip()
 
78
  if BULLET_RE.match(p):
79
  lines.append(p)
80
  continue
 
81
  parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
82
  lines.extend(parts)
83
  return lines
84
 
85
+ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
86
+ """Smaller chunks (~160 words), bullet-aware."""
87
+ lines = _paragraphs_to_lines(paragraphs)
88
  chunks: List[str] = []
89
  current: List[str] = []
90
  current_len = 0
91
  for ln in lines:
92
  w = ln.split()
93
+ if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
 
94
  chunk = " ".join(current).strip()
95
  if chunk:
96
  chunks.append(chunk)
 
109
  chunks = [body]
110
  return chunks
111
 
112
+ # ------------------------------ Intent & Module tagging ------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
114
+ SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
 
 
 
115
  PERMISSION_TERMS = [
116
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
117
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
118
  "not allowed", "not authorized", "denied", "restrict"
119
  ]
120
+ ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
121
+ STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 
 
 
122
  MODULE_VOCAB = {
123
  "receiving": [
124
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
 
126
  ],
127
  "appointments": [
128
  "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
129
+ "appointment creation", "appointment details"
 
130
  ],
131
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
132
  "putaway": ["putaway", "staging", "put away", "location assignment"],
 
134
  "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
135
  "replenishment": ["replenishment", "replenish"],
136
  }
137
+
138
+ def _infer_intent_tag(section_title: str) -> str:
139
+ st = (section_title or "").lower()
140
+ if any(k in st for k in SECTION_STEPS_HINTS):
141
+ return "steps"
142
+ if any(k in st for k in SECTION_ERRORS_HINTS):
143
+ return "errors"
144
+ if "pre" in st and "requisite" in st:
145
+ return "prereqs"
146
+ if any(k in st for k in ["purpose", "overview", "introduction"]):
147
+ return "purpose"
148
+ if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
149
+ return "steps"
150
+ if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
151
+ return "steps"
152
+ return "neutral"
153
 
154
  def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
155
  t = (text or "").lower()
 
157
  intent = "neutral"
158
  if any(term in t for term in PERMISSION_TERMS):
159
  intent = "errors"; tags.append("permissions")
160
+ if "role" in t: tags.append("role_access")
161
+ if "security" in t: tags.append("security")
162
+ if intent == "neutral" and any(term in t for term in ERROR_TERMS):
163
  intent = "errors"; tags.append("errors")
164
  if intent == "neutral" and any(v in t for v in STEP_VERBS):
165
  intent = "steps"; tags.append("procedure")
 
180
  found = ["appointments"]
181
  return list(sorted(set(found)))
182
 
183
+ # ------------------------------ Ingestion ------------------------------
184
  def ingest_documents(folder_path: str) -> None:
185
  print(f"[KB] Checking folder: {folder_path}")
186
+ files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
187
  print(f"[KB] Found {len(files)} Word files: {files}")
188
  if not files:
189
  print("[KB] WARNING: No .docx files found. Please check the folder path.")
 
197
  file_path = os.path.join(folder_path, file)
198
  doc_title = os.path.splitext(file)[0]
199
  doc = Document(file_path)
200
+ sections = _split_by_sections(doc)
 
 
 
201
  total_chunks = 0
202
+ for s_idx, (section_title, paras) in enumerate(sections):
203
+ chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
 
204
  total_chunks += len(chunks)
205
+ base_intent = _infer_intent_tag(section_title)
 
 
206
  for c_idx, chunk in enumerate(chunks):
207
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
208
+ final_intent = base_intent
209
+ if derived_intent == "errors":
210
+ final_intent = "errors"
211
+ elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
212
+ final_intent = derived_intent
213
  module_tags = _derive_module_tags(chunk, file, section_title)
 
214
  embedding = model.encode(chunk).tolist()
215
  doc_id = f"{file}:{s_idx}:{c_idx}"
216
  meta = {
 
222
  "intent_tag": final_intent,
223
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
224
  "module_tags": ", ".join(module_tags) if module_tags else "",
 
225
  }
 
226
  try:
227
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
228
  except Exception:
 
232
  except Exception as e2:
233
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
234
 
 
235
  tokens = _tokenize(chunk)
236
  tf: Dict[str, int] = {}
237
  for tkn in tokens:
 
245
  "length": len(tokens),
246
  "meta": meta,
247
  })
248
+ seen = set()
249
  for term in tf.keys():
250
  bm25_inverted.setdefault(term, []).append(idx)
251
+ if term not in seen:
252
  bm25_df[term] = bm25_df.get(term, 0) + 1
253
+ seen.add(term)
 
254
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
 
255
  N = len(bm25_docs)
256
  if N > 0:
257
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
 
270
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
271
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
272
 
273
+ # ------------------------------ BM25 load ------------------------------
274
  def _load_bm25_index() -> None:
275
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
276
  if not os.path.exists(BM25_INDEX_FILE):
 
290
 
291
  _load_bm25_index()
292
 
293
+ # ------------------------------ BM25 search ------------------------------
294
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
295
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
296
  return 0.0
 
305
  if tf == 0:
306
  continue
307
  N = len(bm25_docs)
308
+ idf_ratio = ((N - df + 0.5) / (df + 0.5))
309
  try:
310
  import math
311
+ idf = math.log(idf_ratio + 1.0)
312
  except Exception:
313
  idf = 1.0
314
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
 
336
  scored.sort(key=lambda x: x[1], reverse=True)
337
  return scored[:top_k]
338
 
339
+ # ------------------------------ Semantic-only ------------------------------
340
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
341
  query_embedding = model.encode(query).tolist()
342
  res = collection.query(
343
  query_embeddings=[query_embedding],
344
  n_results=top_k,
345
+ include=['documents', 'metadatas', 'distances'] # no 'ids'
346
  )
347
+ documents = (res.get("documents", [[]]) or [[]])[0]
348
+ metadatas = (res.get("metadatas", [[]]) or [[]])[0]
349
+ distances = (res.get("distances", [[]]) or [[]])[0]
350
+ # Synthesize IDs from metadata (filename:section:chunk_index)
351
  ids: List[str] = []
352
  if documents:
353
  synthesized = []
 
365
  "ids": ids,
366
  }
367
 
368
+ # ------------------------------ Hybrid search (generic + intent-aware) ------------------------------
369
+ ACTION_SYNONYMS = {
370
+ "create": ["create", "creation", "add", "new", "generate"],
371
+ "update": ["update", "modify", "change", "edit"],
372
+ "delete": ["delete", "remove"],
373
+ "navigate": ["navigate", "go to", "open"],
374
+ }
375
+ ERROR_INTENT_TERMS = [
376
+ "error", "issue", "fail", "not working", "resolution", "fix",
377
+ "permission", "permissions", "access", "no access", "authorization", "authorisation",
378
+ "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
379
+ "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
380
+ ]
381
+
382
  def _detect_user_intent(query: str) -> str:
383
  q = (query or "").lower()
384
+ if any(k in q for k in ERROR_INTENT_TERMS):
 
 
 
 
 
385
  return "errors"
386
  if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
387
  return "steps"
 
450
  or ("permissions" in topic_list)
451
  ):
452
  return 1.10
453
+ if user_intent == "steps" and any(k in st for k in ["process steps", "procedure", "instructions", "workflow"]):
454
  return 0.75
455
  return -0.2
456
 
 
460
  section_tokens = _tokenize_meta_value(meta.get("section"))
461
  topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
462
  module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
463
+ meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens)
 
464
  if not meta_tokens or not q_terms:
465
  return 0.0
466
  qset = set(q_terms)
 
500
  return min(boost, 1.6)
501
 
502
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
503
+ """
504
+ Hybrid retrieval (embeddings + BM25) with intent-, action-, module-, and phrase-aware reranking.
505
+ Returns top items plus doc-level prior and intent for downstream formatting.
506
+ """
507
  norm_query = _normalize_query(query)
508
  q_terms = _tokenize(norm_query)
509
  user_intent = _detect_user_intent(query)
510
  actions = _extract_actions(query)
511
  user_modules = _extract_modules_from_query(query)
512
 
513
+ # semantic (embeddings) search via Chroma
514
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
515
  sem_docs = sem_res.get("documents", [])
516
  sem_metas = sem_res.get("metadatas", [])
 
524
  return 1.0 / (1.0 + float(d))
525
  except Exception:
526
  return 0.0
 
527
  sem_sims = [dist_to_sim(d) for d in sem_dists]
528
+
529
+ # BM25 search
530
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
531
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
532
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
537
  bm25_id_to_text[d["id"]] = d["text"]
538
  bm25_id_to_meta[d["id"]] = d["meta"]
539
 
540
+ # union of candidate IDs (semantic + bm25)
541
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
 
 
 
 
 
 
 
 
 
 
 
 
542
 
543
+ # weights
544
+ gamma = 0.30 # meta overlap
545
+ delta = 0.55 # intent boost
546
+ epsilon = 0.30 # action weight
547
+ zeta = 0.65 # module weight
548
+ eta = 0.50 # phrase-level boost
549
+ theta = 0.00 # optional heading alignment bonus not used
550
+ iota = 0.60 # literal query match boost
551
+
552
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
553
 
554
  for cid in union_ids:
555
+ # pick semantic fields if present; fallback to bm25
556
  if cid in sem_ids:
557
  pos = sem_ids.index(cid)
558
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
 
572
  m_overlap = _meta_overlap(meta, q_terms)
573
  intent_boost = _intent_weight(meta, user_intent)
574
  act_wt = _action_weight(text, actions)
 
575
  mod_wt = _module_weight(meta, user_modules)
576
  phrase_wt = _phrase_boost_score(text, q_terms)
577
  literal_wt = _literal_query_match_boost(text, norm_query)
 
581
  + beta * bm25_sim
582
  + gamma * m_overlap
583
  + delta * intent_boost
584
+ + epsilon * act_wt
585
  + zeta * mod_wt
586
  + eta * phrase_wt
587
+ + theta * 0.0
588
  + iota * literal_wt
589
  )
590
  combined_records_ext.append(
591
  (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta,
592
+ m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, 0.0, literal_wt)
593
  )
594
 
595
+ # exact-match rerank for errors (push lines containing query phrases)
596
+ if user_intent == "errors":
597
+ exact_hits = []
598
+ toks = [tok for tok in norm_query.split() if len(tok) > 2]
599
+ bigrams = _make_ngrams(toks, 2)
600
+ for rec in combined_records_ext:
601
+ text_lower = (rec[3] or "").lower()
602
+ if norm_query and norm_query in text_lower:
603
+ exact_hits.append(rec)
604
+ continue
605
+ if any(bg in text_lower for bg in bigrams):
606
+ exact_hits.append(rec)
607
+ if exact_hits:
608
+ rest = [r for r in combined_records_ext if r not in exact_hits]
609
+ exact_hits.sort(key=lambda x: x[1], reverse=True)
610
+ rest.sort(key=lambda x: x[1], reverse=True)
611
+ combined_records_ext = exact_hits + rest
612
+
613
+ # doc-level prior: prefer docs with more aligned chunks
614
  from collections import defaultdict as _dd
615
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = _dd(list)
616
  for rec in combined_records_ext:
617
  meta = rec[4] or {}
618
  fn = meta.get("filename", "unknown")
619
  doc_groups[fn].append(rec)
620
 
621
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
622
  total_score = sum(r[1] for r in recs)
623
  total_overlap = sum(r[5] for r in recs)
624
  total_intent = sum(max(0.0, r[6]) for r in recs)
625
  total_action = sum(max(0.0, r[7]) for r in recs)
626
  total_module = sum(r[8] for r in recs)
627
  total_phrase = sum(r[9] for r in recs)
628
+ total_literal = sum(r[11] for r in recs)
629
+ total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
630
  errors_section_bonus = 0.0
631
+ if any("error" in ((r[4] or {}).get("section", "")).lower() or
632
+ "known issues" in ((r[4] or {}).get("section", "")).lower() or
633
+ "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
 
 
 
634
  errors_section_bonus = 0.5
635
  return (
636
  total_score
637
  + 0.4 * total_overlap
638
  + 0.7 * total_intent
639
+ + 0.5 * total_action
640
  + 0.8 * total_module
641
  + 0.6 * total_phrase
642
  + 0.7 * total_literal
643
  + errors_section_bonus
644
+ + 0.3 * total_penalty
645
  )
646
 
647
  best_doc, best_doc_prior = None, -1.0
 
651
  best_doc_prior, best_doc = p, fn
652
 
653
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
654
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
655
  for fn, recs in doc_groups.items():
656
  if fn == best_doc:
657
  continue
 
678
  "actions": actions,
679
  }
680
 
681
+ # ------------------------------ Section fetch helpers ------------------------------
682
  def get_section_text(filename: str, section: str) -> str:
683
  texts: List[str] = []
684
  for d in bm25_docs:
 
714
  or "access" in sec
715
  or "known issues" in sec
716
  or "common issues" in sec
717
+ or "errors" in sec
718
  or ("permissions" in topic_list)
719
  ):
720
  t = (d.get("text") or "").strip()
 
723
  return "\n\n".join(texts).strip()
724
 
725
  def get_escalation_text(filename: str) -> str:
726
+ """
727
+ Return concatenated text from any 'Escalation' section in the given SOP file.
728
+ Works across future SOPs—only relies on the heading name containing 'escalation'.
729
+ """
730
  texts: List[str] = []
731
  for d in bm25_docs:
732
  m = d.get("meta", {})
 
738
  texts.append(t)
739
  return "\n\n".join(texts).strip()
740
 
741
+ # ------------------------------ Admin helpers ------------------------------
742
  def get_kb_runtime_info() -> Dict[str, Any]:
743
  return {
744
  "chroma_path": CHROMA_PATH,