srilakshu012456 commited on
Commit
f390ba5
·
verified ·
1 Parent(s): a2205f6

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +148 -161
services/kb_creation.py CHANGED
@@ -1,22 +1,37 @@
1
 
2
- # services/kb_creation.py
 
 
 
 
3
  import os
4
  import re
5
  import pickle
6
- from typing import List, Dict, Any, Tuple, Optional
7
- from docx import Document
8
- from sentence_transformers import SentenceTransformer
9
- import chromadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
- # ------------------------------ ChromaDB setup ------------------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
- # ------------------------------ Embedding model ------------------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
- # ------------------------------ BM25 (lightweight) ------------------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
@@ -26,21 +41,7 @@ bm25_ready: bool = False
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
- # ------------------------------ Utilities ------------------------------
30
-
31
- # --- Action detection helper (generic; reuses ACTION_SYNONYMS) ---
32
- def _line_action_tag(text: str) -> Optional[str]:
33
- """
34
- Return 'create'|'update'|'delete'|'navigate' if the line contains any action synonym,
35
- else None. This is used to split chunks by action so creation/update/delete don't bleed
36
- into one another within a single chunk.
37
- """
38
- low = (text or "").lower()
39
- for act, syns in ACTION_SYNONYMS.items():
40
- if any(s in low for s in syns):
41
- return act
42
- return None
43
-
44
  def _tokenize(text: str) -> List[str]:
45
  if not text:
46
  return []
@@ -56,34 +57,63 @@ def _normalize_query(q: str) -> str:
56
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
57
  return _tokenize(val or "")
58
 
59
- # ------------------------------ DOCX parsing & chunking ------------------------------
60
- BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
 
 
 
 
 
 
 
 
 
 
63
  sections: List[Tuple[str, List[str]]] = []
64
- current_title = None
65
  current_paras: List[str] = []
66
- for para in doc.paragraphs:
 
67
  text = (para.text or "").strip()
68
- style_name = (para.style.name if para.style else "") or ""
69
- is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
70
- if is_heading and text:
 
 
71
  if current_title or current_paras:
72
  sections.append((current_title or "Untitled Section", current_paras))
73
- current_title = text
74
  current_paras = []
75
  else:
76
  if text:
77
  current_paras.append(text)
 
78
  if current_title or current_paras:
79
  sections.append((current_title or "Untitled Section", current_paras))
 
80
  if not sections:
81
- all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
82
  sections = [("Document", all_text)]
83
  return sections
84
 
85
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
86
- """Preserve bullets/numbered list lines; split long paragraphs by sentence boundaries."""
87
  lines: List[str] = []
88
  for p in (paragraphs or []):
89
  p = (p or "").strip()
@@ -97,74 +127,49 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
97
  return lines
98
 
99
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
100
- """
101
- Smaller chunks (~160 words), bullet-aware, and NOW action-aware.
102
- We start a new chunk when:
103
- - Adding the next line would exceed max_words, OR
104
- - The next line starts a different action topic (create/update/delete/navigate).
105
- This prevents a 'create' chunk from also containing 'update'/'delete' sentences.
106
- """
107
  lines = _paragraphs_to_lines(paragraphs)
108
  chunks: List[str] = []
109
  current: List[str] = []
110
  current_len = 0
111
-
112
- # Track the dominant action inside the current chunk (None until detected)
113
- current_action: Optional[str] = None
114
-
115
  for ln in lines:
116
- ln_words = ln.split()
117
- ln_action = _line_action_tag(ln) # detect line action
118
-
119
- # If we already have an action in the current chunk and the new line switches action,
120
- # or the line is a bullet heading for a different action, flush the current chunk first.
121
- switch_action = (
122
- (current_action is not None and ln_action is not None and ln_action != current_action)
123
- )
124
-
125
- # Hard break triggers:
126
- # - size limit,
127
- # - switching to a different action topic,
128
- # - starting a new bullet/number while current is non-empty (keeps bullets compact).
129
- if (current_len + len(ln_words) > max_words) or (switch_action) or (BULLET_RE.match(ln) and current):
130
  chunk = " ".join(current).strip()
131
  if chunk:
132
  chunks.append(chunk)
133
- # reset current
134
  current = [ln]
135
- current_len = len(ln_words)
136
- current_action = ln_action or None
137
  else:
138
- # Continue current chunk
139
  current.append(ln)
140
- current_len += len(ln_words)
141
- # Set the current action if not already set
142
- if current_action is None and ln_action is not None:
143
- current_action = ln_action
144
-
145
- # Flush remainder
146
  if current:
147
  chunk = " ".join(current).strip()
148
  if chunk:
149
  chunks.append(chunk)
150
-
151
- # Fallback: if nothing formed, collapse all lines into one chunk
152
  if not chunks:
153
  body = " ".join(lines).strip()
154
  if body:
155
  chunks = [body]
156
-
157
  return chunks
158
- # ------------------------------ Intent & Module tagging ------------------------------
 
159
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
160
- SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
 
 
 
 
161
  PERMISSION_TERMS = [
162
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
163
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
164
  "not allowed", "not authorized", "denied", "restrict"
165
  ]
166
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
167
- STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 
 
 
 
168
  MODULE_VOCAB = {
169
  "receiving": [
170
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
@@ -172,7 +177,8 @@ MODULE_VOCAB = {
172
  ],
173
  "appointments": [
174
  "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
175
- "appointment creation", "appointment details"
 
176
  ],
177
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
178
  "putaway": ["putaway", "staging", "put away", "location assignment"],
@@ -181,6 +187,19 @@ MODULE_VOCAB = {
181
  "replenishment": ["replenishment", "replenish"],
182
  }
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def _infer_intent_tag(section_title: str) -> str:
185
  st = (section_title or "").lower()
186
  if any(k in st for k in SECTION_STEPS_HINTS):
@@ -191,9 +210,7 @@ def _infer_intent_tag(section_title: str) -> str:
191
  return "prereqs"
192
  if any(k in st for k in ["purpose", "overview", "introduction"]):
193
  return "purpose"
194
- if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
195
- return "steps"
196
- if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
197
  return "steps"
198
  return "neutral"
199
 
@@ -203,8 +220,8 @@ def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
203
  intent = "neutral"
204
  if any(term in t for term in PERMISSION_TERMS):
205
  intent = "errors"; tags.append("permissions")
206
- if "role" in t: tags.append("role_access")
207
- if "security" in t: tags.append("security")
208
  if intent == "neutral" and any(term in t for term in ERROR_TERMS):
209
  intent = "errors"; tags.append("errors")
210
  if intent == "neutral" and any(v in t for v in STEP_VERBS):
@@ -226,7 +243,7 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
226
  found = ["appointments"]
227
  return list(sorted(set(found)))
228
 
229
- # ------------------------------ Ingestion ------------------------------
230
  def ingest_documents(folder_path: str) -> None:
231
  print(f"[KB] Checking folder: {folder_path}")
232
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -242,13 +259,16 @@ def ingest_documents(folder_path: str) -> None:
242
  for file in files:
243
  file_path = os.path.join(folder_path, file)
244
  doc_title = os.path.splitext(file)[0]
245
- doc = Document(file_path)
246
  sections = _split_by_sections(doc)
247
  total_chunks = 0
 
248
  for s_idx, (section_title, paras) in enumerate(sections):
249
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
250
  total_chunks += len(chunks)
251
  base_intent = _infer_intent_tag(section_title)
 
 
252
  for c_idx, chunk in enumerate(chunks):
253
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
254
  final_intent = base_intent
@@ -256,12 +276,8 @@ def ingest_documents(folder_path: str) -> None:
256
  final_intent = "errors"
257
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
258
  final_intent = derived_intent
 
259
  module_tags = _derive_module_tags(chunk, file, section_title)
260
- # Fallback: appointment chunks marked as steps when neutral (existing patch)
261
- if final_intent == "neutral" and ("appointments" in module_tags):
262
- final_intent = "steps"
263
- # >>> NEW: annotate chunk with action tags (create/update/delete/navigate)
264
- actions_here = _extract_actions(chunk) # reuse ACTION_SYNONYMS
265
  embedding = model.encode(chunk).tolist()
266
  doc_id = f"{file}:{s_idx}:{c_idx}"
267
  meta = {
@@ -273,7 +289,7 @@ def ingest_documents(folder_path: str) -> None:
273
  "intent_tag": final_intent,
274
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
275
  "module_tags": ", ".join(module_tags) if module_tags else "",
276
- "action_tags": ", ".join(actions_here) if actions_here else "",
277
  }
278
  try:
279
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -303,11 +319,14 @@ def ingest_documents(folder_path: str) -> None:
303
  if term not in seen:
304
  bm25_df[term] = bm25_df.get(term, 0) + 1
305
  seen.add(term)
 
306
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
 
307
  N = len(bm25_docs)
308
  if N > 0:
309
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
310
  bm25_ready = True
 
311
  payload = {
312
  "bm25_docs": bm25_docs,
313
  "bm25_inverted": bm25_inverted,
@@ -322,7 +341,7 @@ def ingest_documents(folder_path: str) -> None:
322
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
323
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
324
 
325
- # ------------------------------ BM25 load ------------------------------
326
  def _load_bm25_index() -> None:
327
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
328
  if not os.path.exists(BM25_INDEX_FILE):
@@ -342,7 +361,7 @@ def _load_bm25_index() -> None:
342
 
343
  _load_bm25_index()
344
 
345
- # ------------------------------ BM25 search ------------------------------
346
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
347
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
348
  return 0.0
@@ -388,18 +407,18 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
388
  scored.sort(key=lambda x: x[1], reverse=True)
389
  return scored[:top_k]
390
 
391
- # ------------------------------ Semantic-only ------------------------------
392
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
393
  query_embedding = model.encode(query).tolist()
394
  res = collection.query(
395
  query_embeddings=[query_embedding],
396
  n_results=top_k,
397
- include=['documents', 'metadatas', 'distances'] # no 'ids'
398
  )
399
- documents = (res.get("documents", [[]]) or [[]])[0]
400
- metadatas = (res.get("metadatas", [[]]) or [[]])[0]
401
- distances = (res.get("distances", [[]]) or [[]])[0]
402
- # Synthesize IDs from metadata (filename:section:chunk_index)
403
  ids: List[str] = []
404
  if documents:
405
  synthesized = []
@@ -417,13 +436,14 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
417
  "ids": ids,
418
  }
419
 
420
- # ------------------------------ Hybrid search (generic + intent-aware) ------------------------------
421
  ACTION_SYNONYMS = {
422
- "create": ["create", "creation", "add", "new", "generate"],
423
- "update": ["update", "modify", "change", "edit"],
424
- "delete": ["delete", "remove"],
425
  "navigate": ["navigate", "go to", "open"],
426
  }
 
427
  ERROR_INTENT_TERMS = [
428
  "error", "issue", "fail", "not working", "resolution", "fix",
429
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
@@ -498,11 +518,11 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
498
  topics = (meta or {}).get("topic_tags", "") or ""
499
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
500
  if user_intent == "errors" and (
501
- any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"])
502
- or ("permissions" in topic_list)
503
  ):
504
  return 1.10
505
- if user_intent == "steps" and any(k in st for k in ["process steps", "procedure", "instructions", "workflow"]):
506
  return 0.75
507
  return -0.2
508
 
@@ -512,7 +532,8 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
512
  section_tokens = _tokenize_meta_value(meta.get("section"))
513
  topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
514
  module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
515
- meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens)
 
516
  if not meta_tokens or not q_terms:
517
  return 0.0
518
  qset = set(q_terms)
@@ -552,17 +573,12 @@ def _literal_query_match_boost(text: str, query_norm: str) -> float:
552
  return min(boost, 1.6)
553
 
554
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
555
- """
556
- Hybrid retrieval (embeddings + BM25) with intent-, action-, module-, and phrase-aware reranking.
557
- Returns top items plus doc-level prior and intent for downstream formatting.
558
- """
559
  norm_query = _normalize_query(query)
560
  q_terms = _tokenize(norm_query)
561
  user_intent = _detect_user_intent(query)
562
  actions = _extract_actions(query)
563
  user_modules = _extract_modules_from_query(query)
564
 
565
- # semantic (embeddings) search via Chroma
566
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
567
  sem_docs = sem_res.get("documents", [])
568
  sem_metas = sem_res.get("metadatas", [])
@@ -576,9 +592,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
576
  return 1.0 / (1.0 + float(d))
577
  except Exception:
578
  return 0.0
 
579
  sem_sims = [dist_to_sim(d) for d in sem_dists]
580
 
581
- # BM25 search
582
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
583
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
584
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
@@ -589,22 +605,23 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
589
  bm25_id_to_text[d["id"]] = d["text"]
590
  bm25_id_to_meta[d["id"]] = d["meta"]
591
 
592
- # union of candidate IDs (semantic + bm25)
593
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
594
 
595
- # weights
596
- gamma = 0.30 # meta overlap
597
- delta = 0.55 # intent boost
598
- epsilon = 0.30 # action weight
599
- zeta = 0.65 # module weight
600
- eta = 0.50 # phrase-level boost
601
- theta = 0.00 # optional heading alignment bonus not used
602
- iota = 0.60 # literal query match boost
603
 
604
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
 
 
 
 
605
 
 
606
  for cid in union_ids:
607
- # pick semantic fields if present; fallback to bm25
608
  if cid in sem_ids:
609
  pos = sem_ids.index(cid)
610
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
@@ -613,17 +630,16 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
613
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
614
  else:
615
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
616
-
617
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
618
  bm25_text = bm25_id_to_text.get(cid, "")
619
  bm25_meta = bm25_id_to_meta.get(cid, {})
620
-
621
  text = sem_text if sem_text else bm25_text
622
  meta = sem_meta if sem_meta else bm25_meta
623
 
624
  m_overlap = _meta_overlap(meta, q_terms)
625
  intent_boost = _intent_weight(meta, user_intent)
626
  act_wt = _action_weight(text, actions)
 
627
  mod_wt = _module_weight(meta, user_modules)
628
  phrase_wt = _phrase_boost_score(text, q_terms)
629
  literal_wt = _literal_query_match_boost(text, norm_query)
@@ -633,52 +649,30 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
633
  + beta * bm25_sim
634
  + gamma * m_overlap
635
  + delta * intent_boost
636
- + epsilon * act_wt
637
  + zeta * mod_wt
638
  + eta * phrase_wt
639
- + theta * 0.0
640
  + iota * literal_wt
641
  )
642
  combined_records_ext.append(
643
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta,
644
- m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, 0.0, literal_wt)
645
  )
646
 
647
- # exact-match rerank for errors (push lines containing query phrases)
648
- if user_intent == "errors":
649
- exact_hits = []
650
- toks = [tok for tok in norm_query.split() if len(tok) > 2]
651
- bigrams = _make_ngrams(toks, 2)
652
- for rec in combined_records_ext:
653
- text_lower = (rec[3] or "").lower()
654
- if norm_query and norm_query in text_lower:
655
- exact_hits.append(rec)
656
- continue
657
- if any(bg in text_lower for bg in bigrams):
658
- exact_hits.append(rec)
659
- if exact_hits:
660
- rest = [r for r in combined_records_ext if r not in exact_hits]
661
- exact_hits.sort(key=lambda x: x[1], reverse=True)
662
- rest.sort(key=lambda x: x[1], reverse=True)
663
- combined_records_ext = exact_hits + rest
664
-
665
- # doc-level prior: prefer docs with more aligned chunks
666
  from collections import defaultdict as _dd
667
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = _dd(list)
668
  for rec in combined_records_ext:
669
  meta = rec[4] or {}
670
  fn = meta.get("filename", "unknown")
671
  doc_groups[fn].append(rec)
672
 
673
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
674
  total_score = sum(r[1] for r in recs)
675
  total_overlap = sum(r[5] for r in recs)
676
  total_intent = sum(max(0.0, r[6]) for r in recs)
677
  total_action = sum(max(0.0, r[7]) for r in recs)
678
  total_module = sum(r[8] for r in recs)
679
  total_phrase = sum(r[9] for r in recs)
680
- total_literal = sum(r[11] for r in recs)
681
- total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
682
  errors_section_bonus = 0.0
683
  if any("error" in ((r[4] or {}).get("section", "")).lower() or
684
  "known issues" in ((r[4] or {}).get("section", "")).lower() or
@@ -688,12 +682,11 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
688
  total_score
689
  + 0.4 * total_overlap
690
  + 0.7 * total_intent
691
- + 0.5 * total_action
692
  + 0.8 * total_module
693
  + 0.6 * total_phrase
694
  + 0.7 * total_literal
695
  + errors_section_bonus
696
- + 0.3 * total_penalty
697
  )
698
 
699
  best_doc, best_doc_prior = None, -1.0
@@ -703,7 +696,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
703
  best_doc_prior, best_doc = p, fn
704
 
705
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
706
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
707
  for fn, recs in doc_groups.items():
708
  if fn == best_doc:
709
  continue
@@ -730,7 +723,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
730
  "actions": actions,
731
  }
732
 
733
- # ------------------------------ Section fetch helpers ------------------------------
734
  def get_section_text(filename: str, section: str) -> str:
735
  texts: List[str] = []
736
  for d in bm25_docs:
@@ -766,7 +759,6 @@ def get_best_errors_section_text(filename: str) -> str:
766
  or "access" in sec
767
  or "known issues" in sec
768
  or "common issues" in sec
769
- or "errors" in sec
770
  or ("permissions" in topic_list)
771
  ):
772
  t = (d.get("text") or "").strip()
@@ -775,10 +767,6 @@ def get_best_errors_section_text(filename: str) -> str:
775
  return "\n\n".join(texts).strip()
776
 
777
  def get_escalation_text(filename: str) -> str:
778
- """
779
- Return concatenated text from any 'Escalation' section in the given SOP file.
780
- Works across future SOPs—only relies on the heading name containing 'escalation'.
781
- """
782
  texts: List[str] = []
783
  for d in bm25_docs:
784
  m = d.get("meta", {})
@@ -790,7 +778,6 @@ def get_escalation_text(filename: str) -> str:
790
  texts.append(t)
791
  return "\n\n".join(texts).strip()
792
 
793
- # ------------------------------ Admin helpers ------------------------------
794
  def get_kb_runtime_info() -> Dict[str, Any]:
795
  return {
796
  "chroma_path": CHROMA_PATH,
 
1
 
2
+ # services/kb_creation.py (DROP-IN REPLACEMENT)
3
+ # Heading-agnostic sectioning, robust intent & module/action tagging,
4
+ # and hybrid retrieval tuned for SOPs with variant headings (e.g.,
5
+ # "Appointment schedule updation", "Appointment deletion").
6
+
7
  import os
8
  import re
9
  import pickle
10
+ from typing import TYPE_CHECKING, Any, List, Dict, Any, Tuple, Optional
11
+
12
+ # During type checking (Pylance/mypy) we import a real type.
13
+ # At runtime (or if docx is missing), we fallback to Any to avoid import issues.
14
+ if TYPE_CHECKING:
15
+ from docx import Document as DocxDocument
16
+ else:
17
+ DocxDocument = Any
18
+
19
+ # Runtime import used for actual Document parsing (no type hints on this name).
20
+ try:
21
+ from docx import Document # noqa: F401
22
+ except Exception:
23
+ Document = None # type: ignore
24
+
25
+ # These external libs are assumed to be available in your runtime.
26
+ import chromadb # type: ignore
27
+ from sentence_transformers import SentenceTransformer # type: ignore
28
 
 
29
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
30
  client = chromadb.PersistentClient(path=CHROMA_PATH)
31
  collection = client.get_or_create_collection(name="knowledge_base")
32
 
 
33
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
34
 
 
35
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
36
  bm25_docs: List[Dict[str, Any]] = []
37
  bm25_inverted: Dict[str, List[int]] = {}
 
41
  BM25_K1 = 1.5
42
  BM25_B = 0.75
43
 
44
+ # --------------------------- token utilities ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  def _tokenize(text: str) -> List[str]:
46
  if not text:
47
  return []
 
57
  def _tokenize_meta_value(val: Optional[str]) -> List[str]:
58
  return _tokenize(val or "")
59
 
60
+ # --------------------------- DOCX parsing & chunking ---------------------------
61
+ BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[\.\)])\s+", re.IGNORECASE)
62
+
63
+ SECTION_KEYWORDS = (
64
+ "overview", "introduction", "purpose",
65
+ "process", "procedure", "steps", "workflow",
66
+ "creation", "create", "updation", "update", "edit", "change", "reschedule",
67
+ "delete", "deletion", "cancel", "remove",
68
+ "prereq", "pre-requisite", "prerequisite", "requirements",
69
+ "errors", "known issues", "common issues", "troubleshooting", "escalation", "permissions", "access"
70
+ )
71
+
72
+ HEADING_PATTERNS = [
73
+ re.compile(r"^\s*#{1,6}\s+.+$"), # markdown-like
74
+ re.compile(r"^\s*[A-Z][A-Za-z0-9\s/&\-]+$"), # Title-cased single-line
75
+ re.compile(r"^\s*.+:\s*$"), # ends with colon
76
+ ]
77
 
78
+ def _is_heading_text(line: str) -> bool:
79
+ s = (line or "").strip()
80
+ if not s:
81
+ return False
82
+ if (len(s) <= 140) and (not re.search(r"[.!?]$", s)):
83
+ low = s.lower()
84
+ if any(k in low for k in SECTION_KEYWORDS):
85
+ return True
86
+ return any(p.match(s) for p in HEADING_PATTERNS)
87
+
88
+ def _split_by_sections(doc: DocxDocument) -> List[Tuple[str, List[str]]]:
89
  sections: List[Tuple[str, List[str]]] = []
90
+ current_title: Optional[str] = None
91
  current_paras: List[str] = []
92
+
93
+ for para in (doc.paragraphs if hasattr(doc, "paragraphs") else []):
94
  text = (para.text or "").strip()
95
+ style_name = (getattr(para, "style", None).name if getattr(para, "style", None) else "") or ""
96
+ is_style_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
97
+ is_text_heading = _is_heading_text(text)
98
+
99
+ if (is_style_heading or is_text_heading) and text:
100
  if current_title or current_paras:
101
  sections.append((current_title or "Untitled Section", current_paras))
102
+ current_title = text.rstrip(":")
103
  current_paras = []
104
  else:
105
  if text:
106
  current_paras.append(text)
107
+
108
  if current_title or current_paras:
109
  sections.append((current_title or "Untitled Section", current_paras))
110
+
111
  if not sections:
112
+ all_text = [p.text.strip() for p in getattr(doc, "paragraphs", []) if p.text and p.text.strip()]
113
  sections = [("Document", all_text)]
114
  return sections
115
 
116
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
 
117
  lines: List[str] = []
118
  for p in (paragraphs or []):
119
  p = (p or "").strip()
 
127
  return lines
128
 
129
  def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
 
 
 
 
 
 
 
130
  lines = _paragraphs_to_lines(paragraphs)
131
  chunks: List[str] = []
132
  current: List[str] = []
133
  current_len = 0
 
 
 
 
134
  for ln in lines:
135
+ w = ln.split()
136
+ if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
 
 
 
 
 
 
 
 
 
 
 
 
137
  chunk = " ".join(current).strip()
138
  if chunk:
139
  chunks.append(chunk)
 
140
  current = [ln]
141
+ current_len = len(w)
 
142
  else:
 
143
  current.append(ln)
144
+ current_len += len(w)
 
 
 
 
 
145
  if current:
146
  chunk = " ".join(current).strip()
147
  if chunk:
148
  chunks.append(chunk)
 
 
149
  if not chunks:
150
  body = " ".join(lines).strip()
151
  if body:
152
  chunks = [body]
 
153
  return chunks
154
+
155
+ # --------------------------- Intent & Module tagging ---------------------------
156
  SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
157
+ SECTION_ERRORS_HINTS = [
158
+ "common errors", "resolution", "troubleshooting", "known issues",
159
+ "common issues", "escalation", "permissions", "access"
160
+ ]
161
+
162
  PERMISSION_TERMS = [
163
  "permission", "permissions", "access", "access right", "authorization", "authorisation",
164
  "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
165
  "not allowed", "not authorized", "denied", "restrict"
166
  ]
167
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
168
+ STEP_VERBS = [
169
+ "navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open",
170
+ "choose", "enter", "update", "save", "delete", "create", "attach", "assign", "reschedule", "edit", "change", "cancel", "remove"
171
+ ]
172
+
173
  MODULE_VOCAB = {
174
  "receiving": [
175
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
 
177
  ],
178
  "appointments": [
179
  "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
180
+ "appointment creation", "appointment details", "appointment schedule", "reschedule",
181
+ "updation", "update appointment", "cancel appointment", "delete appointment"
182
  ],
183
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
184
  "putaway": ["putaway", "staging", "put away", "location assignment"],
 
187
  "replenishment": ["replenishment", "replenish"],
188
  }
189
 
190
+ ACTION_TERMS = {
191
+ "create": ["create", "creation", "add", "new", "generate", "tag"],
192
+ "update": ["update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"],
193
+ "delete": ["delete", "remove", "cancel", "deletion", "unassign"],
194
+ }
195
+
196
+ def _action_from_text(text: str) -> Optional[str]:
197
+ low = (text or "").lower()
198
+ for act, syns in ACTION_TERMS.items():
199
+ if any(s in low for s in syns):
200
+ return act
201
+ return None
202
+
203
  def _infer_intent_tag(section_title: str) -> str:
204
  st = (section_title or "").lower()
205
  if any(k in st for k in SECTION_STEPS_HINTS):
 
210
  return "prereqs"
211
  if any(k in st for k in ["purpose", "overview", "introduction"]):
212
  return "purpose"
213
+ if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn", "appointment", "schedule", "scheduling"]):
 
 
214
  return "steps"
215
  return "neutral"
216
 
 
220
  intent = "neutral"
221
  if any(term in t for term in PERMISSION_TERMS):
222
  intent = "errors"; tags.append("permissions")
223
+ if "role" in t: tags.append("role_access")
224
+ if "security" in t: tags.append("security")
225
  if intent == "neutral" and any(term in t for term in ERROR_TERMS):
226
  intent = "errors"; tags.append("errors")
227
  if intent == "neutral" and any(v in t for v in STEP_VERBS):
 
243
  found = ["appointments"]
244
  return list(sorted(set(found)))
245
 
246
+ # --------------------------- Ingestion ---------------------------
247
  def ingest_documents(folder_path: str) -> None:
248
  print(f"[KB] Checking folder: {folder_path}")
249
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
 
259
  for file in files:
260
  file_path = os.path.join(folder_path, file)
261
  doc_title = os.path.splitext(file)[0]
262
+ doc = Document(file_path) # runtime use; type hints use DocxDocument alias
263
  sections = _split_by_sections(doc)
264
  total_chunks = 0
265
+
266
  for s_idx, (section_title, paras) in enumerate(sections):
267
  chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
268
  total_chunks += len(chunks)
269
  base_intent = _infer_intent_tag(section_title)
270
+ section_action = _action_from_text(section_title)
271
+
272
  for c_idx, chunk in enumerate(chunks):
273
  derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
274
  final_intent = base_intent
 
276
  final_intent = "errors"
277
  elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
278
  final_intent = derived_intent
279
+
280
  module_tags = _derive_module_tags(chunk, file, section_title)
 
 
 
 
 
281
  embedding = model.encode(chunk).tolist()
282
  doc_id = f"{file}:{s_idx}:{c_idx}"
283
  meta = {
 
289
  "intent_tag": final_intent,
290
  "topic_tags": ", ".join(topic_tags) if topic_tags else "",
291
  "module_tags": ", ".join(module_tags) if module_tags else "",
292
+ "action_tag": section_action or _action_from_text(chunk) or "",
293
  }
294
  try:
295
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
319
  if term not in seen:
320
  bm25_df[term] = bm25_df.get(term, 0) + 1
321
  seen.add(term)
322
+
323
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
324
+
325
  N = len(bm25_docs)
326
  if N > 0:
327
  bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
328
  bm25_ready = True
329
+
330
  payload = {
331
  "bm25_docs": bm25_docs,
332
  "bm25_inverted": bm25_inverted,
 
341
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
342
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
343
 
344
+ # --------------------------- BM25 load ---------------------------
345
  def _load_bm25_index() -> None:
346
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
347
  if not os.path.exists(BM25_INDEX_FILE):
 
361
 
362
  _load_bm25_index()
363
 
364
+ # --------------------------- BM25 search ---------------------------
365
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
366
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
367
  return 0.0
 
407
  scored.sort(key=lambda x: x[1], reverse=True)
408
  return scored[:top_k]
409
 
410
+ # --------------------------- Semantic-only ---------------------------
411
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
412
  query_embedding = model.encode(query).tolist()
413
  res = collection.query(
414
  query_embeddings=[query_embedding],
415
  n_results=top_k,
416
+ include=['documents', 'metadatas', 'distances']
417
  )
418
+ documents = (res.get("documents", [[""]]) or [[""]])[0]
419
+ metadatas = (res.get("metadatas", [[{}]]) or [[{}]])[0]
420
+ distances = (res.get("distances", [[None]]) or [[None]])[0]
421
+
422
  ids: List[str] = []
423
  if documents:
424
  synthesized = []
 
436
  "ids": ids,
437
  }
438
 
439
+ # --------------------------- Hybrid search (generic + intent-aware) -----------
440
  ACTION_SYNONYMS = {
441
+ "create": ["create", "creation", "add", "new", "generate", "tag"],
442
+ "update": ["update", "modify", "change", "edit", "reschedule", "re-schedule", "updation"],
443
+ "delete": ["delete", "remove", "cancel", "deletion", "unassign"],
444
  "navigate": ["navigate", "go to", "open"],
445
  }
446
+
447
  ERROR_INTENT_TERMS = [
448
  "error", "issue", "fail", "not working", "resolution", "fix",
449
  "permission", "permissions", "access", "no access", "authorization", "authorisation",
 
518
  topics = (meta or {}).get("topic_tags", "") or ""
519
  topic_list = [t.strip() for t in topics.split(",") if t.strip()]
520
  if user_intent == "errors" and (
521
+ any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"]) or
522
+ ("permissions" in topic_list)
523
  ):
524
  return 1.10
525
+ if user_intent == "steps" and any(k in st for k in ["process steps", "procedure", "instructions", "workflow", "creation", "updation", "deletion"]):
526
  return 0.75
527
  return -0.2
528
 
 
532
  section_tokens = _tokenize_meta_value(meta.get("section"))
533
  topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
534
  module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
535
+ action_token = _tokenize_meta_value((meta.get("action_tag") or ""))
536
+ meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens + action_token)
537
  if not meta_tokens or not q_terms:
538
  return 0.0
539
  qset = set(q_terms)
 
573
  return min(boost, 1.6)
574
 
575
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
 
 
 
 
576
  norm_query = _normalize_query(query)
577
  q_terms = _tokenize(norm_query)
578
  user_intent = _detect_user_intent(query)
579
  actions = _extract_actions(query)
580
  user_modules = _extract_modules_from_query(query)
581
 
 
582
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
583
  sem_docs = sem_res.get("documents", [])
584
  sem_metas = sem_res.get("metadatas", [])
 
592
  return 1.0 / (1.0 + float(d))
593
  except Exception:
594
  return 0.0
595
+
596
  sem_sims = [dist_to_sim(d) for d in sem_dists]
597
 
 
598
  bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
599
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
600
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
605
  bm25_id_to_text[d["id"]] = d["text"]
606
  bm25_id_to_meta[d["id"]] = d["meta"]
607
 
 
608
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
609
 
610
+ gamma = 0.30
611
+ delta = 0.55
612
+ epsilon = 0.35
613
+ zeta = 0.65
614
+ eta = 0.50
615
+ iota = 0.60
 
 
616
 
617
+ def _action_meta_bonus(meta: Dict[str, Any], actions: List[str]) -> float:
618
+ if not actions:
619
+ return 0.0
620
+ act = (meta or {}).get("action_tag", "") or ""
621
+ return 0.6 if act and act in actions else 0.0
622
 
623
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
624
  for cid in union_ids:
 
625
  if cid in sem_ids:
626
  pos = sem_ids.index(cid)
627
  sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
 
630
  sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
631
  else:
632
  sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
 
633
  bm25_sim = bm25_id_to_norm.get(cid, 0.0)
634
  bm25_text = bm25_id_to_text.get(cid, "")
635
  bm25_meta = bm25_id_to_meta.get(cid, {})
 
636
  text = sem_text if sem_text else bm25_text
637
  meta = sem_meta if sem_meta else bm25_meta
638
 
639
  m_overlap = _meta_overlap(meta, q_terms)
640
  intent_boost = _intent_weight(meta, user_intent)
641
  act_wt = _action_weight(text, actions)
642
+ act_meta = _action_meta_bonus(meta, actions)
643
  mod_wt = _module_weight(meta, user_modules)
644
  phrase_wt = _phrase_boost_score(text, q_terms)
645
  literal_wt = _literal_query_match_boost(text, norm_query)
 
649
  + beta * bm25_sim
650
  + gamma * m_overlap
651
  + delta * intent_boost
652
+ + epsilon * (act_wt + act_meta)
653
  + zeta * mod_wt
654
  + eta * phrase_wt
 
655
  + iota * literal_wt
656
  )
657
  combined_records_ext.append(
658
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt + act_meta, mod_wt, phrase_wt, literal_wt)
 
659
  )
660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  from collections import defaultdict as _dd
662
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = _dd(list)
663
  for rec in combined_records_ext:
664
  meta = rec[4] or {}
665
  fn = meta.get("filename", "unknown")
666
  doc_groups[fn].append(rec)
667
 
668
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
669
  total_score = sum(r[1] for r in recs)
670
  total_overlap = sum(r[5] for r in recs)
671
  total_intent = sum(max(0.0, r[6]) for r in recs)
672
  total_action = sum(max(0.0, r[7]) for r in recs)
673
  total_module = sum(r[8] for r in recs)
674
  total_phrase = sum(r[9] for r in recs)
675
+ total_literal = sum(r[10] for r in recs)
 
676
  errors_section_bonus = 0.0
677
  if any("error" in ((r[4] or {}).get("section", "")).lower() or
678
  "known issues" in ((r[4] or {}).get("section", "")).lower() or
 
682
  total_score
683
  + 0.4 * total_overlap
684
  + 0.7 * total_intent
685
+ + 0.55 * total_action
686
  + 0.8 * total_module
687
  + 0.6 * total_phrase
688
  + 0.7 * total_literal
689
  + errors_section_bonus
 
690
  )
691
 
692
  best_doc, best_doc_prior = None, -1.0
 
696
  best_doc_prior, best_doc = p, fn
697
 
698
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
699
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
700
  for fn, recs in doc_groups.items():
701
  if fn == best_doc:
702
  continue
 
723
  "actions": actions,
724
  }
725
 
726
+ # --------------------------- Section fetch helpers ---------------------------
727
  def get_section_text(filename: str, section: str) -> str:
728
  texts: List[str] = []
729
  for d in bm25_docs:
 
759
  or "access" in sec
760
  or "known issues" in sec
761
  or "common issues" in sec
 
762
  or ("permissions" in topic_list)
763
  ):
764
  t = (d.get("text") or "").strip()
 
767
  return "\n\n".join(texts).strip()
768
 
769
  def get_escalation_text(filename: str) -> str:
 
 
 
 
770
  texts: List[str] = []
771
  for d in bm25_docs:
772
  m = d.get("meta", {})
 
778
  texts.append(t)
779
  return "\n\n".join(texts).strip()
780
 
 
781
  def get_kb_runtime_info() -> Dict[str, Any]:
782
  return {
783
  "chroma_path": CHROMA_PATH,