srilakshu012456 commited on
Commit
d7af615
·
verified ·
1 Parent(s): af2ca53

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +181 -401
services/kb_creation.py CHANGED
@@ -1,22 +1,31 @@
1
 
2
- # services/kb_creation.py
 
 
 
 
 
 
 
 
3
  import os
4
  import re
5
  import pickle
6
  from typing import List, Dict, Any, Tuple, Optional
 
7
  from docx import Document
8
  from sentence_transformers import SentenceTransformer
9
  import chromadb
10
 
11
- # --------------------------- ChromaDB setup ---------------------------
12
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
13
  client = chromadb.PersistentClient(path=CHROMA_PATH)
14
  collection = client.get_or_create_collection(name="knowledge_base")
15
 
16
- # --------------------------- Embedding model ---------------------------
17
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
18
 
19
- # --------------------------- BM25 (lightweight) ---------------------------
20
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
21
  bm25_docs: List[Dict[str, Any]] = []
22
  bm25_inverted: Dict[str, List[int]] = {}
@@ -26,25 +35,51 @@ bm25_ready: bool = False
26
  BM25_K1 = 1.5
27
  BM25_B = 0.75
28
 
29
- # --------------------------- Utilities ---------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  def _tokenize(text: str) -> List[str]:
31
  if not text:
32
  return []
33
  text = text.lower()
34
  return re.findall(r"[a-z0-9]+", text)
35
 
 
36
  def _normalize_query(q: str) -> str:
37
  q = (q or "").strip().lower()
38
  q = re.sub(r"[^\w\s]", " ", q)
39
  q = re.sub(r"\s+", " ", q).strip()
40
  return q
41
 
42
- def _tokenize_meta_value(val: Optional[str]) -> List[str]:
43
- return _tokenize(val or "")
44
 
45
- # --------------------------- DOCX parsing & chunking ---------------------------
46
- BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.\)]\s+)", re.IGNORECASE)
 
47
 
 
48
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
49
  sections: List[Tuple[str, List[str]]] = []
50
  current_title = None
@@ -68,12 +103,8 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
68
  sections = [("Document", all_text)]
69
  return sections
70
 
 
71
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
72
- """
73
- Split paragraphs into bullet-aware lines:
74
- - Preserve bullets/numbered list lines as separate atomic lines.
75
- - Split long paragraphs by sentence boundaries.
76
- """
77
  lines: List[str] = []
78
  for p in (paragraphs or []):
79
  p = (p or "").strip()
@@ -86,8 +117,8 @@ def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
86
  lines.extend(parts)
87
  return lines
88
 
89
- def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 160) -> List[str]:
90
- """Smaller chunks (~160 words), bullet-aware for better recall."""
91
  lines = _paragraphs_to_lines(paragraphs)
92
  chunks: List[str] = []
93
  current: List[str] = []
@@ -113,83 +144,60 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
113
  chunks = [body]
114
  return chunks
115
 
116
- # --------------------------- Intent & Module tagging ---------------------------
117
- SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
118
- SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
119
- PERMISSION_TERMS = [
120
- "permission", "permissions", "access", "access right", "authorization", "authorisation",
121
- "role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
122
- "not allowed", "not authorized", "denied", "restrict"
123
- ]
124
- ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
125
- STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
126
- MODULE_VOCAB = {
127
- "receiving": [
128
- "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
129
- "asn receiving", "unload", "check-in", "dock check-in"
130
- ],
131
- "appointments": [
132
- "appointment", "appointments", "schedule", "scheduling", "slot", "dock door",
133
- "appointment creation", "appointment details"
134
- ],
135
- "picking": ["pick", "picking", "pick release", "wave", "allocation"],
136
- "putaway": ["putaway", "staging", "put away", "location assignment"],
137
- "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
138
- "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
139
- "replenishment": ["replenishment", "replenish"],
140
- }
141
-
142
- def _infer_intent_tag(section_title: str) -> str:
143
- st = (section_title or "").lower()
144
- if any(k in st for k in SECTION_STEPS_HINTS):
145
- return "steps"
146
- if any(k in st for k in SECTION_ERRORS_HINTS):
147
- return "errors"
148
- if "pre" in st and "requisite" in st:
149
- return "prereqs"
150
- if any(k in st for k in ["purpose", "overview", "introduction"]):
151
- return "purpose"
152
- if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
153
- return "steps"
154
- if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
155
- return "steps"
156
  return "neutral"
157
 
158
- def _derive_semantic_intent_from_text(text: str) -> Tuple[str, List[str]]:
159
- t = (text or "").lower()
160
- tags: List[str] = []
161
- intent = "neutral"
162
- if any(term in t for term in PERMISSION_TERMS):
163
- intent = "errors"
164
- tags.append("permissions")
165
- if "role" in t:
166
- tags.append("role_access")
167
- if "security" in t:
168
- tags.append("security")
169
- if intent == "neutral" and any(term in t for term in ERROR_TERMS):
170
- intent = "errors"
171
- tags.append("errors")
172
- if intent == "neutral" and any(v in t for v in STEP_VERBS):
173
- intent = "steps"
174
- tags.append("procedure")
175
- return intent, list(set(tags))
176
 
177
  def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
178
  tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
179
  found = []
180
- for mod, syns in MODULE_VOCAB.items():
181
- if any(s in tokens for s in syns):
182
- found.append(mod)
183
- if not found:
184
- if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
185
- found = ["inventory"]
186
- elif "receive" in tokens or "inbound" in tokens or "goods receipt" in tokens or "grn" in tokens:
187
- found = ["receiving"]
188
- elif "appointment" in tokens or "schedule" in tokens or "dock" in tokens:
189
- found = ["appointments"]
190
  return list(sorted(set(found)))
191
 
192
- # --------------------------- Ingestion ---------------------------
 
 
 
 
 
 
 
 
 
 
 
193
  def ingest_documents(folder_path: str) -> None:
194
  print(f"[KB] Checking folder: {folder_path}")
195
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
@@ -208,20 +216,12 @@ def ingest_documents(folder_path: str) -> None:
208
  doc = Document(file_path)
209
  sections = _split_by_sections(doc)
210
  total_chunks = 0
211
-
212
  for s_idx, (section_title, paras) in enumerate(sections):
213
- chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=160)
214
  total_chunks += len(chunks)
215
- base_intent = _infer_intent_tag(section_title)
216
-
217
  for c_idx, chunk in enumerate(chunks):
218
- derived_intent, topic_tags = _derive_semantic_intent_from_text(chunk)
219
- final_intent = base_intent
220
- if derived_intent == "errors":
221
- final_intent = "errors"
222
- elif base_intent == "neutral" and derived_intent in ("steps", "prereqs"):
223
- final_intent = derived_intent
224
-
225
  module_tags = _derive_module_tags(chunk, file, section_title)
226
  embedding = model.encode(chunk).tolist()
227
  doc_id = f"{file}:{s_idx}:{c_idx}"
@@ -231,11 +231,10 @@ def ingest_documents(folder_path: str) -> None:
231
  "chunk_index": c_idx,
232
  "title": doc_title,
233
  "collection": "SOP",
234
- "intent_tag": final_intent,
235
- "topic_tags": ", ".join(topic_tags) if topic_tags else "",
236
  "module_tags": ", ".join(module_tags) if module_tags else "",
237
  }
238
-
239
  try:
240
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
241
  except Exception:
@@ -244,12 +243,10 @@ def ingest_documents(folder_path: str) -> None:
244
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
245
  except Exception as e2:
246
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
247
-
248
  tokens = _tokenize(chunk)
249
  tf: Dict[str, int] = {}
250
  for tkn in tokens:
251
  tf[tkn] = tf.get(tkn, 0) + 1
252
-
253
  idx = len(bm25_docs)
254
  bm25_docs.append({
255
  "id": doc_id,
@@ -259,14 +256,12 @@ def ingest_documents(folder_path: str) -> None:
259
  "length": len(tokens),
260
  "meta": meta,
261
  })
262
-
263
  seen = set()
264
  for term in tf.keys():
265
  bm25_inverted.setdefault(term, []).append(idx)
266
  if term not in seen:
267
  bm25_df[term] = bm25_df.get(term, 0) + 1
268
  seen.add(term)
269
-
270
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
271
 
272
  N = len(bm25_docs)
@@ -287,7 +282,7 @@ def ingest_documents(folder_path: str) -> None:
287
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
288
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
289
 
290
- # --------------------------- BM25 load ---------------------------
291
  def _load_bm25_index() -> None:
292
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
293
  if not os.path.exists(BM25_INDEX_FILE):
@@ -307,7 +302,7 @@ def _load_bm25_index() -> None:
307
 
308
  _load_bm25_index()
309
 
310
- # --------------------------- BM25 search ---------------------------
311
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
312
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
313
  return 0.0
@@ -322,31 +317,28 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
322
  if tf == 0:
323
  continue
324
  N = len(bm25_docs)
325
- idf_ratio = ((N - df + 0.5) / (df + 0.5))
326
  try:
327
  import math
328
- idf = math.log(idf_ratio + 1.0)
329
  except Exception:
330
  idf = 1.0
331
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
332
  score += idf * (((tf * (BM25_K1 + 1)) / (denom or 1.0)))
333
  return score
334
 
 
335
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
336
  if not bm25_ready:
337
  return []
338
- norm = _normalize_query(query)
339
- q_terms = _tokenize(norm)
340
  if not q_terms:
341
  return []
342
-
343
  candidates = set()
344
  for t in q_terms:
345
  for idx in bm25_inverted.get(t, []):
346
  candidates.add(idx)
347
  if not candidates:
348
  candidates = set(range(len(bm25_docs)))
349
-
350
  scored = []
351
  for idx in candidates:
352
  s = _bm25_score_for_doc(q_terms, idx)
@@ -355,19 +347,17 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
355
  scored.sort(key=lambda x: x[1], reverse=True)
356
  return scored[:top_k]
357
 
358
- # --------------------------- Semantic-only ---------------------------
359
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
360
  query_embedding = model.encode(query).tolist()
361
  res = collection.query(
362
  query_embeddings=[query_embedding],
363
  n_results=top_k,
364
- include=['documents', 'metadatas', 'distances'] # no 'ids'
365
  )
366
  documents = (res.get("documents", [[]]) or [[]])[0]
367
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
368
  distances = (res.get("distances", [[]]) or [[]])[0]
369
-
370
- # Synthesize IDs from metadata (filename:section:chunk_index)
371
  ids: List[str] = []
372
  if documents:
373
  synthesized = []
@@ -377,8 +367,6 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
377
  idx = (m or {}).get("chunk_index", i)
378
  synthesized.append(f"{fn}:{sec}:{idx}")
379
  ids = synthesized
380
-
381
- print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids synthesized={len(ids)}")
382
  return {
383
  "documents": documents,
384
  "metadatas": metadatas,
@@ -386,158 +374,47 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
386
  "ids": ids,
387
  }
388
 
389
- # --------------------------- Hybrid search (improved + exact-match rerank) ---------------------------
390
- ACTION_SYNONYMS = {
391
- "create": ["create", "creation", "add", "new", "generate", "book", "schedule", "set up"],
392
- "update": ["update", "modify", "change", "edit", "reschedule", "adjust", "move"],
393
- "delete": ["delete", "remove"],
394
- "navigate": ["navigate", "go to", "open"],
395
- }
396
-
397
- ERROR_INTENT_TERMS = [
398
- "error", "issue", "fail", "not working", "resolution", "fix",
399
- "permission", "permissions", "access", "no access", "authorization", "authorisation",
400
- "role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
401
- "escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
402
- ]
403
-
404
- def _detect_user_intent(query: str) -> str:
405
- q = (query or "").lower()
406
- if any(k in q for k in ERROR_INTENT_TERMS):
407
- return "errors"
408
- if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform", "receiving"]):
409
- return "steps"
410
- if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
411
- return "prereqs"
412
- if any(k in q for k in ["purpose", "overview", "introduction"]):
413
- return "purpose"
414
- return "neutral"
415
-
416
- def _extract_actions(query: str) -> List[str]:
417
  q = (query or "").lower()
418
  found = []
419
  for act, syns in ACTION_SYNONYMS.items():
420
  if any(s in q for s in syns):
421
  found.append(act)
422
- if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
423
- found.append("navigate")
424
- return list(sorted(set(found))) or []
425
-
426
- def _extract_modules_from_query(query: str) -> List[str]:
427
- q = (query or "").lower()
428
- found = []
429
- for mod, syns in MODULE_VOCAB.items():
430
- if any(s in q for s in syns):
431
- found.append(mod)
432
- if not found and any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
433
- found = ["receiving"]
434
- if "receiving" in found and "appointments" in found:
435
- return ["receiving"]
436
  return list(sorted(set(found)))
437
 
438
- def _action_weight(text: str, actions: List[str]) -> float:
439
- if not actions:
440
- return 0.0
441
- t = (text or "").lower()
442
- score = 0.0
443
- for act in actions:
444
- for syn in ACTION_SYNONYMS.get(act, [act]):
445
- if syn in t:
446
- score += 1.0
447
- # conflict matrix: penalize mismatched operations (e.g., user wants update but chunk talks about create)
448
- conflicts = {"create": ["delete"], "delete": ["create"], "update": ["create", "delete"], "navigate": []}
449
- for act in actions:
450
- for bad in conflicts.get(act, []):
451
- for syn in ACTION_SYNONYMS.get(bad, [bad]):
452
- if syn in t:
453
- score -= 0.8
454
- return score
455
 
456
- def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
457
- if not user_modules:
458
- return 0.0
459
- raw = (meta or {}).get("module_tags", "") or ""
460
- doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
461
- overlap = len(set(user_modules) & set(doc_modules))
462
- if overlap == 0:
463
- return -0.8
464
- return 0.7 * overlap
465
-
466
- def _intent_weight(meta: dict, user_intent: str) -> float:
467
- tag = (meta or {}).get("intent_tag", "neutral")
468
- if user_intent == "neutral":
469
- return 0.0
470
- if tag == user_intent:
471
- return 1.0
472
- if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
473
- return -0.6
474
- st = ((meta or {}).get("section", "") or "").lower()
475
- topics = (meta or {}).get("topic_tags", "") or ""
476
- topic_list = [t.strip() for t in topics.split(",") if t.strip()]
477
- # Prefer errors sections strongly
478
- if user_intent == "errors" and (
479
- any(k in st for k in ["common errors", "known issues", "common issues", "errors", "escalation", "permissions", "access"])
480
- or ("permissions" in topic_list)
481
- ):
482
- return 1.10
483
- if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
484
- return 0.75
485
- return -0.2
486
 
487
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
488
- fn_tokens = _tokenize_meta_value(meta.get("filename"))
489
- title_tokens = _tokenize_meta_value(meta.get("title"))
490
- section_tokens = _tokenize_meta_value(meta.get("section"))
491
- topic_tokens = _tokenize_meta_value((meta.get("topic_tags") or ""))
492
- module_tokens = _tokenize_meta_value((meta.get("module_tags") or ""))
493
- meta_tokens = set(fn_tokens + title_tokens + section_tokens + topic_tokens + module_tokens)
494
  if not meta_tokens or not q_terms:
495
  return 0.0
496
- qset = set(q_terms)
497
- inter = len(meta_tokens & qset)
498
- return inter / max(1, len(qset))
499
 
500
- def _make_ngrams(tokens: List[str], n: int) -> List[str]:
501
- return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
502
 
503
- def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
504
- if not text or not q_terms:
505
- return 0.0
506
- low = (text or "").lower()
507
- bigrams = _make_ngrams(q_terms, 2)
508
- trigrams = _make_ngrams(q_terms, 3)
509
- score = 0.0
510
- for bg in bigrams:
511
- if bg and bg in low:
512
- score += 0.40
513
- for tg in trigrams:
514
- if tg and tg in low:
515
- score += 0.70
516
- return min(score, 2.0)
517
-
518
- def _literal_query_match_boost(text: str, query_norm: str) -> float:
519
- """Extra boost if exact normalized query substring or bigrams appear."""
520
- t = (text or "").lower()
521
- q = (query_norm or "").lower()
522
- boost = 0.0
523
- if q and q in t:
524
- boost += 0.8
525
- toks = [tok for tok in q.split() if len(tok) > 2]
526
- bigrams = _make_ngrams(toks, 2)
527
- for bg in bigrams:
528
- if bg in t:
529
- boost += 0.8
530
- break
531
- return min(boost, 1.6)
532
-
533
- def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
534
- norm_query = _normalize_query(query)
535
- q_terms = _tokenize(norm_query)
536
- user_intent = _detect_user_intent(query)
537
- actions = _extract_actions(query)
538
- user_modules = _extract_modules_from_query(query)
539
-
540
- sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 40))
541
  sem_docs = sem_res.get("documents", [])
542
  sem_metas = sem_res.get("metadatas", [])
543
  sem_dists = sem_res.get("distances", [])
@@ -553,9 +430,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
553
 
554
  sem_sims = [dist_to_sim(d) for d in sem_dists]
555
 
556
- bm25_hits = bm25_search(norm_query, top_k=max(80, top_k * 6))
557
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
558
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
 
559
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
560
  for idx, nscore in bm25_norm_pairs:
561
  d = bm25_docs[idx]
@@ -565,15 +443,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
565
 
566
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
567
 
568
- gamma = 0.30 # meta overlap
569
- delta = 0.55 # intent boost (stronger)
570
- epsilon = 0.30 # action weight
571
- zeta = 0.65 # module weight
572
- eta = 0.50 # phrase-level boost (stronger)
573
- theta = 0.40 # heading alignment bonus
574
- iota = 0.60 # literal query match boost (stronger)
575
-
576
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
577
  for cid in union_ids:
578
  if cid in sem_ids:
579
  pos = sem_ids.index(cid)
@@ -591,128 +461,48 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
591
  text = sem_text if sem_text else bm25_text
592
  meta = sem_meta if sem_meta else bm25_meta
593
 
594
- m_overlap = _meta_overlap(meta, q_terms)
595
- intent_boost = _intent_weight(meta, user_intent)
596
- act_wt = _action_weight(text, actions)
597
- mod_wt = _module_weight(meta, user_modules)
598
- phrase_wt = _phrase_boost_score(text, q_terms)
599
- literal_wt = _literal_query_match_boost(text, norm_query)
600
-
601
- sec_low = ((meta or {}).get("section", "") or "").lower()
602
- title_low = ((meta or {}).get("title", "") or "").lower()
603
- heading_bonus = 0.0
604
- if any(root in sec_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
605
- heading_bonus += 0.40
606
- if any(root in title_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
607
- heading_bonus += 0.40
608
- if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
609
- heading_bonus -= 0.35
610
-
611
- final_score = (
612
- alpha * sem_sim
613
- + beta * bm25_sim
614
- + gamma * m_overlap
615
- + delta * intent_boost
616
- + epsilon * act_wt
617
- + zeta * mod_wt
618
- + eta * phrase_wt
619
- + theta * heading_bonus
620
- + iota * literal_wt
621
- )
622
-
623
- combined_records_ext.append(
624
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
625
- )
626
-
627
- # ---- Exact-match rerank for errors ----
628
- if user_intent == "errors":
629
- exact_hits = []
630
- for rec in combined_records_ext:
631
- text_lower = (rec[3] or "").lower()
632
- if any(phrase in text_lower for phrase in [
633
- norm_query,
634
- *(_make_ngrams([tok for tok in norm_query.split() if len(tok) > 2], 2))
635
- ]):
636
- exact_hits.append(rec)
637
- if exact_hits:
638
- # Move exact hits to front and keep order by current final_score
639
- rest = [r for r in combined_records_ext if r not in exact_hits]
640
- exact_hits.sort(key=lambda x: x[1], reverse=True)
641
- rest.sort(key=lambda x: x[1], reverse=True)
642
- combined_records_ext = exact_hits + rest
643
-
644
- from collections import defaultdict
645
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
646
- for rec in combined_records_ext:
647
- meta = rec[4] or {}
648
- fn = meta.get("filename", "unknown")
649
- doc_groups[fn].append(rec)
650
-
651
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
652
- total_score = sum(r[1] for r in recs)
653
- total_overlap = sum(r[5] for r in recs)
654
- total_intent = sum(max(0.0, r[6]) for r in recs)
655
- total_action = sum(max(0.0, r[7]) for r in recs)
656
- total_module = sum(r[8] for r in recs)
657
- total_phrase = sum(r[9] for r in recs)
658
- total_heading = sum(r[10] for r in recs)
659
- total_literal = sum(r[11] for r in recs)
660
- total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
661
-
662
- # Errors doc prior: bonus for errors/known issues sections
663
- errors_section_bonus = 0.0
664
- if any("error" in ((r[4] or {}).get("section", "")).lower() or "known issues" in ((r[4] or {}).get("section", "")).lower()
665
- or "common issues" in ((r[4] or {}).get("section", "")).lower() for r in recs):
666
- errors_section_bonus = 0.5
667
-
668
- return (
669
- total_score
670
- + 0.4 * total_overlap
671
- + 0.7 * total_intent
672
- + 0.5 * total_action
673
- + 0.8 * total_module
674
- + 0.6 * total_phrase
675
- + 0.6 * total_heading
676
- + 0.7 * total_literal
677
- + errors_section_bonus
678
- + 0.3 * total_penalty
679
- )
680
-
681
- best_doc, best_doc_prior = None, -1.0
682
- for fn, recs in doc_groups.items():
683
- p = doc_prior(recs)
684
- if p > best_doc_prior:
685
- best_doc_prior, best_doc = p, fn
686
-
687
- best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
688
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
689
- for fn, recs in doc_groups.items():
690
- if fn == best_doc:
691
- continue
692
- other_recs.extend(recs)
693
- other_recs.sort(key=lambda x: x[1], reverse=True)
694
 
695
- reordered = best_recs + other_recs
696
- top = reordered[:top_k]
697
- documents = [t[3] for t in top]
698
- metadatas = [t[4] for t in top]
699
- distances = [t[2] for t in top]
700
- ids = [t[0] for t in top]
701
- combined_scores = [t[1] for t in top]
 
 
 
 
 
702
 
703
  return {
704
- "documents": documents,
705
- "metadatas": metadatas,
706
- "distances": distances,
707
- "ids": ids,
708
- "combined_scores": combined_scores,
709
- "best_doc": best_doc,
710
- "best_doc_prior": best_doc_prior,
711
- "user_intent": user_intent,
712
- "actions": actions,
713
  }
714
 
715
- # --------------------------- Section fetch helpers ---------------------------
716
  def get_section_text(filename: str, section: str) -> str:
717
  texts: List[str] = []
718
  for d in bm25_docs:
@@ -721,7 +511,8 @@ def get_section_text(filename: str, section: str) -> str:
721
  t = (d.get("text") or "").strip()
722
  if t:
723
  texts.append(t)
724
- return "\n\n".join(texts).strip()
 
725
 
726
  def get_best_steps_section_text(filename: str) -> str:
727
  texts: List[str] = []
@@ -731,32 +522,22 @@ def get_best_steps_section_text(filename: str) -> str:
731
  t = (d.get("text") or "").strip()
732
  if t:
733
  texts.append(t)
734
- return "\n\n".join(texts).strip()
 
735
 
736
  def get_best_errors_section_text(filename: str) -> str:
737
  texts: List[str] = []
738
  for d in bm25_docs:
739
  m = d.get("meta", {})
740
  sec = (m.get("section") or "").lower()
741
- topics = (m.get("topic_tags") or "")
742
- topic_list = [t.strip() for t in topics.split(",") if t.strip()]
743
- if m.get("filename") == filename and (
744
- m.get("intent_tag") == "errors"
745
- or "error" in sec
746
- or "escalation" in sec
747
- or "permission" in sec
748
- or "access" in sec
749
- or "known issues" in sec
750
- or "common issues" in sec
751
- or "errors" in sec
752
- or ("permissions" in topic_list)
753
- ):
754
  t = (d.get("text") or "").strip()
755
  if t:
756
  texts.append(t)
757
- return "\n\n".join(texts).strip()
758
 
759
- # --------------------------- Admin helpers ---------------------------
760
  def get_kb_runtime_info() -> Dict[str, Any]:
761
  return {
762
  "chroma_path": CHROMA_PATH,
@@ -767,6 +548,7 @@ def get_kb_runtime_info() -> Dict[str, Any]:
767
  "bm25_ready": bm25_ready,
768
  }
769
 
 
770
  def reset_kb(folder_path: str) -> Dict[str, Any]:
771
  result = {"status": "OK", "message": "KB reset and re-ingested"}
772
  try:
@@ -776,13 +558,11 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
776
  pass
777
  global collection
778
  collection = client.get_or_create_collection(name="knowledge_base")
779
-
780
  try:
781
  if os.path.isfile(BM25_INDEX_FILE):
782
  os.remove(BM25_INDEX_FILE)
783
  except Exception as e:
784
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
785
-
786
  os.makedirs(CHROMA_PATH, exist_ok=True)
787
  ingest_documents(folder_path)
788
  result["info"] = get_kb_runtime_info()
 
1
 
2
+ # kb_creation.py (single file)
3
+ # ---------------------------------------------------------------
4
+ # Action-aware KB ingestion + hybrid search for SOP documents.
5
+ # Tags each chunk with intent (steps/errors), module (appointments,
6
+ # receiving, etc.), and action (create/update/delete). Hybrid ranking
7
+ # rewards action alignment and penalizes conflicts so "update
8
+ # appointment" returns update/reschedule steps—NOT creation.
9
+ # ---------------------------------------------------------------
10
+
11
  import os
12
  import re
13
  import pickle
14
  from typing import List, Dict, Any, Tuple, Optional
15
+
16
  from docx import Document
17
  from sentence_transformers import SentenceTransformer
18
  import chromadb
19
 
20
+ # -------------------------- ChromaDB setup --------------------------
21
  CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
22
  client = chromadb.PersistentClient(path=CHROMA_PATH)
23
  collection = client.get_or_create_collection(name="knowledge_base")
24
 
25
+ # -------------------------- Embedding model -------------------------
26
  model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
27
 
28
+ # -------------------------- BM25 (lightweight) ----------------------
29
  BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
30
  bm25_docs: List[Dict[str, Any]] = []
31
  bm25_inverted: Dict[str, List[int]] = {}
 
35
  BM25_K1 = 1.5
36
  BM25_B = 0.75
37
 
38
+ # -------------------------- Vocab & Heuristics ----------------------
39
+ APPT_WORDS = ["appointment", "appointments", "schedule", "scheduling", "dock door", "slot"]
40
+ CREATE_WORDS = ["create", "creation", "new", "add", "generate"]
41
+ UPDATE_WORDS = ["update", "modify", "change", "edit", "reschedule", "re-schedule", "revise"]
42
+ DELETE_WORDS = ["delete", "remove", "cancel", "void"]
43
+
44
+ ACTION_SYNONYMS = {
45
+ "create": CREATE_WORDS,
46
+ "update": UPDATE_WORDS,
47
+ "delete": DELETE_WORDS,
48
+ "navigate": ["navigate", "go to", "open"],
49
+ }
50
+
51
+ ACTION_CONFLICTS = {
52
+ "update": ["create", "delete"],
53
+ "create": ["update", "delete"],
54
+ "delete": ["create", "update"],
55
+ }
56
+
57
+ SECTION_STEPS_HINTS = ["process steps", "procedure", "how to", "workflow", "instructions", "steps"]
58
+ SECTION_ERRORS_HINTS = ["common errors", "resolution", "troubleshooting", "known issues", "common issues", "escalation", "escalation path", "permissions", "access"]
59
+ ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
60
+
61
+ BULLET_RE = re.compile(r"^\s*(?:[\-\*•]|\d+[\.)])\s+", re.IGNORECASE)
62
+
63
+ # -------------------------- Utils ----------------------------------
64
  def _tokenize(text: str) -> List[str]:
65
  if not text:
66
  return []
67
  text = text.lower()
68
  return re.findall(r"[a-z0-9]+", text)
69
 
70
+
71
  def _normalize_query(q: str) -> str:
72
  q = (q or "").strip().lower()
73
  q = re.sub(r"[^\w\s]", " ", q)
74
  q = re.sub(r"\s+", " ", q).strip()
75
  return q
76
 
 
 
77
 
78
+ def _contains_any(text: str, words: List[str]) -> bool:
79
+ low = (text or "").lower()
80
+ return any(w in low for w in words)
81
 
82
+ # -------------------------- DOCX parsing ----------------------------
83
  def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
84
  sections: List[Tuple[str, List[str]]] = []
85
  current_title = None
 
103
  sections = [("Document", all_text)]
104
  return sections
105
 
106
+
107
  def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
 
 
 
 
 
108
  lines: List[str] = []
109
  for p in (paragraphs or []):
110
  p = (p or "").strip()
 
117
  lines.extend(parts)
118
  return lines
119
 
120
+
121
+ def _chunk_text_with_context(paragraphs: List[str], max_words: int = 140) -> List[str]:
122
  lines = _paragraphs_to_lines(paragraphs)
123
  chunks: List[str] = []
124
  current: List[str] = []
 
144
  chunks = [body]
145
  return chunks
146
 
147
+ # -------------------------- Tagging ---------------------------------
148
+ def _nearest_action_to_subject(text: str, subject_words: List[str]) -> Optional[str]:
149
+ """Pick action based on proximity to subject tokens (e.g., appointment)."""
150
+ low = (text or "").lower()
151
+ best = None
152
+ best_pos = 10**9
153
+ for subj in subject_words:
154
+ for m in re.finditer(re.escape(subj), low):
155
+ pos = m.start()
156
+ window = low[max(0, pos-80): pos+120]
157
+ for act, syns in [("update", UPDATE_WORDS), ("create", CREATE_WORDS), ("delete", DELETE_WORDS)]:
158
+ if any(s in window for s in syns):
159
+ if pos < best_pos:
160
+ best, best_pos = act, pos
161
+ return best
162
+
163
+
164
+ def _classify_action(text: str, filename: str, section: str) -> str:
165
+ tokens = " ".join([filename or "", section or "", text or ""]).lower()
166
+ prox = _nearest_action_to_subject(tokens, APPT_WORDS)
167
+ if prox:
168
+ return prox
169
+ if _contains_any(tokens, UPDATE_WORDS):
170
+ return "update"
171
+ if _contains_any(tokens, CREATE_WORDS):
172
+ return "create"
173
+ if _contains_any(tokens, DELETE_WORDS):
174
+ return "delete"
 
 
 
 
 
 
 
 
 
 
 
 
175
  return "neutral"
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
  def _derive_module_tags(text: str, filename: str, section_title: str) -> List[str]:
179
  tokens = " ".join([filename or "", section_title or "", text or ""]).lower()
180
  found = []
181
+ if any(w in tokens for w in APPT_WORDS):
182
+ found.append("appointments")
183
+ if any(w in tokens for w in ["receive", "receiving", "inbound", "goods receipt", "grn"]):
184
+ found.append("receiving")
185
+ if not found and ("dock" in tokens or "door" in tokens):
186
+ found.append("appointments")
 
 
 
 
187
  return list(sorted(set(found)))
188
 
189
+
190
+ def _infer_intent_tag(section_title: str, text: str) -> str:
191
+ st = (section_title or "").lower()
192
+ if any(k in st for k in SECTION_STEPS_HINTS):
193
+ return "steps"
194
+ if any(k in st for k in SECTION_ERRORS_HINTS):
195
+ return "errors"
196
+ if any(t in (text or "").lower() for t in ERROR_TERMS):
197
+ return "errors"
198
+ return "steps"
199
+
200
+ # -------------------------- Ingestion -------------------------------
201
  def ingest_documents(folder_path: str) -> None:
202
  print(f"[KB] Checking folder: {folder_path}")
203
  files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
 
216
  doc = Document(file_path)
217
  sections = _split_by_sections(doc)
218
  total_chunks = 0
 
219
  for s_idx, (section_title, paras) in enumerate(sections):
220
+ chunks = _chunk_text_with_context(paras, max_words=140)
221
  total_chunks += len(chunks)
 
 
222
  for c_idx, chunk in enumerate(chunks):
223
+ action_tag = _classify_action(chunk, file, section_title)
224
+ intent_tag = _infer_intent_tag(section_title, chunk)
 
 
 
 
 
225
  module_tags = _derive_module_tags(chunk, file, section_title)
226
  embedding = model.encode(chunk).tolist()
227
  doc_id = f"{file}:{s_idx}:{c_idx}"
 
231
  "chunk_index": c_idx,
232
  "title": doc_title,
233
  "collection": "SOP",
234
+ "intent_tag": intent_tag,
235
+ "action_tag": action_tag,
236
  "module_tags": ", ".join(module_tags) if module_tags else "",
237
  }
 
238
  try:
239
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
240
  except Exception:
 
243
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
244
  except Exception as e2:
245
  print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
 
246
  tokens = _tokenize(chunk)
247
  tf: Dict[str, int] = {}
248
  for tkn in tokens:
249
  tf[tkn] = tf.get(tkn, 0) + 1
 
250
  idx = len(bm25_docs)
251
  bm25_docs.append({
252
  "id": doc_id,
 
256
  "length": len(tokens),
257
  "meta": meta,
258
  })
 
259
  seen = set()
260
  for term in tf.keys():
261
  bm25_inverted.setdefault(term, []).append(idx)
262
  if term not in seen:
263
  bm25_df[term] = bm25_df.get(term, 0) + 1
264
  seen.add(term)
 
265
  print(f"[KB] Ingested {file} → {total_chunks} chunks")
266
 
267
  N = len(bm25_docs)
 
282
  print(f"[KB] BM25 index saved: {BM25_INDEX_FILE}")
283
  print(f"[KB] Documents ingested. Total entries in Chroma: {collection.count()}")
284
 
285
+ # -------------------------- BM25 load/search ------------------------
286
  def _load_bm25_index() -> None:
287
  global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
288
  if not os.path.exists(BM25_INDEX_FILE):
 
302
 
303
  _load_bm25_index()
304
 
305
+
306
  def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
307
  if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
308
  return 0.0
 
317
  if tf == 0:
318
  continue
319
  N = len(bm25_docs)
 
320
  try:
321
  import math
322
+ idf = math.log(((N - df + 0.5) / (df + 0.5)) + 1.0)
323
  except Exception:
324
  idf = 1.0
325
  denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
326
  score += idf * (((tf * (BM25_K1 + 1)) / (denom or 1.0)))
327
  return score
328
 
329
+
330
  def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
331
  if not bm25_ready:
332
  return []
333
+ q_terms = _tokenize(_normalize_query(query))
 
334
  if not q_terms:
335
  return []
 
336
  candidates = set()
337
  for t in q_terms:
338
  for idx in bm25_inverted.get(t, []):
339
  candidates.add(idx)
340
  if not candidates:
341
  candidates = set(range(len(bm25_docs)))
 
342
  scored = []
343
  for idx in candidates:
344
  s = _bm25_score_for_doc(q_terms, idx)
 
347
  scored.sort(key=lambda x: x[1], reverse=True)
348
  return scored[:top_k]
349
 
350
+ # -------------------------- Semantic search -------------------------
351
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
352
  query_embedding = model.encode(query).tolist()
353
  res = collection.query(
354
  query_embeddings=[query_embedding],
355
  n_results=top_k,
356
+ include=['documents', 'metadatas', 'distances']
357
  )
358
  documents = (res.get("documents", [[]]) or [[]])[0]
359
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
360
  distances = (res.get("distances", [[]]) or [[]])[0]
 
 
361
  ids: List[str] = []
362
  if documents:
363
  synthesized = []
 
367
  idx = (m or {}).get("chunk_index", i)
368
  synthesized.append(f"{fn}:{sec}:{idx}")
369
  ids = synthesized
 
 
370
  return {
371
  "documents": documents,
372
  "metadatas": metadatas,
 
374
  "ids": ids,
375
  }
376
 
377
+ # -------------------------- Hybrid ranking --------------------------
378
+ def _detect_user_action(query: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  q = (query or "").lower()
380
  found = []
381
  for act, syns in ACTION_SYNONYMS.items():
382
  if any(s in q for s in syns):
383
  found.append(act)
384
+ if "reschedule" in q or "re-schedule" in q:
385
+ found.append("update")
 
 
 
 
 
 
 
 
 
 
 
 
386
  return list(sorted(set(found)))
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
+ def _detect_user_modules(query: str) -> List[str]:
390
+ q = (query or "").lower()
391
+ mods = []
392
+ if any(w in q for w in APPT_WORDS):
393
+ mods.append("appointments")
394
+ if any(w in q for w in ["receive", "receiving", "inbound", "goods receipt", "grn"]):
395
+ mods.append("receiving")
396
+ return list(sorted(set(mods)))
397
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
400
+ fn = _tokenize((meta or {}).get("filename", ""))
401
+ sec = _tokenize((meta or {}).get("section", ""))
402
+ title = _tokenize((meta or {}).get("title", ""))
403
+ mods = _tokenize((meta or {}).get("module_tags", ""))
404
+ meta_tokens = set(fn + sec + title + mods)
 
405
  if not meta_tokens or not q_terms:
406
  return 0.0
407
+ inter = len(meta_tokens & set(q_terms))
408
+ return inter / max(1, len(q_terms))
 
409
 
 
 
410
 
411
+ def hybrid_search_knowledge_base(query: str, top_k: int = 10) -> dict:
412
+ norm_q = _normalize_query(query)
413
+ q_terms = _tokenize(norm_q)
414
+ user_actions = _detect_user_action(query)
415
+ user_modules = _detect_user_modules(query)
416
+
417
+ sem_res = search_knowledge_base(norm_q, top_k=max(top_k, 40))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  sem_docs = sem_res.get("documents", [])
419
  sem_metas = sem_res.get("metadatas", [])
420
  sem_dists = sem_res.get("distances", [])
 
430
 
431
  sem_sims = [dist_to_sim(d) for d in sem_dists]
432
 
433
+ bm25_hits = bm25_search(norm_q, top_k=max(80, top_k * 6))
434
  bm25_max = max([s for _, s in bm25_hits], default=1.0)
435
  bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
436
+
437
  bm25_id_to_norm, bm25_id_to_text, bm25_id_to_meta = {}, {}, {}
438
  for idx, nscore in bm25_norm_pairs:
439
  d = bm25_docs[idx]
 
443
 
444
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
445
 
446
+ records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
 
 
 
 
 
 
 
 
447
  for cid in union_ids:
448
  if cid in sem_ids:
449
  pos = sem_ids.index(cid)
 
461
  text = sem_text if sem_text else bm25_text
462
  meta = sem_meta if sem_meta else bm25_meta
463
 
464
+ base = 0.55 * sem_sim + 0.45 * bm25_sim
465
+ overlap = 0.30 * _meta_overlap(meta, q_terms)
466
+
467
+ doc_mods = [m.strip() for m in (meta.get("module_tags") or "").split(",") if m.strip()]
468
+ mod_overlap = len(set(doc_mods) & set(user_modules))
469
+ mod_bonus = 0.60 * mod_overlap if mod_overlap else -0.50
470
+
471
+ doc_action = (meta.get("action_tag") or "neutral").lower()
472
+ action_bonus = 0.0
473
+ if user_actions:
474
+ if doc_action in user_actions:
475
+ action_bonus += 1.40
476
+ for ua in user_actions:
477
+ for bad in ACTION_CONFLICTS.get(ua, []):
478
+ if doc_action == bad:
479
+ action_bonus -= 1.40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
 
481
+ sec_low = (meta.get("section") or "").lower()
482
+ title_low = (meta.get("title") or "").lower()
483
+ head_bonus = 0.0
484
+ if any(w in sec_low for w in APPT_WORDS) or any(w in title_low for w in APPT_WORDS):
485
+ if "appointments" in user_modules:
486
+ head_bonus += 0.40
487
+
488
+ final = base + overlap + mod_bonus + action_bonus + head_bonus
489
+ records.append((cid, final, (sem_dist if sem_dist is not None else 999.0), text, meta))
490
+
491
+ records.sort(key=lambda x: x[1], reverse=True)
492
+ top = records[:top_k]
493
 
494
  return {
495
+ "documents": [t[3] for t in top],
496
+ "metadatas": [t[4] for t in top],
497
+ "distances": [t[2] for t in top],
498
+ "ids": [t[0] for t in top],
499
+ "combined_scores": [t[1] for t in top],
500
+ "best_doc": (top[0][4].get("filename") if top else None),
501
+ "user_actions": user_actions,
502
+ "user_modules": user_modules,
 
503
  }
504
 
505
+ # -------------------------- Section helpers -------------------------
506
  def get_section_text(filename: str, section: str) -> str:
507
  texts: List[str] = []
508
  for d in bm25_docs:
 
511
  t = (d.get("text") or "").strip()
512
  if t:
513
  texts.append(t)
514
+ return "".join(texts).strip()
515
+
516
 
517
  def get_best_steps_section_text(filename: str) -> str:
518
  texts: List[str] = []
 
522
  t = (d.get("text") or "").strip()
523
  if t:
524
  texts.append(t)
525
+ return "".join(texts).strip()
526
+
527
 
528
  def get_best_errors_section_text(filename: str) -> str:
529
  texts: List[str] = []
530
  for d in bm25_docs:
531
  m = d.get("meta", {})
532
  sec = (m.get("section") or "").lower()
533
+ topics = (m.get("module_tags") or "")
534
+ if m.get("filename") == filename and (m.get("intent_tag") == "errors" or "error" in sec):
 
 
 
 
 
 
 
 
 
 
 
535
  t = (d.get("text") or "").strip()
536
  if t:
537
  texts.append(t)
538
+ return "".join(texts).strip()
539
 
540
+ # -------------------------- Admin helpers ---------------------------
541
  def get_kb_runtime_info() -> Dict[str, Any]:
542
  return {
543
  "chroma_path": CHROMA_PATH,
 
548
  "bm25_ready": bm25_ready,
549
  }
550
 
551
+
552
  def reset_kb(folder_path: str) -> Dict[str, Any]:
553
  result = {"status": "OK", "message": "KB reset and re-ingested"}
554
  try:
 
558
  pass
559
  global collection
560
  collection = client.get_or_create_collection(name="knowledge_base")
 
561
  try:
562
  if os.path.isfile(BM25_INDEX_FILE):
563
  os.remove(BM25_INDEX_FILE)
564
  except Exception as e:
565
  result.setdefault("warnings", []).append(f"bm25 index delete: {e}")
 
566
  os.makedirs(CHROMA_PATH, exist_ok=True)
567
  ingest_documents(folder_path)
568
  result["info"] = get_kb_runtime_info()