srilakshu012456 commited on
Commit
32e50bb
·
verified ·
1 Parent(s): afd54bd

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +117 -40
services/kb_creation.py CHANGED
@@ -1,4 +1,3 @@
1
- #updated
2
 
3
  # services/kb_creation.py
4
  import os
@@ -96,12 +95,19 @@ PERMISSION_TERMS = [
96
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
97
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
98
 
 
99
  MODULE_VOCAB = {
 
 
 
 
 
 
 
100
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
101
- "receiving": ["receive", "receiving", "inbound", "asn", "appointment"],
102
- "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
103
  "putaway": ["putaway", "staging", "put away", "location assignment"],
104
  "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
 
105
  "replenishment": ["replenishment", "replenish"],
106
  }
107
 
@@ -116,6 +122,11 @@ def _infer_intent_tag(section_title: str) -> str:
116
  return "prereqs"
117
  if any(k in st for k in ["purpose", "overview", "introduction"]):
118
  return "purpose"
 
 
 
 
 
119
  return "neutral"
120
 
121
 
@@ -145,9 +156,14 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
145
  for mod, syns in MODULE_VOCAB.items():
146
  if any(s in tokens for s in syns):
147
  found.append(mod)
 
148
  if not found:
149
  if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
150
  found = ["inventory"]
 
 
 
 
151
  return list(sorted(set(found)))
152
 
153
  # ---------------------------- Ingestion ----------------------------
@@ -193,13 +209,13 @@ def ingest_documents(folder_path: str) -> None:
193
  "chunk_index": c_idx,
194
  "title": doc_title,
195
  "collection": "SOP",
196
- "intent_tag": final_intent, # str
197
- "topic_tags": ", ".join(topic_tags) if topic_tags else "", # str (NOT list)
198
- "module_tags": ", ".join(module_tags) if module_tags else "", # str (NOT list)
199
  }
200
  try:
201
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
202
- except Exception as e1:
203
  try:
204
  collection.delete(ids=[doc_id])
205
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -319,11 +335,11 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
319
  # ---------------------------- Semantic-only ----------------------------
320
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
321
  query_embedding = model.encode(query).tolist()
322
- # Some Chroma client versions do not support "ids" in include.
323
  res = collection.query(
324
  query_embeddings=[query_embedding],
325
  n_results=top_k,
326
- include=['documents', 'metadatas', 'distances'] # no 'ids' here
327
  )
328
  documents = (res.get("documents", [[]]) or [[]])[0]
329
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
@@ -348,7 +364,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
348
  "ids": ids,
349
  }
350
 
351
- # ---------------------------- Hybrid search (intent + module + action) ----------------------------
352
  ACTION_SYNONYMS = {
353
  "create": ["create", "creation", "add", "new", "generate"],
354
  "update": ["update", "modify", "change", "edit"],
@@ -367,7 +383,7 @@ def _detect_user_intent(query: str) -> str:
367
  q = (query or "").lower()
368
  if any(k in q for k in ERROR_INTENT_TERMS):
369
  return "errors"
370
- if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
371
  return "steps"
372
  if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
373
  return "prereqs"
@@ -382,7 +398,10 @@ def _extract_actions(query: str) -> List[str]:
382
  for act, syns in ACTION_SYNONYMS.items():
383
  if any(s in q for s in syns):
384
  found.append(act)
385
- return found or []
 
 
 
386
 
387
 
388
  def _extract_modules_from_query(query: str) -> List[str]:
@@ -391,8 +410,13 @@ def _extract_modules_from_query(query: str) -> List[str]:
391
  for mod, syns in MODULE_VOCAB.items():
392
  if any(s in q for s in syns):
393
  found.append(mod)
394
- if not found and ("inventory" in q or "adjust" in q):
395
- found = ["inventory"]
 
 
 
 
 
396
  return list(sorted(set(found)))
397
 
398
 
@@ -410,6 +434,9 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
410
  # Strongly prefer errors/escalation/permissions when the user intent is errors
411
  if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
412
  return 0.95
 
 
 
413
  return -0.2
414
 
415
 
@@ -420,8 +447,10 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
420
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
421
  overlap = len(set(user_modules) & set(doc_modules))
422
  if overlap == 0:
423
- return -0.4 # demote different modules to avoid wrong SOP
424
- return 0.6 * overlap
 
 
425
 
426
 
427
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
@@ -438,22 +467,29 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
438
  return inter / max(1, len(qset))
439
 
440
 
441
- def _action_weight(text: str, actions: List[str]) -> float:
442
- if not actions:
 
 
 
 
 
 
 
 
443
  return 0.0
444
- t = (text or "").lower()
 
 
445
  score = 0.0
446
- for act in actions:
447
- for syn in ACTION_SYNONYMS.get(act, [act]):
448
- if syn in t:
449
- score += 1.0
450
- conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
451
- for act in actions:
452
- for bad in conflicts.get(act, []):
453
- for syn in ACTION_SYNONYMS.get(bad, [bad]):
454
- if syn in t:
455
- score -= 0.8
456
- return score
457
 
458
 
459
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
@@ -492,12 +528,18 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
492
  # Union of IDs from semantic and BM25
493
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
494
 
 
 
 
 
495
  gamma = 0.30 # meta overlap
496
- delta = 0.45 # intent boost (stronger for errors)
497
  epsilon = 0.30 # action weight
498
- zeta = 0.50 # module weight (new)
 
 
499
 
500
- combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
501
  for cid in union_ids:
502
  if cid in sem_ids:
503
  pos = sem_ids.index(cid)
@@ -518,30 +560,65 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
518
  m_overlap = _meta_overlap(meta, q_terms)
519
  intent_boost = _intent_weight(meta, user_intent)
520
  act_wt = _action_weight(text, actions)
521
- mod_wt = _module_weight(meta, user_modules)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
- final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt
524
  combined_records_ext.append(
525
- (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt)
526
  )
527
 
528
  from collections import defaultdict
529
- doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
530
  for rec in combined_records_ext:
531
  meta = rec[4] or {}
532
  fn = meta.get("filename", "unknown")
533
  doc_groups[fn].append(rec)
534
 
535
- def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
536
  total_score = sum(r[1] for r in recs)
537
  total_overlap = sum(r[5] for r in recs)
538
  total_intent = sum(max(0.0, r[6]) for r in recs)
539
  total_action = sum(max(0.0, r[7]) for r in recs)
540
  total_module = sum(r[8] for r in recs)
 
 
541
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
542
  esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
543
  perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
544
- return total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.6 * total_module + 0.3 * total_penalty + esc_weight + perm_weight
 
 
 
 
 
 
 
 
 
 
545
 
546
  best_doc, best_doc_prior = None, -1.0
547
  for fn, recs in doc_groups.items():
@@ -550,7 +627,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
550
  best_doc_prior, best_doc = p, fn
551
 
552
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
553
- other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
554
  for fn, recs in doc_groups.items():
555
  if fn == best_doc:
556
  continue
 
 
1
 
2
  # services/kb_creation.py
3
  import os
 
95
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
96
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
97
 
98
+ # Expanded module vocabulary: split Receiving vs Appointments
99
  MODULE_VOCAB = {
100
+ "receiving": [
101
+ "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
102
+ "asn receiving", "unload", "check-in", "dock check-in"
103
+ ],
104
+ "appointments": [
105
+ "appointment", "appointments", "schedule", "scheduling", "slot", "dock door", "appointment creation", "appointment details"
106
+ ],
107
  "picking": ["pick", "picking", "pick release", "wave", "allocation"],
 
 
108
  "putaway": ["putaway", "staging", "put away", "location assignment"],
109
  "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
110
+ "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
111
  "replenishment": ["replenishment", "replenish"],
112
  }
113
 
 
122
  return "prereqs"
123
  if any(k in st for k in ["purpose", "overview", "introduction"]):
124
  return "purpose"
125
+ # Heading hints (e.g., "Inbound Receiving", "Appointment Creation")
126
+ if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
127
+ return "steps"
128
+ if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
129
+ return "steps"
130
  return "neutral"
131
 
132
 
 
156
  for mod, syns in MODULE_VOCAB.items():
157
  if any(s in tokens for s in syns):
158
  found.append(mod)
159
+ # defaulting rule if none found
160
  if not found:
161
  if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
162
  found = ["inventory"]
163
+ elif "receive" in tokens or "inbound" in tokens or "goods receipt" in tokens or "grn" in tokens:
164
+ found = ["receiving"]
165
+ elif "appointment" in tokens or "schedule" in tokens or "dock" in tokens:
166
+ found = ["appointments"]
167
  return list(sorted(set(found)))
168
 
169
  # ---------------------------- Ingestion ----------------------------
 
209
  "chunk_index": c_idx,
210
  "title": doc_title,
211
  "collection": "SOP",
212
+ "intent_tag": final_intent,
213
+ "topic_tags": ", ".join(topic_tags) if topic_tags else "",
214
+ "module_tags": ", ".join(module_tags) if module_tags else "",
215
  }
216
  try:
217
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
218
+ except Exception:
219
  try:
220
  collection.delete(ids=[doc_id])
221
  collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 
335
  # ---------------------------- Semantic-only ----------------------------
336
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
337
  query_embedding = model.encode(query).tolist()
338
+ # Request supported fields only, synthesize ids
339
  res = collection.query(
340
  query_embeddings=[query_embedding],
341
  n_results=top_k,
342
+ include=['documents', 'metadatas', 'distances']
343
  )
344
  documents = (res.get("documents", [[]]) or [[]])[0]
345
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
 
364
  "ids": ids,
365
  }
366
 
367
+ # ---------------------------- Hybrid search (intent + module + action + phrases) ----------------------------
368
  ACTION_SYNONYMS = {
369
  "create": ["create", "creation", "add", "new", "generate"],
370
  "update": ["update", "modify", "change", "edit"],
 
383
  q = (query or "").lower()
384
  if any(k in q for k in ERROR_INTENT_TERMS):
385
  return "errors"
386
+ if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform", "receiving"]):
387
  return "steps"
388
  if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
389
  return "prereqs"
 
398
  for act, syns in ACTION_SYNONYMS.items():
399
  if any(s in q for s in syns):
400
  found.append(act)
401
+ # receiving verbs hint
402
+ if any(w in q for w in ["receive", "receiving", "grn", "goods receipt"]):
403
+ found.append("navigate") # safe generic
404
+ return list(sorted(set(found))) or []
405
 
406
 
407
  def _extract_modules_from_query(query: str) -> List[str]:
 
410
  for mod, syns in MODULE_VOCAB.items():
411
  if any(s in q for s in syns):
412
  found.append(mod)
413
+ # Default if none found
414
+ if not found:
415
+ if "receive" in q or "receiving" in q or "grn" in q or "goods receipt" in q or "inbound" in q:
416
+ found = ["receiving"]
417
+ # Prefer 'receiving' over 'appointments' when both present (generic rule)
418
+ if "receiving" in found and "appointments" in found:
419
+ return ["receiving"]
420
  return list(sorted(set(found)))
421
 
422
 
 
434
  # Strongly prefer errors/escalation/permissions when the user intent is errors
435
  if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
436
  return 0.95
437
+ # Prefer receiving headings for receiving queries
438
+ if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
439
+ return 0.75
440
  return -0.2
441
 
442
 
 
447
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
448
  overlap = len(set(user_modules) & set(doc_modules))
449
  if overlap == 0:
450
+ # Stronger generic penalty for mismatched modules
451
+ return -0.8
452
+ # Slight boost per overlapping module
453
+ return 0.7 * overlap
454
 
455
 
456
  def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
 
467
  return inter / max(1, len(qset))
468
 
469
 
470
+ def _make_ngrams(tokens: List[str], n: int) -> List[str]:
471
+ return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
472
+
473
+
474
+ def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
475
+ """
476
+ Phrase-level scoring: boosts exact bigram/trigram matches.
477
+ Generic, no hardcoding to doc names.
478
+ """
479
+ if not text or not q_terms:
480
  return 0.0
481
+ low = (text or "").lower()
482
+ bigrams = _make_ngrams(q_terms, 2)
483
+ trigrams = _make_ngrams(q_terms, 3)
484
  score = 0.0
485
+ for bg in bigrams:
486
+ if bg and bg in low:
487
+ score += 0.35
488
+ for tg in trigrams:
489
+ if tg and tg in low:
490
+ score += 0.60
491
+ # cap to avoid over-weighting
492
+ return min(score, 1.5)
 
 
 
493
 
494
 
495
  def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
 
528
  # Union of IDs from semantic and BM25
529
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
530
 
531
+ # Optional light gating: if we know user's primary module, keep union but strengthen penalties later.
532
+ primary_user_modules = user_modules if user_modules else []
533
+
534
+ # Weights
535
  gamma = 0.30 # meta overlap
536
+ delta = 0.50 # intent boost (stronger for steps/errors now)
537
  epsilon = 0.30 # action weight
538
+ zeta = 0.65 # module weight (stronger to avoid wrong SOP)
539
+ eta = 0.45 # phrase-level boost
540
+ theta = 0.40 # heading alignment bonus
541
 
542
+ combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
543
  for cid in union_ids:
544
  if cid in sem_ids:
545
  pos = sem_ids.index(cid)
 
560
  m_overlap = _meta_overlap(meta, q_terms)
561
  intent_boost = _intent_weight(meta, user_intent)
562
  act_wt = _action_weight(text, actions)
563
+ mod_wt = _module_weight(meta, primary_user_modules)
564
+ phrase_wt = _phrase_boost_score(text, q_terms)
565
+
566
+ # Heading alignment: bonus if section/title contains key query term roots
567
+ sec_low = ((meta or {}).get("section", "") or "").lower()
568
+ title_low = ((meta or {}).get("title", "") or "").lower()
569
+ heading_bonus = 0.0
570
+ if any(root in sec_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
571
+ heading_bonus += 0.40
572
+ if any(root in title_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
573
+ heading_bonus += 0.40
574
+ if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
575
+ # mild demotion for appointment sections when user asks receiving
576
+ heading_bonus -= 0.35
577
+
578
+ final_score = (
579
+ alpha * sem_sim
580
+ + beta * bm25_sim
581
+ + gamma * m_overlap
582
+ + delta * intent_boost
583
+ + epsilon * act_wt
584
+ + zeta * mod_wt
585
+ + eta * phrase_wt
586
+ + theta * heading_bonus
587
+ )
588
 
 
589
  combined_records_ext.append(
590
+ (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
591
  )
592
 
593
  from collections import defaultdict
594
+ doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
595
  for rec in combined_records_ext:
596
  meta = rec[4] or {}
597
  fn = meta.get("filename", "unknown")
598
  doc_groups[fn].append(rec)
599
 
600
+ def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
601
  total_score = sum(r[1] for r in recs)
602
  total_overlap = sum(r[5] for r in recs)
603
  total_intent = sum(max(0.0, r[6]) for r in recs)
604
  total_action = sum(max(0.0, r[7]) for r in recs)
605
  total_module = sum(r[8] for r in recs)
606
+ total_phrase = sum(r[9] for r in recs)
607
+ total_heading = sum(r[10] for r in recs)
608
  total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
609
  esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
610
  perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
611
+ return (
612
+ total_score
613
+ + 0.4 * total_overlap
614
+ + 0.7 * total_intent
615
+ + 0.5 * total_action
616
+ + 0.8 * total_module # stronger module prior
617
+ + 0.6 * total_phrase # phrase prior
618
+ + 0.6 * total_heading # heading prior
619
+ + 0.3 * total_penalty
620
+ + esc_weight + perm_weight
621
+ )
622
 
623
  best_doc, best_doc_prior = None, -1.0
624
  for fn, recs in doc_groups.items():
 
627
  best_doc_prior, best_doc = p, fn
628
 
629
  best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
630
+ other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
631
  for fn, recs in doc_groups.items():
632
  if fn == best_doc:
633
  continue