srilakshu012456 commited on
Commit
073fd3d
·
verified ·
1 Parent(s): 19b804e

Update services/kb_creation.py

Browse files
Files changed (1) hide show
  1. services/kb_creation.py +26 -16
services/kb_creation.py CHANGED
@@ -92,7 +92,6 @@ PERMISSION_TERMS = [
92
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
93
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
94
 
95
- # Expanded module vocabulary: split Receiving vs Appointments (generic, non-hardcoded)
96
  MODULE_VOCAB = {
97
  "receiving": [
98
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
@@ -119,7 +118,6 @@ def _infer_intent_tag(section_title: str) -> str:
119
  return "prereqs"
120
  if any(k in st for k in ["purpose", "overview", "introduction"]):
121
  return "purpose"
122
- # Heading hints (generic)
123
  if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
124
  return "steps"
125
  if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
@@ -327,11 +325,10 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
327
  # ---------------------------- Semantic-only ----------------------------
328
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
329
  query_embedding = model.encode(query).tolist()
330
- # Request supported fields only; synthesize ids later
331
  res = collection.query(
332
  query_embeddings=[query_embedding],
333
  n_results=top_k,
334
- include=['documents', 'metadatas', 'distances']
335
  )
336
  documents = (res.get("documents", [[]]) or [[]])[0]
337
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
@@ -356,7 +353,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
356
  "ids": ids,
357
  }
358
 
359
- # ---------------------------- Hybrid search (intent + module + action + phrases) ----------------------------
360
  ACTION_SYNONYMS = {
361
  "create": ["create", "creation", "add", "new", "generate"],
362
  "update": ["update", "modify", "change", "edit"],
@@ -388,14 +385,23 @@ def _extract_actions(query: str) -> List[str]:
388
  for act, syns in ACTION_SYNONYMS.items():
389
  if any(s in q for s in syns):
390
  found.append(act)
391
- # receiving verbs hint (generic)
392
  if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
393
  found.append("navigate")
394
- found = list(sorted(set(found)))
395
- return found or []
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  def _action_weight(text: str, actions: List[str]) -> float:
398
- """Score based on presence of action synonyms in the text."""
399
  if not actions:
400
  return 0.0
401
  t = (text or "").lower()
@@ -419,7 +425,7 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
419
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
420
  overlap = len(set(user_modules) & set(doc_modules))
421
  if overlap == 0:
422
- return -0.8 # stronger penalty to avoid wrong SOP
423
  return 0.7 * overlap
424
 
425
  def _intent_weight(meta: dict, user_intent: str) -> float:
@@ -462,7 +468,6 @@ def _make_ngrams(tokens: List[str], n: int) -> List[str]:
462
  return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
463
 
464
  def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
465
- """Phrase-level scoring: boosts exact bigram/trigram matches."""
466
  if not text or not q_terms:
467
  return 0.0
468
  low = (text or "").lower()
@@ -481,8 +486,16 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
481
  norm_query = _normalize_query(query)
482
  q_terms = _tokenize(norm_query)
483
  user_intent = _detect_user_intent(query)
484
- actions = _extract_actions(query)
485
- user_modules = _extract_modules_from_query(query)
 
 
 
 
 
 
 
 
486
 
487
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
488
  sem_docs = sem_res.get("documents", [])
@@ -510,10 +523,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
510
  bm25_id_to_text[d["id"]] = d["text"]
511
  bm25_id_to_meta[d["id"]] = d["meta"]
512
 
513
- # Union of IDs from semantic and BM25
514
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
515
 
516
- # Weights
517
  gamma = 0.30 # meta overlap
518
  delta = 0.50 # intent boost
519
  epsilon = 0.30 # action weight
@@ -545,7 +556,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
545
  mod_wt = _module_weight(meta, user_modules)
546
  phrase_wt = _phrase_boost_score(text, q_terms)
547
 
548
- # Heading alignment bonus / demotion
549
  sec_low = ((meta or {}).get("section", "") or "").lower()
550
  title_low = ((meta or {}).get("title", "") or "").lower()
551
  heading_bonus = 0.0
 
92
  ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
93
  STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
94
 
 
95
  MODULE_VOCAB = {
96
  "receiving": [
97
  "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
 
118
  return "prereqs"
119
  if any(k in st for k in ["purpose", "overview", "introduction"]):
120
  return "purpose"
 
121
  if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
122
  return "steps"
123
  if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
 
325
  # ---------------------------- Semantic-only ----------------------------
326
  def search_knowledge_base(query: str, top_k: int = 10) -> dict:
327
  query_embedding = model.encode(query).tolist()
 
328
  res = collection.query(
329
  query_embeddings=[query_embedding],
330
  n_results=top_k,
331
+ include=['documents', 'metadatas', 'distances'] # no 'ids'
332
  )
333
  documents = (res.get("documents", [[]]) or [[]])[0]
334
  metadatas = (res.get("metadatas", [[]]) or [[]])[0]
 
353
  "ids": ids,
354
  }
355
 
356
+ # ---------------------------- Hybrid search (robust) ----------------------------
357
  ACTION_SYNONYMS = {
358
  "create": ["create", "creation", "add", "new", "generate"],
359
  "update": ["update", "modify", "change", "edit"],
 
385
  for act, syns in ACTION_SYNONYMS.items():
386
  if any(s in q for s in syns):
387
  found.append(act)
 
388
  if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
389
  found.append("navigate")
390
+ return list(sorted(set(found))) or []
391
+
392
+ def _extract_modules_from_query(query: str) -> List[str]:
393
+ q = (query or "").lower()
394
+ found = []
395
+ for mod, syns in MODULE_VOCAB.items():
396
+ if any(s in q for s in syns):
397
+ found.append(mod)
398
+ if not found and any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
399
+ found = ["receiving"]
400
+ if "receiving" in found and "appointments" in found:
401
+ return ["receiving"]
402
+ return list(sorted(set(found)))
403
 
404
  def _action_weight(text: str, actions: List[str]) -> float:
 
405
  if not actions:
406
  return 0.0
407
  t = (text or "").lower()
 
425
  doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
426
  overlap = len(set(user_modules) & set(doc_modules))
427
  if overlap == 0:
428
+ return -0.8
429
  return 0.7 * overlap
430
 
431
  def _intent_weight(meta: dict, user_intent: str) -> float:
 
468
  return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
469
 
470
  def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
 
471
  if not text or not q_terms:
472
  return 0.0
473
  low = (text or "").lower()
 
486
  norm_query = _normalize_query(query)
487
  q_terms = _tokenize(norm_query)
488
  user_intent = _detect_user_intent(query)
489
+
490
+ # Robust guards so missing helpers can’t crash
491
+ try:
492
+ actions = _extract_actions(query)
493
+ except Exception:
494
+ actions = []
495
+ try:
496
+ user_modules = _extract_modules_from_query(query)
497
+ except Exception:
498
+ user_modules = []
499
 
500
  sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
501
  sem_docs = sem_res.get("documents", [])
 
523
  bm25_id_to_text[d["id"]] = d["text"]
524
  bm25_id_to_meta[d["id"]] = d["meta"]
525
 
 
526
  union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
527
 
 
528
  gamma = 0.30 # meta overlap
529
  delta = 0.50 # intent boost
530
  epsilon = 0.30 # action weight
 
556
  mod_wt = _module_weight(meta, user_modules)
557
  phrase_wt = _phrase_boost_score(text, q_terms)
558
 
 
559
  sec_low = ((meta or {}).get("section", "") or "").lower()
560
  title_low = ((meta or {}).get("title", "") or "").lower()
561
  heading_bonus = 0.0