Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +26 -16
services/kb_creation.py
CHANGED
|
@@ -92,7 +92,6 @@ PERMISSION_TERMS = [
|
|
| 92 |
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
|
| 93 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 94 |
|
| 95 |
-
# Expanded module vocabulary: split Receiving vs Appointments (generic, non-hardcoded)
|
| 96 |
MODULE_VOCAB = {
|
| 97 |
"receiving": [
|
| 98 |
"receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
|
|
@@ -119,7 +118,6 @@ def _infer_intent_tag(section_title: str) -> str:
|
|
| 119 |
return "prereqs"
|
| 120 |
if any(k in st for k in ["purpose", "overview", "introduction"]):
|
| 121 |
return "purpose"
|
| 122 |
-
# Heading hints (generic)
|
| 123 |
if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
|
| 124 |
return "steps"
|
| 125 |
if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
|
|
@@ -327,11 +325,10 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
|
|
| 327 |
# ---------------------------- Semantic-only ----------------------------
|
| 328 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 329 |
query_embedding = model.encode(query).tolist()
|
| 330 |
-
# Request supported fields only; synthesize ids later
|
| 331 |
res = collection.query(
|
| 332 |
query_embeddings=[query_embedding],
|
| 333 |
n_results=top_k,
|
| 334 |
-
include=['documents', 'metadatas', 'distances']
|
| 335 |
)
|
| 336 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 337 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
|
@@ -356,7 +353,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
|
| 356 |
"ids": ids,
|
| 357 |
}
|
| 358 |
|
| 359 |
-
# ---------------------------- Hybrid search (
|
| 360 |
ACTION_SYNONYMS = {
|
| 361 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 362 |
"update": ["update", "modify", "change", "edit"],
|
|
@@ -388,14 +385,23 @@ def _extract_actions(query: str) -> List[str]:
|
|
| 388 |
for act, syns in ACTION_SYNONYMS.items():
|
| 389 |
if any(s in q for s in syns):
|
| 390 |
found.append(act)
|
| 391 |
-
# receiving verbs hint (generic)
|
| 392 |
if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
|
| 393 |
found.append("navigate")
|
| 394 |
-
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
|
| 397 |
def _action_weight(text: str, actions: List[str]) -> float:
|
| 398 |
-
"""Score based on presence of action synonyms in the text."""
|
| 399 |
if not actions:
|
| 400 |
return 0.0
|
| 401 |
t = (text or "").lower()
|
|
@@ -419,7 +425,7 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
|
|
| 419 |
doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
|
| 420 |
overlap = len(set(user_modules) & set(doc_modules))
|
| 421 |
if overlap == 0:
|
| 422 |
-
return -0.8
|
| 423 |
return 0.7 * overlap
|
| 424 |
|
| 425 |
def _intent_weight(meta: dict, user_intent: str) -> float:
|
|
@@ -462,7 +468,6 @@ def _make_ngrams(tokens: List[str], n: int) -> List[str]:
|
|
| 462 |
return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
|
| 463 |
|
| 464 |
def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
|
| 465 |
-
"""Phrase-level scoring: boosts exact bigram/trigram matches."""
|
| 466 |
if not text or not q_terms:
|
| 467 |
return 0.0
|
| 468 |
low = (text or "").lower()
|
|
@@ -481,8 +486,16 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 481 |
norm_query = _normalize_query(query)
|
| 482 |
q_terms = _tokenize(norm_query)
|
| 483 |
user_intent = _detect_user_intent(query)
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
|
| 488 |
sem_docs = sem_res.get("documents", [])
|
|
@@ -510,10 +523,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 510 |
bm25_id_to_text[d["id"]] = d["text"]
|
| 511 |
bm25_id_to_meta[d["id"]] = d["meta"]
|
| 512 |
|
| 513 |
-
# Union of IDs from semantic and BM25
|
| 514 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 515 |
|
| 516 |
-
# Weights
|
| 517 |
gamma = 0.30 # meta overlap
|
| 518 |
delta = 0.50 # intent boost
|
| 519 |
epsilon = 0.30 # action weight
|
|
@@ -545,7 +556,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 545 |
mod_wt = _module_weight(meta, user_modules)
|
| 546 |
phrase_wt = _phrase_boost_score(text, q_terms)
|
| 547 |
|
| 548 |
-
# Heading alignment bonus / demotion
|
| 549 |
sec_low = ((meta or {}).get("section", "") or "").lower()
|
| 550 |
title_low = ((meta or {}).get("title", "") or "").lower()
|
| 551 |
heading_bonus = 0.0
|
|
|
|
| 92 |
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
|
| 93 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 94 |
|
|
|
|
| 95 |
MODULE_VOCAB = {
|
| 96 |
"receiving": [
|
| 97 |
"receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
|
|
|
|
| 118 |
return "prereqs"
|
| 119 |
if any(k in st for k in ["purpose", "overview", "introduction"]):
|
| 120 |
return "purpose"
|
|
|
|
| 121 |
if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
|
| 122 |
return "steps"
|
| 123 |
if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
|
|
|
|
| 325 |
# ---------------------------- Semantic-only ----------------------------
|
| 326 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 327 |
query_embedding = model.encode(query).tolist()
|
|
|
|
| 328 |
res = collection.query(
|
| 329 |
query_embeddings=[query_embedding],
|
| 330 |
n_results=top_k,
|
| 331 |
+
include=['documents', 'metadatas', 'distances'] # no 'ids'
|
| 332 |
)
|
| 333 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 334 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
|
|
|
| 353 |
"ids": ids,
|
| 354 |
}
|
| 355 |
|
| 356 |
+
# ---------------------------- Hybrid search (robust) ----------------------------
|
| 357 |
ACTION_SYNONYMS = {
|
| 358 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 359 |
"update": ["update", "modify", "change", "edit"],
|
|
|
|
| 385 |
for act, syns in ACTION_SYNONYMS.items():
|
| 386 |
if any(s in q for s in syns):
|
| 387 |
found.append(act)
|
|
|
|
| 388 |
if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
|
| 389 |
found.append("navigate")
|
| 390 |
+
return list(sorted(set(found))) or []
|
| 391 |
+
|
| 392 |
+
def _extract_modules_from_query(query: str) -> List[str]:
|
| 393 |
+
q = (query or "").lower()
|
| 394 |
+
found = []
|
| 395 |
+
for mod, syns in MODULE_VOCAB.items():
|
| 396 |
+
if any(s in q for s in syns):
|
| 397 |
+
found.append(mod)
|
| 398 |
+
if not found and any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
|
| 399 |
+
found = ["receiving"]
|
| 400 |
+
if "receiving" in found and "appointments" in found:
|
| 401 |
+
return ["receiving"]
|
| 402 |
+
return list(sorted(set(found)))
|
| 403 |
|
| 404 |
def _action_weight(text: str, actions: List[str]) -> float:
|
|
|
|
| 405 |
if not actions:
|
| 406 |
return 0.0
|
| 407 |
t = (text or "").lower()
|
|
|
|
| 425 |
doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
|
| 426 |
overlap = len(set(user_modules) & set(doc_modules))
|
| 427 |
if overlap == 0:
|
| 428 |
+
return -0.8
|
| 429 |
return 0.7 * overlap
|
| 430 |
|
| 431 |
def _intent_weight(meta: dict, user_intent: str) -> float:
|
|
|
|
| 468 |
return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
|
| 469 |
|
| 470 |
def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
|
|
|
|
| 471 |
if not text or not q_terms:
|
| 472 |
return 0.0
|
| 473 |
low = (text or "").lower()
|
|
|
|
| 486 |
norm_query = _normalize_query(query)
|
| 487 |
q_terms = _tokenize(norm_query)
|
| 488 |
user_intent = _detect_user_intent(query)
|
| 489 |
+
|
| 490 |
+
# Robust guards so missing helpers can’t crash
|
| 491 |
+
try:
|
| 492 |
+
actions = _extract_actions(query)
|
| 493 |
+
except Exception:
|
| 494 |
+
actions = []
|
| 495 |
+
try:
|
| 496 |
+
user_modules = _extract_modules_from_query(query)
|
| 497 |
+
except Exception:
|
| 498 |
+
user_modules = []
|
| 499 |
|
| 500 |
sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
|
| 501 |
sem_docs = sem_res.get("documents", [])
|
|
|
|
| 523 |
bm25_id_to_text[d["id"]] = d["text"]
|
| 524 |
bm25_id_to_meta[d["id"]] = d["meta"]
|
| 525 |
|
|
|
|
| 526 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 527 |
|
|
|
|
| 528 |
gamma = 0.30 # meta overlap
|
| 529 |
delta = 0.50 # intent boost
|
| 530 |
epsilon = 0.30 # action weight
|
|
|
|
| 556 |
mod_wt = _module_weight(meta, user_modules)
|
| 557 |
phrase_wt = _phrase_boost_score(text, q_terms)
|
| 558 |
|
|
|
|
| 559 |
sec_low = ((meta or {}).get("section", "") or "").lower()
|
| 560 |
title_low = ((meta or {}).get("title", "") or "").lower()
|
| 561 |
heading_bonus = 0.0
|