Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +87 -35
services/kb_creation.py
CHANGED
|
@@ -43,6 +43,8 @@ def _tokenize_meta_value(val: Optional[str]) -> List[str]:
|
|
| 43 |
return _tokenize(val or "")
|
| 44 |
|
| 45 |
# ---------------------------- DOCX parsing & chunking ----------------------------
|
|
|
|
|
|
|
| 46 |
def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
|
| 47 |
sections: List[Tuple[str, List[str]]] = []
|
| 48 |
current_title = None
|
|
@@ -66,18 +68,57 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
|
|
| 66 |
sections = [("Document", all_text)]
|
| 67 |
return sections
|
| 68 |
|
| 69 |
-
def
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
chunks: List[str] = []
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
if not chunks:
|
| 80 |
-
|
|
|
|
|
|
|
| 81 |
return chunks
|
| 82 |
|
| 83 |
# ---------------------------- Intent & Module tagging ----------------------------
|
|
@@ -89,7 +130,7 @@ PERMISSION_TERMS = [
|
|
| 89 |
"role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
|
| 90 |
"not allowed", "not authorized", "denied", "restrict"
|
| 91 |
]
|
| 92 |
-
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
|
| 93 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 94 |
|
| 95 |
MODULE_VOCAB = {
|
|
@@ -179,7 +220,7 @@ def ingest_documents(folder_path: str) -> None:
|
|
| 179 |
total_chunks = 0
|
| 180 |
|
| 181 |
for s_idx, (section_title, paras) in enumerate(sections):
|
| 182 |
-
chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=
|
| 183 |
total_chunks += len(chunks)
|
| 184 |
|
| 185 |
base_intent = _infer_intent_tag(section_title)
|
|
@@ -214,6 +255,7 @@ def ingest_documents(folder_path: str) -> None:
|
|
| 214 |
except Exception as e2:
|
| 215 |
print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
|
| 216 |
|
|
|
|
| 217 |
tokens = _tokenize(chunk)
|
| 218 |
tf: Dict[str, int] = {}
|
| 219 |
for tkn in tokens:
|
|
@@ -353,7 +395,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
|
| 353 |
"ids": ids,
|
| 354 |
}
|
| 355 |
|
| 356 |
-
# ---------------------------- Hybrid search (
|
| 357 |
ACTION_SYNONYMS = {
|
| 358 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 359 |
"update": ["update", "modify", "change", "edit"],
|
|
@@ -364,7 +406,7 @@ ERROR_INTENT_TERMS = [
|
|
| 364 |
"error", "issue", "fail", "not working", "resolution", "fix",
|
| 365 |
"permission", "permissions", "access", "no access", "authorization", "authorisation",
|
| 366 |
"role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
|
| 367 |
-
"escalation", "escalation path", "access right"
|
| 368 |
]
|
| 369 |
|
| 370 |
def _detect_user_intent(query: str) -> str:
|
|
@@ -445,12 +487,6 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
|
|
| 445 |
return 0.75
|
| 446 |
return -0.2
|
| 447 |
|
| 448 |
-
def _normalize_for_match(text: str) -> str:
|
| 449 |
-
t = (text or "").lower()
|
| 450 |
-
t = re.sub(r"[^\w\s]", " ", t)
|
| 451 |
-
t = re.sub(r"\s+", " ", t).strip()
|
| 452 |
-
return t
|
| 453 |
-
|
| 454 |
def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
|
| 455 |
fn_tokens = _tokenize_meta_value(meta.get("filename"))
|
| 456 |
title_tokens = _tokenize_meta_value(meta.get("title"))
|
|
@@ -482,20 +518,31 @@ def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
|
|
| 482 |
score += 0.60
|
| 483 |
return min(score, 1.5)
|
| 484 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
|
| 486 |
norm_query = _normalize_query(query)
|
| 487 |
q_terms = _tokenize(norm_query)
|
| 488 |
user_intent = _detect_user_intent(query)
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
try:
|
| 492 |
-
actions = _extract_actions(query)
|
| 493 |
-
except Exception:
|
| 494 |
-
actions = []
|
| 495 |
-
try:
|
| 496 |
-
user_modules = _extract_modules_from_query(query)
|
| 497 |
-
except Exception:
|
| 498 |
-
user_modules = []
|
| 499 |
|
| 500 |
sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
|
| 501 |
sem_docs = sem_res.get("documents", [])
|
|
@@ -531,8 +578,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 531 |
zeta = 0.65 # module weight
|
| 532 |
eta = 0.45 # phrase-level boost
|
| 533 |
theta = 0.40 # heading alignment bonus
|
|
|
|
| 534 |
|
| 535 |
-
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
|
| 536 |
for cid in union_ids:
|
| 537 |
if cid in sem_ids:
|
| 538 |
pos = sem_ids.index(cid)
|
|
@@ -555,6 +603,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 555 |
act_wt = _action_weight(text, actions)
|
| 556 |
mod_wt = _module_weight(meta, user_modules)
|
| 557 |
phrase_wt = _phrase_boost_score(text, q_terms)
|
|
|
|
| 558 |
|
| 559 |
sec_low = ((meta or {}).get("section", "") or "").lower()
|
| 560 |
title_low = ((meta or {}).get("title", "") or "").lower()
|
|
@@ -575,20 +624,21 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 575 |
+ zeta * mod_wt
|
| 576 |
+ eta * phrase_wt
|
| 577 |
+ theta * heading_bonus
|
|
|
|
| 578 |
)
|
| 579 |
|
| 580 |
combined_records_ext.append(
|
| 581 |
-
(cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
|
| 582 |
)
|
| 583 |
|
| 584 |
from collections import defaultdict
|
| 585 |
-
doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
|
| 586 |
for rec in combined_records_ext:
|
| 587 |
meta = rec[4] or {}
|
| 588 |
fn = meta.get("filename", "unknown")
|
| 589 |
doc_groups[fn].append(rec)
|
| 590 |
|
| 591 |
-
def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
|
| 592 |
total_score = sum(r[1] for r in recs)
|
| 593 |
total_overlap = sum(r[5] for r in recs)
|
| 594 |
total_intent = sum(max(0.0, r[6]) for r in recs)
|
|
@@ -596,6 +646,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 596 |
total_module = sum(r[8] for r in recs)
|
| 597 |
total_phrase = sum(r[9] for r in recs)
|
| 598 |
total_heading = sum(r[10] for r in recs)
|
|
|
|
| 599 |
total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
|
| 600 |
esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
|
| 601 |
perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
|
|
@@ -607,6 +658,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 607 |
+ 0.8 * total_module
|
| 608 |
+ 0.6 * total_phrase
|
| 609 |
+ 0.6 * total_heading
|
|
|
|
| 610 |
+ 0.3 * total_penalty
|
| 611 |
+ esc_weight + perm_weight
|
| 612 |
)
|
|
@@ -618,7 +670,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 618 |
best_doc_prior, best_doc = p, fn
|
| 619 |
|
| 620 |
best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
|
| 621 |
-
other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
|
| 622 |
for fn, recs in doc_groups.items():
|
| 623 |
if fn == best_doc:
|
| 624 |
continue
|
|
|
|
| 43 |
return _tokenize(val or "")
|
| 44 |
|
| 45 |
# ---------------------------- DOCX parsing & chunking ----------------------------
|
| 46 |
+
BULLET_RE = re.compile(r"^\s*(?:[\-\*\u2022]|\d+[.)])\s+", re.IGNORECASE)
|
| 47 |
+
|
| 48 |
def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
|
| 49 |
sections: List[Tuple[str, List[str]]] = []
|
| 50 |
current_title = None
|
|
|
|
| 68 |
sections = [("Document", all_text)]
|
| 69 |
return sections
|
| 70 |
|
| 71 |
+
def _paragraphs_to_lines(paragraphs: List[str]) -> List[str]:
|
| 72 |
+
"""
|
| 73 |
+
Split paragraphs into bullet-aware lines:
|
| 74 |
+
- Preserve bullets and numbered list lines as separate atomic lines.
|
| 75 |
+
- Split long paragraphs by '. ' into manageable lines.
|
| 76 |
+
"""
|
| 77 |
+
lines: List[str] = []
|
| 78 |
+
for p in (paragraphs or []):
|
| 79 |
+
p = (p or "").strip()
|
| 80 |
+
if not p:
|
| 81 |
+
continue
|
| 82 |
+
# If looks like a bullet/numbered item, keep as is
|
| 83 |
+
if BULLET_RE.match(p):
|
| 84 |
+
lines.append(p)
|
| 85 |
+
continue
|
| 86 |
+
# Otherwise split by sentence boundaries
|
| 87 |
+
parts = [s.strip() for s in re.split(r"(?<=[.!?])\s+", p) if s.strip()]
|
| 88 |
+
lines.extend(parts)
|
| 89 |
+
return lines
|
| 90 |
+
|
| 91 |
+
def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 300) -> List[str]:
|
| 92 |
+
"""
|
| 93 |
+
Smaller chunks for better recall; bullet-aware.
|
| 94 |
+
"""
|
| 95 |
+
lines = _paragraphs_to_lines(paragraphs)
|
| 96 |
chunks: List[str] = []
|
| 97 |
+
current: List[str] = []
|
| 98 |
+
current_len = 0
|
| 99 |
+
|
| 100 |
+
for ln in lines:
|
| 101 |
+
w = ln.split()
|
| 102 |
+
if current_len + len(w) > max_words or (BULLET_RE.match(ln) and current):
|
| 103 |
+
# close current chunk
|
| 104 |
+
chunk = " ".join(current).strip()
|
| 105 |
+
if chunk:
|
| 106 |
+
chunks.append(chunk)
|
| 107 |
+
current = [ln]
|
| 108 |
+
current_len = len(w)
|
| 109 |
+
else:
|
| 110 |
+
current.append(ln)
|
| 111 |
+
current_len += len(w)
|
| 112 |
+
|
| 113 |
+
if current:
|
| 114 |
+
chunk = " ".join(current).strip()
|
| 115 |
+
if chunk:
|
| 116 |
+
chunks.append(chunk)
|
| 117 |
+
|
| 118 |
if not chunks:
|
| 119 |
+
body = " ".join(lines).strip()
|
| 120 |
+
if body:
|
| 121 |
+
chunks = [body]
|
| 122 |
return chunks
|
| 123 |
|
| 124 |
# ---------------------------- Intent & Module tagging ----------------------------
|
|
|
|
| 130 |
"role", "role access", "role mapping", "security", "security profile", "privilege", "insufficient",
|
| 131 |
"not allowed", "not authorized", "denied", "restrict"
|
| 132 |
]
|
| 133 |
+
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't", "mismatch", "locked", "wrong", "denied"]
|
| 134 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 135 |
|
| 136 |
MODULE_VOCAB = {
|
|
|
|
| 220 |
total_chunks = 0
|
| 221 |
|
| 222 |
for s_idx, (section_title, paras) in enumerate(sections):
|
| 223 |
+
chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=300)
|
| 224 |
total_chunks += len(chunks)
|
| 225 |
|
| 226 |
base_intent = _infer_intent_tag(section_title)
|
|
|
|
| 255 |
except Exception as e2:
|
| 256 |
print(f"[KB] ERROR: Upsert failed for {doc_id}: {e2}")
|
| 257 |
|
| 258 |
+
# Build BM25 index entries
|
| 259 |
tokens = _tokenize(chunk)
|
| 260 |
tf: Dict[str, int] = {}
|
| 261 |
for tkn in tokens:
|
|
|
|
| 395 |
"ids": ids,
|
| 396 |
}
|
| 397 |
|
| 398 |
+
# ---------------------------- Hybrid search (improved) ----------------------------
|
| 399 |
ACTION_SYNONYMS = {
|
| 400 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 401 |
"update": ["update", "modify", "change", "edit"],
|
|
|
|
| 406 |
"error", "issue", "fail", "not working", "resolution", "fix",
|
| 407 |
"permission", "permissions", "access", "no access", "authorization", "authorisation",
|
| 408 |
"role", "role mapping", "not authorized", "permission denied", "insufficient privileges",
|
| 409 |
+
"escalation", "escalation path", "access right", "mismatch", "locked", "wrong"
|
| 410 |
]
|
| 411 |
|
| 412 |
def _detect_user_intent(query: str) -> str:
|
|
|
|
| 487 |
return 0.75
|
| 488 |
return -0.2
|
| 489 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
|
| 491 |
fn_tokens = _tokenize_meta_value(meta.get("filename"))
|
| 492 |
title_tokens = _tokenize_meta_value(meta.get("title"))
|
|
|
|
| 518 |
score += 0.60
|
| 519 |
return min(score, 1.5)
|
| 520 |
|
| 521 |
+
def _literal_query_match_boost(text: str, query_norm: str) -> float:
|
| 522 |
+
"""
|
| 523 |
+
Extra boost if the exact normalized query substring (or key tokens) appear in the chunk.
|
| 524 |
+
Helps errors like 'item mismatch' pick the right KB line.
|
| 525 |
+
"""
|
| 526 |
+
t = (text or "").lower()
|
| 527 |
+
q = (query_norm or "").lower()
|
| 528 |
+
boost = 0.0
|
| 529 |
+
if q and q in t:
|
| 530 |
+
boost += 0.6
|
| 531 |
+
# Also check key 2-word error tokens present in query (e.g., 'item mismatch')
|
| 532 |
+
toks = [tok for tok in q.split() if len(tok) > 2]
|
| 533 |
+
bigrams = _make_ngrams(toks, 2)
|
| 534 |
+
for bg in bigrams:
|
| 535 |
+
if bg in t:
|
| 536 |
+
boost += 0.6
|
| 537 |
+
break
|
| 538 |
+
return min(boost, 1.2)
|
| 539 |
+
|
| 540 |
def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
|
| 541 |
norm_query = _normalize_query(query)
|
| 542 |
q_terms = _tokenize(norm_query)
|
| 543 |
user_intent = _detect_user_intent(query)
|
| 544 |
+
actions = _extract_actions(query)
|
| 545 |
+
user_modules = _extract_modules_from_query(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
|
| 547 |
sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
|
| 548 |
sem_docs = sem_res.get("documents", [])
|
|
|
|
| 578 |
zeta = 0.65 # module weight
|
| 579 |
eta = 0.45 # phrase-level boost
|
| 580 |
theta = 0.40 # heading alignment bonus
|
| 581 |
+
iota = 0.40 # literal query match boost
|
| 582 |
|
| 583 |
+
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
|
| 584 |
for cid in union_ids:
|
| 585 |
if cid in sem_ids:
|
| 586 |
pos = sem_ids.index(cid)
|
|
|
|
| 603 |
act_wt = _action_weight(text, actions)
|
| 604 |
mod_wt = _module_weight(meta, user_modules)
|
| 605 |
phrase_wt = _phrase_boost_score(text, q_terms)
|
| 606 |
+
literal_wt = _literal_query_match_boost(text, norm_query)
|
| 607 |
|
| 608 |
sec_low = ((meta or {}).get("section", "") or "").lower()
|
| 609 |
title_low = ((meta or {}).get("title", "") or "").lower()
|
|
|
|
| 624 |
+ zeta * mod_wt
|
| 625 |
+ eta * phrase_wt
|
| 626 |
+ theta * heading_bonus
|
| 627 |
+
+ iota * literal_wt
|
| 628 |
)
|
| 629 |
|
| 630 |
combined_records_ext.append(
|
| 631 |
+
(cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus, literal_wt)
|
| 632 |
)
|
| 633 |
|
| 634 |
from collections import defaultdict
|
| 635 |
+
doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]] = defaultdict(list)
|
| 636 |
for rec in combined_records_ext:
|
| 637 |
meta = rec[4] or {}
|
| 638 |
fn = meta.get("filename", "unknown")
|
| 639 |
doc_groups[fn].append(rec)
|
| 640 |
|
| 641 |
+
def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]]) -> float:
|
| 642 |
total_score = sum(r[1] for r in recs)
|
| 643 |
total_overlap = sum(r[5] for r in recs)
|
| 644 |
total_intent = sum(max(0.0, r[6]) for r in recs)
|
|
|
|
| 646 |
total_module = sum(r[8] for r in recs)
|
| 647 |
total_phrase = sum(r[9] for r in recs)
|
| 648 |
total_heading = sum(r[10] for r in recs)
|
| 649 |
+
total_literal = sum(r[11] for r in recs)
|
| 650 |
total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
|
| 651 |
esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
|
| 652 |
perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
|
|
|
|
| 658 |
+ 0.8 * total_module
|
| 659 |
+ 0.6 * total_phrase
|
| 660 |
+ 0.6 * total_heading
|
| 661 |
+
+ 0.6 * total_literal
|
| 662 |
+ 0.3 * total_penalty
|
| 663 |
+ esc_weight + perm_weight
|
| 664 |
)
|
|
|
|
| 670 |
best_doc_prior, best_doc = p, fn
|
| 671 |
|
| 672 |
best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
|
| 673 |
+
other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float, float]] = []
|
| 674 |
for fn, recs in doc_groups.items():
|
| 675 |
if fn == best_doc:
|
| 676 |
continue
|