Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +117 -40
services/kb_creation.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
#updated
|
| 2 |
|
| 3 |
# services/kb_creation.py
|
| 4 |
import os
|
|
@@ -96,12 +95,19 @@ PERMISSION_TERMS = [
|
|
| 96 |
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
|
| 97 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 98 |
|
|
|
|
| 99 |
MODULE_VOCAB = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
"picking": ["pick", "picking", "pick release", "wave", "allocation"],
|
| 101 |
-
"receiving": ["receive", "receiving", "inbound", "asn", "appointment"],
|
| 102 |
-
"inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
|
| 103 |
"putaway": ["putaway", "staging", "put away", "location assignment"],
|
| 104 |
"shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
|
|
|
|
| 105 |
"replenishment": ["replenishment", "replenish"],
|
| 106 |
}
|
| 107 |
|
|
@@ -116,6 +122,11 @@ def _infer_intent_tag(section_title: str) -> str:
|
|
| 116 |
return "prereqs"
|
| 117 |
if any(k in st for k in ["purpose", "overview", "introduction"]):
|
| 118 |
return "purpose"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
return "neutral"
|
| 120 |
|
| 121 |
|
|
@@ -145,9 +156,14 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
|
|
| 145 |
for mod, syns in MODULE_VOCAB.items():
|
| 146 |
if any(s in tokens for s in syns):
|
| 147 |
found.append(mod)
|
|
|
|
| 148 |
if not found:
|
| 149 |
if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
|
| 150 |
found = ["inventory"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
return list(sorted(set(found)))
|
| 152 |
|
| 153 |
# ---------------------------- Ingestion ----------------------------
|
|
@@ -193,13 +209,13 @@ def ingest_documents(folder_path: str) -> None:
|
|
| 193 |
"chunk_index": c_idx,
|
| 194 |
"title": doc_title,
|
| 195 |
"collection": "SOP",
|
| 196 |
-
"intent_tag": final_intent,
|
| 197 |
-
"topic_tags": ", ".join(topic_tags) if topic_tags else "",
|
| 198 |
-
"module_tags": ", ".join(module_tags) if module_tags else "",
|
| 199 |
}
|
| 200 |
try:
|
| 201 |
collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
|
| 202 |
-
except Exception
|
| 203 |
try:
|
| 204 |
collection.delete(ids=[doc_id])
|
| 205 |
collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
|
|
@@ -319,11 +335,11 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
|
|
| 319 |
# ---------------------------- Semantic-only ----------------------------
|
| 320 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 321 |
query_embedding = model.encode(query).tolist()
|
| 322 |
-
#
|
| 323 |
res = collection.query(
|
| 324 |
query_embeddings=[query_embedding],
|
| 325 |
n_results=top_k,
|
| 326 |
-
include=['documents', 'metadatas', 'distances']
|
| 327 |
)
|
| 328 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 329 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
|
@@ -348,7 +364,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
|
| 348 |
"ids": ids,
|
| 349 |
}
|
| 350 |
|
| 351 |
-
# ---------------------------- Hybrid search (intent + module + action) ----------------------------
|
| 352 |
ACTION_SYNONYMS = {
|
| 353 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 354 |
"update": ["update", "modify", "change", "edit"],
|
|
@@ -367,7 +383,7 @@ def _detect_user_intent(query: str) -> str:
|
|
| 367 |
q = (query or "").lower()
|
| 368 |
if any(k in q for k in ERROR_INTENT_TERMS):
|
| 369 |
return "errors"
|
| 370 |
-
if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
|
| 371 |
return "steps"
|
| 372 |
if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
|
| 373 |
return "prereqs"
|
|
@@ -382,7 +398,10 @@ def _extract_actions(query: str) -> List[str]:
|
|
| 382 |
for act, syns in ACTION_SYNONYMS.items():
|
| 383 |
if any(s in q for s in syns):
|
| 384 |
found.append(act)
|
| 385 |
-
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
|
| 388 |
def _extract_modules_from_query(query: str) -> List[str]:
|
|
@@ -391,8 +410,13 @@ def _extract_modules_from_query(query: str) -> List[str]:
|
|
| 391 |
for mod, syns in MODULE_VOCAB.items():
|
| 392 |
if any(s in q for s in syns):
|
| 393 |
found.append(mod)
|
| 394 |
-
|
| 395 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
return list(sorted(set(found)))
|
| 397 |
|
| 398 |
|
|
@@ -410,6 +434,9 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
|
|
| 410 |
# Strongly prefer errors/escalation/permissions when the user intent is errors
|
| 411 |
if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
|
| 412 |
return 0.95
|
|
|
|
|
|
|
|
|
|
| 413 |
return -0.2
|
| 414 |
|
| 415 |
|
|
@@ -420,8 +447,10 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
|
|
| 420 |
doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
|
| 421 |
overlap = len(set(user_modules) & set(doc_modules))
|
| 422 |
if overlap == 0:
|
| 423 |
-
|
| 424 |
-
|
|
|
|
|
|
|
| 425 |
|
| 426 |
|
| 427 |
def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
|
|
@@ -438,22 +467,29 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
|
|
| 438 |
return inter / max(1, len(qset))
|
| 439 |
|
| 440 |
|
| 441 |
-
def
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
return 0.0
|
| 444 |
-
|
|
|
|
|
|
|
| 445 |
score = 0.0
|
| 446 |
-
for
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
if syn in t:
|
| 455 |
-
score -= 0.8
|
| 456 |
-
return score
|
| 457 |
|
| 458 |
|
| 459 |
def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
|
|
@@ -492,12 +528,18 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 492 |
# Union of IDs from semantic and BM25
|
| 493 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 494 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
gamma = 0.30 # meta overlap
|
| 496 |
-
delta = 0.
|
| 497 |
epsilon = 0.30 # action weight
|
| 498 |
-
zeta = 0.
|
|
|
|
|
|
|
| 499 |
|
| 500 |
-
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
|
| 501 |
for cid in union_ids:
|
| 502 |
if cid in sem_ids:
|
| 503 |
pos = sem_ids.index(cid)
|
|
@@ -518,30 +560,65 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 518 |
m_overlap = _meta_overlap(meta, q_terms)
|
| 519 |
intent_boost = _intent_weight(meta, user_intent)
|
| 520 |
act_wt = _action_weight(text, actions)
|
| 521 |
-
mod_wt = _module_weight(meta,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
-
final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt
|
| 524 |
combined_records_ext.append(
|
| 525 |
-
(cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt)
|
| 526 |
)
|
| 527 |
|
| 528 |
from collections import defaultdict
|
| 529 |
-
doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
|
| 530 |
for rec in combined_records_ext:
|
| 531 |
meta = rec[4] or {}
|
| 532 |
fn = meta.get("filename", "unknown")
|
| 533 |
doc_groups[fn].append(rec)
|
| 534 |
|
| 535 |
-
def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
|
| 536 |
total_score = sum(r[1] for r in recs)
|
| 537 |
total_overlap = sum(r[5] for r in recs)
|
| 538 |
total_intent = sum(max(0.0, r[6]) for r in recs)
|
| 539 |
total_action = sum(max(0.0, r[7]) for r in recs)
|
| 540 |
total_module = sum(r[8] for r in recs)
|
|
|
|
|
|
|
| 541 |
total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
|
| 542 |
esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
|
| 543 |
perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
|
| 544 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
|
| 546 |
best_doc, best_doc_prior = None, -1.0
|
| 547 |
for fn, recs in doc_groups.items():
|
|
@@ -550,7 +627,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 550 |
best_doc_prior, best_doc = p, fn
|
| 551 |
|
| 552 |
best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
|
| 553 |
-
other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
|
| 554 |
for fn, recs in doc_groups.items():
|
| 555 |
if fn == best_doc:
|
| 556 |
continue
|
|
|
|
|
|
|
| 1 |
|
| 2 |
# services/kb_creation.py
|
| 3 |
import os
|
|
|
|
| 95 |
ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
|
| 96 |
STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
|
| 97 |
|
| 98 |
+
# Expanded module vocabulary: split Receiving vs Appointments
|
| 99 |
MODULE_VOCAB = {
|
| 100 |
+
"receiving": [
|
| 101 |
+
"receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
|
| 102 |
+
"asn receiving", "unload", "check-in", "dock check-in"
|
| 103 |
+
],
|
| 104 |
+
"appointments": [
|
| 105 |
+
"appointment", "appointments", "schedule", "scheduling", "slot", "dock door", "appointment creation", "appointment details"
|
| 106 |
+
],
|
| 107 |
"picking": ["pick", "picking", "pick release", "wave", "allocation"],
|
|
|
|
|
|
|
| 108 |
"putaway": ["putaway", "staging", "put away", "location assignment"],
|
| 109 |
"shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
|
| 110 |
+
"inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
|
| 111 |
"replenishment": ["replenishment", "replenish"],
|
| 112 |
}
|
| 113 |
|
|
|
|
| 122 |
return "prereqs"
|
| 123 |
if any(k in st for k in ["purpose", "overview", "introduction"]):
|
| 124 |
return "purpose"
|
| 125 |
+
# Heading hints (e.g., "Inbound Receiving", "Appointment Creation")
|
| 126 |
+
if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
|
| 127 |
+
return "steps"
|
| 128 |
+
if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
|
| 129 |
+
return "steps"
|
| 130 |
return "neutral"
|
| 131 |
|
| 132 |
|
|
|
|
| 156 |
for mod, syns in MODULE_VOCAB.items():
|
| 157 |
if any(s in tokens for s in syns):
|
| 158 |
found.append(mod)
|
| 159 |
+
# defaulting rule if none found
|
| 160 |
if not found:
|
| 161 |
if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
|
| 162 |
found = ["inventory"]
|
| 163 |
+
elif "receive" in tokens or "inbound" in tokens or "goods receipt" in tokens or "grn" in tokens:
|
| 164 |
+
found = ["receiving"]
|
| 165 |
+
elif "appointment" in tokens or "schedule" in tokens or "dock" in tokens:
|
| 166 |
+
found = ["appointments"]
|
| 167 |
return list(sorted(set(found)))
|
| 168 |
|
| 169 |
# ---------------------------- Ingestion ----------------------------
|
|
|
|
| 209 |
"chunk_index": c_idx,
|
| 210 |
"title": doc_title,
|
| 211 |
"collection": "SOP",
|
| 212 |
+
"intent_tag": final_intent,
|
| 213 |
+
"topic_tags": ", ".join(topic_tags) if topic_tags else "",
|
| 214 |
+
"module_tags": ", ".join(module_tags) if module_tags else "",
|
| 215 |
}
|
| 216 |
try:
|
| 217 |
collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
|
| 218 |
+
except Exception:
|
| 219 |
try:
|
| 220 |
collection.delete(ids=[doc_id])
|
| 221 |
collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
|
|
|
|
| 335 |
# ---------------------------- Semantic-only ----------------------------
|
| 336 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 337 |
query_embedding = model.encode(query).tolist()
|
| 338 |
+
# Request supported fields only, synthesize ids
|
| 339 |
res = collection.query(
|
| 340 |
query_embeddings=[query_embedding],
|
| 341 |
n_results=top_k,
|
| 342 |
+
include=['documents', 'metadatas', 'distances']
|
| 343 |
)
|
| 344 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 345 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
|
|
|
| 364 |
"ids": ids,
|
| 365 |
}
|
| 366 |
|
| 367 |
+
# ---------------------------- Hybrid search (intent + module + action + phrases) ----------------------------
|
| 368 |
ACTION_SYNONYMS = {
|
| 369 |
"create": ["create", "creation", "add", "new", "generate"],
|
| 370 |
"update": ["update", "modify", "change", "edit"],
|
|
|
|
| 383 |
q = (query or "").lower()
|
| 384 |
if any(k in q for k in ERROR_INTENT_TERMS):
|
| 385 |
return "errors"
|
| 386 |
+
if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform", "receiving"]):
|
| 387 |
return "steps"
|
| 388 |
if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
|
| 389 |
return "prereqs"
|
|
|
|
| 398 |
for act, syns in ACTION_SYNONYMS.items():
|
| 399 |
if any(s in q for s in syns):
|
| 400 |
found.append(act)
|
| 401 |
+
# receiving verbs hint
|
| 402 |
+
if any(w in q for w in ["receive", "receiving", "grn", "goods receipt"]):
|
| 403 |
+
found.append("navigate") # safe generic
|
| 404 |
+
return list(sorted(set(found))) or []
|
| 405 |
|
| 406 |
|
| 407 |
def _extract_modules_from_query(query: str) -> List[str]:
|
|
|
|
| 410 |
for mod, syns in MODULE_VOCAB.items():
|
| 411 |
if any(s in q for s in syns):
|
| 412 |
found.append(mod)
|
| 413 |
+
# Default if none found
|
| 414 |
+
if not found:
|
| 415 |
+
if "receive" in q or "receiving" in q or "grn" in q or "goods receipt" in q or "inbound" in q:
|
| 416 |
+
found = ["receiving"]
|
| 417 |
+
# Prefer 'receiving' over 'appointments' when both present (generic rule)
|
| 418 |
+
if "receiving" in found and "appointments" in found:
|
| 419 |
+
return ["receiving"]
|
| 420 |
return list(sorted(set(found)))
|
| 421 |
|
| 422 |
|
|
|
|
| 434 |
# Strongly prefer errors/escalation/permissions when the user intent is errors
|
| 435 |
if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
|
| 436 |
return 0.95
|
| 437 |
+
# Prefer receiving headings for receiving queries
|
| 438 |
+
if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
|
| 439 |
+
return 0.75
|
| 440 |
return -0.2
|
| 441 |
|
| 442 |
|
|
|
|
| 447 |
doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
|
| 448 |
overlap = len(set(user_modules) & set(doc_modules))
|
| 449 |
if overlap == 0:
|
| 450 |
+
# Stronger generic penalty for mismatched modules
|
| 451 |
+
return -0.8
|
| 452 |
+
# Slight boost per overlapping module
|
| 453 |
+
return 0.7 * overlap
|
| 454 |
|
| 455 |
|
| 456 |
def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
|
|
|
|
| 467 |
return inter / max(1, len(qset))
|
| 468 |
|
| 469 |
|
| 470 |
+
def _make_ngrams(tokens: List[str], n: int) -> List[str]:
|
| 471 |
+
return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
|
| 475 |
+
"""
|
| 476 |
+
Phrase-level scoring: boosts exact bigram/trigram matches.
|
| 477 |
+
Generic, no hardcoding to doc names.
|
| 478 |
+
"""
|
| 479 |
+
if not text or not q_terms:
|
| 480 |
return 0.0
|
| 481 |
+
low = (text or "").lower()
|
| 482 |
+
bigrams = _make_ngrams(q_terms, 2)
|
| 483 |
+
trigrams = _make_ngrams(q_terms, 3)
|
| 484 |
score = 0.0
|
| 485 |
+
for bg in bigrams:
|
| 486 |
+
if bg and bg in low:
|
| 487 |
+
score += 0.35
|
| 488 |
+
for tg in trigrams:
|
| 489 |
+
if tg and tg in low:
|
| 490 |
+
score += 0.60
|
| 491 |
+
# cap to avoid over-weighting
|
| 492 |
+
return min(score, 1.5)
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
|
| 495 |
def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
|
|
|
|
| 528 |
# Union of IDs from semantic and BM25
|
| 529 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 530 |
|
| 531 |
+
# Optional light gating: if we know user's primary module, keep union but strengthen penalties later.
|
| 532 |
+
primary_user_modules = user_modules if user_modules else []
|
| 533 |
+
|
| 534 |
+
# Weights
|
| 535 |
gamma = 0.30 # meta overlap
|
| 536 |
+
delta = 0.50 # intent boost (stronger for steps/errors now)
|
| 537 |
epsilon = 0.30 # action weight
|
| 538 |
+
zeta = 0.65 # module weight (stronger to avoid wrong SOP)
|
| 539 |
+
eta = 0.45 # phrase-level boost
|
| 540 |
+
theta = 0.40 # heading alignment bonus
|
| 541 |
|
| 542 |
+
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
|
| 543 |
for cid in union_ids:
|
| 544 |
if cid in sem_ids:
|
| 545 |
pos = sem_ids.index(cid)
|
|
|
|
| 560 |
m_overlap = _meta_overlap(meta, q_terms)
|
| 561 |
intent_boost = _intent_weight(meta, user_intent)
|
| 562 |
act_wt = _action_weight(text, actions)
|
| 563 |
+
mod_wt = _module_weight(meta, primary_user_modules)
|
| 564 |
+
phrase_wt = _phrase_boost_score(text, q_terms)
|
| 565 |
+
|
| 566 |
+
# Heading alignment: bonus if section/title contains key query term roots
|
| 567 |
+
sec_low = ((meta or {}).get("section", "") or "").lower()
|
| 568 |
+
title_low = ((meta or {}).get("title", "") or "").lower()
|
| 569 |
+
heading_bonus = 0.0
|
| 570 |
+
if any(root in sec_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
|
| 571 |
+
heading_bonus += 0.40
|
| 572 |
+
if any(root in title_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
|
| 573 |
+
heading_bonus += 0.40
|
| 574 |
+
if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
|
| 575 |
+
# mild demotion for appointment sections when user asks receiving
|
| 576 |
+
heading_bonus -= 0.35
|
| 577 |
+
|
| 578 |
+
final_score = (
|
| 579 |
+
alpha * sem_sim
|
| 580 |
+
+ beta * bm25_sim
|
| 581 |
+
+ gamma * m_overlap
|
| 582 |
+
+ delta * intent_boost
|
| 583 |
+
+ epsilon * act_wt
|
| 584 |
+
+ zeta * mod_wt
|
| 585 |
+
+ eta * phrase_wt
|
| 586 |
+
+ theta * heading_bonus
|
| 587 |
+
)
|
| 588 |
|
|
|
|
| 589 |
combined_records_ext.append(
|
| 590 |
+
(cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
|
| 591 |
)
|
| 592 |
|
| 593 |
from collections import defaultdict
|
| 594 |
+
doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
|
| 595 |
for rec in combined_records_ext:
|
| 596 |
meta = rec[4] or {}
|
| 597 |
fn = meta.get("filename", "unknown")
|
| 598 |
doc_groups[fn].append(rec)
|
| 599 |
|
| 600 |
+
def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
|
| 601 |
total_score = sum(r[1] for r in recs)
|
| 602 |
total_overlap = sum(r[5] for r in recs)
|
| 603 |
total_intent = sum(max(0.0, r[6]) for r in recs)
|
| 604 |
total_action = sum(max(0.0, r[7]) for r in recs)
|
| 605 |
total_module = sum(r[8] for r in recs)
|
| 606 |
+
total_phrase = sum(r[9] for r in recs)
|
| 607 |
+
total_heading = sum(r[10] for r in recs)
|
| 608 |
total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
|
| 609 |
esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
|
| 610 |
perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
|
| 611 |
+
return (
|
| 612 |
+
total_score
|
| 613 |
+
+ 0.4 * total_overlap
|
| 614 |
+
+ 0.7 * total_intent
|
| 615 |
+
+ 0.5 * total_action
|
| 616 |
+
+ 0.8 * total_module # stronger module prior
|
| 617 |
+
+ 0.6 * total_phrase # phrase prior
|
| 618 |
+
+ 0.6 * total_heading # heading prior
|
| 619 |
+
+ 0.3 * total_penalty
|
| 620 |
+
+ esc_weight + perm_weight
|
| 621 |
+
)
|
| 622 |
|
| 623 |
best_doc, best_doc_prior = None, -1.0
|
| 624 |
for fn, recs in doc_groups.items():
|
|
|
|
| 627 |
best_doc_prior, best_doc = p, fn
|
| 628 |
|
| 629 |
best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
|
| 630 |
+
other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
|
| 631 |
for fn, recs in doc_groups.items():
|
| 632 |
if fn == best_doc:
|
| 633 |
continue
|