Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +29 -8
services/kb_creation.py
CHANGED
|
@@ -227,14 +227,12 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
|
|
| 227 |
q_terms = _tokenize(norm)
|
| 228 |
if not q_terms:
|
| 229 |
return []
|
| 230 |
-
|
| 231 |
candidates = set()
|
| 232 |
for t in q_terms:
|
| 233 |
for idx in bm25_inverted.get(t, []):
|
| 234 |
candidates.add(idx)
|
| 235 |
if not candidates:
|
| 236 |
candidates = set(range(len(bm25_docs)))
|
| 237 |
-
|
| 238 |
scored = []
|
| 239 |
for idx in candidates:
|
| 240 |
s = _bm25_score_for_doc(q_terms, idx)
|
|
@@ -285,7 +283,7 @@ ACTION_SYNONYMS = {
|
|
| 285 |
"update": ["update", "modify", "change", "edit"],
|
| 286 |
"delete": ["delete", "remove"],
|
| 287 |
"navigate": ["navigate", "go to", "open"],
|
| 288 |
-
|
| 289 |
}
|
| 290 |
|
| 291 |
def _detect_user_intent(query: str) -> str:
|
|
@@ -338,7 +336,7 @@ def _action_weight(text: str, actions: List[str]) -> float:
|
|
| 338 |
for syn in ACTION_SYNONYMS.get(act, [act]):
|
| 339 |
if syn in t:
|
| 340 |
score += 1.0
|
| 341 |
-
conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []
|
| 342 |
for act in actions:
|
| 343 |
for bad in conflicts.get(act, []):
|
| 344 |
for syn in ACTION_SYNONYMS.get(bad, [bad]):
|
|
@@ -381,9 +379,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 381 |
|
| 382 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 383 |
|
| 384 |
-
gamma = 0.25
|
| 385 |
-
delta = 0.35
|
| 386 |
-
epsilon = 0.30
|
| 387 |
|
| 388 |
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
|
| 389 |
for cid in union_ids:
|
|
@@ -463,7 +461,30 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
|
|
| 463 |
"actions": actions,
|
| 464 |
}
|
| 465 |
|
| 466 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
def get_kb_runtime_info() -> Dict[str, Any]:
|
| 468 |
return {
|
| 469 |
"chroma_path": CHROMA_PATH,
|
|
|
|
| 227 |
q_terms = _tokenize(norm)
|
| 228 |
if not q_terms:
|
| 229 |
return []
|
|
|
|
| 230 |
candidates = set()
|
| 231 |
for t in q_terms:
|
| 232 |
for idx in bm25_inverted.get(t, []):
|
| 233 |
candidates.add(idx)
|
| 234 |
if not candidates:
|
| 235 |
candidates = set(range(len(bm25_docs)))
|
|
|
|
| 236 |
scored = []
|
| 237 |
for idx in candidates:
|
| 238 |
s = _bm25_score_for_doc(q_terms, idx)
|
|
|
|
| 283 |
"update": ["update", "modify", "change", "edit"],
|
| 284 |
"delete": ["delete", "remove"],
|
| 285 |
"navigate": ["navigate", "go to", "open"],
|
| 286 |
+
# NOTE: 'perform' REMOVED to avoid wrong boosts like Appointment "performed..."
|
| 287 |
}
|
| 288 |
|
| 289 |
def _detect_user_intent(query: str) -> str:
|
|
|
|
| 336 |
for syn in ACTION_SYNONYMS.get(act, [act]):
|
| 337 |
if syn in t:
|
| 338 |
score += 1.0
|
| 339 |
+
conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
|
| 340 |
for act in actions:
|
| 341 |
for bad in conflicts.get(act, []):
|
| 342 |
for syn in ACTION_SYNONYMS.get(bad, [bad]):
|
|
|
|
| 379 |
|
| 380 |
union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
|
| 381 |
|
| 382 |
+
gamma = 0.25 # meta overlap
|
| 383 |
+
delta = 0.35 # intent boost
|
| 384 |
+
epsilon = 0.30 # action weight
|
| 385 |
|
| 386 |
combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
|
| 387 |
for cid in union_ids:
|
|
|
|
| 461 |
"actions": actions,
|
| 462 |
}
|
| 463 |
|
| 464 |
+
# --------------------------- Section fetch helpers (for full output) ---------------------------
|
| 465 |
+
def get_section_text(filename: str, section: str) -> str:
|
| 466 |
+
"""Concatenate all chunk texts for a given filename+section."""
|
| 467 |
+
texts: List[str] = []
|
| 468 |
+
for d in bm25_docs:
|
| 469 |
+
m = d.get("meta", {})
|
| 470 |
+
if m.get("filename") == filename and m.get("section") == section:
|
| 471 |
+
t = (d.get("text") or "").strip()
|
| 472 |
+
if t:
|
| 473 |
+
texts.append(t)
|
| 474 |
+
return "\n\n".join(texts).strip()
|
| 475 |
+
|
| 476 |
+
def get_best_steps_section_text(filename: str) -> str:
|
| 477 |
+
"""Return combined text of all 'steps' sections in the given SOP (filename)."""
|
| 478 |
+
texts: List[str] = []
|
| 479 |
+
for d in bm25_docs:
|
| 480 |
+
m = d.get("meta", {})
|
| 481 |
+
if m.get("filename") == filename and (m.get("intent_tag") == "steps"):
|
| 482 |
+
t = (d.get("text") or "").strip()
|
| 483 |
+
if t:
|
| 484 |
+
texts.append(t)
|
| 485 |
+
return "\n\n".join(texts).strip()
|
| 486 |
+
|
| 487 |
+
# --- Admin helpers (optional; unchanged) ---
|
| 488 |
def get_kb_runtime_info() -> Dict[str, Any]:
|
| 489 |
return {
|
| 490 |
"chroma_path": CHROMA_PATH,
|