Spaces:
Sleeping
Sleeping
Update services/kb_creation.py
Browse files- services/kb_creation.py +8 -5
services/kb_creation.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
|
| 2 |
# services/kb_creation.py
|
| 3 |
import os
|
|
@@ -318,17 +319,19 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
|
|
| 318 |
# ---------------------------- Semantic-only ----------------------------
|
| 319 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 320 |
query_embedding = model.encode(query).tolist()
|
|
|
|
| 321 |
res = collection.query(
|
| 322 |
query_embeddings=[query_embedding],
|
| 323 |
n_results=top_k,
|
| 324 |
-
include=['documents', 'metadatas', 'distances'
|
| 325 |
)
|
| 326 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 327 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
| 328 |
distances = (res.get("distances", [[]]) or [[]])[0]
|
| 329 |
-
ids = (res.get("ids", [[]]) or [[]])[0]
|
| 330 |
|
| 331 |
-
|
|
|
|
|
|
|
| 332 |
synthesized = []
|
| 333 |
for i, m in enumerate(metadatas):
|
| 334 |
fn = (m or {}).get("filename", "unknown")
|
|
@@ -337,7 +340,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
|
| 337 |
synthesized.append(f"{fn}:{sec}:{idx}")
|
| 338 |
ids = synthesized
|
| 339 |
|
| 340 |
-
print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
|
| 341 |
return {
|
| 342 |
"documents": documents,
|
| 343 |
"metadatas": metadatas,
|
|
@@ -653,4 +656,4 @@ def reset_kb(folder_path: str) -> Dict[str, Any]:
|
|
| 653 |
result["info"] = get_kb_runtime_info()
|
| 654 |
return result
|
| 655 |
except Exception as e:
|
| 656 |
-
return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
|
|
|
|
| 1 |
+
#updated
|
| 2 |
|
| 3 |
# services/kb_creation.py
|
| 4 |
import os
|
|
|
|
| 319 |
# ---------------------------- Semantic-only ----------------------------
|
| 320 |
def search_knowledge_base(query: str, top_k: int = 10) -> dict:
|
| 321 |
query_embedding = model.encode(query).tolist()
|
| 322 |
+
# Some Chroma client versions do not support "ids" in include.
|
| 323 |
res = collection.query(
|
| 324 |
query_embeddings=[query_embedding],
|
| 325 |
n_results=top_k,
|
| 326 |
+
include=['documents', 'metadatas', 'distances'] # no 'ids' here
|
| 327 |
)
|
| 328 |
documents = (res.get("documents", [[]]) or [[]])[0]
|
| 329 |
metadatas = (res.get("metadatas", [[]]) or [[]])[0]
|
| 330 |
distances = (res.get("distances", [[]]) or [[]])[0]
|
|
|
|
| 331 |
|
| 332 |
+
# Synthesize IDs from metadata (filename:section:chunk_index)
|
| 333 |
+
ids: List[str] = []
|
| 334 |
+
if documents:
|
| 335 |
synthesized = []
|
| 336 |
for i, m in enumerate(metadatas):
|
| 337 |
fn = (m or {}).get("filename", "unknown")
|
|
|
|
| 340 |
synthesized.append(f"{fn}:{sec}:{idx}")
|
| 341 |
ids = synthesized
|
| 342 |
|
| 343 |
+
print(f"[KB] search → {len(documents)} docs (top_k={top_k}); first distance: {distances[0] if distances else 'n/a'}; ids synthesized={len(ids)}")
|
| 344 |
return {
|
| 345 |
"documents": documents,
|
| 346 |
"metadatas": metadatas,
|
|
|
|
| 656 |
result["info"] = get_kb_runtime_info()
|
| 657 |
return result
|
| 658 |
except Exception as e:
|
| 659 |
+
return {"status": "ERROR", "error": f"{e}", "info": get_kb_runtime_info()}
|