|
|
""" |
|
|
UAE Knowledge System - Backend Services |
|
|
Handles knowledge base and retriever initialization |
|
|
""" |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent)) |
|
|
|
|
|
from ir.retriever import EntityRetriever, RetrievalOutput |
|
|
from ir.knowledge_base import KnowledgeBase |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_retriever = None |
|
|
_knowledge_base = None |
|
|
|
|
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent |
|
|
INDEX_CACHE_PATH = PROJECT_ROOT / "ir" / "cache" / "dense_index" |
|
|
|
|
|
|
|
|
def get_knowledge_base() -> KnowledgeBase: |
|
|
"""Lazy load knowledge base""" |
|
|
global _knowledge_base |
|
|
if _knowledge_base is None: |
|
|
print("Loading knowledge base...") |
|
|
_knowledge_base = KnowledgeBase(debug=False) |
|
|
return _knowledge_base |
|
|
|
|
|
|
|
|
def get_retriever(): |
|
|
"""Get the dense retriever (cached)""" |
|
|
global _retriever |
|
|
if _retriever is not None: |
|
|
return _retriever |
|
|
|
|
|
from ir.retrievers.dense import DenseRetriever |
|
|
|
|
|
print("Loading dense retriever...") |
|
|
retriever = DenseRetriever(model_name="bge-m3", debug=False) |
|
|
kb = get_knowledge_base() |
|
|
|
|
|
|
|
|
if INDEX_CACHE_PATH.exists(): |
|
|
print(f"Loading cached index from {INDEX_CACHE_PATH}...") |
|
|
if retriever.load_index(str(INDEX_CACHE_PATH)): |
|
|
print("Cached index loaded!") |
|
|
else: |
|
|
print("Cache load failed, building index...") |
|
|
retriever.build_index_from_knowledge_base(kb) |
|
|
retriever.save_index(str(INDEX_CACHE_PATH)) |
|
|
else: |
|
|
print("Building dense index (this may take a while)...") |
|
|
retriever.build_index_from_knowledge_base(kb) |
|
|
INDEX_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True) |
|
|
retriever.save_index(str(INDEX_CACHE_PATH)) |
|
|
print("Index built and cached!") |
|
|
|
|
|
_retriever = retriever |
|
|
return retriever |
|
|
|
|
|
|
|
|
def search_knowledge_base(query: str, top_k: int = 5): |
|
|
""" |
|
|
Search the knowledge base and return formatted results |
|
|
""" |
|
|
retriever = get_retriever() |
|
|
kb = get_knowledge_base() |
|
|
|
|
|
|
|
|
results = retriever.search(query, top_k=top_k) |
|
|
|
|
|
|
|
|
formatted_results = [] |
|
|
for metadata, score in results: |
|
|
entity_id = metadata.get("entity_id", "") |
|
|
entity_name = metadata.get("entity_name", "Unknown") |
|
|
|
|
|
|
|
|
raw_data = kb.get_raw_entity(entity_id) if entity_id else None |
|
|
|
|
|
result = { |
|
|
"entity_id": entity_id, |
|
|
"entity_name": entity_name, |
|
|
"score": score, |
|
|
"chunk_type": metadata.get("chunk_type", ""), |
|
|
"subcategory": "", |
|
|
"emirate": "", |
|
|
"is_royal": False, |
|
|
"summary": "", |
|
|
"must_answer": [] |
|
|
} |
|
|
|
|
|
if raw_data: |
|
|
facts_data = raw_data.get('facts', {}) |
|
|
metadata_kb = raw_data.get('metadata', {}) |
|
|
|
|
|
result["subcategory"] = raw_data.get('subcategory', '') |
|
|
result["emirate"] = metadata_kb.get('emirate', '') |
|
|
result["is_royal"] = metadata_kb.get('is_royal', False) |
|
|
result["summary"] = facts_data.get('summary_paragraph', '') |
|
|
|
|
|
|
|
|
must_answer = facts_data.get('must_answer', []) |
|
|
result["must_answer"] = [ |
|
|
fact.get('fact', fact) if isinstance(fact, dict) else str(fact) |
|
|
for fact in must_answer[:5] |
|
|
] |
|
|
|
|
|
|
|
|
result["full_entity"] = raw_data |
|
|
|
|
|
formatted_results.append(result) |
|
|
|
|
|
return formatted_results |
|
|
|
|
|
|
|
|
def get_stats(): |
|
|
"""Get knowledge base statistics""" |
|
|
try: |
|
|
kb = get_knowledge_base() |
|
|
entities = len(kb.entities) |
|
|
|
|
|
return { |
|
|
"entities": entities, |
|
|
"categories": 8, |
|
|
"version": "2.3.0" |
|
|
} |
|
|
except Exception as e: |
|
|
return {"entities": 0, "categories": 8, "error": str(e)} |