|
|
"""
|
|
|
Semantic Validator Module
|
|
|
Uses NurseEmbed-300M to provide intelligent SNOMED/FHIR code suggestions.
|
|
|
"""
|
|
|
import json
|
|
|
import logging
|
|
|
from typing import List, Dict, Any
|
|
|
import numpy as np
|
|
|
from sentence_transformers import SentenceTransformer
|
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
_model = None
|
|
|
_knowledge_base = []
|
|
|
_kb_embeddings = None
|
|
|
|
|
|
def load_semantic_engine():
|
|
|
"""Load NurseEmbed model and knowledge base if not already loaded."""
|
|
|
global _model, _knowledge_base, _kb_embeddings
|
|
|
|
|
|
if _model is not None:
|
|
|
return True
|
|
|
|
|
|
try:
|
|
|
logger.info("Loading NurseEmbed-300M model...")
|
|
|
|
|
|
_model = SentenceTransformer("NurseCitizenDeveloper/NurseEmbed-300M")
|
|
|
|
|
|
logger.info("Loading knowledge base...")
|
|
|
with open("knowledge_base.json", "r") as f:
|
|
|
_knowledge_base = json.load(f)
|
|
|
|
|
|
logger.info("Computing knowledge base embeddings...")
|
|
|
kb_texts = [item["abbrev"] for item in _knowledge_base]
|
|
|
|
|
|
|
|
|
|
|
|
kb_texts_robust = [f"{item['abbrev']} {item['full']}" for item in _knowledge_base]
|
|
|
|
|
|
_kb_embeddings = _model.encode(kb_texts_robust)
|
|
|
|
|
|
logger.info(f"Semantic engine loaded with {len(_knowledge_base)} terms.")
|
|
|
return True
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to load semantic engine: {e}")
|
|
|
return False
|
|
|
|
|
|
def find_code_match(text: str, threshold: float = 0.35) -> List[Dict[str, Any]]:
|
|
|
"""
|
|
|
Find matching clinical codes for the given text.
|
|
|
"""
|
|
|
if not text or not text.strip():
|
|
|
return []
|
|
|
|
|
|
|
|
|
if _model is None:
|
|
|
if not load_semantic_engine():
|
|
|
return []
|
|
|
|
|
|
try:
|
|
|
|
|
|
text_embedding = _model.encode([text])
|
|
|
|
|
|
|
|
|
similarities = cosine_similarity(text_embedding, _kb_embeddings)[0]
|
|
|
|
|
|
matches = []
|
|
|
for idx, sim in enumerate(similarities):
|
|
|
if sim > threshold:
|
|
|
item = _knowledge_base[idx]
|
|
|
matches.append({
|
|
|
"term": item["full"],
|
|
|
"abbreviation": item["abbrev"],
|
|
|
"category": item["category"],
|
|
|
"confidence": float(sim),
|
|
|
|
|
|
|
|
|
|
|
|
"system": "SNOMED-CT" if "SNOMED" not in item.get("full", "") else "Other",
|
|
|
"code": f"UNKNOWN-{idx}"
|
|
|
})
|
|
|
|
|
|
|
|
|
matches.sort(key=lambda x: x["confidence"], reverse=True)
|
|
|
return matches[:5]
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error in semantic matching: {e}")
|
|
|
return []
|
|
|
|