Open-Nursing-Validator / core /semantic_validator.py
NurseCitizenDeveloper's picture
Deploy Open Nursing Validator (Docker)
6d12932 verified
"""
Semantic Validator Module
Uses NurseEmbed-300M to provide intelligent SNOMED/FHIR code suggestions.
"""
import json
import logging
from typing import List, Dict, Any
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Configure logger
logger = logging.getLogger(__name__)
# Singleton instances
_model = None
_knowledge_base = []
_kb_embeddings = None
def load_semantic_engine():
"""Load NurseEmbed model and knowledge base if not already loaded."""
global _model, _knowledge_base, _kb_embeddings
if _model is not None:
return True
try:
logger.info("Loading NurseEmbed-300M model...")
# Use local or huggingface model
_model = SentenceTransformer("NurseCitizenDeveloper/NurseEmbed-300M")
logger.info("Loading knowledge base...")
with open("knowledge_base.json", "r") as f:
_knowledge_base = json.load(f)
logger.info("Computing knowledge base embeddings...")
kb_texts = [item["abbrev"] for item in _knowledge_base]
# Also include descriptions for better matching?
# For now, matching abbreviations to input text.
# Actually, let's embed BOTH abbrev and full term for robust matching
kb_texts_robust = [f"{item['abbrev']} {item['full']}" for item in _knowledge_base]
_kb_embeddings = _model.encode(kb_texts_robust)
logger.info(f"Semantic engine loaded with {len(_knowledge_base)} terms.")
return True
except Exception as e:
logger.error(f"Failed to load semantic engine: {e}")
return False
def find_code_match(text: str, threshold: float = 0.35) -> List[Dict[str, Any]]:
"""
Find matching clinical codes for the given text.
"""
if not text or not text.strip():
return []
# Ensure loaded
if _model is None:
if not load_semantic_engine():
return []
try:
# Encode input
text_embedding = _model.encode([text])
# Compute cosine similarity
similarities = cosine_similarity(text_embedding, _kb_embeddings)[0]
matches = []
for idx, sim in enumerate(similarities):
if sim > threshold:
item = _knowledge_base[idx]
matches.append({
"term": item["full"],
"abbreviation": item["abbrev"],
"category": item["category"],
"confidence": float(sim),
# Since KB doesn't have codes yet, we synthesize/lookup or just suggest the term
# User requested "Suggested Codes", but KB is just text mappings.
# We will output the TERM which is mapped to a code in the validator logic
"system": "SNOMED-CT" if "SNOMED" not in item.get("full", "") else "Other",
"code": f"UNKNOWN-{idx}" # Placeholder as KB.json doesn't have codes
})
# Sort by confidence
matches.sort(key=lambda x: x["confidence"], reverse=True)
return matches[:5]
except Exception as e:
logger.error(f"Error in semantic matching: {e}")
return []