""" Semantic Validator Module Uses NurseEmbed-300M to provide intelligent SNOMED/FHIR code suggestions. """ import json import logging from typing import List, Dict, Any import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity # Configure logger logger = logging.getLogger(__name__) # Singleton instances _model = None _knowledge_base = [] _kb_embeddings = None def load_semantic_engine(): """Load NurseEmbed model and knowledge base if not already loaded.""" global _model, _knowledge_base, _kb_embeddings if _model is not None: return True try: logger.info("Loading NurseEmbed-300M model...") # Use local or huggingface model _model = SentenceTransformer("NurseCitizenDeveloper/NurseEmbed-300M") logger.info("Loading knowledge base...") with open("knowledge_base.json", "r") as f: _knowledge_base = json.load(f) logger.info("Computing knowledge base embeddings...") kb_texts = [item["abbrev"] for item in _knowledge_base] # Also include descriptions for better matching? # For now, matching abbreviations to input text. # Actually, let's embed BOTH abbrev and full term for robust matching kb_texts_robust = [f"{item['abbrev']} {item['full']}" for item in _knowledge_base] _kb_embeddings = _model.encode(kb_texts_robust) logger.info(f"Semantic engine loaded with {len(_knowledge_base)} terms.") return True except Exception as e: logger.error(f"Failed to load semantic engine: {e}") return False def find_code_match(text: str, threshold: float = 0.35) -> List[Dict[str, Any]]: """ Find matching clinical codes for the given text. """ if not text or not text.strip(): return [] # Ensure loaded if _model is None: if not load_semantic_engine(): return [] try: # Encode input text_embedding = _model.encode([text]) # Compute cosine similarity similarities = cosine_similarity(text_embedding, _kb_embeddings)[0] matches = [] for idx, sim in enumerate(similarities): if sim > threshold: item = _knowledge_base[idx] matches.append({ "term": item["full"], "abbreviation": item["abbrev"], "category": item["category"], "confidence": float(sim), # Since KB doesn't have codes yet, we synthesize/lookup or just suggest the term # User requested "Suggested Codes", but KB is just text mappings. # We will output the TERM which is mapped to a code in the validator logic "system": "SNOMED-CT" if "SNOMED" not in item.get("full", "") else "Other", "code": f"UNKNOWN-{idx}" # Placeholder as KB.json doesn't have codes }) # Sort by confidence matches.sort(key=lambda x: x["confidence"], reverse=True) return matches[:5] except Exception as e: logger.error(f"Error in semantic matching: {e}") return []