File size: 3,406 Bytes
6d12932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""

Semantic Validator Module

Uses NurseEmbed-300M to provide intelligent SNOMED/FHIR code suggestions.

"""
import json
import logging
from typing import List, Dict, Any
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Configure logger
logger = logging.getLogger(__name__)

# Singleton instances
_model = None
_knowledge_base = []
_kb_embeddings = None

def load_semantic_engine():
    """Load NurseEmbed model and knowledge base if not already loaded."""
    global _model, _knowledge_base, _kb_embeddings
    
    if _model is not None:
        return True

    try:
        logger.info("Loading NurseEmbed-300M model...")
        # Use local or huggingface model
        _model = SentenceTransformer("NurseCitizenDeveloper/NurseEmbed-300M")
        
        logger.info("Loading knowledge base...")
        with open("knowledge_base.json", "r") as f:
            _knowledge_base = json.load(f)
            
        logger.info("Computing knowledge base embeddings...")
        kb_texts = [item["abbrev"] for item in _knowledge_base]
        # Also include descriptions for better matching? 
        # For now, matching abbreviations to input text.
        # Actually, let's embed BOTH abbrev and full term for robust matching
        kb_texts_robust = [f"{item['abbrev']} {item['full']}" for item in _knowledge_base]
        
        _kb_embeddings = _model.encode(kb_texts_robust)
        
        logger.info(f"Semantic engine loaded with {len(_knowledge_base)} terms.")
        return True
    except Exception as e:
        logger.error(f"Failed to load semantic engine: {e}")
        return False

def find_code_match(text: str, threshold: float = 0.35) -> List[Dict[str, Any]]:
    """

    Find matching clinical codes for the given text.

    """
    if not text or not text.strip():
        return []
        
    # Ensure loaded
    if _model is None:
        if not load_semantic_engine():
            return []

    try:
        # Encode input
        text_embedding = _model.encode([text])
        
        # Compute cosine similarity
        similarities = cosine_similarity(text_embedding, _kb_embeddings)[0]
        
        matches = []
        for idx, sim in enumerate(similarities):
            if sim > threshold:
                item = _knowledge_base[idx]
                matches.append({
                    "term": item["full"],
                    "abbreviation": item["abbrev"],
                    "category": item["category"],
                    "confidence": float(sim),
                    # Since KB doesn't have codes yet, we synthesize/lookup or just suggest the term
                    # User requested "Suggested Codes", but KB is just text mappings.
                    # We will output the TERM which is mapped to a code in the validator logic
                    "system": "SNOMED-CT" if "SNOMED" not in item.get("full", "") else "Other", 
                    "code": f"UNKNOWN-{idx}" # Placeholder as KB.json doesn't have codes
                })
        
        # Sort by confidence
        matches.sort(key=lambda x: x["confidence"], reverse=True)
        return matches[:5]
        
    except Exception as e:
        logger.error(f"Error in semantic matching: {e}")
        return []