Spaces:

NurseCitizenDeveloper
/

Nursing-Language-Translator

Sleeping

File size: 10,992 Bytes

"""
🏥 Nursing Language Translator
Translates NHS clinical shorthand to formal language using NurseEmbed-300M
"""
import gradio as gr
import json
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the NurseEmbed model
print("Loading NurseEmbed-300M...")
model = SentenceTransformer("NurseCitizenDeveloper/NurseEmbed-300M")
print("✅ Model loaded!")

# Load knowledge base
with open("knowledge_base.json", "r") as f:
    KNOWLEDGE_BASE = json.load(f)

# Pre-compute embeddings for all abbreviations
print("Computing knowledge base embeddings...")
KB_TEXTS = [item["abbrev"] for item in KNOWLEDGE_BASE]
KB_EMBEDDINGS = model.encode(KB_TEXTS)
print(f"✅ {len(KB_TEXTS)} abbreviations indexed!")

# NEWS2 interpretation
NEWS2_THRESHOLDS = {
    (0, 0): ("Low risk", "Routine monitoring", "🟢"),
    (1, 4): ("Low-medium risk", "Increased monitoring frequency", "🟡"),
    (5, 6): ("Medium risk", "Urgent response - inform senior nurse/doctor", "🟠"),
    (7, 20): ("High risk", "Emergency response - immediate senior review, consider critical care", "🔴")
}

def interpret_news2(score):
    """Interpret NEWS2 score and return clinical action"""
    try:
        score = int(score)
        for (low, high), (risk, action, emoji) in NEWS2_THRESHOLDS.items():
            if low <= score <= high:
                return f"{emoji} **NEWS2 {score}**: {risk}\n   → {action}"
        return f"⚠️ NEWS2 {score}: Invalid score (should be 0-20)"
    except:
        return None

def find_abbreviation_match(text, threshold=0.3):
    """Find matching abbreviations using semantic similarity"""
    if not text.strip():
        return []
    
    # Encode the input text
    text_embedding = model.encode([text])
    
    # Compute similarities
    similarities = cosine_similarity(text_embedding, KB_EMBEDDINGS)[0]
    
    # Get matches above threshold
    matches = []
    for idx, sim in enumerate(similarities):
        if sim > threshold:
            matches.append({
                "abbrev": KNOWLEDGE_BASE[idx]["abbrev"],
                "full": KNOWLEDGE_BASE[idx]["full"],
                "category": KNOWLEDGE_BASE[idx]["category"],
                "similarity": float(sim)
            })
    
    # Sort by similarity
    matches.sort(key=lambda x: x["similarity"], reverse=True)
    return matches[:5]  # Top 5 matches

def extract_demographics(text):
    """Extract age and gender from text"""
    patterns = [
        r'(\d+)\s*[yY]/[oO]',  # 72 y/o
        r'(\d+)\s*[yY][oO]',   # 72yo
        r'(\d+)\s*[yY]ear',    # 72 year
        r'(\d+)\s*[mM]ale',    # 72 male
        r'(\d+)\s*[fF]emale',  # 72 female
        r'(\d+)\s*[MF]\b',     # 72M or 72F
    ]
    
    age = None
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            age = match.group(1)
            break
    
    gender = None
    if re.search(r'\b[mM]ale\b|\b[mM]\b|\bman\b|\bgentleman\b', text):
        gender = "Male"
    elif re.search(r'\b[fF]emale\b|\b[fF]\b|\bwoman\b|\blady\b', text):
        gender = "Female"
    
    result = ""
    if age:
        result += f"**Age**: {age} years old\n"
    if gender:
        result += f"**Gender**: {gender}\n"
    
    return result if result else None

def translate_nursing_text(input_text):
    """Main translation function"""
    if not input_text.strip():
        return "Please enter clinical text to translate."
    
    output = []
    output.append("# 📋 Translation Report\n")
    output.append(f"**Original**: _{input_text}_\n")
    output.append("---\n")
    
    # Extract demographics
    demographics = extract_demographics(input_text)
    if demographics:
        output.append("## 👤 Patient Demographics\n")
        output.append(demographics)
        output.append("")
    
    # Check for NEWS2 scores
    news_match = re.search(r'NEWS2?\s*(?:score\s*)?(?:is\s*|of\s*|=\s*)?(\d+)', input_text, re.IGNORECASE)
    if news_match:
        score = news_match.group(1)
        output.append("## ⚠️ Early Warning Score\n")
        output.append(interpret_news2(score))
        output.append("")
    
    # Tokenize and find abbreviations
    words = re.findall(r'\b[\w/]+\b|[?#][\w]*', input_text)
    
    found_terms = []
    seen = set()
    
    for word in words:
        if word.lower() in seen or len(word) < 2:
            continue
        seen.add(word.lower())
        
        matches = find_abbreviation_match(word, threshold=0.4)
        if matches:
            best_match = matches[0]
            found_terms.append({
                "original": word,
                "translation": best_match["full"],
                "category": best_match["category"],
                "confidence": best_match["similarity"]
            })
    
    # Also check multi-word phrases
    phrases_to_check = [
        "c/o", "y/o", "O/E", "U&E", "?PE", "NOF #"
    ]
    for phrase in phrases_to_check:
        if phrase.lower() in input_text.lower():
            matches = find_abbreviation_match(phrase, threshold=0.4)
            if matches and phrase.lower() not in seen:
                seen.add(phrase.lower())
                best_match = matches[0]
                found_terms.append({
                    "original": phrase,
                    "translation": best_match["full"],
                    "category": best_match["category"],
                    "confidence": best_match["similarity"]
                })
    
    # Sort by confidence
    found_terms.sort(key=lambda x: x["confidence"], reverse=True)
    
    if found_terms:
        output.append("## 📖 Clinical Terms Identified\n")
        output.append("| Term | Translation | Category | Confidence |")
        output.append("|------|-------------|----------|------------|")
        for term in found_terms:
            conf_bar = "🟢" if term["confidence"] > 0.7 else ("🟡" if term["confidence"] > 0.5 else "🟠")
            output.append(f"| `{term['original']}` | {term['translation']} | {term['category']} | {conf_bar} {term['confidence']:.0%} |")
        output.append("")
    
    # Generate formal translation
    output.append("## ✅ Formal Translation\n")
    formal_text = input_text
    for term in found_terms:
        # Replace abbreviation with full form
        pattern = re.compile(re.escape(term["original"]), re.IGNORECASE)
        formal_text = pattern.sub(f"**{term['translation']}**", formal_text, count=1)
    output.append(f"> {formal_text}\n")
    
    return "\n".join(output)


def get_abbreviation_list():
    """Return formatted list of abbreviations by category"""
    categories = {}
    for item in KNOWLEDGE_BASE:
        cat = item["category"]
        if cat not in categories:
            categories[cat] = []
        categories[cat].append(f"`{item['abbrev']}` → {item['full']}")
    
    output = ["# 📚 NHS Abbreviation Reference\n"]
    for cat in sorted(categories.keys()):
        output.append(f"## {cat}\n")
        output.append("\n".join(categories[cat]))
        output.append("")
    
    return "\n".join(output)


# Build the Gradio interface
with gr.Blocks(
    title="🏥 Nursing Language Translator",
    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="cyan")
) as app:
    gr.Markdown("""
    # 🏥 Nursing Language Translator
    
    **Powered by NurseEmbed-300M** — A clinical embedding model trained on NHS nursing terminology.
    
    Translates clinical shorthand, abbreviations, and NEWS2 scores into formal language.
    """)
    
    with gr.Tabs():
        with gr.Tab("🔄 Translate"):
            with gr.Row():
                with gr.Column(scale=1):
                    input_text = gr.Textbox(
                        label="Clinical Shorthand Input",
                        placeholder="e.g., 72M, c/o SOB, NEWS2=7, PMH: COPD, ?PE, started LMWH",
                        lines=4
                    )
                    translate_btn = gr.Button("🔄 Translate", variant="primary")
                    
                    gr.Examples(
                        examples=[
                            ["72M c/o SOB, NEWS2 score is 7, PMH: COPD, AF. Started on Salbutamol NEB and LMWH."],
                            ["Pt admitted via A&E with ?PE. CXR NAD. ABG shows type 1 resp failure. For CT PA."],
                            ["85F NOF # post-op day 2. Increasing confusion, Temp 38.2. ?UTI vs ?SSI. Sent MSU."],
                            ["54M NSTEMI. ECG: ST depression V3-V6. Troponin elevated. For ECHO and cardiology review."],
                            ["NEWS2 9 - patient deteriorating. RR 28, O2 sats 88% on 4L, HR 120, BP 90/60."]
                        ],
                        inputs=input_text,
                        label="Example Clinical Notes"
                    )
                
                with gr.Column(scale=1):
                    output_text = gr.Markdown(label="Translation")
            
            translate_btn.click(
                fn=translate_nursing_text,
                inputs=input_text,
                outputs=output_text
            )
        
        with gr.Tab("📚 Reference"):
            gr.Markdown(get_abbreviation_list())
        
        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
            ## About This Tool
            
            The **Nursing Language Translator** uses **NurseEmbed-300M**, a clinical embedding model 
            fine-tuned on NHS nursing terminology.
            
            ### How It Works
            1. **Semantic Matching**: Uses vector embeddings to match abbreviations to their meanings
            2. **NEWS2 Interpretation**: Automatically interprets Early Warning Scores
            3. **Context-Aware**: Understands clinical context, not just string matching
            
            ### Model Details
            - **Base Model**: EmbeddingGemma-300M
            - **Training Data**: 10,000 medical Q&A pairs + 200 NHS nursing abbreviations
            - **Accuracy**: 81.3% Accuracy@1 on medical retrieval
            
            ### Author
            Created by **Lincoln Gombedza** ([@NurseCitizenDeveloper](https://huggingface.co/NurseCitizenDeveloper))
            
            Part of the **Nursing Citizen Development** movement and **OpenEnv Challenge** submission.
            
            ---
            
            **Disclaimer**: This tool is for educational and assistive purposes only. 
            Always verify clinical information and follow local trust policies.
            """)
    
    gr.Markdown("""
    ---
    <center>
    🩺 Built with ❤️ for NHS Nurses | 
    <a href="https://huggingface.co/NurseCitizenDeveloper/NurseEmbed-300M">Model</a> | 
    <a href="https://github.com/Clinical-Quality-Artifical-Intelligence/nursing-language-translator">GitHub</a>
    </center>
    """)

if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)