File size: 5,008 Bytes
6d12932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""

Evaluation Script for Fine-Tuned Nursing LLM

Uses Azure OpenAI (GPT-4o) as an "Expert Judge" to score model responses.

"""
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

load_dotenv()

# Configuration
HF_MODEL_ID = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
BASE_MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"

def load_model_from_hf():
    """Load the fine-tuned LoRA adapter from Hugging Face."""
    print(f"πŸ”„ Loading model from Hugging Face: {HF_MODEL_ID}...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
    
    # Load base model + adapter
    model = AutoModelForCausalLM.from_pretrained(
        HF_MODEL_ID,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    
    print("βœ… Model loaded successfully!")
    return model, tokenizer

def get_azure_judge():
    """Initialize Azure OpenAI as the expert judge."""
    return AzureChatOpenAI(
        azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    )

def evaluate_response(llm, instruction, input_text, model_output):
    """Use Azure GPT-4o to evaluate the model's response."""
    prompt = f"""You are an expert nursing educator evaluating an AI assistant trained on FONS (Foundation of Nursing Studies) principles.



Evaluate the following response on a scale of 1-10 for each criterion:

1. **Clinical Accuracy** (1-10): Is the information clinically correct?

2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?

3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?



**Instruction Given**: {instruction}

**Context**: {input_text}

**Model Response**: {model_output}



Provide scores and a brief rationale for each, then an overall recommendation."""
    
    response = llm.invoke([HumanMessage(content=prompt)])
    return response.content

def run_evaluation():
    """Main evaluation loop."""
    # Test cases covering key nursing domains
    test_cases = [
        {
            "instruction": "Summarize the key nursing interventions for a patient with delirium.",
            "input": "Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations. History of dementia."
        },
        {
            "instruction": "What are the FONS principles for person-centred care?",
            "input": "A nurse is documenting care for a patient with dementia."
        },
        {
            "instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
            "input": "Using the Braden Scale for a patient with darker skin."
        },
        {
            "instruction": "How should a nurse communicate bad news to a family member?",
            "input": "The patient's condition has deteriorated significantly overnight."
        },
    ]
    
    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.



### Instruction:

{}



### Input:

{}



### Response:

{}"""

    # Load model and judge
    model, tokenizer = load_model_from_hf()
    llm = get_azure_judge()
    
    print("\n" + "="*60)
    print("🏁 Relational Ai for Nursing Evaluation")
    print("="*60)
    
    total_scores = {"accuracy": 0, "person_centred": 0, "fons": 0}
    
    for i, case in enumerate(test_cases, 1):
        print(f"\n--- Test Case {i}/{len(test_cases)} ---")
        print(f"πŸ“ Instruction: {case['instruction']}")
        
        # Generate response
        prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
        
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = response.split("### Response:")[-1].strip() if "### Response:" in response else response
        
        print(f"πŸ€– Model Response: {response[:300]}...")
        
        # Evaluate with Azure Judge
        evaluation = evaluate_response(llm, case["instruction"], case["input"], response)
        print(f"\nβš–οΈ Expert Evaluation:\n{evaluation}")
        print("-" * 50)
    
    print("\n" + "="*60)
    print("βœ… Evaluation Complete!")
    print("="*60)

if __name__ == "__main__":
    run_evaluation()