Open-Nursing-Validator / scripts /evaluate_model.py
NurseCitizenDeveloper's picture
Deploy Open Nursing Validator (Docker)
6d12932 verified
"""
Evaluation Script for Fine-Tuned Nursing LLM
Uses Azure OpenAI (GPT-4o) as an "Expert Judge" to score model responses.
"""
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
load_dotenv()
# Configuration
HF_MODEL_ID = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
BASE_MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"
def load_model_from_hf():
"""Load the fine-tuned LoRA adapter from Hugging Face."""
print(f"πŸ”„ Loading model from Hugging Face: {HF_MODEL_ID}...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
# Load base model + adapter
model = AutoModelForCausalLM.from_pretrained(
HF_MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
print("βœ… Model loaded successfully!")
return model, tokenizer
def get_azure_judge():
"""Initialize Azure OpenAI as the expert judge."""
return AzureChatOpenAI(
azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
def evaluate_response(llm, instruction, input_text, model_output):
"""Use Azure GPT-4o to evaluate the model's response."""
prompt = f"""You are an expert nursing educator evaluating an AI assistant trained on FONS (Foundation of Nursing Studies) principles.
Evaluate the following response on a scale of 1-10 for each criterion:
1. **Clinical Accuracy** (1-10): Is the information clinically correct?
2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?
3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?
**Instruction Given**: {instruction}
**Context**: {input_text}
**Model Response**: {model_output}
Provide scores and a brief rationale for each, then an overall recommendation."""
response = llm.invoke([HumanMessage(content=prompt)])
return response.content
def run_evaluation():
"""Main evaluation loop."""
# Test cases covering key nursing domains
test_cases = [
{
"instruction": "Summarize the key nursing interventions for a patient with delirium.",
"input": "Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations. History of dementia."
},
{
"instruction": "What are the FONS principles for person-centred care?",
"input": "A nurse is documenting care for a patient with dementia."
},
{
"instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
"input": "Using the Braden Scale for a patient with darker skin."
},
{
"instruction": "How should a nurse communicate bad news to a family member?",
"input": "The patient's condition has deteriorated significantly overnight."
},
]
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
# Load model and judge
model, tokenizer = load_model_from_hf()
llm = get_azure_judge()
print("\n" + "="*60)
print("🏁 Relational Ai for Nursing Evaluation")
print("="*60)
total_scores = {"accuracy": 0, "person_centred": 0, "fons": 0}
for i, case in enumerate(test_cases, 1):
print(f"\n--- Test Case {i}/{len(test_cases)} ---")
print(f"πŸ“ Instruction: {case['instruction']}")
# Generate response
prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Response:")[-1].strip() if "### Response:" in response else response
print(f"πŸ€– Model Response: {response[:300]}...")
# Evaluate with Azure Judge
evaluation = evaluate_response(llm, case["instruction"], case["input"], response)
print(f"\nβš–οΈ Expert Evaluation:\n{evaluation}")
print("-" * 50)
print("\n" + "="*60)
print("βœ… Evaluation Complete!")
print("="*60)
if __name__ == "__main__":
run_evaluation()