File size: 5,008 Bytes
6d12932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
"""
Evaluation Script for Fine-Tuned Nursing LLM
Uses Azure OpenAI (GPT-4o) as an "Expert Judge" to score model responses.
"""
import os
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import HumanMessage
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
load_dotenv()
# Configuration
HF_MODEL_ID = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
BASE_MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"
def load_model_from_hf():
"""Load the fine-tuned LoRA adapter from Hugging Face."""
print(f"π Loading model from Hugging Face: {HF_MODEL_ID}...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)
# Load base model + adapter
model = AutoModelForCausalLM.from_pretrained(
HF_MODEL_ID,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
print("β
Model loaded successfully!")
return model, tokenizer
def get_azure_judge():
"""Initialize Azure OpenAI as the expert judge."""
return AzureChatOpenAI(
azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)
def evaluate_response(llm, instruction, input_text, model_output):
"""Use Azure GPT-4o to evaluate the model's response."""
prompt = f"""You are an expert nursing educator evaluating an AI assistant trained on FONS (Foundation of Nursing Studies) principles.
Evaluate the following response on a scale of 1-10 for each criterion:
1. **Clinical Accuracy** (1-10): Is the information clinically correct?
2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?
3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?
**Instruction Given**: {instruction}
**Context**: {input_text}
**Model Response**: {model_output}
Provide scores and a brief rationale for each, then an overall recommendation."""
response = llm.invoke([HumanMessage(content=prompt)])
return response.content
def run_evaluation():
"""Main evaluation loop."""
# Test cases covering key nursing domains
test_cases = [
{
"instruction": "Summarize the key nursing interventions for a patient with delirium.",
"input": "Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations. History of dementia."
},
{
"instruction": "What are the FONS principles for person-centred care?",
"input": "A nurse is documenting care for a patient with dementia."
},
{
"instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
"input": "Using the Braden Scale for a patient with darker skin."
},
{
"instruction": "How should a nurse communicate bad news to a family member?",
"input": "The patient's condition has deteriorated significantly overnight."
},
]
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
# Load model and judge
model, tokenizer = load_model_from_hf()
llm = get_azure_judge()
print("\n" + "="*60)
print("π Relational Ai for Nursing Evaluation")
print("="*60)
total_scores = {"accuracy": 0, "person_centred": 0, "fons": 0}
for i, case in enumerate(test_cases, 1):
print(f"\n--- Test Case {i}/{len(test_cases)} ---")
print(f"π Instruction: {case['instruction']}")
# Generate response
prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = response.split("### Response:")[-1].strip() if "### Response:" in response else response
print(f"π€ Model Response: {response[:300]}...")
# Evaluate with Azure Judge
evaluation = evaluate_response(llm, case["instruction"], case["input"], response)
print(f"\nβοΈ Expert Evaluation:\n{evaluation}")
print("-" * 50)
print("\n" + "="*60)
print("β
Evaluation Complete!")
print("="*60)
if __name__ == "__main__":
run_evaluation()
|