Spaces:

NurseCitizenDeveloper
/

Open-Nursing-Validator

Sleeping

App Files Files Community

Open-Nursing-Validator / scripts /evaluate_model.py

NurseCitizenDeveloper

Deploy Open Nursing Validator (Docker)

6d12932 verified 5 days ago

raw

history blame contribute delete

5.01 kB

	"""
	Evaluation Script for Fine-Tuned Nursing LLM
	Uses Azure OpenAI (GPT-4o) as an "Expert Judge" to score model responses.
	"""
	import os
	from dotenv import load_dotenv
	from langchain_openai import AzureChatOpenAI
	from langchain_core.messages import HumanMessage
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	import torch

	load_dotenv()

	# Configuration
	HF_MODEL_ID = "NurseCitizenDeveloper/nursing-llama-3-8b-fons"
	BASE_MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"

	def load_model_from_hf():
	"""Load the fine-tuned LoRA adapter from Hugging Face."""
	print(f"🔄 Loading model from Hugging Face: {HF_MODEL_ID}...")

	# Load tokenizer
	tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID)

	# Load base model + adapter
	model = AutoModelForCausalLM.from_pretrained(
	HF_MODEL_ID,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True,
	)

	print("✅ Model loaded successfully!")
	return model, tokenizer

	def get_azure_judge():
	"""Initialize Azure OpenAI as the expert judge."""
	return AzureChatOpenAI(
	azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
	openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
	azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
	api_key=os.getenv("AZURE_OPENAI_API_KEY"),
	)

	def evaluate_response(llm, instruction, input_text, model_output):
	"""Use Azure GPT-4o to evaluate the model's response."""
	prompt = f"""You are an expert nursing educator evaluating an AI assistant trained on FONS (Foundation of Nursing Studies) principles.

	Evaluate the following response on a scale of 1-10 for each criterion:
	1. Clinical Accuracy (1-10): Is the information clinically correct?
	2. Person-Centred Language (1-10): Does it use respectful, dignified language?
	3. FONS Alignment (1-10): Does it reflect FONS principles (relational care, practice development)?

	Instruction Given: {instruction}
	Context: {input_text}
	Model Response: {model_output}

	Provide scores and a brief rationale for each, then an overall recommendation."""

	response = llm.invoke([HumanMessage(content=prompt)])
	return response.content

	def run_evaluation():
	"""Main evaluation loop."""
	# Test cases covering key nursing domains
	test_cases = [
	{
	"instruction": "Summarize the key nursing interventions for a patient with delirium.",
	"input": "Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations. History of dementia."
	},
	{
	"instruction": "What are the FONS principles for person-centred care?",
	"input": "A nurse is documenting care for a patient with dementia."
	},
	{
	"instruction": "Explain why skin tone documentation is important in pressure ulcer risk assessment.",
	"input": "Using the Braden Scale for a patient with darker skin."
	},
	{
	"instruction": "How should a nurse communicate bad news to a family member?",
	"input": "The patient's condition has deteriorated significantly overnight."
	},
	]

	alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	{}

	### Input:
	{}

	### Response:
	{}"""

	# Load model and judge
	model, tokenizer = load_model_from_hf()
	llm = get_azure_judge()

	print("\n" + "="*60)
	print("🏁 Relational Ai for Nursing Evaluation")
	print("="*60)

	total_scores = {"accuracy": 0, "person_centred": 0, "fons": 0}

	for i, case in enumerate(test_cases, 1):
	print(f"\n--- Test Case {i}/{len(test_cases)} ---")
	print(f"📝 Instruction: {case['instruction']}")

	# Generate response
	prompt = alpaca_prompt.format(case["instruction"], case["input"], "")
	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	with torch.no_grad():
	outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)

	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	response = response.split("### Response:")[-1].strip() if "### Response:" in response else response

	print(f"🤖 Model Response: {response[:300]}...")

	# Evaluate with Azure Judge
	evaluation = evaluate_response(llm, case["instruction"], case["input"], response)
	print(f"\n⚖️ Expert Evaluation:\n{evaluation}")
	print("-" * 50)

	print("\n" + "="*60)
	print("✅ Evaluation Complete!")
	print("="*60)

	if __name__ == "__main__":
	run_evaluation()