{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# 🧪 Relational Ai for Nursing Evaluation Notebook\n",
                "\n",
                "This notebook evaluates the fine-tuned nursing model using Azure GPT-4o as an \"Expert Judge\".\n",
                "\n",
                "**Model:** `NurseCitizenDeveloper/nursing-llama-3-8b-fons`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 1. Install Dependencies (Run this first, then restart runtime)\n",
                "!pip install -U bitsandbytes transformers accelerate langchain-openai -q\n",
                "print(\"✅ Installed! Now go to Runtime → Restart runtime, then run Cell 2\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 2. Load Model from Hugging Face\n",
                "import torch\n",
                "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
                "\n",
                "HF_MODEL = \"NurseCitizenDeveloper/nursing-llama-3-8b-fons\"\n",
                "print(f\"🔄 Loading model: {HF_MODEL}\")\n",
                "\n",
                "tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)\n",
                "model = AutoModelForCausalLM.from_pretrained(\n",
                "    HF_MODEL,\n",
                "    device_map=\"auto\",\n",
                "    torch_dtype=torch.float16,\n",
                ")\n",
                "print(\"✅ Model loaded successfully!\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 3. Setup Azure OpenAI Judge\n",
                "import os\n",
                "from langchain_openai import AzureChatOpenAI\n",
                "from langchain_core.messages import HumanMessage\n",
                "\n",
                "os.environ[\"AZURE_OPENAI_ENDPOINT\"] = \"https://nursing-brain-uk-685.openai.azure.com/\"\n",
                "# Secret removed for security - please enter your key when running\n",
                "os.environ[\"AZURE_OPENAI_API_KEY\"] = \"YOUR_AZURE_KEY\"\n",
                "os.environ[\"AZURE_OPENAI_DEPLOYMENT\"] = \"gpt-4o\"\n",
                "os.environ[\"AZURE_OPENAI_API_VERSION\"] = \"2024-08-01-preview\"\n",
                "\n",
                "llm = AzureChatOpenAI(\n",
                "    azure_deployment=os.environ[\"AZURE_OPENAI_DEPLOYMENT\"],\n",
                "    openai_api_version=os.environ[\"AZURE_OPENAI_API_VERSION\"],\n",
                "    azure_endpoint=os.environ[\"AZURE_OPENAI_ENDPOINT\"],\n",
                "    api_key=os.environ[\"AZURE_OPENAI_API_KEY\"],\n",
                ")\n",
                "print(\"✅ Azure GPT-4o Judge ready!\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 4. Define Test Cases\n",
                "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
                "\n",
                "### Instruction:\n",
                "{}\n",
                "\n",
                "### Input:\n",
                "{}\n",
                "\n",
                "### Response:\n",
                "{}\"\"\"\n",
                "\n",
                "test_cases = [\n",
                "    {\n",
                "        \"instruction\": \"Summarize the key nursing interventions for a patient with delirium.\",\n",
                "        \"input\": \"Patient is an 85-year-old male with acute confusion, fluctuating consciousness, and visual hallucinations.\"\n",
                "    },\n",
                "    {\n",
                "        \"instruction\": \"What are the FONS principles for person-centred care?\",\n",
                "        \"input\": \"A nurse is documenting care for a patient with dementia.\"\n",
                "    },\n",
                "    {\n",
                "        \"instruction\": \"Explain why skin tone documentation is important in pressure ulcer risk assessment.\",\n",
                "        \"input\": \"Using the Braden Scale for a patient with darker skin.\"\n",
                "    },\n",
                "    {\n",
                "        \"instruction\": \"How should a nurse communicate using person-centred language?\",\n",
                "        \"input\": \"Writing clinical notes about a patient with mental health needs.\"\n",
                "    },\n",
                "    {\n",
                "        \"instruction\": \"Describe the ADPIE nursing process.\",\n",
                "        \"input\": \"Training a new nursing student on documentation.\"\n",
                "    },\n",
                "]\n",
                "print(f\"📋 {len(test_cases)} test cases loaded\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 5. Run Evaluation\n",
                "print(\"\\n\" + \"=\"*60)\n",
                "print(\"🏁 Relational Ai for Nursing EVALUATION\")\n",
                "print(\"=\"*60)\n",
                "\n",
                "results = []\n",
                "\n",
                "for i, case in enumerate(test_cases, 1):\n",
                "    print(f\"\\n--- Test {i}/{len(test_cases)} ---\")\n",
                "    print(f\"📝 Instruction: {case['instruction']}\")\n",
                "    \n",
                "    # Generate response\n",
                "    prompt = alpaca_prompt.format(case[\"instruction\"], case[\"input\"], \"\")\n",
                "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
                "    \n",
                "    with torch.no_grad():\n",
                "        outputs = model.generate(\n",
                "            **inputs, \n",
                "            max_new_tokens=200,\n",
                "            do_sample=True,\n",
                "            temperature=0.7,\n",
                "            top_p=0.9,\n",
                "        )\n",
                "    \n",
                "    response = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
                "    response = response.split(\"### Response:\")[-1].strip() if \"### Response:\" in response else response\n",
                "    \n",
                "    print(f\"🤖 Model Response: {response[:300]}...\")\n",
                "    \n",
                "    # Azure Judge Evaluation\n",
                "    eval_prompt = f\"\"\"You are an expert nursing educator. Evaluate this AI response on a scale of 1-10:\n",
                "\n",
                "1. **Clinical Accuracy** (1-10): Is the information clinically correct?\n",
                "2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?\n",
                "3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?\n",
                "\n",
                "**Instruction:** {case['instruction']}\n",
                "**Context:** {case['input']}\n",
                "**Model Response:** {response}\n",
                "\n",
                "Provide scores and brief rationale for each:\"\"\"\n",
                "    \n",
                "    evaluation = llm.invoke([HumanMessage(content=eval_prompt)])\n",
                "    print(f\"\\n⚖️ Expert Evaluation:\\n{evaluation.content}\")\n",
                "    print(\"-\" * 50)\n",
                "    \n",
                "    results.append({\n",
                "        \"test\": case[\"instruction\"],\n",
                "        \"response\": response,\n",
                "        \"evaluation\": evaluation.content\n",
                "    })\n",
                "\n",
                "print(\"\\n\" + \"=\"*60)\n",
                "print(\"✅ EVALUATION COMPLETE\")\n",
                "print(\"=\"*60)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 6. Summary Report\n",
                "print(\"\\n📊 EVALUATION SUMMARY\")\n",
                "print(\"=\"*40)\n",
                "for i, r in enumerate(results, 1):\n",
                "    print(f\"\\nTest {i}: {r['test'][:50]}...\")\n",
                "    print(f\"Response preview: {r['response'][:100]}...\")"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "name": "python",
            "version": "3.10.12"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}