{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": {},
            "source": [
                "# 🧪 Relational Ai for Nursing Multi-Judge Evaluation\n",
                "\n",
                "**Judges:**\n",
                "- 🔵 **GPT-5.2** (Azure OpenAI)\n",
                "- 🟢 **Gemini** (Google AI - will try 3 Pro, fallback to 2.5 Pro)\n",
                "\n",
                "**Model:** `NurseCitizenDeveloper/nursing-llama-3-8b-fons`"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 1. Install Dependencies\n",
                "!pip install -U bitsandbytes transformers accelerate openai langchain-google-genai google-generativeai -q\n",
                "print(\"✅ Installed! Restart runtime if needed, then run Cell 2\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 2. Load Model from Hugging Face\n",
                "import torch\n",
                "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
                "\n",
                "HF_MODEL = \"NurseCitizenDeveloper/nursing-llama-3-8b-fons\"\n",
                "print(f\"🔄 Loading model: {HF_MODEL}\")\n",
                "\n",
                "tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)\n",
                "model = AutoModelForCausalLM.from_pretrained(HF_MODEL, device_map=\"auto\", torch_dtype=torch.float16)\n",
                "print(\"✅ Model loaded!\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 3. Setup GPT-5.2 Judge (Azure)\n",
                "from openai import AzureOpenAI\n",
                "\n",
                "gpt5_client = AzureOpenAI(\n",
                "    api_version=\"2024-12-01-preview\",\n",
                "    azure_endpoint=\"https://ai-lincoln0303ai530606275924.cognitiveservices.azure.com/\",\n",
                "    api_key=\"YOUR_AZURE_OPENAI_API_KEY\" # Secret removed for security\n",
                ")\n",
                "print(\"✅ GPT-5.2 Judge ready!\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 4. Setup Gemini Judge (Google) - Auto-detects best available model\n",
                "import os\n",
                "import google.generativeai as genai\n",
                "\n",
                "os.environ[\"GOOGLE_API_KEY\"] = \"YOUR_GEMINI_API_KEY\" # Secret removed for security\n",
                "genai.configure(api_key=os.environ[\"GOOGLE_API_KEY\"])\n",
                "\n",
                "# Try Gemini 3 Pro first, fallback to 2.5 Pro\n",
                "gemini_model_name = None\n",
                "for model_name in [\"gemini-3-pro\", \"gemini-2.5-pro\", \"gemini-2.0-flash\", \"gemini-1.5-pro\"]:\n",
                "    try:\n",
                "        test_model = genai.GenerativeModel(model_name)\n",
                "        test_model.generate_content(\"test\")\n",
                "        gemini_model_name = model_name\n",
                "        print(f\"✅ Gemini Judge ready: {model_name}\")\n",
                "        break\n",
                "    except Exception as e:\n",
                "        print(f\"⚠️ {model_name} not available: {str(e)[:50]}\")\n",
                "\n",
                "if gemini_model_name:\n",
                "    gemini_judge = genai.GenerativeModel(gemini_model_name)\n",
                "else:\n",
                "    print(\"❌ No Gemini model available\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 5. Define Test Cases\n",
                "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n",
                "\n",
                "### Instruction:\n",
                "{}\n",
                "\n",
                "### Input:\n",
                "{}\n",
                "\n",
                "### Response:\n",
                "{}\"\"\"\n",
                "\n",
                "test_cases = [\n",
                "    {\"instruction\": \"Summarize key nursing interventions for a patient with delirium.\",\n",
                "     \"input\": \"Patient is an 85-year-old male with acute confusion and visual hallucinations.\"},\n",
                "    {\"instruction\": \"What are the FONS principles for person-centred care?\",\n",
                "     \"input\": \"A nurse is documenting care for a patient with dementia.\"},\n",
                "    {\"instruction\": \"Explain why skin tone documentation is important in pressure ulcer risk assessment.\",\n",
                "     \"input\": \"Using the Braden Scale for a patient with darker skin.\"},\n",
                "    {\"instruction\": \"Describe the ADPIE nursing process.\",\n",
                "     \"input\": \"Training a new nursing student on documentation.\"},\n",
                "]\n",
                "\n",
                "eval_prompt_template = \"\"\"You are an expert nursing educator. Evaluate this AI response on a scale of 1-10:\n",
                "\n",
                "1. **Clinical Accuracy** (1-10): Is the information clinically correct?\n",
                "2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?\n",
                "3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?\n",
                "\n",
                "**Instruction:** {instruction}\n",
                "**Context:** {context}\n",
                "**Model Response:** {response}\n",
                "\n",
                "Provide ONLY the three scores in this exact format:\n",
                "Accuracy: X/10\n",
                "Person-Centred: X/10\n",
                "FONS: X/10\n",
                "Brief rationale:\"\"\"\n",
                "\n",
                "print(f\"📋 {len(test_cases)} test cases loaded\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 6. Run Multi-Judge Evaluation\n",
                "print(\"\\n\" + \"=\"*70)\n",
                "print(f\"🏁 Relational Ai for Nursing MULTI-JUDGE EVALUATION (GPT-5.2 vs {gemini_model_name})\")\n",
                "print(\"=\"*70)\n",
                "\n",
                "results = []\n",
                "\n",
                "for i, case in enumerate(test_cases, 1):\n",
                "    print(f\"\\n{'='*70}\")\n",
                "    print(f\"Test {i}/{len(test_cases)}: {case['instruction']}\")\n",
                "    print(\"=\"*70)\n",
                "    \n",
                "    # Generate response from our model\n",
                "    prompt = alpaca_prompt.format(case[\"instruction\"], case[\"input\"], \"\")\n",
                "    inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
                "    with torch.no_grad():\n",
                "        outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)\n",
                "    response = tokenizer.decode(outputs[0], skip_special_tokens=True).split(\"### Response:\")[-1].strip()\n",
                "    \n",
                "    print(f\"\\n🤖 Model Response: {response[:250]}...\")\n",
                "    \n",
                "    eval_prompt = eval_prompt_template.format(\n",
                "        instruction=case[\"instruction\"],\n",
                "        context=case[\"input\"],\n",
                "        response=response\n",
                "    )\n",
                "    \n",
                "    # GPT-5.2 Evaluation\n",
                "    print(\"\\n🔵 GPT-5.2 Judge:\")\n",
                "    try:\n",
                "        gpt5_response = gpt5_client.chat.completions.create(\n",
                "            model=\"gpt-5.2-chat\",\n",
                "            messages=[{\"role\": \"user\", \"content\": eval_prompt}],\n",
                "            max_tokens=500\n",
                "        )\n",
                "        gpt5_eval = gpt5_response.choices[0].message.content\n",
                "        print(gpt5_eval)\n",
                "    except Exception as e:\n",
                "        print(f\"Error: {e}\")\n",
                "        gpt5_eval = \"N/A\"\n",
                "    \n",
                "    # Gemini Evaluation\n",
                "    print(f\"\\n🟢 {gemini_model_name} Judge:\")\n",
                "    try:\n",
                "        gemini_response = gemini_judge.generate_content(eval_prompt)\n",
                "        gemini_eval = gemini_response.text\n",
                "        print(gemini_eval)\n",
                "    except Exception as e:\n",
                "        print(f\"Error: {e}\")\n",
                "        gemini_eval = \"N/A\"\n",
                "    \n",
                "    results.append({\n",
                "        \"test\": case[\"instruction\"],\n",
                "        \"response\": response,\n",
                "        \"gpt5\": gpt5_eval,\n",
                "        \"gemini\": gemini_eval\n",
                "    })\n",
                "\n",
                "print(\"\\n\" + \"=\"*70)\n",
                "print(\"✅ MULTI-JUDGE EVALUATION COMPLETE\")\n",
                "print(\"=\"*70)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "metadata": {},
            "outputs": [],
            "source": [
                "# 7. Summary Comparison\n",
                "print(f\"\\n📊 SUMMARY: GPT-5.2 vs {gemini_model_name}\")\n",
                "print(\"=\"*60)\n",
                "for i, r in enumerate(results, 1):\n",
                "    print(f\"\\n--- Test {i}: {r['test'][:40]}... ---\")\n",
                "    print(f\"\\n🔵 GPT-5.2:\\n{r['gpt5'][:300]}\")\n",
                "    print(f\"\\n🟢 Gemini:\\n{r['gemini'][:300]}\")\n",
                "    print(\"=\"*60)"
            ]
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "Python 3",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "name": "python",
            "version": "3.10.12"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}