{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# ๐Ÿงช Relational Ai for Nursing Multi-Judge Evaluation\n", "\n", "**Judges:**\n", "- ๐Ÿ”ต **GPT-5.2** (Azure OpenAI)\n", "- ๐ŸŸข **Gemini** (Google AI - will try 3 Pro, fallback to 2.5 Pro)\n", "\n", "**Model:** `NurseCitizenDeveloper/nursing-llama-3-8b-fons`" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 1. Install Dependencies\n", "!pip install -U bitsandbytes transformers accelerate openai langchain-google-genai google-generativeai -q\n", "print(\"โœ… Installed! Restart runtime if needed, then run Cell 2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 2. Load Model from Hugging Face\n", "import torch\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", "HF_MODEL = \"NurseCitizenDeveloper/nursing-llama-3-8b-fons\"\n", "print(f\"๐Ÿ”„ Loading model: {HF_MODEL}\")\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(HF_MODEL)\n", "model = AutoModelForCausalLM.from_pretrained(HF_MODEL, device_map=\"auto\", torch_dtype=torch.float16)\n", "print(\"โœ… Model loaded!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 3. Setup GPT-5.2 Judge (Azure)\n", "from openai import AzureOpenAI\n", "\n", "gpt5_client = AzureOpenAI(\n", " api_version=\"2024-12-01-preview\",\n", " azure_endpoint=\"https://ai-lincoln0303ai530606275924.cognitiveservices.azure.com/\",\n", " api_key=\"YOUR_AZURE_OPENAI_API_KEY\" # Secret removed for security\n", ")\n", "print(\"โœ… GPT-5.2 Judge ready!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 4. Setup Gemini Judge (Google) - Auto-detects best available model\n", "import os\n", "import google.generativeai as genai\n", "\n", "os.environ[\"GOOGLE_API_KEY\"] = \"YOUR_GEMINI_API_KEY\" # Secret removed for security\n", "genai.configure(api_key=os.environ[\"GOOGLE_API_KEY\"])\n", "\n", "# Try Gemini 3 Pro first, fallback to 2.5 Pro\n", "gemini_model_name = None\n", "for model_name in [\"gemini-3-pro\", \"gemini-2.5-pro\", \"gemini-2.0-flash\", \"gemini-1.5-pro\"]:\n", " try:\n", " test_model = genai.GenerativeModel(model_name)\n", " test_model.generate_content(\"test\")\n", " gemini_model_name = model_name\n", " print(f\"โœ… Gemini Judge ready: {model_name}\")\n", " break\n", " except Exception as e:\n", " print(f\"โš ๏ธ {model_name} not available: {str(e)[:50]}\")\n", "\n", "if gemini_model_name:\n", " gemini_judge = genai.GenerativeModel(gemini_model_name)\n", "else:\n", " print(\"โŒ No Gemini model available\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 5. Define Test Cases\n", "alpaca_prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", "\n", "### Instruction:\n", "{}\n", "\n", "### Input:\n", "{}\n", "\n", "### Response:\n", "{}\"\"\"\n", "\n", "test_cases = [\n", " {\"instruction\": \"Summarize key nursing interventions for a patient with delirium.\",\n", " \"input\": \"Patient is an 85-year-old male with acute confusion and visual hallucinations.\"},\n", " {\"instruction\": \"What are the FONS principles for person-centred care?\",\n", " \"input\": \"A nurse is documenting care for a patient with dementia.\"},\n", " {\"instruction\": \"Explain why skin tone documentation is important in pressure ulcer risk assessment.\",\n", " \"input\": \"Using the Braden Scale for a patient with darker skin.\"},\n", " {\"instruction\": \"Describe the ADPIE nursing process.\",\n", " \"input\": \"Training a new nursing student on documentation.\"},\n", "]\n", "\n", "eval_prompt_template = \"\"\"You are an expert nursing educator. Evaluate this AI response on a scale of 1-10:\n", "\n", "1. **Clinical Accuracy** (1-10): Is the information clinically correct?\n", "2. **Person-Centred Language** (1-10): Does it use respectful, dignified language?\n", "3. **FONS Alignment** (1-10): Does it reflect FONS principles (relational care, practice development)?\n", "\n", "**Instruction:** {instruction}\n", "**Context:** {context}\n", "**Model Response:** {response}\n", "\n", "Provide ONLY the three scores in this exact format:\n", "Accuracy: X/10\n", "Person-Centred: X/10\n", "FONS: X/10\n", "Brief rationale:\"\"\"\n", "\n", "print(f\"๐Ÿ“‹ {len(test_cases)} test cases loaded\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 6. Run Multi-Judge Evaluation\n", "print(\"\\n\" + \"=\"*70)\n", "print(f\"๐Ÿ Relational Ai for Nursing MULTI-JUDGE EVALUATION (GPT-5.2 vs {gemini_model_name})\")\n", "print(\"=\"*70)\n", "\n", "results = []\n", "\n", "for i, case in enumerate(test_cases, 1):\n", " print(f\"\\n{'='*70}\")\n", " print(f\"Test {i}/{len(test_cases)}: {case['instruction']}\")\n", " print(\"=\"*70)\n", " \n", " # Generate response from our model\n", " prompt = alpaca_prompt.format(case[\"instruction\"], case[\"input\"], \"\")\n", " inputs = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n", " with torch.no_grad():\n", " outputs = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.7)\n", " response = tokenizer.decode(outputs[0], skip_special_tokens=True).split(\"### Response:\")[-1].strip()\n", " \n", " print(f\"\\n๐Ÿค– Model Response: {response[:250]}...\")\n", " \n", " eval_prompt = eval_prompt_template.format(\n", " instruction=case[\"instruction\"],\n", " context=case[\"input\"],\n", " response=response\n", " )\n", " \n", " # GPT-5.2 Evaluation\n", " print(\"\\n๐Ÿ”ต GPT-5.2 Judge:\")\n", " try:\n", " gpt5_response = gpt5_client.chat.completions.create(\n", " model=\"gpt-5.2-chat\",\n", " messages=[{\"role\": \"user\", \"content\": eval_prompt}],\n", " max_tokens=500\n", " )\n", " gpt5_eval = gpt5_response.choices[0].message.content\n", " print(gpt5_eval)\n", " except Exception as e:\n", " print(f\"Error: {e}\")\n", " gpt5_eval = \"N/A\"\n", " \n", " # Gemini Evaluation\n", " print(f\"\\n๐ŸŸข {gemini_model_name} Judge:\")\n", " try:\n", " gemini_response = gemini_judge.generate_content(eval_prompt)\n", " gemini_eval = gemini_response.text\n", " print(gemini_eval)\n", " except Exception as e:\n", " print(f\"Error: {e}\")\n", " gemini_eval = \"N/A\"\n", " \n", " results.append({\n", " \"test\": case[\"instruction\"],\n", " \"response\": response,\n", " \"gpt5\": gpt5_eval,\n", " \"gemini\": gemini_eval\n", " })\n", "\n", "print(\"\\n\" + \"=\"*70)\n", "print(\"โœ… MULTI-JUDGE EVALUATION COMPLETE\")\n", "print(\"=\"*70)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 7. Summary Comparison\n", "print(f\"\\n๐Ÿ“Š SUMMARY: GPT-5.2 vs {gemini_model_name}\")\n", "print(\"=\"*60)\n", "for i, r in enumerate(results, 1):\n", " print(f\"\\n--- Test {i}: {r['test'][:40]}... ---\")\n", " print(f\"\\n๐Ÿ”ต GPT-5.2:\\n{r['gpt5'][:300]}\")\n", " print(f\"\\n๐ŸŸข Gemini:\\n{r['gemini'][:300]}\")\n", " print(\"=\"*60)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }