{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ARIA — DevOps Incident Response: GRPO Training (Kaggle 2×T4)\n",
    "\n",
    "**Model:** `unsloth/Llama-3.2-3B-Instruct` (4-bit quantized)  \n",
    "**Tasks:** `easy` → `medium`  \n",
    "**Episodes:** 80 per task (160 total)  \n",
    "**Expected runtime:** ~6–8 hours on Kaggle 2×T4  \n",
    "\n",
    "### Before running:\n",
    "1. Enable **GPU T4 x2** (right panel → Accelerator)\n",
    "2. Add Kaggle secret: Settings → Secrets → `HF_TOKEN` = your HF write token\n",
    "3. Run all cells top to bottom — do not skip any"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-04-21T02:58:22.215226Z",
     "iopub.status.busy": "2026-04-21T02:58:22.214729Z",
     "iopub.status.idle": "2026-04-21T02:58:40.587149Z",
     "shell.execute_reply": "2026-04-21T02:58:40.585130Z",
     "shell.execute_reply.started": "2026-04-21T02:58:22.215193Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 1: Install ───────────────────────────────────────────────────────────\n",
    "import subprocess, sys, os\n",
    "\n",
    "os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n",
    "\n",
    "# Install in correct order\n",
    "subprocess.run(['pip', 'install', '-q',\n",
    "    'unsloth',\n",
    "    'transformers>=4.48.0',   # needs 4.48+ for CompileConfig\n",
    "    'mergekit',\n",
    "    'trl>=0.9.0',\n",
    "    'accelerate>=0.26.0',\n",
    "    'peft>=0.10.0',\n",
    "    'bitsandbytes',\n",
    "    'requests',\n",
    "    'matplotlib',\n",
    "    'huggingface_hub',\n",
    "], capture_output=True)\n",
    "\n",
    "# Clear stale cache\n",
    "for mod in list(sys.modules.keys()):\n",
    "    if any(x in mod for x in ['trl','unsloth','transformers','peft']):\n",
    "        del sys.modules[mod]\n",
    "\n",
    "# Verify\n",
    "import unsloth\n",
    "from unsloth import FastLanguageModel\n",
    "import transformers, peft, torch\n",
    "from trl import GRPOConfig\n",
    "\n",
    "print(f'✅ unsloth {unsloth.__version__}')\n",
    "print(f'✅ transformers {transformers.__version__}')\n",
    "print(f'✅ torch {torch.__version__} | CUDA: {torch.cuda.is_available()}')\n",
    "print(f'✅ UNSLOTH_RETURN_LOGITS = {os.environ[\"UNSLOTH_RETURN_LOGITS\"]}')\n",
    "print('✅ All good — proceed')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.588073Z",
     "iopub.status.idle": "2026-04-21T02:58:40.588446Z",
     "shell.execute_reply": "2026-04-21T02:58:40.588323Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.588295Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 2: Authenticate HuggingFace ─────────────────────────────────────────\n",
    "import os\n",
    "from kaggle_secrets import UserSecretsClient\n",
    "\n",
    "try:\n",
    "    secrets = UserSecretsClient()\n",
    "    hf_token = secrets.get_secret('HF_TOKEN')\n",
    "    os.environ['HF_TOKEN'] = hf_token\n",
    "    print('✅ HF token loaded from Kaggle secrets')\n",
    "except Exception as e:\n",
    "    # Fallback: paste token directly (remove before sharing notebook)\n",
    "    hf_token = 'YOUR_HF_WRITE_TOKEN_HERE'\n",
    "    os.environ['HF_TOKEN'] = hf_token\n",
    "    print(f'⚠️  Using hardcoded token (Kaggle secret not found: {e})')\n",
    "\n",
    "from huggingface_hub import login\n",
    "login(token=hf_token, add_to_git_credential=False)\n",
    "print('✅ Logged in to HuggingFace Hub')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.590400Z",
     "iopub.status.idle": "2026-04-21T02:58:40.590712Z",
     "shell.execute_reply": "2026-04-21T02:58:40.590603Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.590587Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 3: Config ────────────────────────────────────────────────────────────\n",
    "CONFIG = {\n",
    "    'model_name': 'unsloth/Llama-3.2-3B-Instruct',\n",
    "    'max_seq_length': 2048,\n",
    "    'load_in_4bit': True,\n",
    "    'env_url': 'https://arijit-07-devops-incident-response.hf.space',\n",
    "    'tasks': ['easy', 'medium'],\n",
    "    'episodes_per_task': 80,\n",
    "    'max_steps_per_episode': 12,\n",
    "    'learning_rate': 5e-6,\n",
    "    'grpo_group_size': 6,\n",
    "    'lora_rank': 16,\n",
    "    'lora_alpha': 32,\n",
    "    'kl_coeff': 0.05,\n",
    "    'hf_repo': 'Arijit-07/aria-devops-llama3b',\n",
    "    'output_dir': '/kaggle/working/aria-llama3b',\n",
    "    'save_every_n_episodes': 20,\n",
    "}\n",
    "print('✅ Config loaded')\n",
    "for k, v in CONFIG.items():\n",
    "    print(f'  {k}: {v}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.592114Z",
     "iopub.status.idle": "2026-04-21T02:58:40.592818Z",
     "shell.execute_reply": "2026-04-21T02:58:40.592688Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.592670Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 4: Environment Client ────────────────────────────────────────────────\n",
    "import requests\n",
    "import json\n",
    "import time\n",
    "\n",
    "BASE_URL = CONFIG['env_url']\n",
    "\n",
    "def env_reset(task_id: str, seed: int = None) -> dict:\n",
    "    payload = {'task_id': task_id}\n",
    "    if seed is not None:\n",
    "        payload['seed'] = seed\n",
    "    for attempt in range(3):\n",
    "        try:\n",
    "            r = requests.post(f'{BASE_URL}/reset', json=payload, timeout=30)\n",
    "            r.raise_for_status()\n",
    "            return r.json()\n",
    "        except Exception as e:\n",
    "            if attempt == 2:\n",
    "                raise\n",
    "            time.sleep(5)\n",
    "\n",
    "def env_step(action: dict) -> dict:\n",
    "    for attempt in range(3):\n",
    "        try:\n",
    "            r = requests.post(f'{BASE_URL}/step', json=action, timeout=30)\n",
    "            r.raise_for_status()\n",
    "            return r.json()\n",
    "        except Exception as e:\n",
    "            if attempt == 2:\n",
    "                raise\n",
    "            time.sleep(5)\n",
    "\n",
    "def env_state() -> dict:\n",
    "    r = requests.get(f'{BASE_URL}/state', timeout=30)\n",
    "    r.raise_for_status()\n",
    "    return r.json()\n",
    "\n",
    "# Test connection\n",
    "health = requests.get(f'{BASE_URL}/health', timeout=15).json()\n",
    "print(f'✅ Environment connected: {health}')\n",
    "\n",
    "# Test reset\n",
    "test_obs = env_reset('easy', seed=0)\n",
    "print(f'✅ Reset successful. Task: {test_obs.get(\"task_id\")}')\n",
    "print(f'   Services: {len(test_obs.get(\"services\", []))}')\n",
    "print(f'   Alerts: {len(test_obs.get(\"active_alerts\", []))}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.594722Z",
     "iopub.status.idle": "2026-04-21T02:58:40.595127Z",
     "shell.execute_reply": "2026-04-21T02:58:40.595000Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.594983Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 5: Observation → Text (what the LLM sees) ───────────────────────────\n",
    "def observation_to_prompt(obs: dict, task_id: str) -> str:\n",
    "    \"\"\"Convert environment observation to LLM prompt text.\"\"\"\n",
    "    lines = []\n",
    "    lines.append('=== PRODUCTION INCIDENT RESPONSE ===')\n",
    "    lines.append(f'Task: {task_id.upper()} | Step: {obs.get(\"step\", 0)}/{obs.get(\"max_steps\", 15)}')\n",
    "    lines.append('')\n",
    "\n",
    "    # SLA Status\n",
    "    sla = obs.get('sla_status', {})\n",
    "    if sla:\n",
    "        lines.append('SLA STATUS:')\n",
    "        for svc, status in sla.items():\n",
    "            emoji = '🔴' if status == 'breached' else '🟡' if status == 'warning' else '🟢'\n",
    "            lines.append(f'  {emoji} {svc}: {status}')\n",
    "        lines.append('')\n",
    "\n",
    "    # Active Alerts\n",
    "    alerts = obs.get('active_alerts', [])\n",
    "    if alerts:\n",
    "        lines.append('ACTIVE ALERTS:')\n",
    "        for a in sorted(alerts, key=lambda x: x.get('severity',''), reverse=True):\n",
    "            lines.append(f'  [{a.get(\"severity\",\"\").upper()}] {a.get(\"service\",\"\")}: {a.get(\"message\",\"\")}')\n",
    "        lines.append('')\n",
    "\n",
    "    # Services\n",
    "    services = obs.get('services', [])\n",
    "    if services:\n",
    "        lines.append('SERVICE METRICS:')\n",
    "        for s in sorted(services, key=lambda x: x.get('error_rate', 0), reverse=True):\n",
    "            lines.append(\n",
    "                f'  {s.get(\"name\",\"\"):30s} | status={s.get(\"status\",\"\"):10s} | '\n",
    "                f'cpu={s.get(\"cpu\",0):5.1f}% | mem={s.get(\"memory\",0):5.1f}% | '\n",
    "                f'err={s.get(\"error_rate\",0):.3f} | p99={s.get(\"latency_p99\",0):.0f}ms'\n",
    "            )\n",
    "        lines.append('')\n",
    "\n",
    "    # Recent logs (partial — only 2 lines shown)\n",
    "    logs = obs.get('recent_logs', {})\n",
    "    if logs:\n",
    "        lines.append('RECENT LOGS (partial — use read_logs for full history):')\n",
    "        for svc, log_lines in list(logs.items())[:4]:\n",
    "            for log_line in log_lines[:2]:\n",
    "                lines.append(f'  [{svc}] {log_line}')\n",
    "        lines.append('')\n",
    "\n",
    "    # Service dependencies\n",
    "    deps = obs.get('service_dependencies', [])\n",
    "    if deps:\n",
    "        lines.append('SERVICE DEPENDENCIES:')\n",
    "        for d in deps[:6]:\n",
    "            lines.append(f'  {d.get(\"service\",\"\")} → calls → {d.get(\"depends_on\",\"\")}')\n",
    "        lines.append('')\n",
    "\n",
    "    # Evidence log\n",
    "    evidence = obs.get('evidence_log', [])\n",
    "    if evidence:\n",
    "        lines.append('EVIDENCE GATHERED THIS EPISODE:')\n",
    "        for e in evidence[-5:]:  # last 5 evidence entries\n",
    "            lines.append(f'  [{e.get(\"action_type\",\"\").upper()}] {e.get(\"content\",\"\")[:150]}')\n",
    "        lines.append('')\n",
    "\n",
    "    # Last result\n",
    "    if obs.get('last_action_result'):\n",
    "        lines.append(f'LAST ACTION RESULT: {obs[\"last_action_result\"][:200]}')\n",
    "        lines.append('')\n",
    "\n",
    "    return '\\n'.join(lines)\n",
    "\n",
    "\n",
    "SYSTEM_PROMPT = \"\"\"You are an expert DevOps engineer responding to a production incident.\n",
    "Analyze the situation carefully and take the most appropriate action.\n",
    "\n",
    "Available actions (respond with EXACTLY one JSON object):\n",
    "- Read logs: {\"action_type\": \"read_logs\", \"service\": \"<service-name>\"}\n",
    "- Search logs: {\"action_type\": \"search_logs\", \"service\": \"<service-name>\", \"query\": \"<search-term>\"}\n",
    "- Read metrics: {\"action_type\": \"read_metrics\", \"service\": \"<service-name>\"}\n",
    "- Read runbook: {\"action_type\": \"read_runbook\", \"runbook\": \"<runbook-name>\"}\n",
    "- Diagnose: {\"action_type\": \"diagnose\", \"root_cause\": \"<your diagnosis>\"}\n",
    "- Restart service: {\"action_type\": \"restart_service\", \"service\": \"<service-name>\"}\n",
    "- Rollback: {\"action_type\": \"rollback\", \"service\": \"<service-name>\", \"version\": \"previous\"}\n",
    "- Scale up: {\"action_type\": \"scale_up\", \"service\": \"<service-name>\"}\n",
    "- Alert on-call: {\"action_type\": \"alert_oncall\", \"message\": \"<alert-message>\"}\n",
    "- Acknowledge alert: {\"action_type\": \"acknowledge\", \"alert_id\": \"<alert-id>\"}\n",
    "- Block IP range: {\"action_type\": \"block_ip_range\", \"ip_range\": \"<cidr>\"}\n",
    "- Create index: {\"action_type\": \"create_index\", \"table\": \"<table>\", \"column\": \"<column>\"}\n",
    "- Failover: {\"action_type\": \"failover\", \"service\": \"<service-name>\", \"target_region\": \"us-west-2\"}\n",
    "\n",
    "Strategy:\n",
    "1. First gather information (read_logs, read_metrics) before acting\n",
    "2. Diagnose before fixing\n",
    "3. Fix the ROOT CAUSE, not symptoms\n",
    "4. Do NOT restart healthy services — this causes penalties\n",
    "\n",
    "Respond with ONLY a valid JSON object. No explanation, no markdown.\"\"\"\n",
    "\n",
    "# Test prompt generation\n",
    "test_prompt = observation_to_prompt(test_obs, 'easy')\n",
    "print('Sample prompt (first 800 chars):')\n",
    "print(test_prompt[:800])\n",
    "print(f'\\nTotal prompt length: {len(test_prompt)} chars')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.596692Z",
     "iopub.status.idle": "2026-04-21T02:58:40.597089Z",
     "shell.execute_reply": "2026-04-21T02:58:40.596929Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.596906Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 6: Load Model with Unsloth ──────────────────────────────────────────\n",
    "from unsloth import FastLanguageModel\n",
    "import torch\n",
    "\n",
    "print(f'Loading {CONFIG[\"model_name\"]} with Unsloth...')\n",
    "print(f'GPU: {torch.cuda.get_device_name(0)}')\n",
    "print(f'VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')\n",
    "\n",
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name=CONFIG['model_name'],\n",
    "    max_seq_length=CONFIG['max_seq_length'],\n",
    "    dtype=None,  # auto-detect\n",
    "    load_in_4bit=CONFIG['load_in_4bit'],\n",
    "    token=hf_token,\n",
    ")\n",
    "\n",
    "# Apply LoRA with Unsloth\n",
    "model = FastLanguageModel.get_peft_model(\n",
    "    model,\n",
    "    r=CONFIG['lora_rank'],\n",
    "    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj',\n",
    "                    'gate_proj', 'up_proj', 'down_proj'],\n",
    "    lora_alpha=CONFIG['lora_alpha'],\n",
    "    lora_dropout=0.05,\n",
    "    bias='none',\n",
    "    use_gradient_checkpointing='unsloth',\n",
    "    random_state=42,\n",
    "    use_rslora=False,\n",
    ")\n",
    "\n",
    "print(f'\\n✅ Model loaded and LoRA applied')\n",
    "print(f'Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}')\n",
    "print(f'Total params: {sum(p.numel() for p in model.parameters()):,}')\n",
    "print(f'VRAM used: {torch.cuda.memory_allocated() / 1e9:.2f} GB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.597793Z",
     "iopub.status.idle": "2026-04-21T02:58:40.598105Z",
     "shell.execute_reply": "2026-04-21T02:58:40.597990Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.597974Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 7: Episode Runner ────────────────────────────────────────────────────\n",
    "import re\n",
    "\n",
    "FastLanguageModel.for_inference(model)\n",
    "\n",
    "def parse_action(text: str) -> dict:\n",
    "    \"\"\"Extract JSON action from LLM output. Returns noop on parse failure.\"\"\"\n",
    "    text = text.strip()\n",
    "    # Try to find JSON block\n",
    "    patterns = [\n",
    "        r'```json\\s*({.*?})\\s*```',\n",
    "        r'```\\s*({.*?})\\s*```',\n",
    "        r'({\\s*\"action_type\"[^}]+})',\n",
    "    ]\n",
    "    for pattern in patterns:\n",
    "        match = re.search(pattern, text, re.DOTALL)\n",
    "        if match:\n",
    "            try:\n",
    "                return json.loads(match.group(1))\n",
    "            except:\n",
    "                continue\n",
    "    # Try raw JSON\n",
    "    try:\n",
    "        return json.loads(text)\n",
    "    except:\n",
    "        return {'action_type': 'noop'}\n",
    "\n",
    "\n",
    "def generate_action(obs: dict, task_id: str) -> tuple:\n",
    "    \"\"\"Generate an action from the current observation using the LLM.\"\"\"\n",
    "    user_content = observation_to_prompt(obs, task_id)\n",
    "\n",
    "    messages = [\n",
    "        {'role': 'system', 'content': SYSTEM_PROMPT},\n",
    "        {'role': 'user', 'content': user_content}\n",
    "    ]\n",
    "\n",
    "    input_ids = tokenizer.apply_chat_template(\n",
    "        messages,\n",
    "        tokenize=True,\n",
    "        add_generation_prompt=True,\n",
    "        return_tensors='pt'\n",
    "    ).to('cuda')\n",
    "\n",
    "    with torch.no_grad():\n",
    "        output = model.generate(\n",
    "            input_ids,\n",
    "            max_new_tokens=128,\n",
    "            temperature=0.7,\n",
    "            do_sample=True,\n",
    "            pad_token_id=tokenizer.eos_token_id,\n",
    "        )\n",
    "\n",
    "    generated = tokenizer.decode(\n",
    "        output[0][input_ids.shape[1]:],\n",
    "        skip_special_tokens=True\n",
    "    )\n",
    "\n",
    "    action = parse_action(generated)\n",
    "    return action, generated\n",
    "\n",
    "\n",
    "def run_episode(task_id: str, seed: int = None, verbose: bool = False) -> float:\n",
    "    \"\"\"Run one episode and return the final reward score.\"\"\"\n",
    "    obs = env_reset(task_id, seed=seed)\n",
    "    total_reward = 0.0\n",
    "    done = False\n",
    "\n",
    "    for step in range(CONFIG['max_steps_per_episode']):\n",
    "        if done:\n",
    "            break\n",
    "\n",
    "        action, raw_output = generate_action(obs, task_id)\n",
    "\n",
    "        if verbose:\n",
    "            print(f'  Step {step+1}: {action.get(\"action_type\",\"?\")} '\n",
    "                  f'(service={action.get(\"service\",\"-\")})')\n",
    "\n",
    "        result = env_step(action)\n",
    "        total_reward += result.get('reward', 0.0)\n",
    "        obs = result.get('observation', obs)\n",
    "        done = result.get('done', False)\n",
    "\n",
    "    # Get final graded score\n",
    "    state = env_state()\n",
    "    final_score = state.get('current_score', total_reward)\n",
    "    return final_score\n",
    "\n",
    "\n",
    "# Test: run one episode before training\n",
    "print('Testing episode runner (1 episode, verbose)...')\n",
    "test_score = run_episode('easy', seed=99, verbose=True)\n",
    "print(f'\\n✅ Test episode score: {test_score:.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.599189Z",
     "iopub.status.idle": "2026-04-21T02:58:40.599769Z",
     "shell.execute_reply": "2026-04-21T02:58:40.599606Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.599571Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 8: Pre-training Baseline ─────────────────────────────────────────────\n",
    "import random\n",
    "\n",
    "print('Running pre-training baseline (10 episodes per task)...')\n",
    "print('This is the BEFORE score — shows the untrained model.')\n",
    "print()\n",
    "\n",
    "baseline_scores = {}\n",
    "for task_id in CONFIG['tasks']:\n",
    "    scores = []\n",
    "    for i in range(10):\n",
    "        seed = random.randint(0, 9999)\n",
    "        score = run_episode(task_id, seed=seed)\n",
    "        scores.append(score)\n",
    "        print(f'  [{task_id}] Episode {i+1}/10: {score:.3f}', end='\\r')\n",
    "\n",
    "    avg = sum(scores) / len(scores)\n",
    "    baseline_scores[task_id] = {'scores': scores, 'avg': avg}\n",
    "    print(f'  [{task_id}] Baseline avg: {avg:.3f} (min={min(scores):.3f}, max={max(scores):.3f})')\n",
    "\n",
    "print()\n",
    "print('Baseline summary:')\n",
    "for task_id, data in baseline_scores.items():\n",
    "    print(f'  {task_id}: {data[\"avg\"]:.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.601001Z",
     "iopub.status.idle": "2026-04-21T02:58:40.601239Z",
     "shell.execute_reply": "2026-04-21T02:58:40.601139Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.601126Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 9: GRPO Setup ────────────────────────────────────────────────────────\n",
    "# Uses episode-level updates (not per-step) + KL penalty to prevent forgetting.\n",
    "from torch.optim import AdamW\n",
    "from transformers import get_cosine_schedule_with_warmup\n",
    "import copy, torch\n",
    "\n",
    "FastLanguageModel.for_training(model)\n",
    "\n",
    "# Frozen reference model for KL penalty\n",
    "ref_model = copy.deepcopy(model)\n",
    "for p in ref_model.parameters():\n",
    "    p.requires_grad = False\n",
    "ref_model.eval()\n",
    "print('✅ Reference model frozen')\n",
    "\n",
    "total_episodes = CONFIG['episodes_per_task'] * len(CONFIG['tasks'])\n",
    "optimizer = AdamW(\n",
    "    [p for p in model.parameters() if p.requires_grad],\n",
    "    lr=CONFIG['learning_rate'], weight_decay=0.01\n",
    ")\n",
    "scheduler = get_cosine_schedule_with_warmup(\n",
    "    optimizer,\n",
    "    num_warmup_steps=max(1, total_episodes // 10),\n",
    "    num_training_steps=total_episodes\n",
    ")\n",
    "\n",
    "\n",
    "def run_episode_collect(task_id, seed):\n",
    "    \"\"\"\n",
    "    KEY FIX: group completions are scored on FRESH env snapshots.\n",
    "    Only the best action advances the main episode.\n",
    "    This prevents reward gates from being burned by group generation.\n",
    "    \"\"\"\n",
    "    obs = env_reset(task_id, seed=seed)\n",
    "    trajectory = []\n",
    "    done = False\n",
    "\n",
    "    FastLanguageModel.for_inference(model)\n",
    "\n",
    "    for step in range(CONFIG['max_steps_per_episode']):\n",
    "        if done:\n",
    "            break\n",
    "\n",
    "        messages = [\n",
    "            {'role': 'system', 'content': SYSTEM_PROMPT},\n",
    "            {'role': 'user', 'content': observation_to_prompt(obs, task_id)}\n",
    "        ]\n",
    "        input_ids = tokenizer.apply_chat_template(\n",
    "            messages, tokenize=True, add_generation_prompt=True,\n",
    "            return_tensors='pt'\n",
    "        ).to('cuda')\n",
    "\n",
    "        # Step 1: Generate all completions (no env calls yet)\n",
    "        group_completions, group_texts = [], []\n",
    "        for _ in range(CONFIG['grpo_group_size']):\n",
    "            with torch.no_grad():\n",
    "                out = model.generate(\n",
    "                    input_ids, max_new_tokens=128, temperature=0.9,\n",
    "                    do_sample=True, pad_token_id=tokenizer.eos_token_id,\n",
    "                )\n",
    "            gen_ids = out[0][input_ids.shape[1]:]\n",
    "            group_completions.append(gen_ids)\n",
    "            group_texts.append(tokenizer.decode(gen_ids, skip_special_tokens=True))\n",
    "\n",
    "        # Step 2: Score each completion on a FRESH env snapshot\n",
    "        # Each gets its own reset so reward gates are clean per completion\n",
    "        group_rewards = []\n",
    "        for gen_text in group_texts:\n",
    "            action = parse_action(gen_text)\n",
    "            try:\n",
    "                env_reset(task_id, seed=seed)  # fresh snapshot\n",
    "                res = env_step(action)\n",
    "                r = res.get('reward', 0.0)\n",
    "            except:\n",
    "                r = 0.0\n",
    "            # Exploration bonus: non-noop gets +0.02 to bootstrap learning\n",
    "            if action.get('action_type', 'noop') != 'noop':\n",
    "                r += 0.02\n",
    "            group_rewards.append(r)\n",
    "\n",
    "        # Step 3: Advance MAIN episode with best action\n",
    "        best_idx = group_rewards.index(max(group_rewards))\n",
    "        best_action = parse_action(group_texts[best_idx])\n",
    "        try:\n",
    "            # Re-sync: reset to current state then step\n",
    "            adv_res = env_step(best_action)\n",
    "            obs = adv_res.get('observation', obs)\n",
    "            done = adv_res.get('done', False)\n",
    "        except:\n",
    "            done = True\n",
    "\n",
    "        trajectory.append({\n",
    "            'input_ids': input_ids,\n",
    "            'completions': group_completions,\n",
    "            'rewards': group_rewards,\n",
    "        })\n",
    "\n",
    "    try:\n",
    "        state = env_state()\n",
    "        final_score = state.get('current_score', 0.0)\n",
    "    except:\n",
    "        final_score = 0.0\n",
    "\n",
    "    return trajectory, final_score\n",
    "\n",
    "\n",
    "def update_from_trajectory(trajectory):\n",
    "    \"\"\"Single model update from full episode trajectory with KL penalty.\"\"\"\n",
    "    if not trajectory:\n",
    "        return 0.0\n",
    "\n",
    "    FastLanguageModel.for_training(model)\n",
    "    model.train()\n",
    "    optimizer.zero_grad()\n",
    "\n",
    "    total_loss = torch.tensor(0.0).to('cuda')\n",
    "\n",
    "    for step_data in trajectory:\n",
    "        input_ids = step_data['input_ids']\n",
    "        completions = step_data['completions']\n",
    "        rewards = step_data['rewards']\n",
    "\n",
    "        rewards_t = torch.tensor(rewards, dtype=torch.float32)\n",
    "        if rewards_t.std() > 1e-8:\n",
    "            advantages = (rewards_t - rewards_t.mean()) / (rewards_t.std() + 1e-8)\n",
    "        else:\n",
    "            advantages = rewards_t - rewards_t.mean()\n",
    "\n",
    "        best_idx = rewards.index(max(rewards))\n",
    "        best_ids = completions[best_idx]\n",
    "        best_adv = advantages[best_idx]\n",
    "\n",
    "        full_ids = torch.cat([input_ids[0], best_ids]).unsqueeze(0)\n",
    "        labels = full_ids.clone()\n",
    "        labels[0, :input_ids.shape[1]] = -100\n",
    "\n",
    "        outputs = model(full_ids, labels=labels)\n",
    "        policy_loss = outputs.loss * (-best_adv)\n",
    "\n",
    "        # KL penalty vs reference model\n",
    "        with torch.no_grad():\n",
    "            ref_out = ref_model(full_ids)\n",
    "        ref_logits = ref_out.logits[:, input_ids.shape[1]-1:-1, :]\n",
    "        pol_logits = outputs.logits[:, input_ids.shape[1]-1:-1, :]\n",
    "        kl = torch.nn.functional.kl_div(\n",
    "            torch.log_softmax(pol_logits, dim=-1),\n",
    "            torch.softmax(ref_logits, dim=-1),\n",
    "            reduction='batchmean'\n",
    "        )\n",
    "        total_loss = total_loss + policy_loss + CONFIG['kl_coeff'] * kl\n",
    "\n",
    "    total_loss = total_loss / len(trajectory)\n",
    "    total_loss.backward()\n",
    "    torch.nn.utils.clip_grad_norm_(\n",
    "        [p for p in model.parameters() if p.requires_grad], 0.5\n",
    "    )\n",
    "    optimizer.step()\n",
    "    scheduler.step()\n",
    "    return total_loss.item()\n",
    "\n",
    "\n",
    "print('✅ GRPO setup complete')\n",
    "print(f'Strategy: fresh env per completion → episode-level update')\n",
    "print(f'LR={CONFIG[\"learning_rate\"]} | KL={CONFIG[\"kl_coeff\"]} | Groups={CONFIG[\"grpo_group_size\"]}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.602529Z",
     "iopub.status.idle": "2026-04-21T02:58:40.602803Z",
     "shell.execute_reply": "2026-04-21T02:58:40.602689Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.602674Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 10: Training Loop ────────────────────────────────────────────────────\n",
    "import os, time, random, json\n",
    "\n",
    "os.makedirs(CONFIG['output_dir'], exist_ok=True)\n",
    "training_log = []\n",
    "episode_scores = {task: [] for task in CONFIG['tasks']}\n",
    "global_episode = 0\n",
    "start_time = time.time()\n",
    "\n",
    "print('=' * 60)\n",
    "print('ARIA GRPO TRAINING')\n",
    "print(f'LR={CONFIG[\"learning_rate\"]} | Groups={CONFIG[\"grpo_group_size\"]} | KL={CONFIG[\"kl_coeff\"]}')\n",
    "print('=' * 60)\n",
    "\n",
    "for task_id in CONFIG['tasks']:\n",
    "    print(f'\\n📋 Task: {task_id.upper()} | Baseline: {baseline_scores[task_id][\"avg\"]:.3f}')\n",
    "    print('-' * 40)\n",
    "\n",
    "    for ep in range(CONFIG['episodes_per_task']):\n",
    "        seed = random.randint(0, 9999)\n",
    "\n",
    "        trajectory, final_score = run_episode_collect(task_id, seed)\n",
    "        loss = update_from_trajectory(trajectory)\n",
    "\n",
    "        episode_scores[task_id].append(final_score)\n",
    "        global_episode += 1\n",
    "        elapsed = (time.time() - start_time) / 60\n",
    "        recent = episode_scores[task_id][-10:]\n",
    "        rolling = sum(recent) / len(recent)\n",
    "\n",
    "        training_log.append({\n",
    "            'episode': global_episode, 'task_id': task_id,\n",
    "            'score': final_score, 'rolling_avg': rolling,\n",
    "            'loss': loss, 'elapsed_min': round(elapsed, 1)\n",
    "        })\n",
    "\n",
    "        if (ep + 1) % 5 == 0:\n",
    "            delta = rolling - baseline_scores[task_id]['avg']\n",
    "            trend = '📈' if delta > 0.02 else '📉' if delta < -0.02 else '➡️'\n",
    "            print(\n",
    "                f'  {trend} Ep {ep+1:3d}/{CONFIG[\"episodes_per_task\"]} | '\n",
    "                f'Score: {final_score:.3f} | Roll-10: {rolling:.3f} | '\n",
    "                f'vs baseline: {delta:+.3f} | Loss: {loss:.4f} | {elapsed:.0f}m'\n",
    "            )\n",
    "\n",
    "        if global_episode % CONFIG['save_every_n_episodes'] == 0:\n",
    "            ckpt = f'{CONFIG[\"output_dir\"]}/checkpoint-ep{global_episode}'\n",
    "            model.save_pretrained(ckpt)\n",
    "            tokenizer.save_pretrained(ckpt)\n",
    "            print(f'  💾 Checkpoint ep{global_episode}')\n",
    "\n",
    "    task_avg = sum(episode_scores[task_id]) / len(episode_scores[task_id])\n",
    "    base_avg = baseline_scores[task_id]['avg']\n",
    "    delta = task_avg - base_avg\n",
    "    result = '✅ IMPROVED' if delta > 0.02 else '⚠️ FLAT' if delta > -0.02 else '❌ DEGRADED'\n",
    "    print(f'\\n{result} {task_id}: {base_avg:.3f} → {task_avg:.3f} ({delta:+.3f})')\n",
    "\n",
    "print(f'\\n🎉 Training complete! {(time.time()-start_time)/60:.0f} minutes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.604209Z",
     "iopub.status.idle": "2026-04-21T02:58:40.604579Z",
     "shell.execute_reply": "2026-04-21T02:58:40.604412Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.604391Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 11: Post-Training Evaluation ────────────────────────────────────────\n",
    "FastLanguageModel.for_inference(model)\n",
    "\n",
    "print('Running post-training evaluation (10 episodes per task)...')\n",
    "post_scores = {}\n",
    "\n",
    "for task_id in CONFIG['tasks']:\n",
    "    scores = []\n",
    "    for i in range(10):\n",
    "        seed = random.randint(10000, 19999)  # unseen seeds\n",
    "        score = run_episode(task_id, seed=seed)\n",
    "        scores.append(score)\n",
    "\n",
    "    avg = sum(scores) / len(scores)\n",
    "    post_scores[task_id] = {'scores': scores, 'avg': avg}\n",
    "    improvement = avg - baseline_scores[task_id]['avg']\n",
    "    print(f'  [{task_id}] Post-training avg: {avg:.3f} '\n",
    "          f'(baseline: {baseline_scores[task_id][\"avg\"]:.3f}, '\n",
    "          f'improvement: +{improvement:.3f})')\n",
    "\n",
    "# Test generalization on unseen tasks\n",
    "print('\\nTesting generalization on UNSEEN tasks...')\n",
    "unseen_tasks = ['hard', 'bonus']\n",
    "generalization_scores = {}\n",
    "for task_id in unseen_tasks:\n",
    "    scores = []\n",
    "    for i in range(5):\n",
    "        seed = random.randint(0, 9999)\n",
    "        try:\n",
    "            score = run_episode(task_id, seed=seed)\n",
    "            scores.append(score)\n",
    "        except:\n",
    "            scores.append(0.0)\n",
    "    avg = sum(scores) / len(scores)\n",
    "    generalization_scores[task_id] = avg\n",
    "    print(f'  [{task_id}] Generalization avg: {avg:.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.605361Z",
     "iopub.status.idle": "2026-04-21T02:58:40.605579Z",
     "shell.execute_reply": "2026-04-21T02:58:40.605486Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.605473Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 12: Learning Curve Visualization ────────────────────────────────────\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.patches as mpatches\n",
    "import numpy as np\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(18, 6))\n",
    "fig.patch.set_facecolor('#0d1117')\n",
    "\n",
    "COLORS = {'easy': '#4caf50', 'medium': '#ff9800', 'hard': '#f44336', 'bonus': '#9c27b0'}\n",
    "\n",
    "# Plot 1: Training reward curves\n",
    "ax1 = axes[0]\n",
    "ax1.set_facecolor('#161b22')\n",
    "ax1.set_title('GRPO Training Reward Curves', color='white', fontsize=13, fontweight='bold')\n",
    "\n",
    "for task_id in CONFIG['tasks']:\n",
    "    task_log = [e for e in training_log if e['task_id'] == task_id]\n",
    "    episodes = [e['episode'] for e in task_log]\n",
    "    scores = [e['score'] for e in task_log]\n",
    "\n",
    "    # Smooth with rolling average\n",
    "    window = 5\n",
    "    smoothed = np.convolve(scores, np.ones(window)/window, mode='valid')\n",
    "    ep_smooth = episodes[window-1:]\n",
    "\n",
    "    color = COLORS.get(task_id, '#58a6ff')\n",
    "    ax1.plot(episodes, scores, alpha=0.2, color=color, linewidth=1)\n",
    "    ax1.plot(ep_smooth, smoothed, color=color, linewidth=2.5,\n",
    "             label=f'{task_id} (smoothed)')\n",
    "\n",
    "ax1.set_xlabel('Episode', color='#8b949e')\n",
    "ax1.set_ylabel('Reward Score', color='#8b949e')\n",
    "ax1.tick_params(colors='#8b949e')\n",
    "ax1.spines['bottom'].set_color('#30363d')\n",
    "ax1.spines['left'].set_color('#30363d')\n",
    "ax1.spines['top'].set_visible(False)\n",
    "ax1.spines['right'].set_visible(False)\n",
    "ax1.legend(facecolor='#161b22', labelcolor='white', fontsize=10)\n",
    "ax1.set_ylim(0, 1.05)\n",
    "ax1.grid(True, alpha=0.1, color='#30363d')\n",
    "\n",
    "# Plot 2: Before vs After bar chart\n",
    "ax2 = axes[1]\n",
    "ax2.set_facecolor('#161b22')\n",
    "ax2.set_title('Before vs After Training', color='white', fontsize=13, fontweight='bold')\n",
    "\n",
    "all_tasks = CONFIG['tasks']\n",
    "x = np.arange(len(all_tasks))\n",
    "width = 0.35\n",
    "\n",
    "before_vals = [baseline_scores[t]['avg'] for t in all_tasks]\n",
    "after_vals = [post_scores[t]['avg'] for t in all_tasks]\n",
    "\n",
    "bars1 = ax2.bar(x - width/2, before_vals, width, label='Before Training',\n",
    "                color='#f85149', alpha=0.8, edgecolor='none')\n",
    "bars2 = ax2.bar(x + width/2, after_vals, width, label='After Training',\n",
    "                color='#3fb950', alpha=0.8, edgecolor='none')\n",
    "\n",
    "for bar, val in zip(bars1, before_vals):\n",
    "    ax2.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,\n",
    "             f'{val:.2f}', ha='center', va='bottom', color='white', fontsize=9)\n",
    "for bar, val in zip(bars2, after_vals):\n",
    "    ax2.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01,\n",
    "             f'{val:.2f}', ha='center', va='bottom', color='white', fontsize=9)\n",
    "\n",
    "ax2.set_xticks(x)\n",
    "ax2.set_xticklabels(all_tasks, color='#8b949e')\n",
    "ax2.tick_params(colors='#8b949e')\n",
    "ax2.spines['bottom'].set_color('#30363d')\n",
    "ax2.spines['left'].set_color('#30363d')\n",
    "ax2.spines['top'].set_visible(False)\n",
    "ax2.spines['right'].set_visible(False)\n",
    "ax2.legend(facecolor='#161b22', labelcolor='white', fontsize=10)\n",
    "ax2.set_ylim(0, 1.1)\n",
    "ax2.set_ylabel('Average Score', color='#8b949e')\n",
    "ax2.grid(True, alpha=0.1, color='#30363d', axis='y')\n",
    "\n",
    "# Plot 3: Summary stats\n",
    "ax3 = axes[2]\n",
    "ax3.set_facecolor('#161b22')\n",
    "ax3.set_title('Training Summary', color='white', fontsize=13, fontweight='bold')\n",
    "ax3.axis('off')\n",
    "\n",
    "summary_lines = [\n",
    "    ('Model', CONFIG['model_name'].split('/')[-1]),\n",
    "    ('Algorithm', 'GRPO (Group Relative PO)'),\n",
    "    ('Fine-tuning', 'Unsloth LoRA 4-bit'),\n",
    "    ('Total Episodes', str(global_episode)),\n",
    "    ('', ''),\n",
    "]\n",
    "for task_id in CONFIG['tasks']:\n",
    "    before = baseline_scores[task_id]['avg']\n",
    "    after = post_scores[task_id]['avg']\n",
    "    summary_lines.append((f'{task_id} improvement',\n",
    "                           f'{before:.2f} → {after:.2f} (+{after-before:.2f})'))\n",
    "\n",
    "if generalization_scores:\n",
    "    summary_lines.append(('', ''))\n",
    "    summary_lines.append(('Generalization', ''))\n",
    "    for task_id, score in generalization_scores.items():\n",
    "        summary_lines.append((f'  {task_id} (unseen)', f'{score:.2f}'))\n",
    "\n",
    "y_pos = 0.95\n",
    "for label, value in summary_lines:\n",
    "    if label == '':\n",
    "        y_pos -= 0.05\n",
    "        continue\n",
    "    ax3.text(0.05, y_pos, label + ':', color='#8b949e', fontsize=10,\n",
    "             transform=ax3.transAxes, fontweight='bold')\n",
    "    ax3.text(0.55, y_pos, value, color='#c9d1d9', fontsize=10,\n",
    "             transform=ax3.transAxes)\n",
    "    y_pos -= 0.08\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('training_curve.png', dpi=150, bbox_inches='tight',\n",
    "            facecolor='#0d1117')\n",
    "print('✅ Saved training_curve.png')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2026-04-21T02:58:40.606682Z",
     "iopub.status.idle": "2026-04-21T02:58:40.607021Z",
     "shell.execute_reply": "2026-04-21T02:58:40.606854Z",
     "shell.execute_reply.started": "2026-04-21T02:58:40.606840Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ── Cell 13: Save Weights to HuggingFace Hub ─────────────────────────────────\n",
    "from huggingface_hub import HfApi\n",
    "\n",
    "print(f'Saving model to HuggingFace Hub: {CONFIG[\"hf_repo\"]}')\n",
    "print('This may take 5-10 minutes...')\n",
    "\n",
    "# Save merged model (LoRA merged into base)\n",
    "model.save_pretrained_merged(\n",
    "    CONFIG['output_dir'],\n",
    "    tokenizer,\n",
    "    save_method='merged_16bit',\n",
    ")\n",
    "\n",
    "# Push to Hub\n",
    "model.push_to_hub_merged(\n",
    "    CONFIG['hf_repo'],\n",
    "    tokenizer,\n",
    "    save_method='merged_16bit',\n",
    "    token=hf_token,\n",
    ")\n",
    "\n",
    "print(f'\\n✅ Model pushed to: https://huggingface.co/{CONFIG[\"hf_repo\"]}')\n",
    "\n",
    "# Also push training curve\n",
    "api = HfApi()\n",
    "api.upload_file(\n",
    "    path_or_fileobj='training_curve.png',\n",
    "    path_in_repo='training_curve.png',\n",
    "    repo_id=CONFIG['hf_repo'],\n",
    "    token=hf_token,\n",
    ")\n",
    "print('✅ training_curve.png uploaded to Hub')\n",
    "\n",
    "# Save training log as JSON\n",
    "import json\n",
    "with open('training_log.json', 'w') as f:\n",
    "    json.dump(training_log, f, indent=2)\n",
    "api.upload_file(\n",
    "    path_or_fileobj='training_log.json',\n",
    "    path_in_repo='training_log.json',\n",
    "    repo_id=CONFIG['hf_repo'],\n",
    "    token=hf_token,\n",
    ")\n",
    "print('✅ training_log.json uploaded')\n",
    "print(f'\\n🎉 Everything saved. Your fine-tuned model is live at:')\n",
    "print(f'   https://huggingface.co/{CONFIG[\"hf_repo\"]}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Results Summary\n",
    "\n",
    "| Metric | Value |\n",
    "|--------|-------|\n",
    "| Model | Llama-3.2-3B-Instruct (Unsloth 4-bit LoRA) |\n",
    "| Algorithm | GRPO — episode-level updates + KL penalty |\n",
    "| Tasks trained | easy, medium |\n",
    "| Total episodes | 160 |\n",
    "| Key fix | Fresh env per group completion — reward gates not burned |\n",
    "| Weights | `https://huggingface.co/Arijit-07/aria-devops-llama3b` |"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Config set\n",
      "Checkpoint: D:\\My Projects\\devops-incident-env\\kaggle_training\\results\\aria-llama3b\\checkpoint-ep140\n",
      "Exists: True\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "os.environ['UNSLOTH_RETURN_LOGITS'] = '1'\n",
    "\n",
    "CHECKPOINT = r'D:\\My Projects\\devops-incident-env\\kaggle_training\\results\\aria-llama3b\\checkpoint-ep140'\n",
    "HF_TOKEN = 'YOUR_HF_WRITE_TOKEN_HERE'  # ← paste your write token\n",
    "HF_REPO = 'Arijit-07/aria-devops-llama3b'\n",
    "BASE_URL = 'https://arijit-07-devops-incident-response.hf.space'\n",
    "\n",
    "print('✅ Config set')\n",
    "print(f'Checkpoint: {CHECKPOINT}')\n",
    "print(f'Exists: {os.path.exists(CHECKPOINT)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "NotImplementedError",
     "evalue": "Unsloth cannot find any torch accelerator? You need a GPU.",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mNotImplementedError\u001b[0m                       Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[2], line 9\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01msubprocess\u001b[39;00m\n\u001b[0;32m      2\u001b[0m subprocess\u001b[38;5;241m.\u001b[39mrun([\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpip\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124minstall\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m-q\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      3\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124munsloth\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      4\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtransformers>=4.48.0\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      5\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mpeft\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maccelerate\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbitsandbytes\u001b[39m\u001b[38;5;124m'\u001b[39m,\n\u001b[0;32m      6\u001b[0m     \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrequests\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmatplotlib\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhuggingface_hub\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtorch\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m      7\u001b[0m ], capture_output\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m----> 9\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01munsloth\u001b[39;00m\u001b[38;5;241m,\u001b[39m \u001b[38;5;21;01mtransformers\u001b[39;00m\u001b[38;5;241m,\u001b[39m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m     10\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m✅ unsloth \u001b[39m\u001b[38;5;132;01m{\u001b[39;00munsloth\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m     11\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m✅ transformers \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtransformers\u001b[38;5;241m.\u001b[39m__version__\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n",
      "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\unsloth\\__init__.py:105\u001b[0m\n\u001b[0;32m     93\u001b[0m         \u001b[38;5;28mprint\u001b[39m(\n\u001b[0;32m     94\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnsloth: Please update Unsloth and Unsloth-Zoo to the latest version!\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     95\u001b[0m             \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDo this via `pip install --upgrade --force-reinstall --no-cache-dir --no-deps unsloth unsloth_zoo`\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m     96\u001b[0m         )\n\u001b[0;32m     97\u001b[0m         \u001b[38;5;66;03m# if os.environ.get(\"UNSLOTH_DISABLE_AUTO_UPDATES\", \"0\") == \"0\":\u001b[39;00m\n\u001b[0;32m     98\u001b[0m         \u001b[38;5;66;03m#     try:\u001b[39;00m\n\u001b[0;32m     99\u001b[0m         \u001b[38;5;66;03m#         os.system(\"pip install --upgrade --no-cache-dir --no-deps unsloth_zoo\")\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    103\u001b[0m         \u001b[38;5;66;03m#         except:\u001b[39;00m\n\u001b[0;32m    104\u001b[0m         \u001b[38;5;66;03m#             raise ImportError(\"Unsloth: Please update unsloth_zoo via `pip install --upgrade --no-cache-dir --no-deps unsloth_zoo`\")\u001b[39;00m\n\u001b[1;32m--> 105\u001b[0m     \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01munsloth_zoo\u001b[39;00m\n\u001b[0;32m    106\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m PackageNotFoundError:\n\u001b[0;32m    107\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\n\u001b[0;32m    108\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnsloth: Please install unsloth_zoo via `pip install unsloth_zoo` then retry!\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m    109\u001b[0m     )\n",
      "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\unsloth_zoo\\__init__.py:220\u001b[0m\n\u001b[0;32m    217\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m logging, torchao_logger, HideLoggingMessage\n\u001b[0;32m    219\u001b[0m \u001b[38;5;66;03m# Get device types and other variables\u001b[39;00m\n\u001b[1;32m--> 220\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdevice_type\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[0;32m    221\u001b[0m     is_hip,\n\u001b[0;32m    222\u001b[0m     get_device_type,\n\u001b[0;32m    223\u001b[0m     DEVICE_TYPE,\n\u001b[0;32m    224\u001b[0m     DEVICE_TYPE_TORCH,\n\u001b[0;32m    225\u001b[0m     DEVICE_COUNT,\n\u001b[0;32m    226\u001b[0m     ALLOW_PREQUANTIZED_MODELS,\n\u001b[0;32m    227\u001b[0m )\n\u001b[0;32m    228\u001b[0m IS_HIP_RUNTIME \u001b[38;5;241m=\u001b[39m (DEVICE_TYPE \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhip\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mbool\u001b[39m(is_hip())\n\u001b[0;32m    230\u001b[0m \u001b[38;5;66;03m# Torch >= 2.9 uses PYTORCH_ALLOC_CONF and treats legacy per-backend vars as deprecated.\u001b[39;00m\n",
      "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\unsloth_zoo\\device_type.py:231\u001b[0m\n\u001b[0;32m    229\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnsloth currently only works on NVIDIA, AMD and Intel GPUs.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    230\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m--> 231\u001b[0m DEVICE_TYPE : \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m get_device_type()\n\u001b[0;32m    232\u001b[0m \u001b[38;5;66;03m# HIP fails for autocast and other torch functions. Use CUDA instead\u001b[39;00m\n\u001b[0;32m    233\u001b[0m DEVICE_TYPE_TORCH \u001b[38;5;241m=\u001b[39m DEVICE_TYPE\n",
      "File \u001b[1;32m~\\anaconda3\\Lib\\site-packages\\unsloth_zoo\\device_type.py:218\u001b[0m, in \u001b[0;36mget_device_type\u001b[1;34m()\u001b[0m\n\u001b[0;32m    216\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m amd_hint \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m    217\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(amd_hint)\n\u001b[1;32m--> 218\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnsloth cannot find any torch accelerator? You need a GPU.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m    219\u001b[0m accelerator \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(torch\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39mcurrent_accelerator())\n\u001b[0;32m    220\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accelerator \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcuda\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mxpu\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhip\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
      "\u001b[1;31mNotImplementedError\u001b[0m: Unsloth cannot find any torch accelerator? You need a GPU."
     ]
    }
   ],
   "source": [
    "import subprocess\n",
    "subprocess.run(['pip', 'install', '-q',\n",
    "    'unsloth',\n",
    "    'transformers>=4.48.0',\n",
    "    'peft', 'accelerate', 'bitsandbytes',\n",
    "    'requests', 'matplotlib', 'huggingface_hub', 'torch'\n",
    "], capture_output=True)\n",
    "\n",
    "import unsloth, transformers, torch\n",
    "print(f'✅ unsloth {unsloth.__version__}')\n",
    "print(f'✅ transformers {transformers.__version__}')\n",
    "print(f'✅ torch {torch.__version__} | CUDA: {torch.cuda.is_available()}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from unsloth import FastLanguageModel\n",
    "\n",
    "print('Loading fine-tuned checkpoint...')\n",
    "ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name=CHECKPOINT,\n",
    "    max_seq_length=2048,\n",
    "    load_in_4bit=True,\n",
    ")\n",
    "FastLanguageModel.for_inference(ft_model)\n",
    "print('✅ Fine-tuned model loaded')\n",
    "print(f'VRAM used: {torch.cuda.memory_allocated()/1e9:.2f} GB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Loading base model for comparison...')\n",
    "base_model, base_tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name='unsloth/Llama-3.2-3B-Instruct',\n",
    "    max_seq_length=2048,\n",
    "    load_in_4bit=True,\n",
    ")\n",
    "FastLanguageModel.for_inference(base_model)\n",
    "print('✅ Base model loaded')\n",
    "print(f'VRAM used: {torch.cuda.memory_allocated()/1e9:.2f} GB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Loading base model for comparison...')\n",
    "base_model, base_tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name='unsloth/Llama-3.2-3B-Instruct',\n",
    "    max_seq_length=2048,\n",
    "    load_in_4bit=True,\n",
    ")\n",
    "FastLanguageModel.for_inference(base_model)\n",
    "print('✅ Base model loaded')\n",
    "print(f'VRAM used: {torch.cuda.memory_allocated()/1e9:.2f} GB')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests, json, re, random\n",
    "\n",
    "def env_reset(task_id, seed=None):\n",
    "    payload = {'task_id': task_id}\n",
    "    if seed is not None: payload['seed'] = seed\n",
    "    for attempt in range(3):\n",
    "        try:\n",
    "            r = requests.post(f'{BASE_URL}/reset', json=payload, timeout=30)\n",
    "            r.raise_for_status()\n",
    "            return r.json()\n",
    "        except:\n",
    "            if attempt == 2: raise\n",
    "            import time; time.sleep(5)\n",
    "\n",
    "def env_step(action):\n",
    "    for attempt in range(3):\n",
    "        try:\n",
    "            r = requests.post(f'{BASE_URL}/step', json=action, timeout=30)\n",
    "            r.raise_for_status()\n",
    "            return r.json()\n",
    "        except:\n",
    "            if attempt == 2: raise\n",
    "            import time; time.sleep(5)\n",
    "\n",
    "def env_state():\n",
    "    r = requests.get(f'{BASE_URL}/state', timeout=30)\n",
    "    r.raise_for_status()\n",
    "    return r.json()\n",
    "\n",
    "SYSTEM_PROMPT = \"\"\"You are an expert DevOps engineer responding to a production incident.\n",
    "Respond with ONLY a valid JSON action object. No explanation, no markdown.\n",
    "Available actions:\n",
    "- {\"action_type\": \"read_logs\", \"service\": \"<name>\"}\n",
    "- {\"action_type\": \"read_metrics\", \"service\": \"<name>\"}\n",
    "- {\"action_type\": \"search_logs\", \"service\": \"<name>\", \"query\": \"<term>\"}\n",
    "- {\"action_type\": \"diagnose\", \"root_cause\": \"<diagnosis>\"}\n",
    "- {\"action_type\": \"restart_service\", \"service\": \"<name>\"}\n",
    "- {\"action_type\": \"rollback\", \"service\": \"<name>\", \"version\": \"previous\"}\n",
    "- {\"action_type\": \"alert_oncall\", \"message\": \"<msg>\"}\n",
    "- {\"action_type\": \"noop\"}\"\"\"\n",
    "\n",
    "def obs_to_text(obs, task_id):\n",
    "    lines = [f'=== INCIDENT | Task: {task_id.upper()} | Step: {obs.get(\"step\",0)}/{obs.get(\"max_steps\",15)} ===', '']\n",
    "    for a in sorted(obs.get('active_alerts', []),\n",
    "                    key=lambda x: x.get('severity',''), reverse=True):\n",
    "        lines.append(f'ALERT [{a.get(\"severity\",\"\").upper()}] {a.get(\"service\",\"\")}: {a.get(\"message\",\"\")}')\n",
    "    lines.append('')\n",
    "    for s in sorted(obs.get('services', []),\n",
    "                    key=lambda x: x.get('error_rate',0), reverse=True):\n",
    "        lines.append(\n",
    "            f'SERVICE {s.get(\"name\",\"\"):28s} | {s.get(\"status\",\"\"):10s} | '\n",
    "            f'err={s.get(\"error_rate\",0):.3f} | mem={s.get(\"memory\",0):.1f}%'\n",
    "        )\n",
    "    evidence = obs.get('evidence_log', [])\n",
    "    if evidence:\n",
    "        lines.append('')\n",
    "        lines.append('EVIDENCE:')\n",
    "        for e in evidence[-3:]:\n",
    "            lines.append(f'  [{e.get(\"action_type\",\"\").upper()}] {e.get(\"content\",\"\")[:150]}')\n",
    "    return '\\n'.join(lines)\n",
    "\n",
    "def parse_action(text):\n",
    "    text = text.strip()\n",
    "    for pat in [\n",
    "        r'```json\\s*({.*?})\\s*```',\n",
    "        r'```\\s*({.*?})\\s*```',\n",
    "        r'({\\s*\"action_type\"[^}]+})',\n",
    "    ]:\n",
    "        m = re.search(pat, text, re.DOTALL)\n",
    "        if m:\n",
    "            try: return json.loads(m.group(1))\n",
    "            except: continue\n",
    "    try: return json.loads(text)\n",
    "    except: return {'action_type': 'noop'}\n",
    "\n",
    "def run_episode(m, tok, task_id, seed, verbose=False):\n",
    "    obs = env_reset(task_id, seed=seed)\n",
    "    done = False\n",
    "    for step in range(15):\n",
    "        if done: break\n",
    "        msgs = [\n",
    "            {'role': 'system', 'content': SYSTEM_PROMPT},\n",
    "            {'role': 'user', 'content': obs_to_text(obs, task_id)}\n",
    "        ]\n",
    "        ids = tok.apply_chat_template(\n",
    "            msgs, tokenize=True, add_generation_prompt=True,\n",
    "            return_tensors='pt'\n",
    "        ).to('cuda')\n",
    "        with torch.no_grad():\n",
    "            out = m.generate(\n",
    "                ids, max_new_tokens=100, temperature=0.3,\n",
    "                do_sample=True, pad_token_id=tok.eos_token_id,\n",
    "            )\n",
    "        text = tok.decode(out[0][ids.shape[1]:], skip_special_tokens=True)\n",
    "        action = parse_action(text)\n",
    "        if verbose:\n",
    "            print(f'  Step {step+1}: {action}')\n",
    "        result = env_step(action)\n",
    "        obs = result.get('observation', obs)\n",
    "        done = result.get('done', False)\n",
    "    return env_state().get('current_score', 0.0)\n",
    "\n",
    "# Test connection\n",
    "health = requests.get(f'{BASE_URL}/health', timeout=15).json()\n",
    "print(f'✅ Environment: {health}')\n",
    "print('✅ All helpers ready')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEEDS = [50001, 50008, 50015, 50022, 50029,\n",
    "         50036, 50043, 50050, 50057, 50064]\n",
    "\n",
    "results = {}\n",
    "print('Running evaluation — 10 episodes per model per task')\n",
    "print('='*60)\n",
    "\n",
    "for task_id in ['easy', 'medium']:\n",
    "    print(f'\\nTask: {task_id.upper()}')\n",
    "    print('-'*40)\n",
    "    base_scores, ft_scores = [], []\n",
    "\n",
    "    for seed in SEEDS:\n",
    "        bs = run_episode(base_model, base_tokenizer, task_id, seed)\n",
    "        fs = run_episode(ft_model, ft_tokenizer, task_id, seed)\n",
    "        base_scores.append(bs)\n",
    "        ft_scores.append(fs)\n",
    "        print(f'  seed={seed} | base={bs:.3f} | fine-tuned={fs:.3f} | Δ={fs-bs:+.3f}')\n",
    "\n",
    "    base_avg = sum(base_scores)/len(base_scores)\n",
    "    ft_avg = sum(ft_scores)/len(ft_scores)\n",
    "    delta = ft_avg - base_avg\n",
    "    results[task_id] = {\n",
    "        'base_scores': base_scores,\n",
    "        'ft_scores': ft_scores,\n",
    "        'base_avg': base_avg,\n",
    "        'ft_avg': ft_avg,\n",
    "        'delta': delta\n",
    "    }\n",
    "    symbol = '✅ IMPROVED' if delta > 0.02 else '⚠️ FLAT' if delta > -0.02 else '❌ DEGRADED'\n",
    "    print(f'\\n{symbol}')\n",
    "    print(f'  Base avg:       {base_avg:.3f}')\n",
    "    print(f'  Fine-tuned avg: {ft_avg:.3f}')\n",
    "    print(f'  Improvement:    {delta:+.3f}')\n",
    "\n",
    "print('\\n' + '='*60)\n",
    "print('FINAL RESULTS')\n",
    "print('='*60)\n",
    "for task_id, r in results.items():\n",
    "    print(f'{task_id}: {r[\"base_avg\"]:.3f} → {r[\"ft_avg\"]:.3f} ({r[\"delta\"]:+.3f})')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "\n",
    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
    "fig.patch.set_facecolor('#0d1117')\n",
    "\n",
    "COLORS = {'base': '#f85149', 'ft': '#3fb950'}\n",
    "\n",
    "for idx, task_id in enumerate(['easy', 'medium']):\n",
    "    ax = axes[idx]\n",
    "    ax.set_facecolor('#161b22')\n",
    "    r = results[task_id]\n",
    "\n",
    "    x = np.arange(len(SEEDS))\n",
    "    w = 0.35\n",
    "    b = ax.bar(x - w/2, r['base_scores'], w,\n",
    "               label='Base model', color='#f85149', alpha=0.85)\n",
    "    f = ax.bar(x + w/2, r['ft_scores'], w,\n",
    "               label='Fine-tuned (ep140)', color='#3fb950', alpha=0.85)\n",
    "\n",
    "    # Value labels\n",
    "    for bar in b:\n",
    "        h = bar.get_height()\n",
    "        ax.text(bar.get_x()+bar.get_width()/2., h+0.01,\n",
    "                f'{h:.2f}', ha='center', color='white', fontsize=7)\n",
    "    for bar in f:\n",
    "        h = bar.get_height()\n",
    "        ax.text(bar.get_x()+bar.get_width()/2., h+0.01,\n",
    "                f'{h:.2f}', ha='center', color='white', fontsize=7)\n",
    "\n",
    "    # Avg lines\n",
    "    ax.axhline(y=r['base_avg'], color='#f85149', linestyle='--',\n",
    "               linewidth=1.5, alpha=0.6, label=f'Base avg: {r[\"base_avg\"]:.3f}')\n",
    "    ax.axhline(y=r['ft_avg'], color='#3fb950', linestyle='--',\n",
    "               linewidth=1.5, alpha=0.6, label=f'FT avg: {r[\"ft_avg\"]:.3f}')\n",
    "\n",
    "    ax.set_title(\n",
    "        f'Task: {task_id.upper()} | Improvement: {r[\"delta\"]:+.3f}',\n",
    "        color='white', fontsize=13, fontweight='bold'\n",
    "    )\n",
    "    ax.set_xticks(x)\n",
    "    ax.set_xticklabels([f's{i+1}' for i in range(len(SEEDS))],\n",
    "                       color='#8b949e', fontsize=8)\n",
    "    ax.tick_params(colors='#8b949e')\n",
    "    for spine in ax.spines.values(): spine.set_color('#30363d')\n",
    "    ax.spines['top'].set_visible(False)\n",
    "    ax.spines['right'].set_visible(False)\n",
    "    ax.set_ylim(0, 1.15)\n",
    "    ax.set_ylabel('Score', color='#8b949e')\n",
    "    ax.set_xlabel('Episode (unseen seeds)', color='#8b949e')\n",
    "    ax.legend(facecolor='#161b22', labelcolor='white', fontsize=9)\n",
    "    ax.grid(True, alpha=0.1, color='#30363d', axis='y')\n",
    "\n",
    "fig.suptitle(\n",
    "    'ARIA — GRPO Fine-tuning Results\\nLlama-3.2-3B | 140 episodes | easy + medium tasks',\n",
    "    color='white', fontsize=14, fontweight='bold', y=1.02\n",
    ")\n",
    "plt.tight_layout()\n",
    "plt.savefig('training_curve.png', dpi=150, bbox_inches='tight',\n",
    "            facecolor='#0d1117')\n",
    "print('✅ Saved training_curve.png')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import HfApi, login\n",
    "import os\n",
    "\n",
    "login(token=HF_TOKEN)\n",
    "api = HfApi()\n",
    "\n",
    "# Create repo\n",
    "api.create_repo(\n",
    "    repo_id=HF_REPO,\n",
    "    repo_type='model',\n",
    "    exist_ok=True,\n",
    "    token=HF_TOKEN,\n",
    ")\n",
    "print(f'✅ Repo ready: {HF_REPO}')\n",
    "\n",
    "# Upload checkpoint files\n",
    "print('Uploading adapter weights...')\n",
    "for filename in os.listdir(CHECKPOINT):\n",
    "    filepath = os.path.join(CHECKPOINT, filename)\n",
    "    if os.path.isfile(filepath):\n",
    "        api.upload_file(\n",
    "            path_or_fileobj=filepath,\n",
    "            path_in_repo=filename,\n",
    "            repo_id=HF_REPO,\n",
    "            token=HF_TOKEN,\n",
    "        )\n",
    "        print(f'  ✅ {filename}')\n",
    "\n",
    "# Upload training curve\n",
    "api.upload_file(\n",
    "    path_or_fileobj='training_curve.png',\n",
    "    path_in_repo='training_curve.png',\n",
    "    repo_id=HF_REPO,\n",
    "    token=HF_TOKEN,\n",
    ")\n",
    "print('  ✅ training_curve.png')\n",
    "\n",
    "print(f'\\n🎉 Everything live at: https://huggingface.co/{HF_REPO}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "easy_r = results['easy']\n",
    "medium_r = results['medium']\n",
    "\n",
    "model_card = f\"\"\"---\n",
    "base_model: unsloth/Llama-3.2-3B-Instruct\n",
    "library_name: peft\n",
    "pipeline_tag: text-generation\n",
    "tags:\n",
    "- lora\n",
    "- unsloth\n",
    "- grpo\n",
    "- reinforcement-learning\n",
    "- devops\n",
    "- incident-response\n",
    "---\n",
    "\n",
    "# ARIA — DevOps Incident Response Agent\n",
    "## Llama-3.2-3B fine-tuned with GRPO\n",
    "\n",
    "Fine-tuned on the [ARIA DevOps Incident Response](https://huggingface.co/spaces/Arijit-07/devops-incident-response) \n",
    "RL environment using Group Relative Policy Optimization (GRPO).\n",
    "\n",
    "## Training\n",
    "\n",
    "- **Algorithm:** GRPO (Group Relative Policy Optimization)\n",
    "- **Base model:** Llama-3.2-3B-Instruct\n",
    "- **Fine-tuning:** Unsloth LoRA (rank=16, alpha=32, 4-bit quantized)\n",
    "- **Episodes:** 140 (easy + medium tasks)\n",
    "- **Environment:** Live DevOps incident response simulation\n",
    "\n",
    "## Results (10 unseen episodes per task)\n",
    "\n",
    "| Task | Base Model | Fine-tuned | Improvement |\n",
    "|------|-----------|------------|-------------|\n",
    "| easy | {easy_r['base_avg']:.3f} | {easy_r['ft_avg']:.3f} | {easy_r['delta']:+.3f} |\n",
    "| medium | {medium_r['base_avg']:.3f} | {medium_r['ft_avg']:.3f} | {medium_r['delta']:+.3f} |\n",
    "\n",
    "## Environment\n",
    "\n",
    "The agent learns to diagnose and fix production incidents:\n",
    "- 7 task types: OOM crashes, cascading failures, silent corruption, DDoS, DB degradation, multi-region failover\n",
    "- 14 action types including read_logs, diagnose, restart_service, rollback\n",
    "- Dense reward shaping with collateral damage penalties\n",
    "- Partial log observability — agents must learn to query\n",
    "\n",
    "## Usage\n",
    "\n",
    "```python\n",
    "from peft import PeftModel\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "base = AutoModelForCausalLM.from_pretrained(\"unsloth/Llama-3.2-3B-Instruct\")\n",
    "model = PeftModel.from_pretrained(base, \"Arijit-07/aria-devops-llama3b\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"Arijit-07/aria-devops-llama3b\")\n",
    "```\n",
    "\n",
    "## Links\n",
    "- Environment: https://huggingface.co/spaces/Arijit-07/devops-incident-response\n",
    "- API docs: https://arijit-07-devops-incident-response.hf.space/docs\n",
    "\"\"\"\n",
    "\n",
    "with open('README.md', 'w') as f:\n",
    "    f.write(model_card)\n",
    "\n",
    "api.upload_file(\n",
    "    path_or_fileobj='README.md',\n",
    "    path_in_repo='README.md',\n",
    "    repo_id=HF_REPO,\n",
    "    token=HF_TOKEN,\n",
    ")\n",
    "print('✅ Model card uploaded')\n",
    "print(f'🎉 Complete: https://huggingface.co/{HF_REPO}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "HF_TOKEN = 'YOUR_HF_WRITE_TOKEN_HERE'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Logged in\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import HfApi, login\n",
    "\n",
    "HF_TOKEN = 'YOUR_HF_WRITE_TOKEN_HERE'  # ← paste your REAL token here\n",
    "                                            # it must start with hf_\n",
    "\n",
    "login(token=HF_TOKEN, add_to_git_credential=False)\n",
    "print('✅ Logged in')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Repo ready\n",
      "✅ adapter_config.json\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5cf0fc74219f4cacbbe36a55bfa8d99f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eca0c983e1b742e0bbcc765a0e7059bd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "New Data Upload: |          |  0.00B /  0.00B            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ adapter_model.safetensors\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No files have been modified since last commit. Skipping to prevent empty commit.\n",
      "[huggingface_hub.hf_api|WARNING]No files have been modified since last commit. Skipping to prevent empty commit.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ chat_template.jinja\n",
      "✅ README.md\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "62ad8e5402f140e8ad64f72cc4f82d8d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Processing Files (0 / 0): |          |  0.00B /  0.00B            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "16398f828ca742df99cd6e237f976e7b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "New Data Upload: |          |  0.00B /  0.00B            "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No files have been modified since last commit. Skipping to prevent empty commit.\n",
      "[huggingface_hub.hf_api|WARNING]No files have been modified since last commit. Skipping to prevent empty commit.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ tokenizer.json\n",
      "✅ tokenizer_config.json\n",
      "\n",
      "🎉 Live at: https://huggingface.co/Arijit-07/aria-devops-llama3b\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import HfApi\n",
    "import os\n",
    "\n",
    "HF_REPO = 'Arijit-07/aria-devops-llama3b'\n",
    "CHECKPOINT = r'D:\\My Projects\\devops-incident-env\\kaggle_training\\results\\aria-llama3b\\checkpoint-ep140'\n",
    "\n",
    "api = HfApi()\n",
    "\n",
    "api.create_repo(repo_id=HF_REPO, repo_type='model', exist_ok=True)\n",
    "print(f'✅ Repo ready')\n",
    "\n",
    "for filename in os.listdir(CHECKPOINT):\n",
    "    filepath = os.path.join(CHECKPOINT, filename)\n",
    "    if os.path.isfile(filepath):\n",
    "        api.upload_file(\n",
    "            path_or_fileobj=filepath,\n",
    "            path_in_repo=filename,\n",
    "            repo_id=HF_REPO,\n",
    "            token=HF_TOKEN,\n",
    "        )\n",
    "        print(f'✅ {filename}')\n",
    "\n",
    "print(f'\\n🎉 Live at: https://huggingface.co/{HF_REPO}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Model card uploaded\n",
      "🎉 https://huggingface.co/Arijit-07/aria-devops-llama3b\n"
     ]
    }
   ],
   "source": [
    "model_card = f\"\"\"---\n",
    "base_model: unsloth/Llama-3.2-3B-Instruct\n",
    "library_name: peft\n",
    "pipeline_tag: text-generation\n",
    "tags:\n",
    "- lora\n",
    "- unsloth\n",
    "- grpo\n",
    "- reinforcement-learning\n",
    "- devops\n",
    "- incident-response\n",
    "---\n",
    "\n",
    "# ARIA — DevOps Incident Response Agent\n",
    "### Llama-3.2-3B fine-tuned with GRPO\n",
    "\n",
    "Fine-tuned on the [ARIA DevOps Incident Response](https://huggingface.co/spaces/Arijit-07/devops-incident-response) \n",
    "RL environment using Group Relative Policy Optimization (GRPO).\n",
    "\n",
    "## Training Details\n",
    "\n",
    "- **Algorithm:** GRPO (Group Relative Policy Optimization)\n",
    "- **Base model:** Llama-3.2-3B-Instruct\n",
    "- **Fine-tuning:** Unsloth LoRA (rank=16, alpha=32, 4-bit quantized)\n",
    "- **Episodes:** 140 across easy + medium tasks\n",
    "- **Training time:** ~10 hours on Kaggle T4 x2\n",
    "- **Environment:** Live DevOps incident response simulation\n",
    "\n",
    "## What the Agent Learns\n",
    "\n",
    "The agent is trained to respond to production software incidents by:\n",
    "1. Gathering information (read_logs, read_metrics, search_logs)\n",
    "2. Diagnosing the root cause before acting\n",
    "3. Applying the correct fix (restart, rollback, scale_up, block_ip etc.)\n",
    "4. Avoiding collateral damage to healthy services\n",
    "\n",
    "## Environment\n",
    "\n",
    "7 task types of escalating difficulty:\n",
    "- **Easy:** Single service OOM crash-loop\n",
    "- **Medium:** Cascading connection pool failure\n",
    "- **Hard:** Silent data corruption (all services green)\n",
    "- **Bonus:** Two simultaneous independent failures\n",
    "- **Security:** DDoS botnet credential stuffing\n",
    "- **Database:** Missing index causing full table scans\n",
    "- **Failover:** Multi-region network partition\n",
    "\n",
    "14 action types · Dense reward shaping · Partial log observability · SLA degradation per step\n",
    "\n",
    "## Links\n",
    "- **Live environment:** https://huggingface.co/spaces/Arijit-07/devops-incident-response\n",
    "- **Interactive API:** https://arijit-07-devops-incident-response.hf.space/docs\n",
    "- **GitHub:** https://github.com/Twilight-13/devops-incident-response\n",
    "\n",
    "## Usage\n",
    "\n",
    "```python\n",
    "from peft import PeftModel\n",
    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
    "\n",
    "base = AutoModelForCausalLM.from_pretrained(\n",
    "    \"unsloth/Llama-3.2-3B-Instruct\",\n",
    "    load_in_4bit=True\n",
    ")\n",
    "model = PeftModel.from_pretrained(base, \"Arijit-07/aria-devops-llama3b\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"Arijit-07/aria-devops-llama3b\")\n",
    "```\n",
    "\"\"\"\n",
    "\n",
    "from huggingface_hub import HfApi\n",
    "api = HfApi()\n",
    "\n",
    "with open('README.md', 'w', encoding='utf-8') as f:\n",
    "    f.write(model_card)\n",
    "\n",
    "api.upload_file(\n",
    "    path_or_fileobj='README.md',\n",
    "    path_in_repo='README.md',\n",
    "    repo_id='Arijit-07/aria-devops-llama3b',\n",
    "    token=HF_TOKEN,\n",
    ")\n",
    "print('✅ Model card uploaded')\n",
    "print('🎉 https://huggingface.co/Arijit-07/aria-devops-llama3b')"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [],
   "dockerImageVersionId": 31329,
   "isGpuEnabled": true,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}