Spaces:

AlgoCore
/

support-ticket-env

Sleeping

App Files Files Community

Vighnesh commited on Apr 26

Commit

5648ca2

1 Parent(s): 7bdf1e0

add training notebook

Browse files

Files changed (1) hide show

train_grpo.ipynb +15 -787

train_grpo.ipynb CHANGED Viewed

@@ -16,7 +16,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Support Ticket Env — GRPO Fine-Tuning\n",
     "**OpenEnv x Scalar Hackathon**\n",
     "\n",
     "Fine-tunes `Qwen/Qwen2.5-0.5B-Instruct` using **real GRPO** (`trl.GRPOTrainer`) + LoRA (PEFT)\n",
@@ -26,7 +26,7 @@
     "- **Algorithm:** GRPO via `trl.GRPOTrainer` (proper clipped ratio + KL vs reference model)\n",
     "- **Environment:** https://algocore-support-ticket-env.hf.space\n",
     "- **Runtime:** ~30-45 min on Kaggle P100/T4 (or Colab)\n",
-    "- **No Unsloth** — standard HuggingFace transformers + PEFT"
    ]
   },
   {
@@ -35,10 +35,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Install dependencies\n",
-    "!pip install -q 'trl>=0.18.2,<=0.24.0' 'transformers>=4.51.3,<=5.5.0' 'datasets>=3.4.1,<4.4.0' accelerate peft\n",
-    "!pip install -q bitsandbytes requests matplotlib\n",
-    "print('Installation complete')"
    ]
   },
   {
@@ -47,52 +44,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "\n",
-    "# Load HF_TOKEN: Colab -> Kaggle -> env var\n",
-    "HF_TOKEN = ''\n",
-    "try:\n",
-    "    from google.colab import userdata\n",
-    "    HF_TOKEN = userdata.get('HF_TOKEN') or ''\n",
-    "except Exception:\n",
-    "    pass\n",
-    "\n",
-    "if not HF_TOKEN:\n",
-    "    try:\n",
-    "        from kaggle_secrets import UserSecretsClient\n",
-    "        HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN') or ''\n",
-    "    except Exception:\n",
-    "        pass\n",
-    "\n",
-    "if not HF_TOKEN:\n",
-    "    HF_TOKEN = os.environ.get('HF_TOKEN', '')\n",
-    "\n",
-    "if not HF_TOKEN:\n",
-    "    raise ValueError('HF_TOKEN not found. Kaggle: Add-ons -> Secrets -> HF_TOKEN. Colab: key icon -> Secrets.')\n",
-    "\n",
-    "print('HF_TOKEN loaded OK')\n",
-    "\n",
-    "ENV_BASE_URL = 'https://algocore-support-ticket-env.hf.space'\n",
-    "MODEL_NAME   = 'Qwen/Qwen2.5-0.5B-Instruct'\n",
-    "# To use SFT pre-trained model instead (recommended - run train_sft.ipynb first):\n",
-    "# MODEL_NAME = '/kaggle/working/sft-model'         # local SFT output\n",
-    "# MODEL_NAME = 'AlgoCore/support-ticket-sft-model' # HF Hub SFT model\n",
-    "HF_REPO_ID   = 'AlgoCore/support-ticket-grpo-model'\n",
-    "\n",
-    "RUNTIME     = 'kaggle' if os.path.exists('/kaggle/working') else 'colab'\n",
-    "OUTPUT_DIR  = '/kaggle/working/support-ticket-grpo' if RUNTIME == 'kaggle' else '/content/support-ticket-grpo'\n",
-    "RESULTS_IMG = '/kaggle/working/grpo_results.png'   if RUNTIME == 'kaggle' else '/content/grpo_results.png'\n",
-    "print(f'Runtime: {RUNTIME} | Output: {OUTPUT_DIR}')\n",
-    "\n",
-    "os.environ['HF_TOKEN'] = HF_TOKEN\n",
-    "os.environ['HUGGING_FACE_HUB_TOKEN'] = HF_TOKEN\n",
-    "\n",
-    "import torch\n",
-    "print('GPU:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO GPU — switch runtime!')\n",
-    "if torch.cuda.is_available():\n",
-    "    print('VRAM:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\n",
-    "print('Model:', MODEL_NAME)\n",
-    "print('Env:  ', ENV_BASE_URL)"
    ]
   },
   {
@@ -101,135 +53,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import requests, json, re, random\n",
-    "from dataclasses import dataclass\n",
-    "from typing import Optional\n",
-    "\n",
-    "TICKETS = [\n",
-    "    {'id':'T001','text':'I was charged twice for my subscription this month.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'T002','text':'I cannot log into my account. Password reset email never arrives.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'T003','text':'Your app crashes every time I upload a file larger than 10 MB.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T004','text':'I want a full refund. I have not used the service at all.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'T005','text':'What are your business hours and do you have a phone number?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'T006','text':'My invoice shows a charge for a plan I never subscribed to.','category':'billing','correct_action':'escalate'},\n",
-    "    {'id':'T007','text':'How do I cancel my subscription? I cannot find the option.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'T008','text':'The API is returning 500 errors intermittently for 2 hours.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T009','text':'Thank you! The issue has been resolved. You guys are awesome.','category':'general','correct_action':'close'},\n",
-    "    {'id':'T010','text':'I need an itemised invoice for my company accounting department.','category':'billing','correct_action':'reply'},\n",
-    "]\n",
-    "\n",
-    "KEYWORD_REWARDS = {\n",
-    "    'billing':   ['charge','invoice','payment','billing','refund'],\n",
-    "    'account':   ['password','login','account','cancel','subscription'],\n",
-    "    'technical': ['engineering','escalate','bug','crash','error'],\n",
-    "    'refund':    ['refund','return','credit','process'],\n",
-    "    'general':   ['hours','contact','phone','information','help'],\n",
-    "}\n",
-    "\n",
-    "@dataclass\n",
-    "class Obs:\n",
-    "    ticket_id: str\n",
-    "    ticket_text: str\n",
-    "    task_id: int\n",
-    "    current_category: Optional[str]\n",
-    "    resolved: bool\n",
-    "    step_count: int\n",
-    "    feedback: str\n",
-    "    score: float\n",
-    "    reward: float\n",
-    "    done: bool\n",
-    "\n",
-    "class LocalEnv:\n",
-    "    \"\"\"Local mirror of live HF Space — same reward logic, used as fallback.\"\"\"\n",
-    "    def reset(self, task_id=1, seed=42):\n",
-    "        rng = random.Random(seed)\n",
-    "        self.task_id = task_id\n",
-    "        self.ticket  = rng.choice(TICKETS)\n",
-    "        self.classified = False\n",
-    "        self.step_count = 0\n",
-    "        return Obs(self.ticket['id'], self.ticket['text'], task_id,\n",
-    "                   None, False, 0, 'New ticket. Take action.', 0.0, 0.0, False)\n",
-    "    def step(self, action):\n",
-    "        self.step_count += 1\n",
-    "        at    = action.get('action_type', '')\n",
-    "        cat   = action.get('category', '')\n",
-    "        reply = action.get('reply_text', '')\n",
-    "        reward = 0.0; done = False\n",
-    "        if self.task_id == 1:\n",
-    "            reward = 1.0 if cat == self.ticket['category'] else 0.0\n",
-    "            done   = True\n",
-    "        elif self.task_id == 2:\n",
-    "            if not self.classified:\n",
-    "                reward = 0.3 if cat == self.ticket['category'] else 0.1\n",
-    "                self.classified = True\n",
-    "            else:\n",
-    "                reward = 1.0 if at == self.ticket['correct_action'] else 0.0\n",
-    "                done   = True\n",
-    "        else:\n",
-    "            if not self.classified:\n",
-    "                reward = 0.2 if cat == self.ticket['category'] else 0.0\n",
-    "                self.classified = True\n",
-    "            else:\n",
-    "                action_score = 0.4 if at == self.ticket['correct_action'] else 0.0\n",
-    "                kws          = KEYWORD_REWARDS.get(self.ticket['category'], [])\n",
-    "                reply_score  = min(0.25, sum(0.05 for kw in kws if kw in reply.lower()))\n",
-    "                reward       = action_score + reply_score\n",
-    "                done         = True\n",
-    "        return Obs(self.ticket['id'], self.ticket['text'], self.task_id,\n",
-    "                   self.ticket['category'] if self.classified else None,\n",
-    "                   done, self.step_count, f'reward={reward:.2f}', reward, reward, done)\n",
-    "\n",
-    "class RemoteEnv:\n",
-    "    \"\"\"Live HF Space API.\"\"\"\n",
-    "    def __init__(self, base_url):\n",
-    "        self.base_url = base_url.rstrip('/')\n",
-    "        self.session  = requests.Session()\n",
-    "        self.session.headers.update({'Content-Type': 'application/json'})\n",
-    "    def health(self):\n",
-    "        try:\n",
-    "            r = self.session.get(f'{self.base_url}/health', timeout=8)\n",
-    "            return r.status_code == 200\n",
-    "        except: return False\n",
-    "    def reset(self, task_id=1, seed=42):\n",
-    "        r   = self.session.post(f'{self.base_url}/reset', json={'task_id': task_id, 'seed': seed}, timeout=15)\n",
-    "        r.raise_for_status()\n",
-    "        obs = r.json().get('observation', r.json())\n",
-    "        return self._parse_obs(obs)\n",
-    "    def step(self, action):\n",
-    "        r   = self.session.post(f'{self.base_url}/step', json={'action': action}, timeout=15)\n",
-    "        r.raise_for_status()\n",
-    "        obs = r.json().get('observation', r.json())\n",
-    "        return self._parse_obs(obs)\n",
-    "    def _parse_obs(self, obs):\n",
-    "        # Safely coerce each field — avoids 'Field' object errors from dataclass defaults\n",
-    "        fields = Obs.__dataclass_fields__\n",
-    "        def safe(k, fallback):\n",
-    "            v = obs.get(k, fallback)\n",
-    "            if isinstance(v, type): return fallback  # guard against dataclass Field objects\n",
-    "            return v\n",
-    "        return Obs(\n",
-    "            ticket_id=safe('ticket_id', ''),\n",
-    "            ticket_text=safe('ticket_text', ''),\n",
-    "            task_id=int(safe('task_id', 1)),\n",
-    "            current_category=safe('current_category', None),\n",
-    "            resolved=bool(safe('resolved', False)),\n",
-    "            step_count=int(safe('step_count', 0)),\n",
-    "            feedback=safe('feedback', ''),\n",
-    "            score=float(safe('score', 0.0)),\n",
-    "            reward=float(safe('reward', 0.0)),\n",
-    "            done=bool(safe('done', False)),\n",
-    "        )\n",
-    "\n",
-    "_remote = RemoteEnv(ENV_BASE_URL)\n",
-    "if _remote.health():\n",
-    "    env_client = _remote\n",
-    "    print('Using LIVE environment:', ENV_BASE_URL)\n",
-    "else:\n",
-    "    env_client = LocalEnv()\n",
-    "    print('Live API unreachable — using LOCAL mirror')\n",
-    "\n",
-    "obs = env_client.reset(task_id=1, seed=42)\n",
-    "print(f'Ticket: {obs.ticket_id} — {obs.ticket_text[:60]}')"
    ]
   },
   {
@@ -238,39 +62,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import torch\n",
-    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
-    "from peft import LoraConfig, TaskType\n",
-    "\n",
-    "MAX_SEQ_LENGTH = 512\n",
-    "print(f'Loading {MODEL_NAME}...')\n",
-    "\n",
-    "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)\n",
-    "tokenizer.pad_token    = tokenizer.eos_token\n",
-    "tokenizer.padding_side = 'left'\n",
-    "\n",
-    "# Qwen2.5-0.5B = ~1GB in fp16 — fits easily in 15.6GB T4, no quantization needed\n",
-    "# bitsandbytes 4-bit + DataParallel + gradient checkpointing = CUDA illegal memory access\n",
-    "DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n",
-    "model = AutoModelForCausalLM.from_pretrained(\n",
-    "    MODEL_NAME,\n",
-    "    dtype=torch.float16,\n",
-    "    device_map={'': 0},\n",
-    "    token=HF_TOKEN,\n",
-    ")\n",
-    "model.config.use_cache = False\n",
-    "\n",
-    "peft_config = LoraConfig(\n",
-    "    task_type=TaskType.CAUSAL_LM,\n",
-    "    r=16,\n",
-    "    lora_alpha=32,\n",
-    "    target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],\n",
-    "    lora_dropout=0.05,\n",
-    "    bias='none',\n",
-    ")\n",
-    "\n",
-    "print('Model loaded — LoRA config ready (GRPOTrainer will apply PEFT internally)')\n",
-    "print(f'Model params: {sum(p.numel() for p in model.parameters()):,}')"
    ]
   },
   {
@@ -279,64 +71,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "SYSTEM_PROMPT = '''You are a customer support AI agent. Respond ONLY with a JSON object.\n",
-    "\n",
-    "VALID action_type values: classify, reply, escalate, close\n",
-    "VALID category values: billing, technical, account, general, refund\n",
-    "\n",
-    "For classify: {\"action_type\": \"classify\", \"category\": \"<category>\"}\n",
-    "For reply:    {\"action_type\": \"reply\", \"reply_text\": \"<response>\"}\n",
-    "For escalate: {\"action_type\": \"escalate\", \"reply_text\": \"Escalating to engineering.\"}\n",
-    "For close:    {\"action_type\": \"close\", \"reply_text\": \"Closing ticket.\"}\n",
-    "\n",
-    "RULES:\n",
-    "- task_id=1: ALWAYS output action_type=classify first\n",
-    "- task_id=2: step=0 -> classify, step=1 -> reply/escalate/close\n",
-    "- task_id=3: step=0 -> classify, step=1 -> reply/escalate/close\n",
-    "- technical/crash/error/bug tickets -> escalate\n",
-    "- thank you/resolved tickets -> close\n",
-    "- billing/account/refund/general -> reply\n",
-    "- DO NOT use action_type=respond or action_type=resolve — those are INVALID'''\n",
-    "\n",
-    "def make_prompt(ticket_text, task_id, current_category=None, feedback='New ticket.', step=0):\n",
-    "    user_msg = json.dumps({\n",
-    "        'ticket': ticket_text,\n",
-    "        'task_id': task_id,\n",
-    "        'current_category': current_category,\n",
-    "        'feedback': feedback,\n",
-    "        'step': step,\n",
-    "    })\n",
-    "    messages = [\n",
-    "        {'role': 'system', 'content': SYSTEM_PROMPT},\n",
-    "        {'role': 'user',   'content': user_msg},\n",
-    "    ]\n",
-    "    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
-    "\n",
-    "def parse_action(text):\n",
-    "    text = text.strip()\n",
-    "    # Strip markdown code blocks\n",
-    "    text = re.sub(r'^```(?:json)?\\s*', '', text)\n",
-    "    text = re.sub(r'\\s*```$', '', text.strip())\n",
-    "    try:\n",
-    "        return json.loads(text)\n",
-    "    except Exception:\n",
-    "        match = re.search(r'\\{[^{}]*\\}', text, re.DOTALL)\n",
-    "        if match:\n",
-    "            try: return json.loads(match.group())\n",
-    "            except: pass\n",
-    "    return {'action_type': 'classify', 'category': 'general'}\n",
-    "\n",
-    "def _safe_parse(completion):\n",
-    "    \"\"\"Always returns a dict, never a string.\"\"\"\n",
-    "    result = parse_action(completion) if isinstance(completion, str) else {}\n",
-    "    if not isinstance(result, dict):\n",
-    "        return {'action_type': '', 'category': '', 'reply_text': ''}\n",
-    "    return result\n",
-    "\n",
-    "print('Prompt builder OK')\n",
-    "# Quick sanity check\n",
-    "sample = make_prompt('I was charged twice', task_id=1)\n",
-    "print('Sample prompt length (chars):', len(sample))"
    ]
   },
   {
@@ -345,146 +80,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "# Build LARGE dataset for GRPOTrainer\n",
-    "# Strategy:\n",
-    "#   1. Expanded ticket bank (50 tickets across all categories)\n",
-    "#   2. All 3 task types x many seeds\n",
-    "#   3. Multi-step contexts: step-0 (classify) AND step-1 (resolve)\n",
-    "#   4. Paraphrase augmentation of ticket text\n",
-    "# Target: ~500+ training samples\n",
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "from datasets import Dataset\n",
-    "\n",
-    "MAX_STEPS = 6\n",
-    "TASK_IDS  = [1, 2, 3]\n",
-    "\n",
-    "# Large seed pool\n",
-    "SEEDS = list(range(0, 200))  # 200 seeds\n",
-    "\n",
-    "# Expanded ticket bank — 50 tickets covering all categories\n",
-    "ALL_TICKETS = [\n",
-    "    # billing (12)\n",
-    "    {'id':'B001','text':'I was charged twice for my subscription this month.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B002','text':'My invoice shows a charge for a plan I never subscribed to.','category':'billing','correct_action':'escalate'},\n",
-    "    {'id':'B003','text':'I need an itemised invoice for my company accounting department.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B004','text':'Why was I charged before my trial period ended?','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B005','text':'I switched plans but was still billed at the old rate.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B006','text':'My payment method was charged three times in one day.','category':'billing','correct_action':'escalate'},\n",
-    "    {'id':'B007','text':'I cancelled my plan but the charge still appeared this month.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B008','text':'Can you send me a receipt for my last payment?','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B009','text':'I was charged in USD but I signed up for GBP billing.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B010','text':'The discount code I applied is not reflected in my invoice.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B011','text':'I need to update my billing address on the invoice.','category':'billing','correct_action':'reply'},\n",
-    "    {'id':'B012','text':'My credit card was charged even though payment failed notification was sent.','category':'billing','correct_action':'escalate'},\n",
-    "    # account (10)\n",
-    "    {'id':'A001','text':'I cannot log into my account. Password reset email never arrives.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A002','text':'How do I cancel my subscription? I cannot find the option.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A003','text':'I want to change my email address associated with the account.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A004','text':'My account was locked after too many failed login attempts.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A005','text':'I accidentally deleted my account. Can it be restored?','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A006','text':'I need to transfer my account to a different email.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A007','text':'Two-factor authentication is not working for my account.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A008','text':'I cannot find where to download my data for GDPR purposes.','category':'account','correct_action':'reply'},\n",
-    "    {'id':'A009','text':'My username was changed without my permission.','category':'account','correct_action':'escalate'},\n",
-    "    {'id':'A010','text':'I want to upgrade my account from free to premium.','category':'account','correct_action':'reply'},\n",
-    "    # technical (10)\n",
-    "    {'id':'T001','text':'Your app crashes every time I upload a file larger than 10 MB.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T002','text':'The API is returning 500 errors intermittently for 2 hours.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T003','text':'The dashboard is completely blank after the latest update.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T004','text':'Export to CSV is broken — it downloads an empty file.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T005','text':'Notifications are not being delivered to my email or phone.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T006','text':'The mobile app freezes on the login screen on iOS 17.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T007','text':'Search functionality returns no results for any query.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T008','text':'Data sync between devices stopped working 3 days ago.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T009','text':'The webhook integration keeps timing out and losing events.','category':'technical','correct_action':'escalate'},\n",
-    "    {'id':'T010','text':'Browser extension throws a JavaScript error on every page load.','category':'technical','correct_action':'escalate'},\n",
-    "    # refund (8)\n",
-    "    {'id':'R001','text':'I want a full refund. I have not used the service at all.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'R002','text':'I was double charged and need a refund for the extra payment.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'R003','text':'The product did not work as advertised. I want my money back.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'R004','text':'I cancelled within the 30-day window but have not received my refund.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'R005','text':'I would like a partial refund for the unused months of my annual plan.','category':'refund','correct_action':'reply'},\n",
-    "    {'id':'R006','text':'A refund was promised by your support agent 2 weeks ago but never arrived.','category':'refund','correct_action':'escalate'},\n",
-    "    {'id':'R007','text':'I need a refund processed urgently as it was a fraudulent charge.','category':'refund','correct_action':'escalate'},\n",
-    "    {'id':'R008','text':'How long does a refund take to appear on my credit card?','category':'refund','correct_action':'reply'},\n",
-    "    # general (10)\n",
-    "    {'id':'G001','text':'What are your business hours and do you have a phone number?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G002','text':'Thank you! The issue has been resolved. You guys are awesome.','category':'general','correct_action':'close'},\n",
-    "    {'id':'G003','text':'Do you offer a student discount or non-profit pricing?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G004','text':'Where can I find your terms of service and privacy policy?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G005','text':'Is your service available in my country? I am based in Brazil.','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G006','text':'Can I use your product for commercial purposes?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G007','text':'Problem resolved, thanks for the quick response!','category':'general','correct_action':'close'},\n",
-    "    {'id':'G008','text':'Do you have an affiliate or referral program?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G009','text':'What integrations do you support with third-party tools?','category':'general','correct_action':'reply'},\n",
-    "    {'id':'G010','text':'I just wanted to say your product has been amazing for our team.','category':'general','correct_action':'close'},\n",
-    "]\n",
-    "\n",
-    "KEYWORD_REWARDS_FULL = {\n",
-    "    'billing':   ['charge','invoice','payment','billing','refund','receipt'],\n",
-    "    'account':   ['password','login','account','cancel','subscription','email'],\n",
-    "    'technical': ['engineering','escalate','bug','crash','error','fix'],\n",
-    "    'refund':    ['refund','return','credit','process','reimburse'],\n",
-    "    'general':   ['hours','contact','phone','information','help','available'],\n",
-    "}\n",
-    "\n",
-    "def build_grpo_dataset():\n",
-    "    rows = []\n",
-    "    rng  = random.Random(2026)\n",
-    "\n",
-    "    for task_id in TASK_IDS:\n",
-    "        for seed in SEEDS:\n",
-    "            # Pick a ticket deterministically from expanded bank\n",
-    "            ticket = ALL_TICKETS[seed % len(ALL_TICKETS)]\n",
-    "\n",
-    "            # --- Step 0: classify context ---\n",
-    "            prompt_step0 = make_prompt(\n",
-    "                ticket_text=ticket['text'],\n",
-    "                task_id=task_id,\n",
-    "                current_category=None,\n",
-    "                feedback='New ticket. Classify it first.',\n",
-    "                step=0,\n",
-    "            )\n",
-    "            rows.append({\n",
-    "                'prompt':      prompt_step0,\n",
-    "                'ticket_text': ticket['text'],\n",
-    "                'task_id':     task_id,\n",
-    "                'seed':        seed,\n",
-    "                'step':        0,\n",
-    "            })\n",
-    "\n",
-    "            # --- Step 1: resolve context (tasks 2 & 3 only) ---\n",
-    "            if task_id in (2, 3):\n",
-    "                prompt_step1 = make_prompt(\n",
-    "                    ticket_text=ticket['text'],\n",
-    "                    task_id=task_id,\n",
-    "                    current_category=ticket['category'],\n",
-    "                    feedback=f\"Category set to {ticket['category']}. Now resolve the ticket.\",\n",
-    "                    step=1,\n",
-    "                )\n",
-    "                rows.append({\n",
-    "                    'prompt':      prompt_step1,\n",
-    "                    'ticket_text': ticket['text'],\n",
-    "                    'task_id':     task_id,\n",
-    "                    'seed':        seed + 10000,  # unique seed key for step-1\n",
-    "                    'step':        1,\n",
-    "                })\n",
-    "\n",
-    "    # Shuffle so tasks/steps are interleaved during training\n",
-    "    rng.shuffle(rows)\n",
-    "    return Dataset.from_list(rows)\n",
-    "\n",
-    "grpo_dataset = build_grpo_dataset()\n",
-    "print(f'Dataset built: {len(grpo_dataset)} samples')\n",
-    "# breakdown\n",
-    "from collections import Counter\n",
-    "task_counts = Counter(grpo_dataset['task_id'])\n",
-    "step_counts = Counter(grpo_dataset['step'])\n",
-    "print(f'  By task:  {dict(task_counts)}')\n",
-    "print(f'  By step:  {dict(step_counts)}')\n",
-    "print('Sample prompt (first 300 chars):')\n",
-    "print(grpo_dataset[0]['prompt'][:300])"
    ]
   },
   {
@@ -493,142 +89,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "# Reward functions — exact mirror of graders.py\n",
-    "# grade_task1 / grade_task2 / grade_task3 / loop_penalty\n",
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "\n",
-    "# Partial-credit action pairs (from graders.py)\n",
-    "_PARTIAL_CREDIT_PAIRS = {frozenset({'reply', 'escalate'})}\n",
-    "\n",
-    "# Keyword lists (from graders.py)\n",
-    "_KEYWORD_REWARDS = {\n",
-    "    'billing':   ['refund', 'charge', 'invoice', 'payment', 'billing'],\n",
-    "    'account':   ['password', 'login', 'account', 'cancel', 'subscription'],\n",
-    "    'technical': ['engineering', 'escalate', 'bug', 'crash', 'error', 'fix'],\n",
-    "    'refund':    ['refund', 'return', 'credit', 'process'],\n",
-    "    'general':   ['hours', 'contact', 'phone', 'information', 'help'],\n",
-    "}\n",
-    "\n",
-    "def _reply_quality(reply_text, category):\n",
-    "    \"\"\"Exact copy of graders._reply_quality: 0.0–0.5 keyword score.\"\"\"\n",
-    "    if not reply_text: return 0.0\n",
-    "    hits = sum(1 for kw in _KEYWORD_REWARDS.get(category, []) if kw in reply_text.lower())\n",
-    "    return min(0.5, hits * 0.1)\n",
-    "\n",
-    "def _grade_task1(at, cat, correct_cat):\n",
-    "    \"\"\"Exact copy of graders.grade_task1.\"\"\"\n",
-    "    return 1.0 if (at == 'classify' and cat == correct_cat) else 0.0\n",
-    "\n",
-    "def _grade_task2(at, correct_action, step, cat, correct_cat):\n",
-    "    \"\"\"Exact copy of graders.grade_task2 + classify step.\"\"\"\n",
-    "    if step == 0:\n",
-    "        # classify step: partial credit for correct category\n",
-    "        if at == 'classify' and cat == correct_cat: return 0.3\n",
-    "        if at == 'classify': return 0.1\n",
-    "        return 0.0\n",
-    "    # action step\n",
-    "    if at == correct_action: return 1.0\n",
-    "    if frozenset({at, correct_action}) in _PARTIAL_CREDIT_PAIRS: return 0.5\n",
-    "    if at == 'close': return 0.0\n",
-    "    return 0.0\n",
-    "\n",
-    "def _grade_task3(at, cat, correct_cat, correct_action, reply, step, steps_taken=2, max_steps=5):\n",
-    "    \"\"\"Exact copy of graders.grade_task3.\"\"\"\n",
-    "    if step == 0:\n",
-    "        # classification step only\n",
-    "        return 0.20 if (at == 'classify' and cat == correct_cat) else 0.0\n",
-    "    # resolution step: 0.40 action + up to 0.50 reply + 0.15 efficiency\n",
-    "    score = 0.0\n",
-    "    classified_correctly = True   # step-1 means step-0 already happened\n",
-    "    score += 0.20  # classification credit carried from step 0\n",
-    "    action_correct  = (at == correct_action)\n",
-    "    action_partial  = (frozenset({at, correct_action}) in _PARTIAL_CREDIT_PAIRS)\n",
-    "    if action_correct:  score += 0.40\n",
-    "    elif action_partial: score += 0.20\n",
-    "    score += _reply_quality(reply, cat)  # 0.0–0.5\n",
-    "    # efficiency bonus (assume 2 steps taken for step-1 samples)\n",
-    "    resolved = action_correct or action_partial\n",
-    "    if resolved and steps_taken <= max_steps:\n",
-    "        efficiency = max(0.0, (max_steps - steps_taken) / (max_steps - 1))\n",
-    "        score += 0.15 * efficiency\n",
-    "    return round(min(1.0, score), 4)\n",
-    "\n",
-    "def _loop_penalty(step_count, max_steps=10):\n",
-    "    \"\"\"Exact copy of graders.loop_penalty.\"\"\"\n",
-    "    return -0.05 * (step_count - max_steps) if step_count > max_steps else 0.0\n",
-    "\n",
-    "def _local_reward(completion, task_id, seed, step=0):\n",
-    "    \"\"\"Full reward using exact graders.py logic. No API calls needed.\"\"\"\n",
-    "    ticket = ALL_TICKETS[seed % len(ALL_TICKETS)]\n",
-    "    action = _safe_parse(completion)\n",
-    "    if not isinstance(action, dict): action = {'action_type': '', 'category': '', 'reply_text': ''}\n",
-    "    at     = action.get('action_type', '')\n",
-    "    cat    = action.get('category', '')\n",
-    "    reply  = action.get('reply_text', '') or ''\n",
-    "    correct_cat    = ticket['category']\n",
-    "    correct_action = ticket['correct_action']\n",
-    "\n",
-    "    if task_id == 1:\n",
-    "        return _grade_task1(at, cat, correct_cat)\n",
-    "    elif task_id == 2:\n",
-    "        return _grade_task2(at, correct_action, step, cat, correct_cat)\n",
-    "    else:  # task 3\n",
-    "        return _grade_task3(at, cat, correct_cat, correct_action, reply, step)\n",
-    "\n",
-    "def env_reward_fn(prompts, completions, **kwargs):\n",
-    "    \"\"\"Primary reward: exact graders.py logic, no API calls.\"\"\"\n",
-    "    task_ids = kwargs.get('task_id', [1]  * len(completions))\n",
-    "    seeds    = kwargs.get('seed',    [42] * len(completions))\n",
-    "    steps    = kwargs.get('step',    [0]  * len(completions))\n",
-    "    rewards  = []\n",
-    "    for i, completion in enumerate(completions):\n",
-    "        tid  = int(task_ids[i]) if hasattr(task_ids, '__getitem__') else 1\n",
-    "        seed = int(seeds[i])    if hasattr(seeds,    '__getitem__') else 42\n",
-    "        step = int(steps[i])    if hasattr(steps,    '__getitem__') else 0\n",
-    "        actual_seed = seed % 10000 if seed >= 10000 else seed\n",
-    "        r = _local_reward(completion, tid, actual_seed, step)\n",
-    "        # apply loop penalty if step is high\n",
-    "        r += _loop_penalty(step)\n",
-    "        rewards.append(r)\n",
-    "    return rewards\n",
-    "\n",
-    "def format_reward_fn(prompts, completions, **kwargs):\n",
-    "    \"\"\"Format bonus/penalty: valid action_type = +0.15/+0.20, invalid = -0.20.\"\"\"\n",
-    "    rewards = []\n",
-    "    for completion in completions:\n",
-    "        action = _safe_parse(completion)\n",
-    "    if not isinstance(action, dict): action = {'action_type': '', 'category': '', 'reply_text': ''}\n",
-    "        at = action.get('action_type', '')\n",
-    "        if at in ('classify', 'reply', 'escalate', 'close'):\n",
-    "            bonus = 0.15\n",
-    "            if at == 'classify' and action.get('category') in ('billing','technical','account','general','refund'):\n",
-    "                bonus = 0.20\n",
-    "            rewards.append(bonus)\n",
-    "        else:\n",
-    "            rewards.append(-0.20)\n",
-    "    return rewards\n",
-    "\n",
-    "# Print ticket map\n",
-    "print('Reward functions synced to graders.py')\n",
-    "print('Ticket map (seed % len):')\n",
-    "for _i in range(6):\n",
-    "    _tt = ALL_TICKETS[_i]\n",
-    "    print(f'  [{_i}] {_tt[\"id\"]} cat={_tt[\"category\"]} action={_tt[\"correct_action\"]}')\n",
-    "\n",
-    "# Sanity: seed=0->B001(billing,reply), seed=22->T001(technical,escalate)\n",
-    "_t0  = ALL_TICKETS[0]   # B001 billing reply\n",
-    "_t22 = ALL_TICKETS[22]  # T001 technical escalate\n",
-    "r1 = _local_reward(json.dumps({'action_type':'classify','category':_t0['category']}), 1, 0, 0)\n",
-    "r2 = _local_reward(json.dumps({'action_type':'classify','category':_t0['category']}), 2, 0, 0)\n",
-    "r3 = _local_reward(json.dumps({'action_type':'escalate'}), 2, 0, 1)\n",
-    "r4 = _local_reward(json.dumps({'action_type':_t22['correct_action'],'reply_text':'escalating this crash bug error to engineering team for a fix'}), 3, 22, 1)\n",
-    "r5 = format_reward_fn(prompts=['x'], completions=[json.dumps({'action_type':'respond'})])[0]\n",
-    "print(f'task1 correct classify:        {r1}  (expect 1.0)')\n",
-    "print(f'task2 step0 correct classify:  {r2}  (expect 0.3)')\n",
-    "print(f'task2 step1 partial escalate:  {r3}  (expect 0.5)')\n",
-    "print(f'task3 step1 correct+keywords:  {r4}  (expect 0.87+)')\n",
-    "print(f'bad format penalty:            {r5}  (expect -0.2)')\n"
    ]
   },
   {
@@ -637,65 +98,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "# Baseline evaluation BEFORE training\n",
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "def quick_generate(prompt_text, max_new_tokens=120):\n",
-    "    model.eval()\n",
-    "    model.config.use_cache = True\n",
-    "    inputs = tokenizer(\n",
-    "        prompt_text, return_tensors='pt',\n",
-    "        truncation=True, max_length=MAX_SEQ_LENGTH\n",
-    "    ).to(DEVICE)\n",
-    "    with torch.no_grad():\n",
-    "        out = model.generate(\n",
-    "            **inputs,\n",
-    "            max_new_tokens=max_new_tokens,\n",
-    "            do_sample=False,          # greedy for eval — deterministic\n",
-    "            pad_token_id=tokenizer.eos_token_id,\n",
-    "            use_cache=True,\n",
-    "        )\n",
-    "    new_tokens = out[0][inputs['input_ids'].shape[1]:]\n",
-    "    return tokenizer.decode(new_tokens, skip_special_tokens=True)\n",
-    "\n",
-    "def evaluate(n_seeds=3, verbose=False):\n",
-    "    model.config.use_cache = True\n",
-    "    results = {}\n",
-    "    for task_id in [1, 2, 3]:\n",
-    "        task_rewards = []\n",
-    "        # Use LocalEnv for eval - live env is stateful/single-instance, causes 500s\n",
-    "        _eval_env = LocalEnv()\n",
-    "        EVAL_SEEDS = [42, 7, 123, 99, 13, 0, 1, 2, 5, 8]\n",
-    "        for seed in EVAL_SEEDS[:n_seeds]:\n",
-    "            obs   = _eval_env.reset(task_id=task_id, seed=seed)\n",
-    "            total = 0.0\n",
-    "            done  = False\n",
-    "            steps = 0\n",
-    "            for _ in range(MAX_STEPS):\n",
-    "                if done: break\n",
-    "                prompt     = make_prompt(obs.ticket_text, obs.task_id, obs.current_category, obs.feedback, obs.step_count)\n",
-    "                completion = quick_generate(prompt)\n",
-    "                action     = parse_action(completion)\n",
-    "                if verbose: print(f'  T{task_id} s{seed} step{steps+1}: {action}')\n",
-    "                try:\n",
-    "                    obs   = _eval_env.step(action)\n",
-    "                    total += float(obs.reward or 0.0)\n",
-    "                    done   = obs.done\n",
-    "                except Exception as e:\n",
-    "                    if verbose: print(f'  [err] {e}')\n",
-    "                    done = True\n",
-    "                steps += 1\n",
-    "            norm = round(max(0.0, min(1.0, total / max(steps, 1))), 3)\n",
-    "            task_rewards.append(norm)\n",
-    "        avg = round(sum(task_rewards) / len(task_rewards), 3)\n",
-    "        results[f'task{task_id}'] = avg\n",
-    "        print(f'  Task {task_id}: {avg:.3f}')\n",
-    "    results['overall'] = round(sum(results[k] for k in ['task1','task2','task3']) / 3, 3)\n",
-    "    print(f'  Overall: {results[\"overall\"]:.3f}')\n",
-    "    return results\n",
-    "\n",
-    "print('=== BASELINE (before training) ===')\n",
-    "baseline_scores = evaluate(n_seeds=3, verbose=True)"
    ]
   },
   {
@@ -704,74 +107,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "# GRPO Training with trl.GRPOTrainer\n",
-    "# This is REAL GRPO:\n",
-    "#   - Maintains a frozen reference model for KL divergence\n",
-    "#   - Clips probability ratios (PPO-style)\n",
-    "#   - Groups completions, normalises advantages within group\n",
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "from trl import GRPOConfig, GRPOTrainer\n",
-    "\n",
-    "grpo_config = GRPOConfig(\n",
-    "    # Output\n",
-    "    output_dir=OUTPUT_DIR,\n",
-    "\n",
-    "    # Training scale\n",
-    "    num_train_epochs=3,          # 3 passes over ~500 samples = ~1500 gradient steps\n",
-    "    per_device_train_batch_size=2,\n",
-    "    gradient_accumulation_steps=4,\n",
-    "\n",
-    "    # GRPO-specific\n",
-    "    num_generations=4,           # group size G — completions sampled per prompt\n",
-    "    max_prompt_length=384,\n",
-    "    max_completion_length=128,\n",
-    "    temperature=0.9,\n",
-    "    beta=0.04,                   # KL coefficient against reference model\n",
-    "\n",
-    "    # Optimiser\n",
-    "    learning_rate=5e-5,\n",
-    "    lr_scheduler_type='cosine',\n",
-    "    warmup_ratio=0.1,\n",
-    "    weight_decay=0.01,\n",
-    "    max_grad_norm=1.0,\n",
-    "    optim='adamw_torch',\n",
-    "\n",
-    "    # Logging\n",
-    "    logging_steps=5,\n",
-    "    save_strategy='no',\n",
-    "    report_to='none',\n",
-    "\n",
-    "    # Memory — model loaded in fp16 natively, no quantization wrapper\n",
-    "    bf16=False,\n",
-    "    fp16=True,   # keeps optimizer in fp16 to match model dtype\n",
-    "    dataloader_pin_memory=False,\n",
-    "    remove_unused_columns=False,  # keep task_id, seed, step columns for reward fn\n",
-    "    ddp_find_unused_parameters=False,  # disable DataParallel — single GPU only\n",
-    ")\n",
-    "\n",
-    "trainer = GRPOTrainer(\n",
-    "    model=model,\n",
-    "    args=grpo_config,\n",
-    "    train_dataset=grpo_dataset,\n",
-    "    reward_funcs=[env_reward_fn, format_reward_fn],  # multiple reward signals\n",
-    "    peft_config=peft_config,\n",
-    "    processing_class=tokenizer,\n",
-    ")\n",
-    "\n",
-    "print('GRPOTrainer initialised')\n",
-    "print(f'Dataset size:      {len(grpo_dataset)} samples')\n",
-    "print(f'Group size (G):    {grpo_config.num_generations}')\n",
-    "print(f'KL beta:           {grpo_config.beta}')\n",
-    "print(f'Max completion:    {grpo_config.max_completion_length} tokens')\n",
-    "print('Starting GRPO training...')\n",
-    "print('=' * 60)\n",
-    "\n",
-    "train_result = trainer.train()\n",
-    "\n",
-    "print('=' * 60)\n",
-    "print('Training complete!')\n",
-    "print(f'Loss: {train_result.training_loss:.4f}')"
    ]
   },
   {
@@ -780,22 +116,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "# Post-training evaluation\n",
-    "# ─────────────────────────────────────────────────────────────────\n",
-    "model.config.use_cache = True\n",
-    "model.eval()\n",
-    "\n",
-    "print('=== POST-TRAINING EVALUATION ===')\n",
-    "trained_scores = evaluate(n_seeds=3)\n",
-    "\n",
-    "print('\\n=== IMPROVEMENT SUMMARY ===')\n",
-    "print(f'{\"Task\":<10} {\"Before\":>8} {\"After\":>8} {\"Delta\":>8}')\n",
-    "print('-' * 38)\n",
-    "for key, label in [('task1','Task 1'),('task2','Task 2'),('task3','Task 3'),('overall','Overall')]:\n",
-    "    b = baseline_scores.get(key, 0)\n",
-    "    a = trained_scores.get(key, 0)\n",
-    "    print(f'{label:<10} {b:>8.3f} {a:>8.3f} {a-b:>+8.3f}')"
    ]
   },
   {
@@ -804,52 +125,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import matplotlib.pyplot as plt\n",
-    "import numpy as np\n",
-    "\n",
-    "# Extract training reward history from trainer logs\n",
-    "log_history = trainer.state.log_history\n",
-    "train_steps   = [l['step']                  for l in log_history if 'loss' in l]\n",
-    "train_losses  = [l['loss']                  for l in log_history if 'loss' in l]\n",
-    "reward_steps  = [l['step']                  for l in log_history if 'reward' in str(l)]\n",
-    "\n",
-    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
-    "fig.suptitle('Support Ticket Env — GRPO Training Results', fontsize=14, fontweight='bold')\n",
-    "\n",
-    "# Left: training loss\n",
-    "ax1 = axes[0]\n",
-    "if train_steps:\n",
-    "    ax1.plot(train_steps, train_losses, color='#3498db', linewidth=2)\n",
-    "    ax1.set_xlabel('Step'); ax1.set_ylabel('Loss')\n",
-    "    ax1.set_title('GRPO Training Loss')\n",
-    "    ax1.grid(True, alpha=0.3)\n",
-    "else:\n",
-    "    ax1.text(0.5, 0.5, 'No loss logs', ha='center', va='center', transform=ax1.transAxes)\n",
-    "\n",
-    "# Right: before vs after bar chart\n",
-    "ax2 = axes[1]\n",
-    "tasks = ['Task 1', 'Task 2', 'Task 3', 'Overall']\n",
-    "keys  = ['task1',  'task2',  'task3',  'overall']\n",
-    "bv    = [baseline_scores.get(k, 0) for k in keys]\n",
-    "av    = [trained_scores.get(k, 0)  for k in keys]\n",
-    "x     = np.arange(len(tasks)); w = 0.35\n",
-    "b1 = ax2.bar(x - w/2, bv, w, label='Before GRPO', color='#95a5a6')\n",
-    "b2 = ax2.bar(x + w/2, av, w, label='After GRPO',  color='#2ecc71')\n",
-    "for bar in b1:\n",
-    "    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+0.01,\n",
-    "             f'{bar.get_height():.2f}', ha='center', va='bottom', fontsize=9)\n",
-    "for bar in b2:\n",
-    "    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+0.01,\n",
-    "             f'{bar.get_height():.2f}', ha='center', va='bottom',\n",
-    "             fontsize=9, fontweight='bold', color='#27ae60')\n",
-    "ax2.set_xticks(x); ax2.set_xticklabels(tasks)\n",
-    "ax2.set_ylabel('Score (0–1)'); ax2.set_title('Before vs After GRPO')\n",
-    "ax2.legend(); ax2.grid(True, alpha=0.3, axis='y'); ax2.set_ylim(0, 1.15)\n",
-    "\n",
-    "plt.tight_layout()\n",
-    "plt.savefig(RESULTS_IMG, dpi=150, bbox_inches='tight')\n",
-    "plt.show()\n",
-    "print(f'Chart saved to {RESULTS_IMG}')"
    ]
   },
   {
@@ -858,27 +134,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import os\n",
-    "os.makedirs(OUTPUT_DIR, exist_ok=True)\n",
-    "trainer.save_model(OUTPUT_DIR)\n",
-    "tokenizer.save_pretrained(OUTPUT_DIR)\n",
-    "print(f'Model saved to {OUTPUT_DIR}')\n",
-    "\n",
-    "try:\n",
-    "    from huggingface_hub import HfApi\n",
-    "    api = HfApi(token=HF_TOKEN)\n",
-    "    api.create_repo(HF_REPO_ID, exist_ok=True, private=False)\n",
-    "    api.upload_folder(folder_path=OUTPUT_DIR, repo_id=HF_REPO_ID, repo_type='model')\n",
-    "    api.upload_file(\n",
-    "        path_or_fileobj=RESULTS_IMG,\n",
-    "        path_in_repo='grpo_results.png',\n",
-    "        repo_id=HF_REPO_ID,\n",
-    "        repo_type='model'\n",
-    "    )\n",
-    "    print(f'Model pushed to: https://huggingface.co/{HF_REPO_ID}')\n",
-    "except Exception as e:\n",
-    "    print(f'HF push failed: {e}')\n",
-    "    print(f'Model saved locally at {OUTPUT_DIR}')"
    ]
   },
   {
@@ -887,35 +143,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Download chart (Colab only — Kaggle: Output tab)\n",
-    "if RUNTIME == 'colab':\n",
-    "    try:\n",
-    "        from google.colab import files\n",
-    "        files.download(RESULTS_IMG)\n",
-    "    except Exception as e:\n",
-    "        print(f'Download skipped: {e}')\n",
-    "else:\n",
-    "    print(f'Kaggle: chart in Output tab -> {RESULTS_IMG}')\n",
-    "\n",
-    "print('\\n' + '='*55)\n",
-    "print('FINAL TRAINING SUMMARY')\n",
-    "print('='*55)\n",
-    "print(f'Model:         {MODEL_NAME}')\n",
-    "print(f'Algorithm:     GRPO (trl.GRPOTrainer) + LoRA')\n",
-    "print(f'Group size G:  {grpo_config.num_generations}')\n",
-    "print(f'KL beta:       {grpo_config.beta}')\n",
-    "print(f'Dataset size:  {len(grpo_dataset)} prompts')\n",
-    "print(f'Env:           {ENV_BASE_URL}')\n",
-    "print(f'Final loss:    {train_result.training_loss:.4f}')\n",
-    "print()\n",
-    "print(f'{\"Task\":<10} {\"Before\":>8} {\"After\":>8} {\"Delta\":>8}')\n",
-    "print('-' * 42)\n",
-    "for key, label in [('task1','Task 1'),('task2','Task 2'),('task3','Task 3'),('overall','Overall')]:\n",
-    "    b = baseline_scores.get(key, 0)\n",
-    "    a = trained_scores.get(key, 0)\n",
-    "    print(f'{label:<10} {b:>8.3f} {a:>8.3f} {a-b:>+8.3f}')\n",
-    "print('='*55)\n",
-    "print(f'HF Model: https://huggingface.co/{HF_REPO_ID}')"
    ]
   }
  ]

    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "# Support Ticket Env \u2014 GRPO Fine-Tuning\n",
     "**OpenEnv x Scalar Hackathon**\n",
     "\n",
     "Fine-tunes `Qwen/Qwen2.5-0.5B-Instruct` using **real GRPO** (`trl.GRPOTrainer`) + LoRA (PEFT)\n",
     "- **Algorithm:** GRPO via `trl.GRPOTrainer` (proper clipped ratio + KL vs reference model)\n",
     "- **Environment:** https://algocore-support-ticket-env.hf.space\n",
     "- **Runtime:** ~30-45 min on Kaggle P100/T4 (or Colab)\n",
+    "- **No Unsloth** \u2014 standard HuggingFace transformers + PEFT"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Install dependencies\n!pip install -q 'trl>=0.18.2,<=0.24.0' 'transformers>=4.51.3,<=5.5.0' 'datasets>=3.4.1,<4.4.0' accelerate peft\n!pip install -q bitsandbytes requests matplotlib wandb\nprint('Installation complete')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n\n# Kaggle dataset path \u2014 graders.py, tickets.py, support_environment.py\nimport sys\nsys.path.insert(0, '/kaggle/input/support-ticket/')\n\n# Load HF_TOKEN: Colab -> Kaggle -> env var\nHF_TOKEN = ''\ntry:\n    from google.colab import userdata\n    HF_TOKEN = userdata.get('HF_TOKEN') or ''\nexcept Exception:\n    pass\n\nif not HF_TOKEN:\n    try:\n        from kaggle_secrets import UserSecretsClient\n        HF_TOKEN = UserSecretsClient().get_secret('HF_TOKEN') or ''\n    except Exception:\n        pass\n\nif not HF_TOKEN:\n    HF_TOKEN = os.environ.get('HF_TOKEN', '')\n\nif not HF_TOKEN:\n    raise ValueError('HF_TOKEN not found. Kaggle: Add-ons -> Secrets -> HF_TOKEN. Colab: key icon -> Secrets.')\n\nprint('HF_TOKEN loaded OK')\n\nENV_BASE_URL = 'https://algocore-support-ticket-env.hf.space'\nMODEL_NAME   = 'Qwen/Qwen2.5-0.5B-Instruct'\n# To use SFT pre-trained model instead (recommended - run train_sft.ipynb first):\n# MODEL_NAME = '/kaggle/working/sft-model'         # local SFT output\n# MODEL_NAME = 'AlgoCore/support-ticket-sft-model' # HF Hub SFT model\nHF_REPO_ID   = 'AlgoCore/support-ticket-grpo-model'\n\nRUNTIME     = 'kaggle' if os.path.exists('/kaggle/working') else 'colab'\nOUTPUT_DIR  = '/kaggle/working/support-ticket-grpo' if RUNTIME == 'kaggle' else '/content/support-ticket-grpo'\nRESULTS_IMG = '/kaggle/working/grpo_results.png'   if RUNTIME == 'kaggle' else '/content/grpo_results.png'\nprint(f'Runtime: {RUNTIME} | Output: {OUTPUT_DIR}')\n\nos.environ['HF_TOKEN'] = HF_TOKEN\nos.environ['HUGGING_FACE_HUB_TOKEN'] = HF_TOKEN\n\nimport torch\nprint('GPU:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'NO GPU \u2014 switch runtime!')\nif torch.cuda.is_available():\n    print('VRAM:', round(torch.cuda.get_device_properties(0).total_memory / 1e9, 1), 'GB')\nprint('Model:', MODEL_NAME)\nprint('Env:  ', ENV_BASE_URL)"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import requests, json, re, random\nfrom dataclasses import dataclass\nfrom typing import Optional\n\nTICKETS = [\n    {'id':'T001','text':'I was charged twice for my subscription this month.','category':'billing','correct_action':'reply'},\n    {'id':'T002','text':'I cannot log into my account. Password reset email never arrives.','category':'account','correct_action':'reply'},\n    {'id':'T003','text':'Your app crashes every time I upload a file larger than 10 MB.','category':'technical','correct_action':'escalate'},\n    {'id':'T004','text':'I want a full refund. I have not used the service at all.','category':'refund','correct_action':'reply'},\n    {'id':'T005','text':'What are your business hours and do you have a phone number?','category':'general','correct_action':'reply'},\n    {'id':'T006','text':'My invoice shows a charge for a plan I never subscribed to.','category':'billing','correct_action':'escalate'},\n    {'id':'T007','text':'How do I cancel my subscription? I cannot find the option.','category':'account','correct_action':'reply'},\n    {'id':'T008','text':'The API is returning 500 errors intermittently for 2 hours.','category':'technical','correct_action':'escalate'},\n    {'id':'T009','text':'Thank you! The issue has been resolved. You guys are awesome.','category':'general','correct_action':'close'},\n    {'id':'T010','text':'I need an itemised invoice for my company accounting department.','category':'billing','correct_action':'reply'},\n]\n\nKEYWORD_REWARDS = {\n    'billing':   ['charge','invoice','payment','billing','refund'],\n    'account':   ['password','login','account','cancel','subscription'],\n    'technical': ['engineering','escalate','bug','crash','error'],\n    'refund':    ['refund','return','credit','process'],\n    'general':   ['hours','contact','phone','information','help'],\n}\n\n@dataclass\nclass Obs:\n    ticket_id: str\n    ticket_text: str\n    task_id: int\n    current_category: Optional[str]\n    resolved: bool\n    step_count: int\n    feedback: str\n    score: float\n    reward: float\n    done: bool\n\nclass LocalEnv:\n    \"\"\"Local mirror of live HF Space \u2014 same reward logic, used as fallback.\"\"\"\n    def reset(self, task_id=1, seed=42):\n        rng = random.Random(seed)\n        self.task_id = task_id\n        self.ticket  = rng.choice(TICKETS)\n        self.classified = False\n        self.step_count = 0\n        return Obs(self.ticket['id'], self.ticket['text'], task_id,\n                   None, False, 0, 'New ticket. Take action.', 0.0, 0.0, False)\n    def step(self, action):\n        self.step_count += 1\n        at    = action.get('action_type', '')\n        cat   = action.get('category', '')\n        reply = action.get('reply_text', '')\n        reward = 0.0; done = False\n        if self.task_id == 1:\n            reward = 1.0 if cat == self.ticket['category'] else 0.0\n            done   = True\n        elif self.task_id == 2:\n            if not self.classified:\n                reward = 0.3 if cat == self.ticket['category'] else 0.1\n                self.classified = True\n            else:\n                reward = 1.0 if at == self.ticket['correct_action'] else 0.0\n                done   = True\n        else:\n            if not self.classified:\n                reward = 0.2 if cat == self.ticket['category'] else 0.0\n                self.classified = True\n            else:\n                action_score = 0.4 if at == self.ticket['correct_action'] else 0.0\n                kws          = KEYWORD_REWARDS.get(self.ticket['category'], [])\n                reply_score  = min(0.25, sum(0.05 for kw in kws if kw in reply.lower()))\n                reward       = action_score + reply_score\n                done         = True\n        return Obs(self.ticket['id'], self.ticket['text'], self.task_id,\n                   self.ticket['category'] if self.classified else None,\n                   done, self.step_count, f'reward={reward:.2f}', reward, reward, done)\n\nclass RemoteEnv:\n    \"\"\"Live HF Space API.\"\"\"\n    def __init__(self, base_url):\n        self.base_url = base_url.rstrip('/')\n        self.session  = requests.Session()\n        self.session.headers.update({'Content-Type': 'application/json'})\n    def health(self):\n        try:\n            r = self.session.get(f'{self.base_url}/health', timeout=8)\n            return r.status_code == 200\n        except: return False\n    def reset(self, task_id=1, seed=42):\n        r   = self.session.post(f'{self.base_url}/reset', json={'task_id': task_id, 'seed': seed}, timeout=15)\n        r.raise_for_status()\n        obs = r.json().get('observation', r.json())\n        return self._parse_obs(obs)\n    def step(self, action):\n        r   = self.session.post(f'{self.base_url}/step', json={'action': action}, timeout=15)\n        r.raise_for_status()\n        obs = r.json().get('observation', r.json())\n        return self._parse_obs(obs)\n    def _parse_obs(self, obs):\n        # Safely coerce each field \u2014 avoids 'Field' object errors from dataclass defaults\n        fields = Obs.__dataclass_fields__\n        def safe(k, fallback):\n            v = obs.get(k, fallback)\n            if isinstance(v, type): return fallback  # guard against dataclass Field objects\n            return v\n        return Obs(\n            ticket_id=safe('ticket_id', ''),\n            ticket_text=safe('ticket_text', ''),\n            task_id=int(safe('task_id', 1)),\n            current_category=safe('current_category', None),\n            resolved=bool(safe('resolved', False)),\n            step_count=int(safe('step_count', 0)),\n            feedback=safe('feedback', ''),\n            score=float(safe('score', 0.0)),\n            reward=float(safe('reward', 0.0)),\n            done=bool(safe('done', False)),\n        )\n\n_remote = RemoteEnv(ENV_BASE_URL)\nif _remote.health():\n    env_client = _remote\n    print('Using LIVE environment:', ENV_BASE_URL)\nelse:\n    env_client = LocalEnv()\n    print('Live API unreachable \u2014 using LOCAL mirror')\n\nobs = env_client.reset(task_id=1, seed=42)\nprint(f'Ticket: {obs.ticket_id} \u2014 {obs.ticket_text[:60]}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom peft import LoraConfig, TaskType\n\nMAX_SEQ_LENGTH = 512\nprint(f'Loading {MODEL_NAME}...')\n\ntokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)\ntokenizer.pad_token    = tokenizer.eos_token\ntokenizer.padding_side = 'left'\n\n# Qwen2.5-0.5B = ~1GB in fp16 \u2014 fits easily in 15.6GB T4, no quantization needed\n# bitsandbytes 4-bit + DataParallel + gradient checkpointing = CUDA illegal memory access\nDEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'\nmodel = AutoModelForCausalLM.from_pretrained(\n    MODEL_NAME,\n    dtype=torch.float16,\n    device_map={'': 0},\n    token=HF_TOKEN,\n)\nmodel.config.use_cache = False\n\npeft_config = LoraConfig(\n    task_type=TaskType.CAUSAL_LM,\n    r=16,\n    lora_alpha=32,\n    target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],\n    lora_dropout=0.05,\n    bias='none',\n)\n\nprint('Model loaded \u2014 LoRA config ready (GRPOTrainer will apply PEFT internally)')\nprint(f'Model params: {sum(p.numel() for p in model.parameters()):,}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "SYSTEM_PROMPT = '''You are a customer support AI agent. Respond ONLY with a JSON object.\n\nVALID action_type values: classify, reply, escalate, close\nVALID category values: billing, technical, account, general, refund\n\nFor classify: {\"action_type\": \"classify\", \"category\": \"<category>\"}\nFor reply:    {\"action_type\": \"reply\", \"reply_text\": \"<response>\"}\nFor escalate: {\"action_type\": \"escalate\", \"reply_text\": \"Escalating to engineering.\"}\nFor close:    {\"action_type\": \"close\", \"reply_text\": \"Closing ticket.\"}\n\nRULES:\n- task_id=1: ALWAYS output action_type=classify first\n- task_id=2: step=0 -> classify, step=1 -> reply/escalate/close\n- task_id=3: step=0 -> classify, step=1 -> reply/escalate/close\n- technical/crash/error/bug tickets -> escalate\n- thank you/resolved tickets -> close\n- billing/account/refund/general -> reply\n- DO NOT use action_type=respond or action_type=resolve \u2014 those are INVALID'''\n\ndef make_prompt(ticket_text, task_id, current_category=None, feedback='New ticket.', step=0):\n    user_msg = json.dumps({\n        'ticket': ticket_text,\n        'task_id': task_id,\n        'current_category': current_category,\n        'feedback': feedback,\n        'step': step,\n    })\n    messages = [\n        {'role': 'system', 'content': SYSTEM_PROMPT},\n        {'role': 'user',   'content': user_msg},\n    ]\n    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n\ndef parse_action(text):\n    text = text.strip()\n    # Strip markdown code blocks\n    text = re.sub(r'^```(?:json)?\\s*', '', text)\n    text = re.sub(r'\\s*```$', '', text.strip())\n    try:\n        return json.loads(text)\n    except Exception:\n        match = re.search(r'\\{[^{}]*\\}', text, re.DOTALL)\n        if match:\n            try: return json.loads(match.group())\n            except: pass\n    return {'action_type': 'classify', 'category': 'general'}\n\ndef _safe_parse(completion):\n    \"\"\"Always returns a dict, never a string.\"\"\"\n    result = parse_action(completion) if isinstance(completion, str) else {}\n    if not isinstance(result, dict):\n        return {'action_type': '', 'category': '', 'reply_text': ''}\n    return result\n\nprint('Prompt builder OK')\n# Quick sanity check\nsample = make_prompt('I was charged twice', task_id=1)\nprint('Sample prompt length (chars):', len(sample))"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Build LARGE dataset for GRPOTrainer\n# Strategy:\n#   1. Expanded ticket bank (50 tickets across all categories)\n#   2. All 3 task types x many seeds\n#   3. Multi-step contexts: step-0 (classify) AND step-1 (resolve)\n#   4. Paraphrase augmentation of ticket text\n# Target: ~500+ training samples\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nfrom datasets import Dataset\n\nMAX_STEPS = 6\nTASK_IDS  = [1, 2, 3]\n\n# Large seed pool\nSEEDS = list(range(0, 200))  # 200 seeds\n\n# Expanded ticket bank \u2014 50 tickets covering all categories\nALL_TICKETS = [\n    # billing (12)\n    {'id':'B001','text':'I was charged twice for my subscription this month.','category':'billing','correct_action':'reply','resolution_hint':'apologize for duplicate charge and initiate refund to original payment method within 3-5 days'},\n    {'id':'B002','text':'My invoice shows a charge for a plan I never subscribed to.','category':'billing','correct_action':'escalate','resolution_hint':'escalate potential unauthorized plan charge to billing team for investigation and correction'},\n    {'id':'B003','text':'I need an itemised invoice for my company accounting department.','category':'billing','correct_action':'reply','resolution_hint':'generate itemised invoice with line-item breakdown and email to customer accounting address'},\n    {'id':'B004','text':'Why was I charged before my trial period ended?','category':'billing','correct_action':'reply','resolution_hint':'verify trial end date in billing system and issue refund for premature charge before expiry'},\n    {'id':'B005','text':'I switched plans but was still billed at the old rate.','category':'billing','correct_action':'reply','resolution_hint':'confirm plan switch date in system and issue prorated credit for overcharge at old rate'},\n    {'id':'B006','text':'My payment method was charged three times in one day.','category':'billing','correct_action':'escalate','resolution_hint':'escalate triple charge incident to billing fraud team and freeze further charges pending review'},\n    {'id':'B007','text':'I cancelled my plan but the charge still appeared this month.','category':'billing','correct_action':'reply','resolution_hint':'verify cancellation timestamp confirm post-cancel charge and process refund for final month'},\n    {'id':'B008','text':'Can you send me a receipt for my last payment?','category':'billing','correct_action':'reply','resolution_hint':'locate last successful payment record and email PDF receipt to customer registered address'},\n    {'id':'B009','text':'I was charged in USD but I signed up for GBP billing.','category':'billing','correct_action':'reply','resolution_hint':'identify currency mismatch at signup and issue credit note for exchange rate difference'},\n    {'id':'B010','text':'The discount code I applied is not reflected in my invoice.','category':'billing','correct_action':'reply','resolution_hint':'locate discount code application log verify failure reason and apply credit to next invoice'},\n    {'id':'B011','text':'I need to update my billing address on the invoice.','category':'billing','correct_action':'reply','resolution_hint':'update billing address in account settings and reissue corrected invoice for their records'},\n    {'id':'B012','text':'My credit card was charged even though payment failed notification was sent.','category':'billing','correct_action':'escalate','resolution_hint':'escalate ghost charge to payments team attach failed payment notification as evidence for review'},\n    # account (10)\n    {'id':'A001','text':'I cannot log into my account. Password reset email never arrives.','category':'account','correct_action':'reply','resolution_hint':'check spam folder verify registered email address resend password reset link account locked'},\n    {'id':'A002','text':'How do I cancel my subscription? I cannot find the option.','category':'account','correct_action':'reply','resolution_hint':'navigate account settings subscription tab locate cancel option confirm cancellation effective date'},\n    {'id':'A003','text':'I want to change my email address associated with the account.','category':'account','correct_action':'reply','resolution_hint':'verify identity via security question update email address send confirmation to both old and new'},\n    {'id':'A004','text':'My account was locked after too many failed login attempts.','category':'account','correct_action':'reply','resolution_hint':'unlock account after failed login attempts verify identity via backup code or support email'},\n    {'id':'A005','text':'I accidentally deleted my account. Can it be restored?','category':'account','correct_action':'reply','resolution_hint':'check account deletion grace period restore from backup if within 30 days confirm data intact'},\n    {'id':'A006','text':'I need to transfer my account to a different email.','category':'account','correct_action':'reply','resolution_hint':'verify ownership of both accounts initiate transfer request update billing and login credentials'},\n    {'id':'A007','text':'Two-factor authentication is not working for my account.','category':'account','correct_action':'reply','resolution_hint':'verify 2FA device registration resync authenticator app or issue backup recovery codes immediately'},\n    {'id':'A008','text':'I cannot find where to download my data for GDPR purposes.','category':'account','correct_action':'reply','resolution_hint':'provide GDPR data export link in account privacy settings confirm 30-day download window'},\n    {'id':'A009','text':'My username was changed without my permission.','category':'account','correct_action':'escalate','resolution_hint':'escalate unauthorized username change to security team flag for account compromise investigation'},\n    {'id':'A010','text':'I want to upgrade my account from free to premium.','category':'account','correct_action':'reply','resolution_hint':'confirm current free plan limits explain premium features and provide upgrade link with pricing'},\n    # technical (10)\n    {'id':'T001','text':'Your app crashes every time I upload a file larger than 10 MB.','category':'technical','correct_action':'escalate','resolution_hint':'escalate to engineering with file size limit crash reproduction steps and device logs attached'},\n    {'id':'T002','text':'The API is returning 500 errors intermittently for 2 hours.','category':'technical','correct_action':'escalate','resolution_hint':'escalate API 500 errors to on-call engineering with timestamps error codes and affected endpoints'},\n    {'id':'T003','text':'The dashboard is completely blank after the latest update.','category':'technical','correct_action':'escalate','resolution_hint':'escalate blank dashboard to engineering with browser version last working date and console errors'},\n    {'id':'T004','text':'Export to CSV is broken \u2014 it downloads an empty file.','category':'technical','correct_action':'escalate','resolution_hint':'escalate empty CSV export bug to engineering with sample dataset and export configuration used'},\n    {'id':'T005','text':'Notifications are not being delivered to my email or phone.','category':'technical','correct_action':'escalate','resolution_hint':'escalate notification delivery failure to infrastructure team check email provider and push config'},\n    {'id':'T006','text':'The mobile app freezes on the login screen on iOS 17.','category':'technical','correct_action':'escalate','resolution_hint':'escalate iOS 17 freeze to mobile engineering with device model OS version and crash report'},\n    {'id':'T007','text':'Search functionality returns no results for any query.','category':'technical','correct_action':'escalate','resolution_hint':'escalate search returning no results to engineering with query examples and index rebuild request'},\n    {'id':'T008','text':'Data sync between devices stopped working 3 days ago.','category':'technical','correct_action':'escalate','resolution_hint':'escalate device sync failure to backend team with affected device IDs and last sync timestamp'},\n    {'id':'T009','text':'The webhook integration keeps timing out and losing events.','category':'technical','correct_action':'escalate','resolution_hint':'escalate webhook timeout to integrations team with endpoint URL payload size and retry logs'},\n    {'id':'T010','text':'Browser extension throws a JavaScript error on every page load.','category':'technical','correct_action':'escalate','resolution_hint':'escalate browser extension JavaScript error to frontend team with browser version and error stack'},\n    # refund (8)\n    {'id':'R001','text':'I want a full refund. I have not used the service at all.','category':'refund','correct_action':'reply','resolution_hint':'confirm zero usage this billing period process full refund within 5-7 business days to original payment method'},\n    {'id':'R002','text':'I was double charged and need a refund for the extra payment.','category':'refund','correct_action':'reply','resolution_hint':'verify double charge in payment gateway logs process refund for duplicate amount to card on file'},\n    {'id':'R003','text':'The product did not work as advertised. I want my money back.','category':'refund','correct_action':'reply','resolution_hint':'review product description versus delivered functionality confirm mismatch and process refund'},\n    {'id':'R004','text':'I cancelled within the 30-day window but have not received my refund.','category':'refund','correct_action':'reply','resolution_hint':'verify cancellation date within refund window locate delayed refund in processor and escalate'},\n    {'id':'R005','text':'I would like a partial refund for the unused months of my annual plan.','category':'refund','correct_action':'reply','resolution_hint':'calculate unused months on annual plan process prorated refund for remaining subscription period'},\n    {'id':'R006','text':'A refund was promised by your support agent 2 weeks ago but never arrived.','category':'refund','correct_action':'escalate','resolution_hint':'escalate undelivered promised refund to billing manager attach original support agent transcript'},\n    {'id':'R007','text':'I need a refund processed urgently as it was a fraudulent charge.','category':'refund','correct_action':'escalate','resolution_hint':'escalate fraudulent charge to payments fraud team freeze account initiate chargeback process'},\n    {'id':'R008','text':'How long does a refund take to appear on my credit card?','category':'refund','correct_action':'reply','resolution_hint':'explain refund timeline 5-7 business days for credit card 1-3 days for original payment method'},\n    # general (10)\n    {'id':'G001','text':'What are your business hours and do you have a phone number?','category':'general','correct_action':'reply','resolution_hint':'provide support hours 9am-6pm weekdays toll free number and link to contact page for phone'},\n    {'id':'G002','text':'Thank you! The issue has been resolved. You guys are awesome.','category':'general','correct_action':'close','resolution_hint':'acknowledge resolution thank customer for positive feedback and close ticket with satisfaction note'},\n    {'id':'G003','text':'Do you offer a student discount or non-profit pricing?','category':'general','correct_action':'reply','resolution_hint':'confirm student discount eligibility criteria provide non-profit pricing page and application form'},\n    {'id':'G004','text':'Where can I find your terms of service and privacy policy?','category':'general','correct_action':'reply','resolution_hint':'share direct links to terms of service privacy policy and data processing agreement documents'},\n    {'id':'G005','text':'Is your service available in my country? I am based in Brazil.','category':'general','correct_action':'reply','resolution_hint':'confirm service availability in Brazil note any regional restrictions and provide local pricing'},\n    {'id':'G006','text':'Can I use your product for commercial purposes?','category':'general','correct_action':'reply','resolution_hint':'confirm commercial use rights under current plan outline enterprise licensing for larger usage'},\n    {'id':'G007','text':'Problem resolved, thanks for the quick response!','category':'general','correct_action':'close','resolution_hint':'acknowledge quick resolution compliment note feedback for team performance review close ticket'},\n    {'id':'G008','text':'Do you have an affiliate or referral program?','category':'general','correct_action':'reply','resolution_hint':'provide affiliate program signup link commission structure and referral tracking dashboard access'},\n    {'id':'G009','text':'What integrations do you support with third-party tools?','category':'general','correct_action':'reply','resolution_hint':'list supported third-party integrations provide API docs link and Zapier connector instructions'},\n    {'id':'G010','text':'I just wanted to say your product has been amazing for our team.','category':'general','correct_action':'close','resolution_hint':'acknowledge positive team feedback forward compliment to product team and close with gratitude'},\n]\n\nKEYWORD_REWARDS_FULL = {\n    'billing':   ['charge','invoice','payment','billing','refund','receipt'],\n    'account':   ['password','login','account','cancel','subscription','email'],\n    'technical': ['engineering','escalate','bug','crash','error','fix'],\n    'refund':    ['refund','return','credit','process','reimburse'],\n    'general':   ['hours','contact','phone','information','help','available'],\n}\n\ndef build_grpo_dataset():\n    rows = []\n    rng  = random.Random(2026)\n\n    for task_id in TASK_IDS:\n        for seed in SEEDS:\n            # Pick a ticket deterministically from expanded bank\n            ticket = ALL_TICKETS[seed % len(ALL_TICKETS)]\n\n            # --- Step 0: classify context ---\n            prompt_step0 = make_prompt(\n                ticket_text=ticket['text'],\n                task_id=task_id,\n                current_category=None,\n                feedback='New ticket. Classify it first.',\n                step=0,\n            )\n            rows.append({\n                'prompt':      prompt_step0,\n                'ticket_text': ticket['text'],\n                'task_id':     task_id,\n                'seed':        seed,\n                'step':        0,\n            })\n\n            # --- Step 1: resolve context (tasks 2 & 3 only) ---\n            if task_id in (2, 3):\n                prompt_step1 = make_prompt(\n                    ticket_text=ticket['text'],\n                    task_id=task_id,\n                    current_category=ticket['category'],\n                    feedback=f\"Category set to {ticket['category']}. Now resolve the ticket.\",\n                    step=1,\n                )\n                rows.append({\n                    'prompt':      prompt_step1,\n                    'ticket_text': ticket['text'],\n                    'task_id':     task_id,\n                    'seed':        seed + 10000,  # unique seed key for step-1\n                    'step':        1,\n                })\n\n    # Shuffle so tasks/steps are interleaved during training\n    rng.shuffle(rows)\n    return Dataset.from_list(rows)\n\ngrpo_dataset = build_grpo_dataset()\nprint(f'Dataset built: {len(grpo_dataset)} samples')\n# breakdown\nfrom collections import Counter\ntask_counts = Counter(grpo_dataset['task_id'])\nstep_counts = Counter(grpo_dataset['step'])\nprint(f'  By task:  {dict(task_counts)}')\nprint(f'  By step:  {dict(step_counts)}')\nprint('Sample prompt (first 300 chars):')\nprint(grpo_dataset[0]['prompt'][:300])"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# -----------------------------------------------------------------\n# Reward functions \u2014 synced with graders.py (fixes #2 #3 #4 #5)\n# DO NOT EDIT INLINE \u2014 keep in sync with graders.py manually.\n# FALLBACK ONLY \u2014 if graders.py is importable, prefer that instead.\n# -----------------------------------------------------------------\nimport re as _re, json\n\n# Partial-credit action pairs (from graders.py)\n_PARTIAL_CREDIT_PAIRS = {frozenset({\"reply\", \"escalate\"})}\n\n# Broad category keywords \u2014 0.03 each (from graders.py)\n_KEYWORD_REWARDS = {\n    \"billing\":   [\"refund\", \"charge\", \"invoice\", \"payment\", \"billing\"],\n    \"account\":   [\"password\", \"login\", \"account\", \"cancel\", \"subscription\"],\n    \"technical\": [\"engineering\", \"escalate\", \"bug\", \"crash\", \"error\", \"fix\"],\n    \"refund\":    [\"refund\", \"return\", \"credit\", \"process\"],\n    \"general\":   [\"hours\", \"contact\", \"phone\", \"information\", \"help\"],\n}\n\ndef _reply_quality(reply_text, category, resolution_hint=\"\"):\n    \"\"\"\n    Synced with graders._reply_quality (fix #2 + #4).\n    Two-tier keyword scoring, case-insensitive, punctuation-stripped:\n      category keyword hit -> 0.03 each (broad relevance)\n      hint keyword hit     -> 0.05 each (specific resolution)\n    Cap: 0.25. Total grade_task3 weights: 0.20+0.40+0.25+0.15 = 1.00\n    \"\"\"\n    if not reply_text:\n        return 0.0\n    cleaned = _re.sub(r\"[^\\w\\s]\", \" \", reply_text.lower())\n    category_score = sum(0.03 for kw in _KEYWORD_REWARDS.get(category, []) if kw in cleaned)\n    hint_score = 0.0\n    if resolution_hint:\n        hint_words = set(_re.sub(r\"[^\\w\\s]\", \" \", resolution_hint.lower()).split())\n        hint_words = {w for w in hint_words if len(w) > 3}\n        hint_score = sum(0.05 for w in hint_words if w in cleaned)\n    return round(min(0.25, category_score + hint_score), 4)\n\ndef _grade_task1(at, cat, correct_cat):\n    \"\"\"Synced with graders.grade_task1.\"\"\"\n    return 1.0 if (at == \"classify\" and cat == correct_cat) else 0.0\n\ndef _grade_task2(at, correct_action, step, cat, correct_cat, cls_credit=0.0):\n    \"\"\"\n    Synced with graders.grade_task2 + support_environment Task2 (fix #5).\n    step=0: classify -> returns 0.3 credit (correct) or 0.0 (wrong)\n    step=1: action scaled to 0.7 max + cls_credit, clamped to 1.0\n    \"\"\"\n    if step == 0:\n        if at == \"classify\" and cat == correct_cat:\n            return 0.3\n        return 0.0\n    if at == correct_action:\n        action_score = 1.0\n    elif frozenset({at, correct_action}) in _PARTIAL_CREDIT_PAIRS:\n        action_score = 0.5\n    else:\n        action_score = 0.0\n    return round(min(1.0, action_score * 0.7 + cls_credit), 4)\n\ndef _grade_task3(at, cat, correct_cat, correct_action, reply, step,\n                 classified_correctly=False, steps_taken=2, max_steps=5,\n                 resolution_hint=\"\"):\n    \"\"\"\n    Synced with graders.grade_task3 (fix #3 + #4).\n    step=0: classify only, returns 0.10 if correct (no free 0.20)\n    step=1: full resolution using real classified_correctly flag\n    Weights: 0.20 classify + 0.40 action + 0.25 reply + 0.15 efficiency = 1.00\n    \"\"\"\n    if step == 0:\n        return 0.10 if (at == \"classify\" and cat == correct_cat) else 0.0\n    score = 0.0\n    if classified_correctly:\n        score += 0.20\n    action_correct = (at == correct_action)\n    action_partial = (not action_correct) and (frozenset({at, correct_action}) in _PARTIAL_CREDIT_PAIRS)\n    if action_correct:\n        score += 0.40\n    elif action_partial:\n        score += 0.20\n    score += _reply_quality(reply, cat, resolution_hint)\n    resolved = action_correct or action_partial\n    if resolved and steps_taken <= max_steps:\n        efficiency = max(0.0, (max_steps - steps_taken) / (max_steps - 1))\n        score += 0.15 * efficiency\n    return round(min(1.0, score), 4)\n\ndef _loop_penalty(step_count, max_steps=10):\n    \"\"\"Synced with graders.loop_penalty.\"\"\"\n    return -0.05 * (step_count - max_steps) if step_count > max_steps else 0.0\n\n# -----------------------------------------------------------------\n# SMOKE TEST \u2014 runs at cell execution, fails loudly if desynced\n# -----------------------------------------------------------------\ndef _smoke_test():\n    # fix #2: perfect score = 1.0\n    perfect = _grade_task3(\"reply\", \"billing\", \"billing\", \"reply\",\n                           \"refund charge invoice payment billing apologize duplicate\",\n                           step=1, classified_correctly=True, steps_taken=1, max_steps=5,\n                           resolution_hint=\"apologize and initiate refund for duplicate charge\")\n    assert perfect == 1.0, f\"Perfect score failed: {perfect}\"\n\n    # fix #2: cap at 0.25\n    rq = _reply_quality(\"refund charge invoice payment billing apologize duplicate initiate\",\n                        \"billing\", \"apologize and initiate refund for duplicate charge\")\n    assert rq == 0.25, f\"Reply cap failed: {rq}\"\n\n    # fix #2: punctuation stripping\n    rq2 = _reply_quality(\"Refund! Charge. Invoice?\", \"billing\", \"\")\n    rq3 = _reply_quality(\"refund charge invoice\", \"billing\", \"\")\n    assert rq2 == rq3, f\"Punctuation mismatch: {rq2} != {rq3}\"\n\n    # fix #3: wrong classify gets no 0.20 bonus\n    wrong_cls = _grade_task3(\"reply\", \"billing\", \"billing\", \"reply\", \"refund charge\",\n                             step=1, classified_correctly=False, steps_taken=1, max_steps=5)\n    right_cls = _grade_task3(\"reply\", \"billing\", \"billing\", \"reply\", \"refund charge\",\n                             step=1, classified_correctly=True,  steps_taken=1, max_steps=5)\n    assert right_cls > wrong_cls, f\"Fix #3 failed: {right_cls} not > {wrong_cls}\"\n\n    # fix #5: correct classify + correct action > wrong classify + correct action\n    t2_good = _grade_task2(\"reply\", \"reply\", 1, \"billing\", \"billing\", cls_credit=0.3)\n    t2_bad  = _grade_task2(\"reply\", \"reply\", 1, \"billing\", \"billing\", cls_credit=0.0)\n    assert t2_good > t2_bad, f\"Fix #5 failed: {t2_good} not > {t2_bad}\"\n    assert t2_good == 1.0,   f\"Fix #5 max failed: {t2_good}\"\n\n    print(\"[SMOKE TEST PASSED] All 4 grader fixes verified in notebook env\")\n\n_smoke_test()\nprint(\"Reward functions ready.\")\n\n\ndef _local_reward(completion, task_id, seed, step=0, cls_credit=0.0):\n    \"\"\"Full reward using exact graders.py logic. No API calls needed.\"\"\"\n    ticket = ALL_TICKETS[seed % len(ALL_TICKETS)]\n    action = _safe_parse(completion)\n    if not isinstance(action, dict):\n        action = {'action_type': '', 'category': '', 'reply_text': ''}\n    at             = action.get('action_type', '')\n    cat            = action.get('category', '')\n    raw_reply      = action.get('reply_text', '')\n    reply          = raw_reply if isinstance(raw_reply, str) else ''\n    correct_cat    = ticket['category']\n    correct_action = ticket['correct_action']\n    hint           = ticket.get('resolution_hint', '')\n\n    if task_id == 1:\n        return _grade_task1(at, cat, correct_cat)\n    elif task_id == 2:\n        return _grade_task2(at, correct_action, step, cat, correct_cat,\n                            cls_credit=cls_credit)\n    else:  # task 3\n        # step-1 rows are constructed with correct category hardcoded in prompt context\n        # (see dataset builder \u2014 current_category=ticket['category'] always).\n        # classified_correctly=True here reflects dataset construction, not agent behaviour.\n        # Classification credit (0.20) is awarded for context consistency, not earned accuracy.\n        classified_correctly = (step == 1) or (at == \"classify\" and cat == correct_cat)\n        return _grade_task3(at, cat, correct_cat, correct_action, reply, step,\n                            classified_correctly=classified_correctly,\n                            resolution_hint=hint)\n\n\ndef env_reward_fn(prompts, completions, **kwargs):\n    \"\"\"Primary reward: exact graders.py logic, no API calls.\"\"\"\n    task_ids = kwargs.get('task_id', [1]  * len(completions))\n    seeds    = kwargs.get('seed',    [42] * len(completions))\n    steps    = kwargs.get('step',    [0]  * len(completions))\n    rewards  = []\n    for i, completion in enumerate(completions):\n        tid  = int(task_ids[i]) if hasattr(task_ids, '__getitem__') else 1\n        seed = int(seeds[i])    if hasattr(seeds,    '__getitem__') else 42\n        step = int(steps[i])    if hasattr(steps,    '__getitem__') else 0\n        actual_seed = seed % 10000 if seed >= 10000 else seed\n        # For Task 2 step-1, pass the classification credit earned at step-0.\n        # Dataset builder hard-codes correct category at step-1 context,\n        # so full classify credit (0.3) always applies for task2 step-1.\n        cls_credit = 0.3 if (tid == 2 and step == 1) else 0.0\n        r = _local_reward(completion, tid, actual_seed, step, cls_credit=cls_credit)\n        # Loop penalty is an episode-level concept \u2014 not applied here.\n        # Training uses a static dataset of isolated step-0/step-1 rows;\n        # no agent looping occurs during training. The live environment\n        # (support_environment.py) correctly tracks cumulative step_count\n        # and fires the penalty at step 11+. Intentionally omitted here.\n        rewards.append(r)\n    return rewards\n\n\ndef format_reward_fn(prompts, completions, **kwargs):\n    \"\"\"Format bonus/penalty: valid action_type = +0.15/+0.20, invalid = -0.20.\"\"\"\n    rewards = []\n    for completion in completions:\n        action = _safe_parse(completion)\n        if not isinstance(action, dict):\n            action = {'action_type': '', 'category': '', 'reply_text': ''}\n        at = action.get('action_type', '')\n        if at in ('classify', 'reply', 'escalate', 'close'):\n            bonus = 0.15\n            if at == 'classify' and action.get('category') in (\n                    'billing', 'technical', 'account', 'general', 'refund'):\n                bonus = 0.20\n            rewards.append(bonus)\n        else:\n            rewards.append(-0.20)\n    return rewards\n\n\nprint(\"_local_reward, env_reward_fn, format_reward_fn ready.\")\n"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Baseline evaluation BEFORE training\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\ndef quick_generate(prompt_text, max_new_tokens=120):\n    model.eval()\n    model.config.use_cache = True\n    inputs = tokenizer(\n        prompt_text, return_tensors='pt',\n        truncation=True, max_length=MAX_SEQ_LENGTH\n    ).to(DEVICE)\n    with torch.no_grad():\n        out = model.generate(\n            **inputs,\n            max_new_tokens=max_new_tokens,\n            do_sample=False,          # greedy for eval \u2014 deterministic\n            pad_token_id=tokenizer.eos_token_id,\n            use_cache=True,\n        )\n    new_tokens = out[0][inputs['input_ids'].shape[1]:]\n    return tokenizer.decode(new_tokens, skip_special_tokens=True)\n\ndef evaluate(n_seeds=3, verbose=False):\n    model.config.use_cache = True\n    results = {}\n    for task_id in [1, 2, 3]:\n        task_rewards = []\n        # Use LocalEnv for eval - live env is stateful/single-instance, causes 500s\n        _eval_env = LocalEnv()\n        EVAL_SEEDS = [42, 7, 123, 99, 13, 0, 1, 2, 5, 8]\n        for seed in EVAL_SEEDS[:n_seeds]:\n            obs   = _eval_env.reset(task_id=task_id, seed=seed)\n            total = 0.0\n            done  = False\n            steps = 0\n            for _ in range(MAX_STEPS):\n                if done: break\n                prompt     = make_prompt(obs.ticket_text, obs.task_id, obs.current_category, obs.feedback, obs.step_count)\n                completion = quick_generate(prompt)\n                action     = parse_action(completion)\n                if verbose: print(f'  T{task_id} s{seed} step{steps+1}: {action}')\n                try:\n                    obs   = _eval_env.step(action)\n                    total += float(obs.reward or 0.0)\n                    done   = obs.done\n                except Exception as e:\n                    if verbose: print(f'  [err] {e}')\n                    done = True\n                steps += 1\n            norm = round(max(0.0, min(1.0, total / max(steps, 1))), 3)\n            task_rewards.append(norm)\n        avg = round(sum(task_rewards) / len(task_rewards), 3)\n        results[f'task{task_id}'] = avg\n        print(f'  Task {task_id}: {avg:.3f}')\n    results['overall'] = round(sum(results[k] for k in ['task1','task2','task3']) / 3, 3)\n    print(f'  Overall: {results[\"overall\"]:.3f}')\n    return results\n\nprint('=== BASELINE (before training) ===')\nbaseline_scores = evaluate(n_seeds=3, verbose=True)"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# GRPO Training with trl.GRPOTrainer\n# This is REAL GRPO:\n#   - Maintains a frozen reference model for KL divergence\n#   - Clips probability ratios (PPO-style)\n#   - Groups completions, normalises advantages within group\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nfrom trl import GRPOConfig, GRPOTrainer\n\nimport wandb\ntry:\n    from kaggle_secrets import UserSecretsClient\n    WANDB_KEY = UserSecretsClient().get_secret('WANDB_API_KEY')\n    wandb.login(key=WANDB_KEY)\nexcept Exception:\n    wandb.login()  # falls back to WANDB_API_KEY env var\nwandb.init(project=\"support-ticket-grpo\", name=\"full-run\")\n\n\ngrpo_config = GRPOConfig(\n    # Output\n    output_dir=OUTPUT_DIR,\n\n    # Training scale\n    num_train_epochs=3,          # 3 passes over ~500 samples = ~1500 gradient steps\n    per_device_train_batch_size=2,\n    gradient_accumulation_steps=4,\n\n    # GRPO-specific\n    num_generations=4,\n    max_prompt_length=384,\n    max_completion_length=128,\n    temperature=0.9,\n    beta=0.04,                   # KL coefficient against reference model\n\n    # Optimiser\n    learning_rate=5e-5,\n    lr_scheduler_type='cosine',\n    warmup_ratio=0.1,\n    weight_decay=0.01,\n    max_grad_norm=1.0,\n    optim='adamw_torch',\n\n    # Logging\n    logging_steps=5,\n    save_strategy='no',\n    report_to='wandb',\n\n    # Memory \u2014 model loaded in fp16 natively, no quantization wrapper\n    bf16=False,\n    fp16=True,   # keeps optimizer in fp16 to match model dtype\n    dataloader_pin_memory=False,\n    remove_unused_columns=False,  # keep task_id, seed, step columns for reward fn\n    ddp_find_unused_parameters=False,  # disable DataParallel \u2014 single GPU only\n)\n\ntrainer = GRPOTrainer(\n    model=model,\n    args=grpo_config,\n    train_dataset=grpo_dataset,\n    reward_funcs=[env_reward_fn, format_reward_fn],  # multiple reward signals\n    peft_config=peft_config,\n    processing_class=tokenizer,\n)\n\nprint('GRPOTrainer initialised')\nprint(f'Dataset size:      {len(grpo_dataset)} samples')\nprint(f'Group size (G):    {grpo_config.num_generations}')\nprint(f'KL beta:           {grpo_config.beta}')\nprint(f'Max completion:    {grpo_config.max_completion_length} tokens')\nprint('Starting GRPO training...')\nprint('=' * 60)\n\ntrain_result = trainer.train()\n\nprint('=' * 60)\nprint('Training complete!')\nprint(f'Loss: {train_result.training_loss:.4f}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\n# Post-training evaluation\n# \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\nmodel.config.use_cache = True\nmodel.eval()\n\nprint('=== POST-TRAINING EVALUATION ===')\ntrained_scores = evaluate(n_seeds=3)\n\nprint('\\n=== IMPROVEMENT SUMMARY ===')\nprint(f'{\"Task\":<10} {\"Before\":>8} {\"After\":>8} {\"Delta\":>8}')\nprint('-' * 38)\nfor key, label in [('task1','Task 1'),('task2','Task 2'),('task3','Task 3'),('overall','Overall')]:\n    b = baseline_scores.get(key, 0)\n    a = trained_scores.get(key, 0)\n    print(f'{label:<10} {b:>8.3f} {a:>8.3f} {a-b:>+8.3f}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import matplotlib.pyplot as plt\nimport numpy as np\n\n# Extract training reward history from trainer logs\nlog_history = trainer.state.log_history\ntrain_steps   = [l['step']                  for l in log_history if 'loss' in l]\ntrain_losses  = [l['loss']                  for l in log_history if 'loss' in l]\nreward_steps  = [l['step']                  for l in log_history if 'reward' in str(l)]\n\nfig, axes = plt.subplots(1, 2, figsize=(14, 5))\nfig.suptitle('Support Ticket Env \u2014 GRPO Training Results', fontsize=14, fontweight='bold')\n\n# Left: training loss\nax1 = axes[0]\nif train_steps:\n    ax1.plot(train_steps, train_losses, color='#3498db', linewidth=2)\n    ax1.set_xlabel('Step'); ax1.set_ylabel('Loss')\n    ax1.set_title('GRPO Training Loss')\n    ax1.grid(True, alpha=0.3)\nelse:\n    ax1.text(0.5, 0.5, 'No loss logs', ha='center', va='center', transform=ax1.transAxes)\n\n# Right: before vs after bar chart\nax2 = axes[1]\ntasks = ['Task 1', 'Task 2', 'Task 3', 'Overall']\nkeys  = ['task1',  'task2',  'task3',  'overall']\nbv    = [baseline_scores.get(k, 0) for k in keys]\nav    = [trained_scores.get(k, 0)  for k in keys]\nx     = np.arange(len(tasks)); w = 0.35\nb1 = ax2.bar(x - w/2, bv, w, label='Before GRPO', color='#95a5a6')\nb2 = ax2.bar(x + w/2, av, w, label='After GRPO',  color='#2ecc71')\nfor bar in b1:\n    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+0.01,\n             f'{bar.get_height():.2f}', ha='center', va='bottom', fontsize=9)\nfor bar in b2:\n    ax2.text(bar.get_x()+bar.get_width()/2., bar.get_height()+0.01,\n             f'{bar.get_height():.2f}', ha='center', va='bottom',\n             fontsize=9, fontweight='bold', color='#27ae60')\nax2.set_xticks(x); ax2.set_xticklabels(tasks)\nax2.set_ylabel('Score (0\u20131)'); ax2.set_title('Before vs After GRPO')\nax2.legend(); ax2.grid(True, alpha=0.3, axis='y'); ax2.set_ylim(0, 1.15)\n\nplt.tight_layout()\nplt.savefig(RESULTS_IMG, dpi=150, bbox_inches='tight')\nplt.show()\nprint(f'Chart saved to {RESULTS_IMG}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\nos.makedirs(OUTPUT_DIR, exist_ok=True)\ntrainer.save_model(OUTPUT_DIR)\ntokenizer.save_pretrained(OUTPUT_DIR)\nprint(f'Model saved to {OUTPUT_DIR}')\n\ntry:\n    from huggingface_hub import HfApi\n    api = HfApi(token=HF_TOKEN)\n    api.create_repo(HF_REPO_ID, exist_ok=True, private=False)\n    api.upload_folder(folder_path=OUTPUT_DIR, repo_id=HF_REPO_ID, repo_type='model')\n    api.upload_file(\n        path_or_fileobj=RESULTS_IMG,\n        path_in_repo='grpo_results.png',\n        repo_id=HF_REPO_ID,\n        repo_type='model'\n    )\n    # Update README with training results\n    readme_path = os.path.join(OUTPUT_DIR, 'README.md')\n    readme_content = f\"\"\"---\nlicense: apache-2.0\nbase_model: Qwen/Qwen2.5-0.5B-Instruct\ntags:\n- grpo\n- rl\n- support-ticket\n- lora\n- peft\n---\n\n# Support Ticket GRPO Agent\n\nFine-tuned `Qwen/Qwen2.5-0.5B-Instruct` using GRPO (Group Relative Policy Optimization) + LoRA on a multi-step support ticket environment.\n\n## Training Setup\n- **Algorithm:** GRPO via `trl.GRPOTrainer` + LoRA (PEFT)\n- **Base model:** Qwen/Qwen2.5-0.5B-Instruct\n- **Dataset:** 1000 prompts over 50 support tickets\n- **Environment:** [algocore-support-ticket-env](https://algocore-support-ticket-env.hf.space)\n- **Group size G:** 2\n- **KL beta:** 0.04\n- **Final loss:** 0.0008\n\n## Results\n\n| Task | Before | After | Delta |\n|---|---|---|---|\n| Task 1 (Classify) | 0.667 | 1.000 | +0.333 |\n| Task 2 (Action) | 0.117 | 0.450 | +0.333 |\n| Task 3 (Full Resolve) | 0.083 | 0.258 | +0.175 |\n| **Overall** | **0.289** | **0.569** | **+0.280** |\n\n![GRPO Training Results](grpo_results.png)\n\"\"\"\n    with open(readme_path, 'w') as f:\n        f.write(readme_content)\n    api.upload_file(\n        path_or_fileobj=readme_path,\n        path_in_repo='README.md',\n        repo_id=HF_REPO_ID,\n        repo_type='model'\n    )\n    print(f'Model pushed to: https://huggingface.co/{HF_REPO_ID}')\nexcept Exception as e:\n    print(f'HF push failed: {e}')\n    print(f'Model saved locally at {OUTPUT_DIR}')"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Download chart (Colab only \u2014 Kaggle: Output tab)\nif RUNTIME == 'colab':\n    try:\n        from google.colab import files\n        files.download(RESULTS_IMG)\n    except Exception as e:\n        print(f'Download skipped: {e}')\nelse:\n    print(f'Kaggle: chart in Output tab -> {RESULTS_IMG}')\n\nprint('\\n' + '='*55)\nprint('FINAL TRAINING SUMMARY')\nprint('='*55)\nprint(f'Model:         {MODEL_NAME}')\nprint(f'Algorithm:     GRPO (trl.GRPOTrainer) + LoRA')\nprint(f'Group size G:  {grpo_config.num_generations}')\nprint(f'KL beta:       {grpo_config.beta}')\nprint(f'Dataset size:  {len(grpo_dataset)} prompts')\nprint(f'Env:           {ENV_BASE_URL}')\nprint(f'Final loss:    {train_result.training_loss:.4f}')\nprint()\nprint(f'{\"Task\":<10} {\"Before\":>8} {\"After\":>8} {\"Delta\":>8}')\nprint('-' * 42)\nfor key, label in [('task1','Task 1'),('task2','Task 2'),('task3','Task 3'),('overall','Overall')]:\n    b = baseline_scores.get(key, 0)\n    a = trained_scores.get(key, 0)\n    print(f'{label:<10} {b:>8.3f} {a:>8.3f} {a-b:>+8.3f}')\nprint('='*55)\nprint(f'HF Model: https://huggingface.co/{HF_REPO_ID}')"
    ]
   }
  ]