{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "trusted": true
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install huggingface_hub\n",
    "!pip install llama-cpp-python\n",
    "!pip install datasets\n",
    "!pip install torch==2.2.0 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu121\n",
    "!pip install unsloth\n",
    "!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from llama_cpp import Llama\n",
    "from huggingface_hub import hf_hub_download\n",
    "from datasets import load_dataset\n",
    "from unsloth import FastLanguageModel\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# Load model (unchanged)\n",
    "qlora_model_path = hf_hub_download(\n",
    "    repo_id=\"ebbalg/llama-finetome\",\n",
    "    filename=\"llama-3.2-1b-instruct.Q4_K_M.gguf\"\n",
    ")\n",
    "qlora_model = Llama(model_path=qlora_model_path, \n",
    "                    n_ctx=2048, n_threads=2, verbose=False, chat_format=\"llama-3\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "from tqdm import tqdm  # progress bar\n",
    "import re\n",
    "\n",
    "def extract_choice_from_text(text):\n",
    "    \"\"\"\n",
    "    Scan the text and return the first valid choice letter that appears.\n",
    "    \"\"\"\n",
    "    text_upper = text.upper()\n",
    "\n",
    "    # ---- 1. Strongest match: \"Answer: X\" ----\n",
    "    m = re.search(r\"ANSWER[:\\s\\-]*([A-E])\", text_upper)\n",
    "    if m:\n",
    "        return m.group(1)\n",
    "\n",
    "    # ---- 2. Next: lines that START with the choice (allowing whitespace) ----\n",
    "    m = re.match(r\"\\s*([A-E])(\\.|\\s|$)\", text_upper)\n",
    "    if m:\n",
    "        return m.group(1)\n",
    "\n",
    "    # ---- 3. Other patterns like \"Best answer is C\" ----\n",
    "    m = re.search(r\"BEST ANSWER.*?([A-E])\", text_upper)\n",
    "    if m:\n",
    "        return m.group(1)\n",
    "\n",
    "    # ---- 4. Look for something like \"C.\" inside the text ----\n",
    "    m = re.search(r\"\\b([A-E])\\.\", text_upper)\n",
    "    if m:\n",
    "        return m.group(1)\n",
    "\n",
    "    # ---- 5. Weakest fallback: first standalone letter ----\n",
    "    m = re.search(r\"\\b([A-E])\\b\", text_upper)\n",
    "    if m:\n",
    "        return m.group(1)\n",
    "            \n",
    "    print(text)\n",
    "    return None  # couldn't find\n",
    "\n",
    "def eval_arc(model_fn, dataset_split):\n",
    "    correct = 0\n",
    "    total = len(dataset_split)\n",
    "\n",
    "    for i, row in enumerate(dataset_split):\n",
    "        question = row[\"question\"]\n",
    "        answer = row[\"answerKey\"]\n",
    "\n",
    "        # Build choices dicts from ARC structure\n",
    "        choices_texts = row['choices']['text']\n",
    "        choices_labels = row['choices']['label']\n",
    "        choices = [{\"label\": l, \"text\": t} for l, t in zip(choices_labels, choices_texts)]\n",
    "\n",
    "        # Build prompt\n",
    "        prompt = (\n",
    "            f\"Question: {question}\\n\"\n",
    "            + \"\\n\".join([f\"{c['label']}. {c['text']}\" for c in choices])\n",
    "            + \"\\nAnswer:\"\n",
    "        )\n",
    "\n",
    "        # Run model\n",
    "        out = model_fn(prompt)\n",
    "        pred = extract_choice_from_text(out)\n",
    "        \n",
    "        if pred == answer:\n",
    "            correct += 1\n",
    "            \n",
    "        if (i + 1) % 10 == 0:\n",
    "            print(f\"{i+1}/{total} Accuracy = {correct / total * 100}\")\n",
    "\n",
    "    return correct / total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "model, tokenizer = FastLanguageModel.from_pretrained(\n",
    "    model_name = \"unsloth/Llama-3.2-3B-Instruct\", # or choose \"unsloth/Llama-3.2-1B-Instruct\"\n",
    "    max_seq_length = 2048,\n",
    "    dtype = None,\n",
    "    load_in_4bit = True\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "arc = load_dataset(\"allenai/ai2_arc\", \"ARC-Challenge\")\n",
    "arc_eval = arc['test'].select(range(500))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ---- GGUF QLoRA model runner ----\n",
    "def run_gguf(llm, prompt, max_tokens=128):\n",
    "    out = llm(prompt, max_tokens=max_tokens, temperature=0)\n",
    "    return out[\"choices\"][0][\"text\"]\n",
    "    \n",
    "score_qlora = eval_arc(lambda p: run_gguf(qlora_model, p), arc_eval)\n",
    "print(\"QLoRA model:\", score_qlora)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# ---- Base Unsloth model runner ----\n",
    "def run_unsloth(model, tokenizer, prompt, max_tokens=128):\n",
    "    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)\n",
    "    out = model.generate(**inputs, max_new_tokens=max_tokens)\n",
    "    return tokenizer.decode(out[0], skip_special_tokens=True)\n",
    "    \n",
    "score_base  = eval_arc(lambda p: run_unsloth(model, tokenizer, p), arc_eval)\n",
    "print(\"Base model:\", score_base)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import string \n",
    "\n",
    "def convert_truthfulqa_row(row):\n",
    "    target = row[\"mc1_targets\"]  # or mc2_targets\n",
    "    choices = target[\"choices\"]\n",
    "    answer_idx = target[\"labels\"].index(1)  # correct index\n",
    "\n",
    "    # Generate labels dynamically\n",
    "    labels = list(string.ascii_uppercase[:len(choices)])\n",
    "    \n",
    "    # Build prompt\n",
    "    prompt = f\"Question: {row['question']}\\n\"\n",
    "    prompt += \"\\n\".join([f\"{labels[i]}. {c}\" for i, c in enumerate(choices)])\n",
    "    prompt += \"\\nAnswer (choose one letter):\"\n",
    "    \n",
    "    correct_label = labels[answer_idx]\n",
    "    return prompt, correct_label\n",
    "\n",
    "def eval_truthfulqa(model_fn, dataset_split):\n",
    "    i = 0\n",
    "    correct = 0\n",
    "    total = len(dataset_split)\n",
    "\n",
    "    for prompt, correct_label in dataset_split:\n",
    "        pred_text = model_fn(prompt)\n",
    "\n",
    "        # Extract predicted label: first uppercase letter in the output\n",
    "        pred_label = extract_choice_from_text(pred_text)\n",
    "        \n",
    "        if pred_label == correct_label:\n",
    "            correct += 1\n",
    "        \n",
    "        if (i + 1) % 20 == 0:\n",
    "            print(f\"{i+1}/{total} Accuracy = {correct / total * 100}\")\n",
    "        i += 1\n",
    "\n",
    "    return correct / total\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "ds = load_dataset(\"truthful_qa\", \"multiple_choice\")\n",
    "qa_eval_raw = ds[\"validation\"].select(range(500))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "qa_eval = [convert_truthfulqa_row(row) for row in qa_eval_raw]\n",
    "\n",
    "for row in qa_eval:\n",
    "    print(row)\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "score_qlora = eval_truthfulqa(lambda p: run_gguf(qlora_model, p), qa_eval)\n",
    "print(\"QLoRA model accuracy:\", score_qlora)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "trusted": true
   },
   "outputs": [],
   "source": [
    "score_base  = eval_truthfulqa(lambda p: run_unsloth(model, tokenizer, p), qa_eval)\n",
    "print(\"Base model accuracy:\", score_base)"
   ]
  }
 ],
 "metadata": {
  "kaggle": {
   "accelerator": "nvidiaTeslaT4",
   "dataSources": [],
   "dockerImageVersionId": 31193,
   "isGpuEnabled": true,
   "isInternetEnabled": true,
   "language": "python",
   "sourceType": "notebook"
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}