{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "trusted": true }, "outputs": [], "source": [ "%%capture\n", "!pip install huggingface_hub\n", "!pip install llama-cpp-python\n", "!pip install datasets\n", "!pip install torch==2.2.0 torchvision==0.17.1 --index-url https://download.pytorch.org/whl/cu121\n", "!pip install unsloth\n", "!pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "from llama_cpp import Llama\n", "from huggingface_hub import hf_hub_download\n", "from datasets import load_dataset\n", "from unsloth import FastLanguageModel\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "# Load model (unchanged)\n", "qlora_model_path = hf_hub_download(\n", " repo_id=\"ebbalg/llama-finetome\",\n", " filename=\"llama-3.2-1b-instruct.Q4_K_M.gguf\"\n", ")\n", "qlora_model = Llama(model_path=qlora_model_path, \n", " n_ctx=2048, n_threads=2, verbose=False, chat_format=\"llama-3\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "from tqdm import tqdm # progress bar\n", "import re\n", "\n", "def extract_choice_from_text(text):\n", " \"\"\"\n", " Scan the text and return the first valid choice letter that appears.\n", " \"\"\"\n", " text_upper = text.upper()\n", "\n", " # ---- 1. Strongest match: \"Answer: X\" ----\n", " m = re.search(r\"ANSWER[:\\s\\-]*([A-E])\", text_upper)\n", " if m:\n", " return m.group(1)\n", "\n", " # ---- 2. Next: lines that START with the choice (allowing whitespace) ----\n", " m = re.match(r\"\\s*([A-E])(\\.|\\s|$)\", text_upper)\n", " if m:\n", " return m.group(1)\n", "\n", " # ---- 3. Other patterns like \"Best answer is C\" ----\n", " m = re.search(r\"BEST ANSWER.*?([A-E])\", text_upper)\n", " if m:\n", " return m.group(1)\n", "\n", " # ---- 4. Look for something like \"C.\" inside the text ----\n", " m = re.search(r\"\\b([A-E])\\.\", text_upper)\n", " if m:\n", " return m.group(1)\n", "\n", " # ---- 5. Weakest fallback: first standalone letter ----\n", " m = re.search(r\"\\b([A-E])\\b\", text_upper)\n", " if m:\n", " return m.group(1)\n", " \n", " print(text)\n", " return None # couldn't find\n", "\n", "def eval_arc(model_fn, dataset_split):\n", " correct = 0\n", " total = len(dataset_split)\n", "\n", " for i, row in enumerate(dataset_split):\n", " question = row[\"question\"]\n", " answer = row[\"answerKey\"]\n", "\n", " # Build choices dicts from ARC structure\n", " choices_texts = row['choices']['text']\n", " choices_labels = row['choices']['label']\n", " choices = [{\"label\": l, \"text\": t} for l, t in zip(choices_labels, choices_texts)]\n", "\n", " # Build prompt\n", " prompt = (\n", " f\"Question: {question}\\n\"\n", " + \"\\n\".join([f\"{c['label']}. {c['text']}\" for c in choices])\n", " + \"\\nAnswer:\"\n", " )\n", "\n", " # Run model\n", " out = model_fn(prompt)\n", " pred = extract_choice_from_text(out)\n", " \n", " if pred == answer:\n", " correct += 1\n", " \n", " if (i + 1) % 10 == 0:\n", " print(f\"{i+1}/{total} Accuracy = {correct / total * 100}\")\n", "\n", " return correct / total" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = \"unsloth/Llama-3.2-3B-Instruct\", # or choose \"unsloth/Llama-3.2-1B-Instruct\"\n", " max_seq_length = 2048,\n", " dtype = None,\n", " load_in_4bit = True\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "arc = load_dataset(\"allenai/ai2_arc\", \"ARC-Challenge\")\n", "arc_eval = arc['test'].select(range(500))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "# ---- GGUF QLoRA model runner ----\n", "def run_gguf(llm, prompt, max_tokens=128):\n", " out = llm(prompt, max_tokens=max_tokens, temperature=0)\n", " return out[\"choices\"][0][\"text\"]\n", " \n", "score_qlora = eval_arc(lambda p: run_gguf(qlora_model, p), arc_eval)\n", "print(\"QLoRA model:\", score_qlora)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "# ---- Base Unsloth model runner ----\n", "def run_unsloth(model, tokenizer, prompt, max_tokens=128):\n", " inputs = tokenizer(prompt, return_tensors='pt').to(model.device)\n", " out = model.generate(**inputs, max_new_tokens=max_tokens)\n", " return tokenizer.decode(out[0], skip_special_tokens=True)\n", " \n", "score_base = eval_arc(lambda p: run_unsloth(model, tokenizer, p), arc_eval)\n", "print(\"Base model:\", score_base)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "import string \n", "\n", "def convert_truthfulqa_row(row):\n", " target = row[\"mc1_targets\"] # or mc2_targets\n", " choices = target[\"choices\"]\n", " answer_idx = target[\"labels\"].index(1) # correct index\n", "\n", " # Generate labels dynamically\n", " labels = list(string.ascii_uppercase[:len(choices)])\n", " \n", " # Build prompt\n", " prompt = f\"Question: {row['question']}\\n\"\n", " prompt += \"\\n\".join([f\"{labels[i]}. {c}\" for i, c in enumerate(choices)])\n", " prompt += \"\\nAnswer (choose one letter):\"\n", " \n", " correct_label = labels[answer_idx]\n", " return prompt, correct_label\n", "\n", "def eval_truthfulqa(model_fn, dataset_split):\n", " i = 0\n", " correct = 0\n", " total = len(dataset_split)\n", "\n", " for prompt, correct_label in dataset_split:\n", " pred_text = model_fn(prompt)\n", "\n", " # Extract predicted label: first uppercase letter in the output\n", " pred_label = extract_choice_from_text(pred_text)\n", " \n", " if pred_label == correct_label:\n", " correct += 1\n", " \n", " if (i + 1) % 20 == 0:\n", " print(f\"{i+1}/{total} Accuracy = {correct / total * 100}\")\n", " i += 1\n", "\n", " return correct / total\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "ds = load_dataset(\"truthful_qa\", \"multiple_choice\")\n", "qa_eval_raw = ds[\"validation\"].select(range(500))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "qa_eval = [convert_truthfulqa_row(row) for row in qa_eval_raw]\n", "\n", "for row in qa_eval:\n", " print(row)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "score_qlora = eval_truthfulqa(lambda p: run_gguf(qlora_model, p), qa_eval)\n", "print(\"QLoRA model accuracy:\", score_qlora)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "trusted": true }, "outputs": [], "source": [ "score_base = eval_truthfulqa(lambda p: run_unsloth(model, tokenizer, p), qa_eval)\n", "print(\"Base model accuracy:\", score_base)" ] } ], "metadata": { "kaggle": { "accelerator": "nvidiaTeslaT4", "dataSources": [], "dockerImageVersionId": 31193, "isGpuEnabled": true, "isInternetEnabled": true, "language": "python", "sourceType": "notebook" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 4 }