{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "authorship_tag": "ABX9TyNAdLqevt1eWZCxknXSe62q",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Kajlid/ft-lora/blob/main/train.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Install dependencies"
      ],
      "metadata": {
        "id": "XsjHWiaqvJrf"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "TUwzrKYWu1V1"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "!pip install unsloth\n",
        "# Also get the latest nightly Unsloth!\n",
        "!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Login to HuggingFace\n",
        "from huggingface_hub import notebook_login\n",
        "notebook_login()"
      ],
      "metadata": {
        "id": "mkSDOk9Ipa3l"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import HfApi\n",
        "\n",
        "# Make sure HuggingFace repo exists inside the 'ft-lora' organization\n",
        "api = HfApi()\n",
        "api.create_repo(\n",
        "    repo_id=\"ft-lora/llama3.2-3b-instruct-finetuned\",\n",
        "    repo_type=\"model\",\n",
        "    exist_ok=True,\n",
        ")"
      ],
      "metadata": {
        "id": "59UAr7pNsS5P"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Chosen Foundation model"
      ],
      "metadata": {
        "id": "-ZXJiO88vX3v"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth import FastLanguageModel\n",
        "import torch\n",
        "# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
        "max_seq_length = 1024\n",
        "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
        "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n",
        "\n",
        "# 4bit pre quantized models we support for 4x faster downloading + no OOMs.\n",
        "fourbit_models = [\n",
        "    \"unsloth/Meta-Llama-3.1-8B-bnb-4bit\",      # Llama-3.1 2x faster\n",
        "    \"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit\",\n",
        "    \"unsloth/Meta-Llama-3.1-70B-bnb-4bit\",\n",
        "    \"unsloth/Meta-Llama-3.1-405B-bnb-4bit\",    # 4bit for 405b!\n",
        "    \"unsloth/Mistral-Small-Instruct-2409\",     # Mistral 22b 2x faster!\n",
        "    \"unsloth/mistral-7b-instruct-v0.3-bnb-4bit\",\n",
        "    \"unsloth/Phi-3.5-mini-instruct\",           # Phi-3.5 2x faster!\n",
        "    \"unsloth/Phi-3-medium-4k-instruct\",\n",
        "    \"unsloth/gemma-2-9b-bnb-4bit\",\n",
        "    \"unsloth/gemma-2-27b-bnb-4bit\",            # Gemma 2x faster!\n",
        "\n",
        "    \"unsloth/Llama-3.2-1B-bnb-4bit\",           # NEW! Llama 3.2 models\n",
        "    \"unsloth/Llama-3.2-1B-Instruct-bnb-4bit\",\n",
        "    \"unsloth/Llama-3.2-3B-bnb-4bit\",\n",
        "    \"unsloth/Llama-3.2-3B-Instruct-bnb-4bit\",\n",
        "\n",
        "    \"unsloth/Llama-3.3-70B-Instruct-bnb-4bit\" # NEW! Llama 3.3 70B!\n",
        "] # More models at https://huggingface.co/unsloth\n",
        "\n",
        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "    model_name = \"unsloth/Llama-3.2-3B-Instruct\", # or choose \"unsloth/Llama-3.2-1B-Instruct\"\n",
        "    max_seq_length = max_seq_length,\n",
        "    dtype = dtype,\n",
        "    load_in_4bit = load_in_4bit,\n",
        "    # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n",
        ")"
      ],
      "metadata": {
        "id": "QDTe7QyavVdK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = FastLanguageModel.get_peft_model(\n",
        "    model,\n",
        "    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
        "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
        "                      \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
        "    lora_alpha = 16,\n",
        "    lora_dropout = 0, # Supports any, but = 0 is optimized\n",
        "    bias = \"none\",    # Supports any, but = \"none\" is optimized\n",
        "    # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n",
        "    use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n",
        "    random_state = 3407,\n",
        "    use_rslora = False,  # We support rank stabilized LoRA\n",
        "    loftq_config = None, # And LoftQ\n",
        ")"
      ],
      "metadata": {
        "id": "c2e82oi5vmkH"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Data preparation"
      ],
      "metadata": {
        "id": "s6Yoz4oYvybY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth.chat_templates import get_chat_template\n",
        "\n",
        "tokenizer = get_chat_template(\n",
        "    tokenizer,\n",
        "    chat_template = \"llama-3.1\",     # 3.2?\n",
        ")\n",
        "\n",
        "def formatting_prompts_func(examples):\n",
        "    convos = examples[\"conversations\"]\n",
        "    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]\n",
        "    return { \"text\" : texts, }\n",
        "pass\n",
        "\n",
        "from datasets import load_dataset\n",
        "dataset = load_dataset(\"mlabonne/FineTome-100k\", split = \"train\")"
      ],
      "metadata": {
        "id": "4pxI_a4Nvo2q"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "We now use standardize_sharegpt to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:\n",
        "\n",
        "```\n",
        "{\"from\": \"system\", \"value\": \"You are an assistant\"}\n",
        "{\"from\": \"human\", \"value\": \"What is 2+2?\"}\n",
        "{\"from\": \"gpt\", \"value\": \"It's 4.\"}\n",
        "```\n",
        "\n",
        "\n",
        "to\n",
        "```\n",
        "{\"role\": \"system\", \"content\": \"You are an assistant\"}\n",
        "{\"role\": \"user\", \"content\": \"What is 2+2?\"}\n",
        "{\"role\": \"assistant\", \"content\": \"It's 4.\"}\n",
        "```"
      ],
      "metadata": {
        "id": "eR-aB7MlwYOA"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth.chat_templates import standardize_sharegpt\n",
        "dataset = standardize_sharegpt(dataset)\n",
        "dataset = dataset.map(formatting_prompts_func, batched = True,)"
      ],
      "metadata": {
        "id": "bbVizvjLwEDV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Look at conversation 5"
      ],
      "metadata": {
        "id": "lAeCqzGaw0gO"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "dataset[5][\"conversations\"]"
      ],
      "metadata": {
        "id": "Q_HbtkqHwwx7"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "And we see how the chat template transformed these conversations.\n",
        "\n",
        "[Notice] Llama 3.1 Instruct's default chat template default adds `\"Cutting Knowledge Date: December 2023\\nToday Date: 26 July 2024\"`, so do not be alarmed!"
      ],
      "metadata": {
        "id": "l62cYCW5w90y"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Training the model"
      ],
      "metadata": {
        "id": "lEVG1z-rxKSj"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now let's use Huggingface TRL's SFTTrainer! More docs here: TRL SFT docs. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!"
      ],
      "metadata": {
        "id": "I-7Mg2KRxTrX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from trl import SFTTrainer\n",
        "from transformers import TrainingArguments, DataCollatorForSeq2Seq\n",
        "from unsloth import is_bfloat16_supported\n",
        "\n",
        "trainer = SFTTrainer(\n",
        "    model = model,\n",
        "    tokenizer = tokenizer,\n",
        "    train_dataset = dataset,\n",
        "    dataset_text_field = \"text\",\n",
        "    max_seq_length = max_seq_length,\n",
        "    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),\n",
        "    dataset_num_proc = 2,\n",
        "    # packing = False,\n",
        "    packing = True,             # Can make training 5x faster for short sequences.\n",
        "    args = TrainingArguments(\n",
        "        per_device_train_batch_size = 2,\n",
        "        gradient_accumulation_steps = 4,\n",
        "        warmup_steps = 5,\n",
        "        # num_train_epochs = 1, # Set this for 1 full training run.\n",
        "        max_steps = 60,\n",
        "        learning_rate = 2e-4,\n",
        "        fp16 = not is_bfloat16_supported(),\n",
        "        bf16 = is_bfloat16_supported(),\n",
        "        logging_steps = 1, # Logs training progress\n",
        "        optim = \"adamw_8bit\",\n",
        "        weight_decay = 0.01,\n",
        "        lr_scheduler_type = \"linear\",\n",
        "        seed = 3407,\n",
        "        output_dir = \"outputs\",\n",
        "        report_to = \"none\", # Use this for WandB etc\n",
        "\n",
        "        # Checkpointing model\n",
        "        save_strategy=\"steps\",       # Save checkpoints during training\n",
        "        save_steps=10,               # Save every 10 steps\n",
        "        save_total_limit=3,         # Keep only the last 3 checkpoints\n",
        "\n",
        "        # The HuggingFace organization to push to\n",
        "        push_to_hub=True,     # set to True for continuous saving in HF\n",
        "        hub_model_id=\"ft-lora/llama3.2-3b-instruct-finetuned\",  # repo name\n",
        "        hub_strategy=\"checkpoint\",   # uploads checkpoints during training\n",
        "    ),\n",
        ")"
      ],
      "metadata": {
        "id": "DyewKvJWxJp8"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs."
      ],
      "metadata": {
        "id": "VtclYRolxc4g"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth.chat_templates import train_on_responses_only\n",
        "trainer = train_on_responses_only(\n",
        "    trainer,\n",
        "    instruction_part = \"<|start_header_id|>user<|end_header_id|>\\n\\n\",\n",
        "    response_part = \"<|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n",
        ")"
      ],
      "metadata": {
        "id": "jt67y1Rzxa7T"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "We verify masking is actually done:"
      ],
      "metadata": {
        "id": "GEX241ifxhk3"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer.decode(trainer.train_dataset[5][\"input_ids\"])"
      ],
      "metadata": {
        "id": "2_c6Yo43xfwn"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "space = tokenizer(\" \", add_special_tokens = False).input_ids[0]\n",
        "tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5][\"labels\"]])"
      ],
      "metadata": {
        "id": "aruT0NV7xlTL"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "We can see the System and Instruction prompts are successfully masked!"
      ],
      "metadata": {
        "id": "eXc-Ga-ExpOk"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Show current memory stats"
      ],
      "metadata": {
        "id": "BmdqinkNxsXu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Show current memory stats\n",
        "gpu_stats = torch.cuda.get_device_properties(0)\n",
        "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
        "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
        "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
        "print(f\"{start_gpu_memory} GB of memory reserved.\")"
      ],
      "metadata": {
        "id": "AcvIQf6Exp1g"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "trainer_stats = trainer.train()"
      ],
      "metadata": {
        "id": "LUGdOLqQxwBt"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "#@title Show final memory and time stats\n",
        "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
        "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
        "used_percentage = round(used_memory         /max_memory*100, 3)\n",
        "lora_percentage = round(used_memory_for_lora/max_memory*100, 3)\n",
        "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
        "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
        "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
        "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
        "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
        "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
      ],
      "metadata": {
        "id": "8B_sJRzWx1b9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Inference"
      ],
      "metadata": {
        "id": "0GvBJABMx6vZ"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Let's run the model! You can change the instruction and input - leave the output blank!\n",
        "\n",
        "[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct here\n",
        "\n",
        "We use min_p = 0.1 and temperature = 1.5."
      ],
      "metadata": {
        "id": "9mPSK3HHx9iV"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from unsloth.chat_templates import get_chat_template\n",
        "\n",
        "tokenizer = get_chat_template(\n",
        "    tokenizer,\n",
        "    chat_template = \"llama-3.1\",\n",
        ")\n",
        "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
        "\n",
        "messages = [\n",
        "    {\"role\": \"user\", \"content\": \"Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,\"},\n",
        "]\n",
        "inputs = tokenizer.apply_chat_template(\n",
        "    messages,\n",
        "    tokenize = True,\n",
        "    add_generation_prompt = True, # Must add for generation\n",
        "    return_tensors = \"pt\",\n",
        ").to(\"cuda\")\n",
        "\n",
        "outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,\n",
        "                         temperature = 1.5, min_p = 0.1)\n",
        "tokenizer.batch_decode(outputs)"
      ],
      "metadata": {
        "id": "TiffjpKBx3po"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!"
      ],
      "metadata": {
        "id": "rtI2ij1byFi4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
        "\n",
        "messages = [\n",
        "    {\"role\": \"user\", \"content\": \"Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,\"},\n",
        "]\n",
        "inputs = tokenizer.apply_chat_template(\n",
        "    messages,\n",
        "    tokenize = True,\n",
        "    add_generation_prompt = True, # Must add for generation\n",
        "    return_tensors = \"pt\",\n",
        ").to(\"cuda\")\n",
        "\n",
        "from transformers import TextStreamer\n",
        "text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n",
        "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,\n",
        "                   use_cache = True, temperature = 1.5, min_p = 0.1)"
      ],
      "metadata": {
        "id": "Q6OVg-9PyEOF"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Saving, loading finetuned models"
      ],
      "metadata": {
        "id": "oBwfqD5eyKnT"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.\n",
        "\n",
        "[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!"
      ],
      "metadata": {
        "id": "YuFKfJ6FyQRa"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "model.save_pretrained(\"lora_model\") # Local saving\n",
        "tokenizer.save_pretrained(\"lora_model\")"
      ],
      "metadata": {
        "id": "u_tAwLZ9yJDC",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "fc067ca9-b9d7-4553-a6e0-8bca74124ebd"
      },
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "('lora_model/tokenizer_config.json',\n",
              " 'lora_model/special_tokens_map.json',\n",
              " 'lora_model/chat_template.jinja',\n",
              " 'lora_model/tokenizer.json')"
            ]
          },
          "metadata": {},
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Save the finetuned model to HuggingFace\n",
        "# Organization name: ft-lora\n",
        "\n",
        "# Online saving\n",
        "# model.push_to_hub(\"ft-lora/llama3.2-3b-instruct-finetuned\")\n",
        "# tokenizer.push_to_hub(\"ft-lora/llama3.2-3b-instruct-finetuned\")\n"
      ],
      "metadata": {
        "id": "6Y6dENUbZedJ"
      },
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Now if you want to load the LoRA adapters we just saved for inference, set False to True:"
      ],
      "metadata": {
        "id": "DWdxaCN6yZLM"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "if True:\n",
        "    from unsloth import FastLanguageModel\n",
        "    model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "        model_name = \"lora_model\", # the model used for training\n",
        "        max_seq_length = max_seq_length,\n",
        "        dtype = dtype,\n",
        "        load_in_4bit = load_in_4bit,\n",
        "    )\n",
        "    FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
        "\n",
        "messages = [\n",
        "    {\"role\": \"user\", \"content\": \"Describe a tall tower in the capital of France.\"},\n",
        "]\n",
        "inputs = tokenizer.apply_chat_template(\n",
        "    messages,\n",
        "    tokenize = True,\n",
        "    add_generation_prompt = True, # Must add for generation\n",
        "    return_tensors = \"pt\",\n",
        ").to(\"cuda\")\n",
        "\n",
        "from transformers import TextStreamer\n",
        "text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n",
        "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,\n",
        "                   use_cache = True, temperature = 1.5, min_p = 0.1)"
      ],
      "metadata": {
        "id": "xiVGwJpCyZsA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "### Saving to float16 for VLLM"
      ],
      "metadata": {
        "id": "YZjuI3SAygsU"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "We also support saving to float16 directly. Select merged_16bit for float16 or merged_4bit for int4. We also allow lora adapters as a fallback. Use push_to_hub_merged to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens."
      ],
      "metadata": {
        "id": "PIr6PHZoykG_"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Merge to 16bit, standard HuggingFace model\n",
        "# LoRA weights are merged into the base model\n",
        "# Needed for GGUF\n",
        "if True: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_16bit\",)\n",
        "if True: model.push_to_hub_merged(\"ft-lora/llama3.2-3b-instruct-finetuned\", tokenizer, save_method = \"merged_16bit\")\n",
        "\n",
        "# Merge to 4bit, better for for CPU inference\n",
        "# if True: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_4bit_forced\",)\n",
        "# if True: model.push_to_hub_merged(\"ft-lora/llama3.2-3b-instruct-finetuned\", tokenizer, save_method = \"merged_4bit_forced\")\n",
        "\n",
        "# Just LoRA adapters\n",
        "# if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"lora\",)\n",
        "# if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"lora\")"
      ],
      "metadata": {
        "id": "ApQZDyNRyiQa"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## GGUF / llama.cpp Conversion"
      ],
      "metadata": {
        "id": "YqvuyKiiyqaa"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "Switch to CPU here?"
      ],
      "metadata": {
        "id": "oJ6bHyTLX2zX"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/ggml-org/llama.cpp.git\n",
        "%cd llama.cpp"
      ],
      "metadata": {
        "id": "OouPlz8TRq1x",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "184453a3-a3e6-4ca6-ba6a-0ecb0c1fc5e0"
      },
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'llama.cpp'...\n",
            "remote: Enumerating objects: 69802, done.\u001b[K\n",
            "remote: Counting objects: 100% (169/169), done.\u001b[K\n",
            "remote: Compressing objects: 100% (112/112), done.\u001b[K\n",
            "remote: Total 69802 (delta 125), reused 62 (delta 57), pack-reused 69633 (from 4)\u001b[K\n",
            "Receiving objects: 100% (69802/69802), 212.52 MiB | 16.06 MiB/s, done.\n",
            "Resolving deltas: 100% (50541/50541), done.\n",
            "/content/llama.cpp\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install -r requirements.txt\n",
        "!pip install -U transformers huggingface_hub"
      ],
      "metadata": {
        "id": "5Rgi7oFXRxdW"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "%cd /content/llama.cpp"
      ],
      "metadata": {
        "id": "nlR9solITpqR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!ls"
      ],
      "metadata": {
        "id": "-jRTZhF2TrH4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "ls -lh /content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_GIC5jnovzhx",
        "outputId": "34088bd6-1c73-4b77-fb9d-9376bf23fd65"
      },
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "ls: cannot access '/content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned': No such file or directory\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Define directory to save merged model\n",
        "# merged_model_dir = \"/content/lora_model_merged\"\n",
        "\n",
        "# Save merged model + tokenizer\n",
        "# model.save_pretrained(merged_model_dir, tokenizer)\n",
        "# merged_model.save_pretrained(\"/content/merged_model\", tokenizer)\n",
        "\n",
        "# Convert to GGUF using llama.cpp script\n",
        "#!python convert_hf_to_gguf.py /content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned \\\n",
        "    #--outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n",
        "    #--outtype auto\n",
        "\n",
        "!python convert_hf_to_gguf.py /content/llama.cpp/model \\\n",
        "    --outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n",
        "    --outtype auto\n",
        "\n",
        "\n",
        "#!python convert_hf_to_gguf.py /content/lora_model \\\n",
        "    #--outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n",
        "    #--outtype auto"
      ],
      "metadata": {
        "id": "ahUUYPyOR5Ua"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import HfApi\n",
        "\n",
        "api = HfApi()\n",
        "repo_id = \"ft-lora/llama3.2-3b-gguf-q4km\"\n",
        "api.create_repo(repo_id, repo_type=\"model\", exist_ok=True)\n",
        "\n",
        "api.upload_file(\n",
        "    path_or_fileobj=\"/content/llama3.2-3b-instruct-finetuned.gguf\",\n",
        "    path_in_repo=\"llama3.2-3b-instruct-finetuned.gguf\",\n",
        "    repo_id=repo_id\n",
        ")"
      ],
      "metadata": {
        "id": "R7ca98iBSAHX"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}