{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyNAdLqevt1eWZCxknXSe62q", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "source": [ "## Install dependencies" ], "metadata": { "id": "XsjHWiaqvJrf" } }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "TUwzrKYWu1V1" }, "outputs": [], "source": [ "%%capture\n", "!pip install unsloth\n", "# Also get the latest nightly Unsloth!\n", "!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git" ] }, { "cell_type": "code", "source": [ "# Login to HuggingFace\n", "from huggingface_hub import notebook_login\n", "notebook_login()" ], "metadata": { "id": "mkSDOk9Ipa3l" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import HfApi\n", "\n", "# Make sure HuggingFace repo exists inside the 'ft-lora' organization\n", "api = HfApi()\n", "api.create_repo(\n", " repo_id=\"ft-lora/llama3.2-3b-instruct-finetuned\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", ")" ], "metadata": { "id": "59UAr7pNsS5P" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Chosen Foundation model" ], "metadata": { "id": "-ZXJiO88vX3v" } }, { "cell_type": "code", "source": [ "from unsloth import FastLanguageModel\n", "import torch\n", "# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "max_seq_length = 1024\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "\n", "# 4bit pre quantized models we support for 4x faster downloading + no OOMs.\n", "fourbit_models = [\n", " \"unsloth/Meta-Llama-3.1-8B-bnb-4bit\", # Llama-3.1 2x faster\n", " \"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit\",\n", " \"unsloth/Meta-Llama-3.1-70B-bnb-4bit\",\n", " \"unsloth/Meta-Llama-3.1-405B-bnb-4bit\", # 4bit for 405b!\n", " \"unsloth/Mistral-Small-Instruct-2409\", # Mistral 22b 2x faster!\n", " \"unsloth/mistral-7b-instruct-v0.3-bnb-4bit\",\n", " \"unsloth/Phi-3.5-mini-instruct\", # Phi-3.5 2x faster!\n", " \"unsloth/Phi-3-medium-4k-instruct\",\n", " \"unsloth/gemma-2-9b-bnb-4bit\",\n", " \"unsloth/gemma-2-27b-bnb-4bit\", # Gemma 2x faster!\n", "\n", " \"unsloth/Llama-3.2-1B-bnb-4bit\", # NEW! Llama 3.2 models\n", " \"unsloth/Llama-3.2-1B-Instruct-bnb-4bit\",\n", " \"unsloth/Llama-3.2-3B-bnb-4bit\",\n", " \"unsloth/Llama-3.2-3B-Instruct-bnb-4bit\",\n", "\n", " \"unsloth/Llama-3.3-70B-Instruct-bnb-4bit\" # NEW! Llama 3.3 70B!\n", "] # More models at https://huggingface.co/unsloth\n", "\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = \"unsloth/Llama-3.2-3B-Instruct\", # or choose \"unsloth/Llama-3.2-1B-Instruct\"\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " # token = \"hf_...\", # use one if using gated models like meta-llama/Llama-2-7b-hf\n", ")" ], "metadata": { "id": "QDTe7QyavVdK" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n", " \"gate_proj\", \"up_proj\", \"down_proj\",],\n", " lora_alpha = 16,\n", " lora_dropout = 0, # Supports any, but = 0 is optimized\n", " bias = \"none\", # Supports any, but = \"none\" is optimized\n", " # [NEW] \"unsloth\" uses 30% less VRAM, fits 2x larger batch sizes!\n", " use_gradient_checkpointing = \"unsloth\", # True or \"unsloth\" for very long context\n", " random_state = 3407,\n", " use_rslora = False, # We support rank stabilized LoRA\n", " loftq_config = None, # And LoftQ\n", ")" ], "metadata": { "id": "c2e82oi5vmkH" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Data preparation" ], "metadata": { "id": "s6Yoz4oYvybY" } }, { "cell_type": "code", "source": [ "from unsloth.chat_templates import get_chat_template\n", "\n", "tokenizer = get_chat_template(\n", " tokenizer,\n", " chat_template = \"llama-3.1\", # 3.2?\n", ")\n", "\n", "def formatting_prompts_func(examples):\n", " convos = examples[\"conversations\"]\n", " texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]\n", " return { \"text\" : texts, }\n", "pass\n", "\n", "from datasets import load_dataset\n", "dataset = load_dataset(\"mlabonne/FineTome-100k\", split = \"train\")" ], "metadata": { "id": "4pxI_a4Nvo2q" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "We now use standardize_sharegpt to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:\n", "\n", "```\n", "{\"from\": \"system\", \"value\": \"You are an assistant\"}\n", "{\"from\": \"human\", \"value\": \"What is 2+2?\"}\n", "{\"from\": \"gpt\", \"value\": \"It's 4.\"}\n", "```\n", "\n", "\n", "to\n", "```\n", "{\"role\": \"system\", \"content\": \"You are an assistant\"}\n", "{\"role\": \"user\", \"content\": \"What is 2+2?\"}\n", "{\"role\": \"assistant\", \"content\": \"It's 4.\"}\n", "```" ], "metadata": { "id": "eR-aB7MlwYOA" } }, { "cell_type": "code", "source": [ "from unsloth.chat_templates import standardize_sharegpt\n", "dataset = standardize_sharegpt(dataset)\n", "dataset = dataset.map(formatting_prompts_func, batched = True,)" ], "metadata": { "id": "bbVizvjLwEDV" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "Look at conversation 5" ], "metadata": { "id": "lAeCqzGaw0gO" } }, { "cell_type": "code", "source": [ "dataset[5][\"conversations\"]" ], "metadata": { "id": "Q_HbtkqHwwx7" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "And we see how the chat template transformed these conversations.\n", "\n", "[Notice] Llama 3.1 Instruct's default chat template default adds `\"Cutting Knowledge Date: December 2023\\nToday Date: 26 July 2024\"`, so do not be alarmed!" ], "metadata": { "id": "l62cYCW5w90y" } }, { "cell_type": "markdown", "source": [ "## Training the model" ], "metadata": { "id": "lEVG1z-rxKSj" } }, { "cell_type": "markdown", "source": [ "Now let's use Huggingface TRL's SFTTrainer! More docs here: TRL SFT docs. We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!" ], "metadata": { "id": "I-7Mg2KRxTrX" } }, { "cell_type": "code", "source": [ "from trl import SFTTrainer\n", "from transformers import TrainingArguments, DataCollatorForSeq2Seq\n", "from unsloth import is_bfloat16_supported\n", "\n", "trainer = SFTTrainer(\n", " model = model,\n", " tokenizer = tokenizer,\n", " train_dataset = dataset,\n", " dataset_text_field = \"text\",\n", " max_seq_length = max_seq_length,\n", " data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),\n", " dataset_num_proc = 2,\n", " # packing = False,\n", " packing = True, # Can make training 5x faster for short sequences.\n", " args = TrainingArguments(\n", " per_device_train_batch_size = 2,\n", " gradient_accumulation_steps = 4,\n", " warmup_steps = 5,\n", " # num_train_epochs = 1, # Set this for 1 full training run.\n", " max_steps = 60,\n", " learning_rate = 2e-4,\n", " fp16 = not is_bfloat16_supported(),\n", " bf16 = is_bfloat16_supported(),\n", " logging_steps = 1, # Logs training progress\n", " optim = \"adamw_8bit\",\n", " weight_decay = 0.01,\n", " lr_scheduler_type = \"linear\",\n", " seed = 3407,\n", " output_dir = \"outputs\",\n", " report_to = \"none\", # Use this for WandB etc\n", "\n", " # Checkpointing model\n", " save_strategy=\"steps\", # Save checkpoints during training\n", " save_steps=10, # Save every 10 steps\n", " save_total_limit=3, # Keep only the last 3 checkpoints\n", "\n", " # The HuggingFace organization to push to\n", " push_to_hub=True, # set to True for continuous saving in HF\n", " hub_model_id=\"ft-lora/llama3.2-3b-instruct-finetuned\", # repo name\n", " hub_strategy=\"checkpoint\", # uploads checkpoints during training\n", " ),\n", ")" ], "metadata": { "id": "DyewKvJWxJp8" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs." ], "metadata": { "id": "VtclYRolxc4g" } }, { "cell_type": "code", "source": [ "from unsloth.chat_templates import train_on_responses_only\n", "trainer = train_on_responses_only(\n", " trainer,\n", " instruction_part = \"<|start_header_id|>user<|end_header_id|>\\n\\n\",\n", " response_part = \"<|start_header_id|>assistant<|end_header_id|>\\n\\n\",\n", ")" ], "metadata": { "id": "jt67y1Rzxa7T" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "We verify masking is actually done:" ], "metadata": { "id": "GEX241ifxhk3" } }, { "cell_type": "code", "source": [ "tokenizer.decode(trainer.train_dataset[5][\"input_ids\"])" ], "metadata": { "id": "2_c6Yo43xfwn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "space = tokenizer(\" \", add_special_tokens = False).input_ids[0]\n", "tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5][\"labels\"]])" ], "metadata": { "id": "aruT0NV7xlTL" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "We can see the System and Instruction prompts are successfully masked!" ], "metadata": { "id": "eXc-Ga-ExpOk" } }, { "cell_type": "markdown", "source": [ "### Show current memory stats" ], "metadata": { "id": "BmdqinkNxsXu" } }, { "cell_type": "code", "source": [ "#@title Show current memory stats\n", "gpu_stats = torch.cuda.get_device_properties(0)\n", "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n", "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n", "print(f\"{start_gpu_memory} GB of memory reserved.\")" ], "metadata": { "id": "AcvIQf6Exp1g" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "trainer_stats = trainer.train()" ], "metadata": { "id": "LUGdOLqQxwBt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "#@title Show final memory and time stats\n", "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n", "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n", "used_percentage = round(used_memory /max_memory*100, 3)\n", "lora_percentage = round(used_memory_for_lora/max_memory*100, 3)\n", "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n", "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n", "print(f\"Peak reserved memory = {used_memory} GB.\")\n", "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n", "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n", "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")" ], "metadata": { "id": "8B_sJRzWx1b9" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Inference" ], "metadata": { "id": "0GvBJABMx6vZ" } }, { "cell_type": "markdown", "source": [ "Let's run the model! You can change the instruction and input - leave the output blank!\n", "\n", "[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct here\n", "\n", "We use min_p = 0.1 and temperature = 1.5." ], "metadata": { "id": "9mPSK3HHx9iV" } }, { "cell_type": "code", "source": [ "from unsloth.chat_templates import get_chat_template\n", "\n", "tokenizer = get_chat_template(\n", " tokenizer,\n", " chat_template = \"llama-3.1\",\n", ")\n", "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": \"Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(\n", " messages,\n", " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", " return_tensors = \"pt\",\n", ").to(\"cuda\")\n", "\n", "outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,\n", " temperature = 1.5, min_p = 0.1)\n", "tokenizer.batch_decode(outputs)" ], "metadata": { "id": "TiffjpKBx3po" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!" ], "metadata": { "id": "rtI2ij1byFi4" } }, { "cell_type": "code", "source": [ "FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": \"Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(\n", " messages,\n", " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", " return_tensors = \"pt\",\n", ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n", "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,\n", " use_cache = True, temperature = 1.5, min_p = 0.1)" ], "metadata": { "id": "Q6OVg-9PyEOF" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Saving, loading finetuned models" ], "metadata": { "id": "oBwfqD5eyKnT" } }, { "cell_type": "markdown", "source": [ "To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.\n", "\n", "[NOTE] This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!" ], "metadata": { "id": "YuFKfJ6FyQRa" } }, { "cell_type": "code", "source": [ "model.save_pretrained(\"lora_model\") # Local saving\n", "tokenizer.save_pretrained(\"lora_model\")" ], "metadata": { "id": "u_tAwLZ9yJDC", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fc067ca9-b9d7-4553-a6e0-8bca74124ebd" }, "execution_count": 19, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "('lora_model/tokenizer_config.json',\n", " 'lora_model/special_tokens_map.json',\n", " 'lora_model/chat_template.jinja',\n", " 'lora_model/tokenizer.json')" ] }, "metadata": {}, "execution_count": 19 } ] }, { "cell_type": "code", "source": [ "# Save the finetuned model to HuggingFace\n", "# Organization name: ft-lora\n", "\n", "# Online saving\n", "# model.push_to_hub(\"ft-lora/llama3.2-3b-instruct-finetuned\")\n", "# tokenizer.push_to_hub(\"ft-lora/llama3.2-3b-instruct-finetuned\")\n" ], "metadata": { "id": "6Y6dENUbZedJ" }, "execution_count": 20, "outputs": [] }, { "cell_type": "markdown", "source": [ "Now if you want to load the LoRA adapters we just saved for inference, set False to True:" ], "metadata": { "id": "DWdxaCN6yZLM" } }, { "cell_type": "code", "source": [ "if True:\n", " from unsloth import FastLanguageModel\n", " model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = \"lora_model\", # the model used for training\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " )\n", " FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": \"Describe a tall tower in the capital of France.\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(\n", " messages,\n", " tokenize = True,\n", " add_generation_prompt = True, # Must add for generation\n", " return_tensors = \"pt\",\n", ").to(\"cuda\")\n", "\n", "from transformers import TextStreamer\n", "text_streamer = TextStreamer(tokenizer, skip_prompt = True)\n", "_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,\n", " use_cache = True, temperature = 1.5, min_p = 0.1)" ], "metadata": { "id": "xiVGwJpCyZsA" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### Saving to float16 for VLLM" ], "metadata": { "id": "YZjuI3SAygsU" } }, { "cell_type": "markdown", "source": [ "We also support saving to float16 directly. Select merged_16bit for float16 or merged_4bit for int4. We also allow lora adapters as a fallback. Use push_to_hub_merged to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens." ], "metadata": { "id": "PIr6PHZoykG_" } }, { "cell_type": "code", "source": [ "# Merge to 16bit, standard HuggingFace model\n", "# LoRA weights are merged into the base model\n", "# Needed for GGUF\n", "if True: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_16bit\",)\n", "if True: model.push_to_hub_merged(\"ft-lora/llama3.2-3b-instruct-finetuned\", tokenizer, save_method = \"merged_16bit\")\n", "\n", "# Merge to 4bit, better for for CPU inference\n", "# if True: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_4bit_forced\",)\n", "# if True: model.push_to_hub_merged(\"ft-lora/llama3.2-3b-instruct-finetuned\", tokenizer, save_method = \"merged_4bit_forced\")\n", "\n", "# Just LoRA adapters\n", "# if False: model.save_pretrained_merged(\"model\", tokenizer, save_method = \"lora\",)\n", "# if False: model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"lora\")" ], "metadata": { "id": "ApQZDyNRyiQa" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "## GGUF / llama.cpp Conversion" ], "metadata": { "id": "YqvuyKiiyqaa" } }, { "cell_type": "markdown", "source": [ "Switch to CPU here?" ], "metadata": { "id": "oJ6bHyTLX2zX" } }, { "cell_type": "code", "source": [ "!git clone https://github.com/ggml-org/llama.cpp.git\n", "%cd llama.cpp" ], "metadata": { "id": "OouPlz8TRq1x", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "184453a3-a3e6-4ca6-ba6a-0ecb0c1fc5e0" }, "execution_count": 25, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Cloning into 'llama.cpp'...\n", "remote: Enumerating objects: 69802, done.\u001b[K\n", "remote: Counting objects: 100% (169/169), done.\u001b[K\n", "remote: Compressing objects: 100% (112/112), done.\u001b[K\n", "remote: Total 69802 (delta 125), reused 62 (delta 57), pack-reused 69633 (from 4)\u001b[K\n", "Receiving objects: 100% (69802/69802), 212.52 MiB | 16.06 MiB/s, done.\n", "Resolving deltas: 100% (50541/50541), done.\n", "/content/llama.cpp\n" ] } ] }, { "cell_type": "code", "source": [ "!pip install -r requirements.txt\n", "!pip install -U transformers huggingface_hub" ], "metadata": { "id": "5Rgi7oFXRxdW" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "%cd /content/llama.cpp" ], "metadata": { "id": "nlR9solITpqR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "!ls" ], "metadata": { "id": "-jRTZhF2TrH4" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "ls -lh /content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_GIC5jnovzhx", "outputId": "34088bd6-1c73-4b77-fb9d-9376bf23fd65" }, "execution_count": 34, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "ls: cannot access '/content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned': No such file or directory\n" ] } ] }, { "cell_type": "code", "source": [ "# Define directory to save merged model\n", "# merged_model_dir = \"/content/lora_model_merged\"\n", "\n", "# Save merged model + tokenizer\n", "# model.save_pretrained(merged_model_dir, tokenizer)\n", "# merged_model.save_pretrained(\"/content/merged_model\", tokenizer)\n", "\n", "# Convert to GGUF using llama.cpp script\n", "#!python convert_hf_to_gguf.py /content/llama.cpp/ft-lora/llama3.2-3b-instruct-finetuned \\\n", " #--outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n", " #--outtype auto\n", "\n", "!python convert_hf_to_gguf.py /content/llama.cpp/model \\\n", " --outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n", " --outtype auto\n", "\n", "\n", "#!python convert_hf_to_gguf.py /content/lora_model \\\n", " #--outfile /content/llama3.2-3b-instruct-finetuned.gguf \\\n", " #--outtype auto" ], "metadata": { "id": "ahUUYPyOR5Ua" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from huggingface_hub import HfApi\n", "\n", "api = HfApi()\n", "repo_id = \"ft-lora/llama3.2-3b-gguf-q4km\"\n", "api.create_repo(repo_id, repo_type=\"model\", exist_ok=True)\n", "\n", "api.upload_file(\n", " path_or_fileobj=\"/content/llama3.2-3b-instruct-finetuned.gguf\",\n", " path_in_repo=\"llama3.2-3b-instruct-finetuned.gguf\",\n", " repo_id=repo_id\n", ")" ], "metadata": { "id": "R7ca98iBSAHX" }, "execution_count": null, "outputs": [] } ] }