initial commit

Files changed (4) hide show

default_config.yaml +25 -0
train.ipynb +742 -0
train.py +392 -0
use_model.py +88 -0

default_config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: true
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: True
+machine_rank: 0
+main_training_function: train
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

train.ipynb ADDED Viewed

	@@ -0,0 +1,742 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6c066f8c-5b3b-486b-b958-76cc9d380146",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"TORCH_USE_CUDA_DSA\"] = \"1\"  # Enable CUDA Dynamic Shared Allocation\n",
+    "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c20b0689-1347-47fb-a259-48ab1e1c1420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "!{sys.executable} -m pip install datasets transformers accelerate==0.30.0 peft flash-attn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "a4b1142a-c51e-4794-94d7-805e70fb308d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0.45.5\n"
+     ]
+    }
+   ],
+   "source": [
+    "import bitsandbytes as bab\n",
+    "print(bab.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b3900693-64f2-4f48-803a-196de8f616f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "NVIDIA L4\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "print(torch.cuda.get_device_name(0))\n",
+    "print(torch.cuda.is_bf16_supported())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "de679cdb-4fb6-4bd6-be66-6379c4131312",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(8, 9)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(torch.cuda.get_device_capability())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b0b4c7df-aa9e-40ed-9827-e5436e33168c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: bitsandbytes in /usr/local/lib/python3.10/dist-packages (0.45.5)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (1.26.4)\n",
+      "Requirement already satisfied: torch<3,>=2.0 in /usr/local/lib/python3.10/dist-packages (from bitsandbytes) (2.6.0)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (10.3.5.147)\n",
+      "Requirement already satisfied: typing-extensions>=4.10.0 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (4.12.2)\n",
+      "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (0.6.2)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.127)\n",
+      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (2024.9.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (1.13.1)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (3.4.2)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.5.8)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (11.2.1.3)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.3.1.170)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (9.1.0.70)\n",
+      "Requirement already satisfied: triton==3.2.0 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (3.2.0)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (11.6.1.9)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (3.1.5)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (2.21.5)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (3.17.0)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.127)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /usr/local/lib/python3.10/dist-packages (from torch<3,>=2.0->bitsandbytes) (12.4.127)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch<3,>=2.0->bitsandbytes) (1.3.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch<3,>=2.0->bitsandbytes) (3.0.2)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "!{sys.executable} -m pip install -U bitsandbytes "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "08edce1c-0326-4688-a557-5f80b33cb077",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device_map = (\n",
+    "    int(os.environ.get(\"LOCAL_RANK\", -1))\n",
+    "    if torch.distributed.is_available() and torch.distributed.is_initialized()\n",
+    "    else \"auto\"\n",
+    ")  # {\"\": 0}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "ca502d8f-a0e2-421c-b615-5bd232236fb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "auto\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(device_map)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6d4c31bf-9087-40e2-8cf1-50d793a50cc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MODEL = \"bigcode/starcoderbase-1b\"  # Model checkpoint on the Hugging Face Hub\n",
+    "DATASET = \"smangrul/hf-stack-v1\"  # Dataset on the Hugging Face Hub\n",
+    "DATA_COLUMN = \"content\"  # Column name containing the code content\n",
+    "\n",
+    "SEQ_LENGTH = 2048  # Sequence length\n",
+    "\n",
+    "MAX_STEPS = 2000  # max_steps\n",
+    "BATCH_SIZE = 1  # batch_size\n",
+    "GR_ACC_STEPS = 1  # gradient_accumulation_steps\n",
+    "LR = 5e-4  # learning_rate\n",
+    "LR_SCHEDULER_TYPE = \"cosine\"  # lr_scheduler_type\n",
+    "WEIGHT_DECAY = 0.01  # weight_decay\n",
+    "NUM_WARMUP_STEPS = 30  # num_warmup_steps\n",
+    "EVAL_FREQ = 100  # eval_freq\n",
+    "SAVE_FREQ = 100  # save_freq\n",
+    "LOG_FREQ = 25  # log_freq\n",
+    "OUTPUT_DIR = \"peft-starcoder-lora-a100\"  # output_dir\n",
+    "BF16 = True  # bf16\n",
+    "FP16 = False  # no_fp16\n",
+    "\n",
+    "# FIM trasformations arguments\n",
+    "FIM_RATE = 0.5  # fim_rate\n",
+    "FIM_SPM_RATE = 0.5  # fim_spm_rate\n",
+    "\n",
+    "# LORA\n",
+    "LORA_R = 8  # lora_r\n",
+    "LORA_ALPHA = 32  # lora_alpha\n",
+    "LORA_DROPOUT = 0.0  # lora_dropout\n",
+    "LORA_TARGET_MODULES = \"c_proj,c_attn,q_attn,c_fc,c_proj\"  # lora_target_modules\n",
+    "\n",
+    "# bitsandbytes config\n",
+    "USE_NESTED_QUANT = True  # use_nested_quant\n",
+    "BNB_4BIT_COMPUTE_DTYPE = \"bfloat16\"  # bnb_4bit_compute_dtype\n",
+    "\n",
+    "SEED = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e60f3f7-c90f-41ec-91d3-98b6532e9446",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import login\n",
+    "from transformers import (\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    "    logging,\n",
+    "    set_seed,\n",
+    "    BitsAndBytesConfig,\n",
+    ")\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "import torch\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "#Prepare Data\n",
+    "dataset = load_dataset(\n",
+    "    DATASET,\n",
+    "    data_dir=\"data\",\n",
+    "    split=\"train\",\n",
+    "    streaming=True,\n",
+    ")\n",
+    "\n",
+    "valid_data = dataset.take(4000)\n",
+    "train_data = dataset.skip(4000)\n",
+    "train_data = train_data.shuffle(buffer_size=5000, seed=SEED)\n",
+    "\n",
+    "set_seed(SEED)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "88201294-50c4-44b0-9209-1873feba0dae",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n",
+      "100%|██████████| 400/400 [00:03<00:00, 109.96it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The character to token ratio of the dataset is: 2.43\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)\n",
+    "\n",
+    "\n",
+    "def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):\n",
+    "    \"\"\"\n",
+    "    Estimate the average number of characters per token in the dataset.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    total_characters, total_tokens = 0, 0\n",
+    "    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):\n",
+    "        total_characters += len(example[data_column])\n",
+    "        total_tokens += len(tokenizer(example[data_column]).tokens())\n",
+    "\n",
+    "    return total_characters / total_tokens\n",
+    "\n",
+    "\n",
+    "chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)\n",
+    "print(f\"The character to token ratio of the dataset is: {chars_per_token:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "eee2c81e-f0df-435e-b4b9-e2a1d4f8c853",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import functools\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.\n",
+    "@functools.lru_cache(maxsize=None)\n",
+    "def get_fim_token_ids(tokenizer):\n",
+    "    try:\n",
+    "        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[\"additional_special_tokens\"][1:5]\n",
+    "        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (\n",
+    "            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]\n",
+    "        )\n",
+    "    except KeyError:\n",
+    "        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None\n",
+    "    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id\n",
+    "\n",
+    "\n",
+    "## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py\n",
+    "def permute(\n",
+    "    sample,\n",
+    "    np_rng,\n",
+    "    suffix_tok_id,\n",
+    "    prefix_tok_id,\n",
+    "    middle_tok_id,\n",
+    "    pad_tok_id,\n",
+    "    fim_rate=0.5,\n",
+    "    fim_spm_rate=0.5,\n",
+    "    truncate_or_pad=False,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:\n",
+    "    PSM and SPM (with a probability of fim_spm_rate).\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # The if condition will trigger with the probability of fim_rate\n",
+    "    # This means FIM transformations will apply to samples with a probability of fim_rate\n",
+    "    if np_rng.binomial(1, fim_rate):\n",
+    "\n",
+    "        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.\n",
+    "        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))\n",
+    "        boundaries.sort()\n",
+    "\n",
+    "        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)\n",
+    "        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)\n",
+    "        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)\n",
+    "\n",
+    "        if truncate_or_pad:\n",
+    "            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix\n",
+    "            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3\n",
+    "            diff = new_length - len(sample)\n",
+    "\n",
+    "            # trancate or pad if there's a difference in length between the new length and the original\n",
+    "            if diff > 0:\n",
+    "                if suffix.shape[0] <= diff:\n",
+    "                    return sample, np_rng\n",
+    "                suffix = suffix[: suffix.shape[0] - diff]\n",
+    "            elif diff < 0:\n",
+    "                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])\n",
+    "\n",
+    "        # With the probability of fim_spm_rateapply SPM variant of FIM transformations\n",
+    "        # SPM: suffix, prefix, middle\n",
+    "        if np_rng.binomial(1, fim_spm_rate):\n",
+    "            new_sample = np.concatenate(\n",
+    "                [\n",
+    "                    [prefix_tok_id, suffix_tok_id],\n",
+    "                    suffix,\n",
+    "                    [middle_tok_id],\n",
+    "                    prefix,\n",
+    "                    middle,\n",
+    "                ]\n",
+    "            )\n",
+    "        # Otherwise, apply the PSM variant of FIM transformations\n",
+    "        # PSM: prefix, suffix, middle\n",
+    "        else:\n",
+    "\n",
+    "            new_sample = np.concatenate(\n",
+    "                [\n",
+    "                    [prefix_tok_id],\n",
+    "                    prefix,\n",
+    "                    [suffix_tok_id],\n",
+    "                    suffix,\n",
+    "                    [middle_tok_id],\n",
+    "                    middle,\n",
+    "                ]\n",
+    "            )\n",
+    "    else:\n",
+    "        # don't apply FIM transformations\n",
+    "        new_sample = sample\n",
+    "\n",
+    "    return list(new_sample), np_rng"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "3a9ebdef-4178-44af-9b35-12c8189c27f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import IterableDataset\n",
+    "from torch.utils.data.dataloader import DataLoader\n",
+    "import random\n",
+    "\n",
+    "# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.\n",
+    "\n",
+    "\n",
+    "class ConstantLengthDataset(IterableDataset):\n",
+    "    \"\"\"\n",
+    "    Iterable dataset that returns constant length chunks of tokens from stream of text files.\n",
+    "        Args:\n",
+    "            tokenizer (Tokenizer): The processor used for proccessing the data.\n",
+    "            dataset (dataset.Dataset): Dataset with text files.\n",
+    "            infinite (bool): If True the iterator is reset after dataset reaches end else stops.\n",
+    "            seq_length (int): Length of token sequences to return.\n",
+    "            num_of_sequences (int): Number of token sequences to keep in buffer.\n",
+    "            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.\n",
+    "            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.\n",
+    "            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.\n",
+    "            seed (int): Seed for random number generator.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        tokenizer,\n",
+    "        dataset,\n",
+    "        infinite=False,\n",
+    "        seq_length=1024,\n",
+    "        num_of_sequences=1024,\n",
+    "        chars_per_token=3.6,\n",
+    "        content_field=\"content\",\n",
+    "        fim_rate=0.5,\n",
+    "        fim_spm_rate=0.5,\n",
+    "        seed=0,\n",
+    "    ):\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.concat_token_id = tokenizer.eos_token_id\n",
+    "        self.dataset = dataset\n",
+    "        self.seq_length = seq_length\n",
+    "        self.infinite = infinite\n",
+    "        self.current_size = 0\n",
+    "        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences\n",
+    "        self.content_field = content_field\n",
+    "        self.fim_rate = fim_rate\n",
+    "        self.fim_spm_rate = fim_spm_rate\n",
+    "        self.seed = seed\n",
+    "\n",
+    "        (\n",
+    "            self.suffix_tok_id,\n",
+    "            self.prefix_tok_id,\n",
+    "            self.middle_tok_id,\n",
+    "            self.pad_tok_id,\n",
+    "        ) = get_fim_token_ids(self.tokenizer)\n",
+    "        if not self.suffix_tok_id and self.fim_rate > 0:\n",
+    "            print(\"FIM is not supported by tokenizer, disabling FIM\")\n",
+    "            self.fim_rate = 0\n",
+    "\n",
+    "    def __iter__(self):\n",
+    "        iterator = iter(self.dataset)\n",
+    "        more_examples = True\n",
+    "        np_rng = np.random.RandomState(seed=self.seed)\n",
+    "        while more_examples:\n",
+    "            buffer, buffer_len = [], 0\n",
+    "            while True:\n",
+    "                if buffer_len >= self.max_buffer_size:\n",
+    "                    break\n",
+    "                try:\n",
+    "                    buffer.append(next(iterator)[self.content_field])\n",
+    "                    buffer_len += len(buffer[-1])\n",
+    "                except StopIteration:\n",
+    "                    if self.infinite:\n",
+    "                        iterator = iter(self.dataset)\n",
+    "                    else:\n",
+    "                        more_examples = False\n",
+    "                        break\n",
+    "            tokenized_inputs = self.tokenizer(buffer, truncation=False)[\"input_ids\"]\n",
+    "            all_token_ids = []\n",
+    "\n",
+    "            for tokenized_input in tokenized_inputs:\n",
+    "                # optionally do FIM permutations\n",
+    "                if self.fim_rate > 0:\n",
+    "                    tokenized_input, np_rng = permute(\n",
+    "                        tokenized_input,\n",
+    "                        np_rng,\n",
+    "                        self.suffix_tok_id,\n",
+    "                        self.prefix_tok_id,\n",
+    "                        self.middle_tok_id,\n",
+    "                        self.pad_tok_id,\n",
+    "                        fim_rate=self.fim_rate,\n",
+    "                        fim_spm_rate=self.fim_spm_rate,\n",
+    "                        truncate_or_pad=False,\n",
+    "                    )\n",
+    "\n",
+    "                all_token_ids.extend(tokenized_input + [self.concat_token_id])\n",
+    "            examples = []\n",
+    "            for i in range(0, len(all_token_ids), self.seq_length):\n",
+    "                input_ids = all_token_ids[i : i + self.seq_length]\n",
+    "                if len(input_ids) == self.seq_length:\n",
+    "                    examples.append(input_ids)\n",
+    "            random.shuffle(examples)\n",
+    "            for example in examples:\n",
+    "                self.current_size += 1\n",
+    "                yield {\n",
+    "                    \"input_ids\": torch.LongTensor(example),\n",
+    "                    \"labels\": torch.LongTensor(example),\n",
+    "                }\n",
+    "\n",
+    "\n",
+    "train_dataset = ConstantLengthDataset(\n",
+    "    tokenizer,\n",
+    "    train_data,\n",
+    "    infinite=True,\n",
+    "    seq_length=SEQ_LENGTH,\n",
+    "    chars_per_token=chars_per_token,\n",
+    "    content_field=DATA_COLUMN,\n",
+    "    fim_rate=FIM_RATE,\n",
+    "    fim_spm_rate=FIM_SPM_RATE,\n",
+    "    seed=SEED,\n",
+    ")\n",
+    "eval_dataset = ConstantLengthDataset(\n",
+    "    tokenizer,\n",
+    "    valid_data,\n",
+    "    infinite=False,\n",
+    "    seq_length=SEQ_LENGTH,\n",
+    "    chars_per_token=chars_per_token,\n",
+    "    content_field=DATA_COLUMN,\n",
+    "    fim_rate=FIM_RATE,\n",
+    "    fim_spm_rate=FIM_SPM_RATE,\n",
+    "    seed=SEED,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5021e686-2e1c-4608-9477-1f07adf2de35",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e4ea2b63e63448938020858c06143ca5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "ImportError",
+     "evalue": "FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 26\u001b[0m\n\u001b[1;32m     17\u001b[0m bnb_config \u001b[38;5;241m=\u001b[39m BitsAndBytesConfig(\n\u001b[1;32m     18\u001b[0m     load_in_4bit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m     19\u001b[0m     bnb_4bit_quant_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnf4\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m     20\u001b[0m     bnb_4bit_compute_dtype\u001b[38;5;241m=\u001b[39mcompute_dtype,\n\u001b[1;32m     21\u001b[0m     bnb_4bit_use_double_quant\u001b[38;5;241m=\u001b[39mUSE_NESTED_QUANT,\n\u001b[1;32m     22\u001b[0m )\n\u001b[1;32m     24\u001b[0m device_map \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m0\u001b[39m}\n\u001b[0;32m---> 26\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mAutoModelForCausalLM\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     27\u001b[0m \u001b[43m    \u001b[49m\u001b[43mMODEL\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     28\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquantization_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbnb_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     29\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     30\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m  \u001b[49m\u001b[38;5;66;43;03m# We will be using gradient checkpointing\u001b[39;49;00m\n\u001b[1;32m     31\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtrust_remote_code\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     32\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_implementation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mflash_attention_2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     33\u001b[0m \u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/models/auto/auto_factory.py:564\u001b[0m, in \u001b[0;36m_BaseAutoModelClass.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m    562\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(config) \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys():\n\u001b[1;32m    563\u001b[0m     model_class \u001b[38;5;241m=\u001b[39m _get_model_class(config, \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping)\n\u001b[0;32m--> 564\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmodel_class\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_pretrained\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    565\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpretrained_model_name_or_path\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_args\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mhub_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    566\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    567\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    568\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnrecognized configuration class \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconfig\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for this kind of AutoModel: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    569\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mModel type should be one of \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(c\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mfor\u001b[39;00m\u001b[38;5;250m \u001b[39mc\u001b[38;5;250m \u001b[39m\u001b[38;5;129;01min\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_model_mapping\u001b[38;5;241m.\u001b[39mkeys())\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    570\u001b[0m )\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:3804\u001b[0m, in \u001b[0;36mPreTrainedModel.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m   3801\u001b[0m     init_contexts\u001b[38;5;241m.\u001b[39mappend(init_empty_weights())\n\u001b[1;32m   3803\u001b[0m config \u001b[38;5;241m=\u001b[39m copy\u001b[38;5;241m.\u001b[39mdeepcopy(config)  \u001b[38;5;66;03m# We do not want to modify the config inplace in from_pretrained.\u001b[39;00m\n\u001b[0;32m-> 3804\u001b[0m config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_autoset_attn_implementation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3805\u001b[0m \u001b[43m    \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_flash_attention_2\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\n\u001b[1;32m   3806\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3808\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ContextManagers(init_contexts):\n\u001b[1;32m   3809\u001b[0m     \u001b[38;5;66;03m# Let's make sure we don't run the init function of buffer modules\u001b[39;00m\n\u001b[1;32m   3810\u001b[0m     model \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m(config, \u001b[38;5;241m*\u001b[39mmodel_args, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:1534\u001b[0m, in \u001b[0;36mPreTrainedModel._autoset_attn_implementation\u001b[0;34m(cls, config, use_flash_attention_2, torch_dtype, device_map, check_device_map)\u001b[0m\n\u001b[1;32m   1531\u001b[0m     config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1533\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m config\u001b[38;5;241m.\u001b[39m_attn_implementation \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attention_2\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m-> 1534\u001b[0m     \u001b[38;5;28;43mcls\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_check_and_enable_flash_attn_2\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1535\u001b[0m \u001b[43m        \u001b[49m\u001b[43mconfig\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1536\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtorch_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtorch_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1537\u001b[0m \u001b[43m        \u001b[49m\u001b[43mdevice_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdevice_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1538\u001b[0m \u001b[43m        \u001b[49m\u001b[43mhard_check_only\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m   1539\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcheck_device_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheck_device_map\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1540\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1541\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;28;01mNone\u001b[39;00m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msdpa\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available():\n\u001b[1;32m   1542\u001b[0m     \u001b[38;5;66;03m# use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.\u001b[39;00m\n\u001b[1;32m   1543\u001b[0m     config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcls\u001b[39m\u001b[38;5;241m.\u001b[39m_check_and_enable_sdpa(\n\u001b[1;32m   1544\u001b[0m         config,\n\u001b[1;32m   1545\u001b[0m         hard_check_only\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m requested_attn_implementation \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m   1546\u001b[0m     )\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:1636\u001b[0m, in \u001b[0;36mPreTrainedModel._check_and_enable_flash_attn_2\u001b[0;34m(cls, config, torch_dtype, device_map, check_device_map, hard_check_only)\u001b[0m\n\u001b[1;32m   1633\u001b[0m install_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1635\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m importlib\u001b[38;5;241m.\u001b[39mutil\u001b[38;5;241m.\u001b[39mfind_spec(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1636\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpreface\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m the package flash_attn seems to be not installed. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minstall_message\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   1638\u001b[0m flash_attention_version \u001b[38;5;241m=\u001b[39m version\u001b[38;5;241m.\u001b[39mparse(importlib\u001b[38;5;241m.\u001b[39mmetadata\u001b[38;5;241m.\u001b[39mversion(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflash_attn\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m   1639\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mversion\u001b[38;5;241m.\u001b[39mcuda:\n",
+      "\u001b[0;31mImportError\u001b[0m: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n",
+    "from peft.tuners.lora import LoraLayer\n",
+    "\n",
+    "load_in_8bit = False\n",
+    "\n",
+    "# 4-bit quantization\n",
+    "compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)\n",
+    "\n",
+    "bnb_config2 = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_use_double_quant=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.bfloat16\n",
+    ")\n",
+    "\n",
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=compute_dtype,\n",
+    "    bnb_4bit_use_double_quant=USE_NESTED_QUANT,\n",
+    ")\n",
+    "\n",
+    "device_map = {\"\": 0}\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    MODEL,\n",
+    "    quantization_config=bnb_config,\n",
+    "    device_map=device_map,\n",
+    "    use_cache=False,  # We will be using gradient checkpointing\n",
+    "    trust_remote_code=True,\n",
+    "    attn_implementation=\"flash_attention_2\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c0a9fdb-4087-4ec5-aefa-fc5f413252e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = prepare_model_for_kbit_training(model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5fd3293-76ef-48eb-8241-478e311ec947",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set up lora\n",
+    "peft_config = LoraConfig(\n",
+    "    lora_alpha=LORA_ALPHA,\n",
+    "    lora_dropout=LORA_DROPOUT,\n",
+    "    r=LORA_R,\n",
+    "    bias=\"none\",\n",
+    "    task_type=\"CAUSAL_LM\",\n",
+    "    target_modules=LORA_TARGET_MODULES.split(\",\"),\n",
+    ")\n",
+    "\n",
+    "model = get_peft_model(model, peft_config)\n",
+    "model.print_trainable_parameters()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "082c6a7b-db61-4800-8a94-419331b1fd22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data.start_iteration = 0\n",
+    "\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=f\"ernyou/{OUTPUT_DIR}\",\n",
+    "    dataloader_drop_last=True,\n",
+    "    eval_strategy=\"steps\",\n",
+    "    save_strategy=\"steps\",\n",
+    "    max_steps=MAX_STEPS,\n",
+    "    eval_steps=EVAL_FREQ,\n",
+    "    save_steps=SAVE_FREQ,\n",
+    "    logging_steps=LOG_FREQ,\n",
+    "    per_device_train_batch_size=BATCH_SIZE,\n",
+    "    per_device_eval_batch_size=BATCH_SIZE,\n",
+    "    learning_rate=LR,\n",
+    "    lr_scheduler_type=LR_SCHEDULER_TYPE,\n",
+    "    warmup_steps=NUM_WARMUP_STEPS,\n",
+    "    gradient_accumulation_steps=GR_ACC_STEPS,\n",
+    "    gradient_checkpointing_kwargs={\"use_reentrant\": True},\n",
+    "    gradient_checkpointing=True,\n",
+    "    fp16=FP16,\n",
+    "    bf16=BF16,\n",
+    "    weight_decay=WEIGHT_DECAY,\n",
+    "    push_to_hub=True,\n",
+    "    include_tokens_per_second=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2c302ded-f017-433c-9622-55ecb45141bd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.3.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "import accelerate as ac\n",
+    "print(ac.__version__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5318efa4-83da-41fa-9123-50b505e9a615",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ceddceb0-8e1e-493f-99a1-b77c6b0c40b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)\n",
+    "\n",
+    "print(\"Training...\")\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76fb1530-5e16-4b4b-a30f-b689df3483f3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

train.py ADDED Viewed

	@@ -0,0 +1,392 @@

+MODEL = "bigcode/starcoderbase-1b"  # Model checkpoint on the Hugging Face Hub
+DATASET = "smangrul/hf-stack-v1"  # Dataset on the Hugging Face Hub
+DATA_COLUMN = "content"  # Column name containing the code content
+SEQ_LENGTH = 2048  # Sequence length
+MAX_STEPS = 2000  # max_steps
+BATCH_SIZE = 8  # batch_size
+GR_ACC_STEPS = 1  # gradient_accumulation_steps
+LR = 5e-4  # learning_rate
+LR_SCHEDULER_TYPE = "cosine"  # lr_scheduler_type
+WEIGHT_DECAY = 0.01  # weight_decay
+NUM_WARMUP_STEPS = 30  # num_warmup_steps
+EVAL_FREQ = 100  # eval_freq
+SAVE_FREQ = 100  # save_freq
+LOG_FREQ = 25  # log_freq
+OUTPUT_DIR = "peft-starcoder-lora-a100"  # output_dir
+BF16 = False  # bf16
+FP16 = False  # no_fp16
+# FIM trasformations arguments
+FIM_RATE = 0.5  # fim_rate
+FIM_SPM_RATE = 0.5  # fim_spm_rate
+# LORA
+LORA_R = 8  # lora_r
+LORA_ALPHA = 32  # lora_alpha
+LORA_DROPOUT = 0.0  # lora_dropout
+LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"  # lora_target_modules
+# bitsandbytes config
+#USE_NESTED_QUANT = True  # use_nested_quant
+#BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # bnb_4bit_compute_dtype
+SEED = 0
+from huggingface_hub import login
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    logging,
+    set_seed,
+    BitsAndBytesConfig,
+)
+from datasets import load_dataset
+import torch
+from tqdm import tqdm
+#Prepare Data
+dataset = load_dataset(
+    DATASET,
+    data_dir="data",
+    split="train",
+    streaming=True,
+)
+valid_data = dataset.take(4000)
+train_data = dataset.skip(4000)
+train_data = train_data.shuffle(buffer_size=5000, seed=SEED)
+set_seed(SEED)
+tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+    return total_characters / total_tokens
+chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
+print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+import functools
+import numpy as np
+# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.
+@functools.lru_cache(maxsize=None)
+def get_fim_token_ids(tokenizer):
+    try:
+        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][1:5]
+        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
+        )
+    except KeyError:
+        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None
+    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
+## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
+def permute(
+    sample,
+    np_rng,
+    suffix_tok_id,
+    prefix_tok_id,
+    middle_tok_id,
+    pad_tok_id,
+    fim_rate=0.5,
+    fim_spm_rate=0.5,
+    truncate_or_pad=False,
+):
+    """
+    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
+    PSM and SPM (with a probability of fim_spm_rate).
+    """
+    # The if condition will trigger with the probability of fim_rate
+    # This means FIM transformations will apply to samples with a probability of fim_rate
+    if np_rng.binomial(1, fim_rate):
+        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.
+        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
+        boundaries.sort()
+        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
+        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
+        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
+        if truncate_or_pad:
+            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - len(sample)
+            # trancate or pad if there's a difference in length between the new length and the original
+            if diff > 0:
+                if suffix.shape[0] <= diff:
+                    return sample, np_rng
+                suffix = suffix[: suffix.shape[0] - diff]
+            elif diff < 0:
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
+        # With the probability of fim_spm_rateapply SPM variant of FIM transformations
+        # SPM: suffix, prefix, middle
+        if np_rng.binomial(1, fim_spm_rate):
+            new_sample = np.concatenate(
+                [
+                    [prefix_tok_id, suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    prefix,
+                    middle,
+                ]
+            )
+        # Otherwise, apply the PSM variant of FIM transformations
+        # PSM: prefix, suffix, middle
+        else:
+            new_sample = np.concatenate(
+                [
+                    [prefix_tok_id],
+                    prefix,
+                    [suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    middle,
+                ]
+            )
+    else:
+        # don't apply FIM transformations
+        new_sample = sample
+    return list(new_sample), np_rng
+from torch.utils.data import IterableDataset
+from torch.utils.data.dataloader import DataLoader
+import random
+# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+        Args:
+            tokenizer (Tokenizer): The processor used for proccessing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
+            seq_length (int): Length of token sequences to return.
+            num_of_sequences (int): Number of token sequences to keep in buffer.
+            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
+            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
+            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
+            seed (int): Seed for random number generator.
+    """
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        content_field="content",
+        fim_rate=0.5,
+        fim_spm_rate=0.5,
+        seed=0,
+    ):
+        self.tokenizer = tokenizer
+        self.concat_token_id = tokenizer.eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.content_field = content_field
+        self.fim_rate = fim_rate
+        self.fim_spm_rate = fim_spm_rate
+        self.seed = seed
+        (
+            self.suffix_tok_id,
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.pad_tok_id,
+        ) = get_fim_token_ids(self.tokenizer)
+        if not self.suffix_tok_id and self.fim_rate > 0:
+            print("FIM is not supported by tokenizer, disabling FIM")
+            self.fim_rate = 0
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        np_rng = np.random.RandomState(seed=self.seed)
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(next(iterator)[self.content_field])
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
+            all_token_ids = []
+            for tokenized_input in tokenized_inputs:
+                # optionally do FIM permutations
+                if self.fim_rate > 0:
+                    tokenized_input, np_rng = permute(
+                        tokenized_input,
+                        np_rng,
+                        self.suffix_tok_id,
+                        self.prefix_tok_id,
+                        self.middle_tok_id,
+                        self.pad_tok_id,
+                        fim_rate=self.fim_rate,
+                        fim_spm_rate=self.fim_spm_rate,
+                        truncate_or_pad=False,
+                    )
+                all_token_ids.extend(tokenized_input + [self.concat_token_id])
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+train_dataset = ConstantLengthDataset(
+    tokenizer,
+    train_data,
+    infinite=True,
+    seq_length=SEQ_LENGTH,
+    chars_per_token=chars_per_token,
+    content_field=DATA_COLUMN,
+    fim_rate=FIM_RATE,
+    fim_spm_rate=FIM_SPM_RATE,
+    seed=SEED,
+)
+eval_dataset = ConstantLengthDataset(
+    tokenizer,
+    valid_data,
+    infinite=False,
+    seq_length=SEQ_LENGTH,
+    chars_per_token=chars_per_token,
+    content_field=DATA_COLUMN,
+    fim_rate=FIM_RATE,
+    fim_spm_rate=FIM_SPM_RATE,
+    seed=SEED,
+)
+import torch
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from peft.tuners.lora import LoraLayer
+#load_in_8bit = False
+# 4-bit quantization
+#compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)
+#compute_float32 = torch.float32
+#bnb_config = BitsAndBytesConfig(
+#    load_in_4bit=True,
+#    bnb_4bit_quant_type="nf4",
+#    bnb_4bit_compute_dtype=compute_float32,
+#    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
+#    bnb_4bit_quant_storage=compute_float32
+#)
+#import os
+#device_map = int(os.environ.get("LOCAL_RANK", -1))
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL,
+    #quantization_config=bnb_config,
+    device_map=None,
+    use_cache=False,  # We will be using gradient checkpointing
+    trust_remote_code=True,
+    torch_dtype = torch.float32,
+)
+#from collections import Counter
+#print(Counter(p.dtype for p in model.parameters()))
+#model = prepare_model_for_kbit_training(model)
+#from collections import Counter
+#print("after prepare_model_for_kbit_training ", Counter(p.dtype for p in model.parameters()))
+peft_config = LoraConfig(
+    lora_alpha=LORA_ALPHA,
+    lora_dropout=LORA_DROPOUT,
+    r=LORA_R,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=LORA_TARGET_MODULES.split(","),
+)
+model = get_peft_model(model, peft_config)
+model.print_trainable_parameters()
+#from collections import Counter
+#print("after get_peft_model ", Counter(p.dtype for p in model.parameters()))
+train_data.start_iteration = 0
+training_args = TrainingArguments(
+    output_dir=f"limernyou/{OUTPUT_DIR}",
+    dataloader_drop_last=True,
+    eval_strategy="steps",
+    save_strategy="steps",
+    max_steps=MAX_STEPS,
+    eval_steps=EVAL_FREQ,
+    save_steps=SAVE_FREQ,
+    logging_steps=LOG_FREQ,
+    per_device_train_batch_size=BATCH_SIZE,
+    per_device_eval_batch_size=BATCH_SIZE,
+    learning_rate=LR,
+    lr_scheduler_type=LR_SCHEDULER_TYPE,
+    warmup_steps=NUM_WARMUP_STEPS,
+    gradient_accumulation_steps=GR_ACC_STEPS,
+    gradient_checkpointing_kwargs={"use_reentrant": False},
+    gradient_checkpointing=True,
+    fp16=FP16,
+    bf16=BF16,
+    weight_decay=WEIGHT_DECAY,
+    push_to_hub=True,
+    include_tokens_per_second=True,
+)
+#from trl import SFTConfig, SFTTrainer
+trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)
+print("Training...")
+trainer.train()
+trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+trainer.save_model()
+trainer.push_to_hub()

use_model.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from peft import PeftModel
+from huggingface_hub import login
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    logging,
+    set_seed,
+    BitsAndBytesConfig,
+)
+MODEL = "bigcode/starcoderbase-1b"  # Model checkpoint on the Hugging Face Hub
+# load the original model first
+print("Load Tokenizer")
+tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left"
+print("Load Model")
+base_model = AutoModelForCausalLM.from_pretrained(
+    MODEL,
+    quantization_config=None,
+    device_map=None,
+    trust_remote_code=True,
+    torch_dtype=torch.float32,
+).cuda()
+# merge fine-tuned weights with the base model
+peft_model_id = f"limernyou/starcoder-peft-conti"
+model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name="personal_copilot")
+#model.add_weighted_adapter(["personal_copilot"], [0.8], "best_personal_copilot")
+#model.set_adapter("best_personal_copilot")
+model.merge_and_unload()
+#if not hasattr(model, "hf_device_map"):
+#    model.cuda()
+def get_code_completion(prefix, suffix):
+    text = prompt = f"""<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"""
+    base_model.eval()
+    outputs = base_model.generate(
+        input_ids=tokenizer(text, return_tensors="pt").input_ids.cuda(),
+        #attention_mask=tokenizer(prompt, return_tensors="pt").to("cuda")["attention_mask"],
+        max_new_tokens=128,
+        temperature=0.2,
+        top_k=50,
+        top_p=0.95,
+        do_sample=True,
+        repetition_penalty=1.0,
+    )
+    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+def get_code_completion1(prefix, suffix):
+    prompt = prefix + suffix
+    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+    model.eval()
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=128,
+            temperature=0.2,
+            top_k=50,
+            top_p=0.95,
+            do_sample=True,
+            repetition_penalty=1.0,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    completion = output_text.split("<fim_middle>")[-1].strip()
+    return completion
+prefix = """from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM
+peft_config = LoraConfig(
+"""
+suffix = """"""
+print("Starcoder generating response")
+#print(tokenizer.special_tokens_map)
+print(get_code_completion(prefix, suffix))
+print("Successful")