diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..21cdde5d41ad5dd26b595186125a05c17768f1a1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb filter=lfs diff=lfs merge=lfs -text
diff --git a/.ipynb_checkpoints/fim-checkpoint.py b/.ipynb_checkpoints/fim-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef1d57bc2cf9994a80ffa0239492bad0ba311854
--- /dev/null
+++ b/.ipynb_checkpoints/fim-checkpoint.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import numpy as np
+
+
+# this is expensive so we cache it
+@functools.lru_cache(maxsize=None)
+def get_fim_token_ids(tokenizer):
+    if "codellama" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.suffix_id,
+            tokenizer.prefix_id,
+            tokenizer.middle_id,
+            0,
+        )
+    elif "deepseek-coder" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<｜fim▁hole｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁begin｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁end｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<pad>", add_special_tokens=False)[0],
+        )
+    elif "stable-code" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<fim_suffix>")[0],
+            tokenizer.encode("<fim_prefix>")[0],
+            tokenizer.encode("<fim_middle>")[0],
+            tokenizer.encode("<fim_pad>")[0],
+        )
+    else:
+        bos_token_id = None
+        try:
+            FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
+                "additional_special_tokens"
+            ][1:5]
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                tokenizer.vocab[tok]
+                for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
+            )
+        except KeyError:
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                None,
+                None,
+                None,
+                None,
+            )
+    return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
+
+
+def _bos_token_processing(prefix_token_list, bos_token):
+    if bos_token is not None:
+        # add the BOS token to the beginning of the list
+        prefix_token_list.insert(0, bos_token)
+
+    return prefix_token_list
+
+
+## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
+def permute(
+    sample,
+    np_rng,
+    suffix_tok_id,
+    prefix_tok_id,
+    middle_tok_id,
+    pad_tok_id,
+    fim_rate=0.5,
+    fim_spm_rate=0.5,
+    truncate_or_pad=False,
+    bos_token_id=None,
+):
+    """
+    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
+    PSM and SPM (with a probability of fim_spm_rate).
+    """
+
+    if np_rng.binomial(1, fim_rate):
+        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
+        boundaries.sort()
+
+        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
+        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
+        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
+
+        if truncate_or_pad:
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - len(sample)
+            if diff > 0:
+                if suffix.shape[0] <= diff:
+                    return sample, np_rng
+                suffix = suffix[: suffix.shape[0] - diff]
+            elif diff < 0:
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
+
+        if np_rng.binomial(1, fim_spm_rate):
+            prefix_special_tokens = _bos_token_processing(
+                [prefix_tok_id, suffix_tok_id], bos_token_id
+            )
+            # SPM (variant 2 from FIM paper)
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    suffix,
+                    [middle_tok_id],
+                    prefix,
+                    middle,
+                ]
+            )
+        else:
+            prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
+            # PSM
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    prefix,
+                    [suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    middle,
+                ]
+            )
+    else:
+        # don't do FIM preproc
+        new_sample = sample
+    return list(new_sample), np_rng
diff --git a/.ipynb_checkpoints/requirements-checkpoint.txt b/.ipynb_checkpoints/requirements-checkpoint.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b7b96512c409868cf9c62d05ed6254a9dc6bd5f
--- /dev/null
+++ b/.ipynb_checkpoints/requirements-checkpoint.txt
@@ -0,0 +1,14 @@
+git+https://github.com/huggingface/transformers
+git+https://github.com/huggingface/accelerate
+git+https://github.com/huggingface/peft
+trl
+huggingface-hub
+bitsandbytes
+evaluate
+datasets
+einops
+wandb
+tiktoken
+deepspeed
+tqdm
+safetensors
\ No newline at end of file
diff --git a/.ipynb_checkpoints/run_peft-checkpoint.sh b/.ipynb_checkpoints/run_peft-checkpoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..540c0e1e72cd3ca63699ac7eaebf293d475951bd
--- /dev/null
+++ b/.ipynb_checkpoints/run_peft-checkpoint.sh
@@ -0,0 +1,40 @@
+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 3e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 4 \
+--per_device_eval_batch_size 4 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_peft_lora True \
+--lora_r 32 \
+--lora_alpha 64 \
+--lora_dropout 0.1 \
+--lora_target_modules "all-linear" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True
\ No newline at end of file
diff --git a/.ipynb_checkpoints/train-checkpoint.py b/.ipynb_checkpoints/train-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d648ac1bcaddd7ff39da29193f1e7f2ccd9f35
--- /dev/null
+++ b/.ipynb_checkpoints/train-checkpoint.py
@@ -0,0 +1,495 @@
+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Continued pre-training/fine-tuning of code LLMs for code autocompletion.
+"""
+
+import gc
+import os
+import random
+import sys
+from typing import Optional
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from torch.utils.data import IterableDataset
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    HfArgumentParser,
+    set_seed,
+    BitsAndBytesConfig,
+)
+
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq
+import fim
+
+
+# Define and parse arguments.
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    lora_alpha: Optional[int] = field(default=16)
+    lora_dropout: Optional[float] = field(default=0.1)
+    lora_r: Optional[int] = field(default=64)
+    lora_target_modules: Optional[str] = field(
+        default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
+        metadata={
+            "help": "comma separated list of target modules to apply LoRA layers to"
+        },
+    )
+    use_nested_quant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Activate nested quantization for 4bit base models"},
+    )
+    bnb_4bit_compute_dtype: Optional[str] = field(
+        default="float16",
+        metadata={"help": "Compute dtype for 4bit base models"},
+    )
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4",
+        metadata={"help": "Quantization type fp4 or nf4"},
+    )
+    use_flash_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables Flash attention for training."},
+    )
+    use_peft_lora: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables PEFT LoRA for training."},
+    )
+    use_8bit_qunatization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 8bit."},
+    )
+    use_4bit_quantization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 4bit."},
+    )
+    use_reentrant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Gradient Checkpointing param. Refer the related docs"},
+    )
+    use_unsloth: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables UnSloth for training."},
+    )
+    use_loftq: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."},
+    )
+    use_loftq_callback: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    dataset_name: Optional[str] = field(
+        default="smangrul/hug_stack",
+        metadata={"help": "The preference dataset to use."},
+    )
+    dataset_text_field: str = field(
+        default="text", metadata={"help": "Dataset field to use as input text."}
+    )
+    max_seq_length: Optional[int] = field(default=4096)
+    test_size: Optional[float] = field(default=0.1)
+    fim_rate: Optional[float] = field(default=0.5)
+    fim_spm_rate: Optional[float] = field(default=0.5)
+    splits: Optional[str] = field(
+        default="train",
+        metadata={"help": "Comma separate list of the splits to use from the dataset."},
+    )
+
+
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+
+    return total_characters / total_tokens
+
+
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+        Args:
+            tokenizer (Tokenizer): The processor used for proccessing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
+            seq_length (int): Length of token sequences to return.
+            num_of_sequences (int): Number of token sequences to keep in buffer.
+            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
+            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
+            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
+            seed (int): Seed for random number generator.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        content_field="content",
+        fim_rate=0.5,
+        fim_spm_rate=0.5,
+        seed=0,
+        shuffle=False,
+    ):
+        self.tokenizer = tokenizer
+        self.concat_token_id = tokenizer.eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.content_field = content_field
+        self.fim_rate = fim_rate
+        self.fim_spm_rate = fim_spm_rate
+        self.seed = seed
+        self.shuffle = shuffle
+
+        (
+            self.bos_token_id,
+            self.suffix_tok_id,
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.pad_tok_id,
+        ) = fim.get_fim_token_ids(self.tokenizer)
+        if not self.suffix_tok_id and self.fim_rate > 0:
+            print("FIM is not supported by tokenizer, disabling FIM")
+            self.fim_rate = 0
+
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        np_rng = np.random.RandomState(seed=self.seed)
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(next(iterator)[self.content_field])
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(
+                buffer, truncation=False, add_special_tokens=False
+            )["input_ids"]
+            all_token_ids = []
+
+            for tokenized_input in tokenized_inputs:
+                # optionally do FIM permutations
+                if self.fim_rate > 0:
+                    tokenized_input, np_rng = fim.permute(
+                        tokenized_input,
+                        np_rng,
+                        self.suffix_tok_id,
+                        self.prefix_tok_id,
+                        self.middle_tok_id,
+                        self.pad_tok_id,
+                        fim_rate=self.fim_rate,
+                        fim_spm_rate=self.fim_spm_rate,
+                        truncate_or_pad=False,
+                        bos_token_id=self.bos_token_id,
+                    )
+
+                all_token_ids.extend(tokenized_input + [self.concat_token_id])
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+
+
+def create_datasets(tokenizer, args, seed):
+    dataset = load_dataset(args.dataset_name, split=args.splits)
+    dataset = dataset.train_test_split(
+        test_size=args.test_size, seed=seed, shuffle=True
+    )
+    train_data = dataset["train"]
+    valid_data = dataset["test"]
+    print(
+        f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
+    )
+    chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field)
+    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+    train_dataset = ConstantLengthDataset(
+        tokenizer,
+        train_data,
+        infinite=True,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+        shuffle=True,
+    )
+    valid_dataset = ConstantLengthDataset(
+        tokenizer,
+        valid_data,
+        infinite=False,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+    )
+    print(f"A sample of valid dataset: {next(iter(valid_dataset))}")
+    return train_dataset, valid_dataset
+
+def get_mae(x, y):
+    return (x - y).abs().mean()
+
+
+def get_mse(x, y):
+    return torch.pow(x - y, 2).mean()
+
+
+def error_report(x, y):
+    mae = get_mae(x, y)
+    mse = get_mse(x, y)
+    print(
+        f"Mean absolute error: {mae:>8.5f}\n"
+        f"Mean squared error:  {mse:>8.5f}"
+    )
+
+    
+def loftq_init(model, tokenizer, train_dataset, max_seq_length, args):
+    if args.use_loftq_callback:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+        base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype)
+        base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
+        random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist()
+        random_inputs = [train_dataset[i]['content'] for i in random_input_ids]
+        random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length)
+        logits_base = base_model(**random_inputs).logits
+        del base_model
+        gc.collect()
+        
+        def loftq_callback(model, module_name):
+            """Callable to replace weights with LoFTQ if the mse is lower than the current best one."""
+            global current_mse
+            logits = model(**random_inputs).logits
+            mse = get_mse(logits_base, logits)
+            if mse < current_mse:
+                current_mse = mse
+                print(f"MSE improved for module {module_name}")
+                return True
+            print(f"MSE did not improve for module {module_name}")
+            return False
+        
+        replace_lora_weights_loftq(model, callback=loftq_callback)
+        logits_loftq_callback = model(**random_inputs).logits
+        error_report(logits_base, logits_loftq_callback)
+    else:
+        replace_lora_weights_loftq(model)
+
+
+def create_and_prepare_model(args, data_args, training_args):
+    device_map = None
+    bnb_config = None
+
+    load_in_8bit = args.use_8bit_qunatization
+    load_in_4bit = args.use_4bit_quantization
+
+    if args.use_unsloth:
+        from unsloth import FastLanguageModel
+
+    if args.use_4bit_quantization:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=args.use_4bit_quantization,
+            bnb_4bit_quant_type=args.bnb_4bit_quant_type,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=args.use_nested_quant,
+        )
+
+        if compute_dtype == torch.float16 and args.use_4bit_quantization:
+            major, _ = torch.cuda.get_device_capability()
+            if major >= 8:
+                print("=" * 80)
+                print(
+                    "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
+                )
+                print("=" * 80)
+
+    if args.use_4bit_quantization or args.use_8bit_qunatization:
+        device_map = (
+            int(os.environ.get("LOCAL_RANK", -1))
+            if torch.distributed.is_available() and torch.distributed.is_initialized()
+            else "auto"
+        )  # {"": 0}
+
+    if args.use_unsloth:
+        # Load model
+        model, _ = FastLanguageModel.from_pretrained(
+            model_name=args.model_name_or_path,
+            max_seq_length=data_args.max_seq_length,
+            dtype=None,
+            load_in_4bit=load_in_4bit,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            load_in_8bit=load_in_8bit,
+            quantization_config=bnb_config,
+            device_map=device_map,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+        )
+
+    if (
+        (args.use_4bit_quantization or args.use_8bit_qunatization)
+        and args.use_peft_lora
+        and not args.use_unsloth
+    ):
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant},
+        )
+
+    if args.use_peft_lora and not args.use_unsloth:
+        peft_config = LoraConfig(
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            bias="none",
+            task_type="CAUSAL_LM",
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+        )
+        model = get_peft_model(model, peft_config)
+    elif args.use_peft_lora and args.use_unsloth:
+        # Do model patching and add fast LoRA weights
+        model = FastLanguageModel.get_peft_model(
+            model,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            random_state=training_args.seed,
+            max_seq_length=data_args.max_seq_length,
+        )
+    return model
+
+
+def main(model_args, data_args, training_args):
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    # load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+
+    # load the datasets
+    train_dataset, eval_dataset = create_datasets(
+        tokenizer, data_args, training_args.seed
+    )
+    train_dataset.start_iteration = 0
+
+    model = create_and_prepare_model(model_args, data_args, training_args)
+    # gradient ckpt
+    model.config.use_cache = not training_args.gradient_checkpointing
+    training_args.gradient_checkpointing = (
+        training_args.gradient_checkpointing and not model_args.use_unsloth
+    )
+    if training_args.gradient_checkpointing:
+        training_args.gradient_checkpointing_kwargs = {
+            "use_reentrant": model_args.use_reentrant
+        }
+
+    # trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    trainer.accelerator.print(f"{trainer.model}")
+    if model_args.use_peft_lora:
+        trainer.model.print_trainable_parameters()
+
+    # LoftQ initialization when using QLoRA
+    if model_args.use_4bit_quantization and model_args.use_loftq:
+        loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args)
+
+    # train
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    trainer.train(resume_from_checkpoint=checkpoint)
+
+    # saving final model
+    if trainer.is_fsdp_enabled:
+        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+    trainer.save_model()
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments)
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    main(model_args, data_args, training_args)
diff --git a/__pycache__/fim.cpython-310.pyc b/__pycache__/fim.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b29077b42a2538a642ff269a6b4c4d01eb469dd2
Binary files /dev/null and b/__pycache__/fim.cpython-310.pyc differ
diff --git a/codellama-hugcoder/README.md b/codellama-hugcoder/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..de5239e6f140c1823c2e3f7a14e5c9cacb55e3fd
--- /dev/null
+++ b/codellama-hugcoder/README.md
@@ -0,0 +1,57 @@
+---
+library_name: peft
+license: llama2
+base_model: codellama/CodeLlama-7b-Instruct-hf
+tags:
+- generated_from_trainer
+model-index:
+- name: codellama-hugcoder
+  results: []
+---
+
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+
+# codellama-hugcoder
+
+This model is a fine-tuned version of [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) on an unknown dataset.
+
+## Model description
+
+More information needed
+
+## Intended uses & limitations
+
+More information needed
+
+## Training and evaluation data
+
+More information needed
+
+## Training procedure
+
+### Training hyperparameters
+
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 16
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- training_steps: 2000
+
+### Training results
+
+
+
+### Framework versions
+
+- PEFT 0.15.2.dev0
+- Transformers 4.52.0.dev0
+- Pytorch 2.6.0+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.1
\ No newline at end of file
diff --git a/codellama-hugcoder/adapter_config.json b/codellama-hugcoder/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1
--- /dev/null
+++ b/codellama-hugcoder/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama-hugcoder/adapter_model.safetensors b/codellama-hugcoder/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58d3ccd4c40a5bb55497cd8825213decfac35527
--- /dev/null
+++ b/codellama-hugcoder/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
+size 319876032
diff --git a/codellama-hugcoder/checkpoint-1000/README.md b/codellama-hugcoder/checkpoint-1000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/README.md
@@ -0,0 +1,202 @@
+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.2.dev0
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-1000/adapter_config.json b/codellama-hugcoder/checkpoint-1000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors b/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..e6ddbcefd786f5c0a7c637ee5516b92c5b877848
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a808764ea3b6733b0a7c7a6002b640b2b9246cabcd9ad2d940aa7f43c05d66e3
+size 319876032
diff --git a/codellama-hugcoder/checkpoint-1000/optimizer.pt b/codellama-hugcoder/checkpoint-1000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7dc10c090ce071a89ce8347c0693338c37b5af
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4025993adcd424dc3d3b0c61b41a0e262786b3bb304e6a592a013e59b80a6b38
+size 640009682
diff --git a/codellama-hugcoder/checkpoint-1000/rng_state.pth b/codellama-hugcoder/checkpoint-1000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..991eada42a154d777614c77b2064af65c7abfeeb
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8f822cfb134cd0b1f54ce227e6d11176dede74f86c94420156b0a49753efe3b7
+size 14244
diff --git a/codellama-hugcoder/checkpoint-1000/scheduler.pt b/codellama-hugcoder/checkpoint-1000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..5b24b95d3a3c869a6e1fd81aef577a3b307bd3a6
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3569157643c45495d0de4a184cdcaab0e6cab5317a8ad5f0b1bbb2d736dd80d4
+size 1064
diff --git a/codellama-hugcoder/checkpoint-1000/trainer_state.json b/codellama-hugcoder/checkpoint-1000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcb698779e3d5b5820312186b79098454e6552ba
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/trainer_state.json
@@ -0,0 +1,1434 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5,
+  "eval_steps": 100.0,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.314789078859776e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama-hugcoder/checkpoint-1000/training_args.bin b/codellama-hugcoder/checkpoint-1000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304
diff --git a/codellama-hugcoder/checkpoint-1500/README.md b/codellama-hugcoder/checkpoint-1500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.2.dev0
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-1500/adapter_config.json b/codellama-hugcoder/checkpoint-1500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors b/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..b4a6a146f60aa497ffcbc2b5247f443943944031
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:954883169196fec3dbbf2581acd2ff6690fa789729045bb04113f1bb36637c46
+size 319876032
diff --git a/codellama-hugcoder/checkpoint-1500/optimizer.pt b/codellama-hugcoder/checkpoint-1500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b62651c4a46378842c8470f5e4e0164bd1e32669
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:135e0fda5af04719269dc4cca8199c95f610932728fc80b6e63f3d656098bd57
+size 640009682
diff --git a/codellama-hugcoder/checkpoint-1500/rng_state.pth b/codellama-hugcoder/checkpoint-1500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..f06fbb84d8d04c17347fd349f63eefcd95addf6d
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fda3c0b12e2631264746b16f7dd8a85fd763004a3c1d20e136ad6fae01987d26
+size 14244
diff --git a/codellama-hugcoder/checkpoint-1500/scheduler.pt b/codellama-hugcoder/checkpoint-1500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a8076c3f23ed44081a0388e235c4ca5039d23d13
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:046c4144f3d3e450ad1c1129a3ed6e680f6f65f10c488eeb2fd00b8cd376efa0
+size 1064
diff --git a/codellama-hugcoder/checkpoint-1500/trainer_state.json b/codellama-hugcoder/checkpoint-1500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea00c923cf539fa4f4d768dc48177acff1149bf3
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/trainer_state.json
@@ -0,0 +1,2134 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.75,
+  "eval_steps": 100.0,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5025,
+      "grad_norm": 0.11050508171319962,
+      "learning_rate": 0.00017501531200741534,
+      "loss": 0.4098,
+      "step": 1005
+    },
+    {
+      "epoch": 0.505,
+      "grad_norm": 0.11737144738435745,
+      "learning_rate": 0.00017372371008817256,
+      "loss": 0.3943,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5075,
+      "grad_norm": 0.11473528295755386,
+      "learning_rate": 0.00017243030151655643,
+      "loss": 0.3796,
+      "step": 1015
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.13086555898189545,
+      "learning_rate": 0.00017113518479063738,
+      "loss": 0.4367,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.11752833425998688,
+      "learning_rate": 0.00016983845853856837,
+      "loss": 0.4097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.515,
+      "grad_norm": 0.11596900969743729,
+      "learning_rate": 0.0001685402215110739,
+      "loss": 0.3812,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5175,
+      "grad_norm": 0.11850260943174362,
+      "learning_rate": 0.00016724057257392998,
+      "loss": 0.4354,
+      "step": 1035
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12466365844011307,
+      "learning_rate": 0.00016593961070043498,
+      "loss": 0.4317,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5225,
+      "grad_norm": 0.11178991943597794,
+      "learning_rate": 0.0001646374349638724,
+      "loss": 0.3936,
+      "step": 1045
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.11252165585756302,
+      "learning_rate": 0.00016333414452996623,
+      "loss": 0.386,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5275,
+      "grad_norm": 0.12886975705623627,
+      "learning_rate": 0.0001620298386493288,
+      "loss": 0.3965,
+      "step": 1055
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.11716549098491669,
+      "learning_rate": 0.00016072461664990288,
+      "loss": 0.3924,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5325,
+      "grad_norm": 0.11604485660791397,
+      "learning_rate": 0.000159418577929397,
+      "loss": 0.3624,
+      "step": 1065
+    },
+    {
+      "epoch": 0.535,
+      "grad_norm": 0.11538460850715637,
+      "learning_rate": 0.00015811182194771633,
+      "loss": 0.4338,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.11618762463331223,
+      "learning_rate": 0.00015680444821938804,
+      "loss": 0.4058,
+      "step": 1075
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11750835925340652,
+      "learning_rate": 0.00015549655630598343,
+      "loss": 0.4422,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5425,
+      "grad_norm": 0.12725204229354858,
+      "learning_rate": 0.00015418824580853535,
+      "loss": 0.4422,
+      "step": 1085
+    },
+    {
+      "epoch": 0.545,
+      "grad_norm": 0.11274927109479904,
+      "learning_rate": 0.00015287961635995347,
+      "loss": 0.4229,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5475,
+      "grad_norm": 0.11833129078149796,
+      "learning_rate": 0.00015157076761743686,
+      "loss": 0.4442,
+      "step": 1095
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.11384794861078262,
+      "learning_rate": 0.00015026179925488475,
+      "loss": 0.4528,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5525,
+      "grad_norm": 0.11864661425352097,
+      "learning_rate": 0.00014895281095530575,
+      "loss": 0.3988,
+      "step": 1105
+    },
+    {
+      "epoch": 0.555,
+      "grad_norm": 0.11673832684755325,
+      "learning_rate": 0.00014764390240322691,
+      "loss": 0.3544,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5575,
+      "grad_norm": 0.1174502745270729,
+      "learning_rate": 0.00014633517327710202,
+      "loss": 0.4034,
+      "step": 1115
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.12685547769069672,
+      "learning_rate": 0.00014502672324172107,
+      "loss": 0.3595,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.12368053942918777,
+      "learning_rate": 0.00014371865194062007,
+      "loss": 0.3395,
+      "step": 1125
+    },
+    {
+      "epoch": 0.565,
+      "grad_norm": 0.1077839657664299,
+      "learning_rate": 0.000142411058988493,
+      "loss": 0.4199,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5675,
+      "grad_norm": 0.11699855327606201,
+      "learning_rate": 0.00014110404396360576,
+      "loss": 0.3443,
+      "step": 1135
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.13238464295864105,
+      "learning_rate": 0.0001397977064002128,
+      "loss": 0.3499,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5725,
+      "grad_norm": 0.11482933163642883,
+      "learning_rate": 0.0001384921457809772,
+      "loss": 0.3619,
+      "step": 1145
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.13390353322029114,
+      "learning_rate": 0.00013718746152939487,
+      "loss": 0.3684,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5775,
+      "grad_norm": 0.11464900523424149,
+      "learning_rate": 0.00013588375300222283,
+      "loss": 0.3313,
+      "step": 1155
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.10367871820926666,
+      "learning_rate": 0.00013458111948191296,
+      "loss": 0.3323,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5825,
+      "grad_norm": 0.12259294092655182,
+      "learning_rate": 0.0001332796601690512,
+      "loss": 0.3986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.585,
+      "grad_norm": 0.10923358052968979,
+      "learning_rate": 0.00013197947417480292,
+      "loss": 0.3808,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.12479504942893982,
+      "learning_rate": 0.0001306806605133656,
+      "loss": 0.4429,
+      "step": 1175
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.11521733552217484,
+      "learning_rate": 0.000129383318094428,
+      "loss": 0.4778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5925,
+      "grad_norm": 0.14112086594104767,
+      "learning_rate": 0.00012808754571563827,
+      "loss": 0.4634,
+      "step": 1185
+    },
+    {
+      "epoch": 0.595,
+      "grad_norm": 0.12947902083396912,
+      "learning_rate": 0.00012679344205507981,
+      "loss": 0.4439,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5975,
+      "grad_norm": 0.13288578391075134,
+      "learning_rate": 0.0001255011056637567,
+      "loss": 0.4402,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1216069906949997,
+      "learning_rate": 0.00012421063495808853,
+      "loss": 0.4203,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6025,
+      "grad_norm": 0.11649637669324875,
+      "learning_rate": 0.000122922128212416,
+      "loss": 0.4512,
+      "step": 1205
+    },
+    {
+      "epoch": 0.605,
+      "grad_norm": 0.1201406940817833,
+      "learning_rate": 0.00012163568355151628,
+      "loss": 0.3725,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6075,
+      "grad_norm": 0.12117727100849152,
+      "learning_rate": 0.00012035139894313107,
+      "loss": 0.4352,
+      "step": 1215
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.11709322035312653,
+      "learning_rate": 0.00011906937219050556,
+      "loss": 0.4189,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.11865726858377457,
+      "learning_rate": 0.0001177897009249405,
+      "loss": 0.3796,
+      "step": 1225
+    },
+    {
+      "epoch": 0.615,
+      "grad_norm": 0.10807759314775467,
+      "learning_rate": 0.0001165124825983573,
+      "loss": 0.4465,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6175,
+      "grad_norm": 0.13788209855556488,
+      "learning_rate": 0.00011523781447587641,
+      "loss": 0.4994,
+      "step": 1235
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.12921364605426788,
+      "learning_rate": 0.00011396579362841044,
+      "loss": 0.4251,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6225,
+      "grad_norm": 0.12162365019321442,
+      "learning_rate": 0.0001126965169252718,
+      "loss": 0.3864,
+      "step": 1245
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.12897826731204987,
+      "learning_rate": 0.00011143008102679559,
+      "loss": 0.3753,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6275,
+      "grad_norm": 0.116109699010849,
+      "learning_rate": 0.00011016658237697866,
+      "loss": 0.3296,
+      "step": 1255
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.12935414910316467,
+      "learning_rate": 0.00010890611719613512,
+      "loss": 0.3797,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6325,
+      "grad_norm": 0.13730891048908234,
+      "learning_rate": 0.0001076487814735685,
+      "loss": 0.3711,
+      "step": 1265
+    },
+    {
+      "epoch": 0.635,
+      "grad_norm": 0.13870631158351898,
+      "learning_rate": 0.00010639467096026211,
+      "loss": 0.4328,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.11644043773412704,
+      "learning_rate": 0.00010514388116158701,
+      "loss": 0.3283,
+      "step": 1275
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.12221091985702515,
+      "learning_rate": 0.00010389650733002894,
+      "loss": 0.3898,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6425,
+      "grad_norm": 0.12048634141683578,
+      "learning_rate": 0.00010265264445793464,
+      "loss": 0.3256,
+      "step": 1285
+    },
+    {
+      "epoch": 0.645,
+      "grad_norm": 0.1250566840171814,
+      "learning_rate": 0.00010141238727027761,
+      "loss": 0.408,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6475,
+      "grad_norm": 0.13518592715263367,
+      "learning_rate": 0.00010017583021744454,
+      "loss": 0.3763,
+      "step": 1295
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.13047736883163452,
+      "learning_rate": 9.89430674680425e-05,
+      "loss": 0.3989,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6525,
+      "grad_norm": 0.11474955826997757,
+      "learning_rate": 9.771419290172773e-05,
+      "loss": 0.3374,
+      "step": 1305
+    },
+    {
+      "epoch": 0.655,
+      "grad_norm": 0.11670063436031342,
+      "learning_rate": 9.648930010205619e-05,
+      "loss": 0.3343,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6575,
+      "grad_norm": 0.15385080873966217,
+      "learning_rate": 9.526848234935704e-05,
+      "loss": 0.3432,
+      "step": 1315
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13441519439220428,
+      "learning_rate": 9.405183261362863e-05,
+      "loss": 0.3116,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.14772167801856995,
+      "learning_rate": 9.283944354745888e-05,
+      "loss": 0.3613,
+      "step": 1325
+    },
+    {
+      "epoch": 0.665,
+      "grad_norm": 0.12146154791116714,
+      "learning_rate": 9.163140747896907e-05,
+      "loss": 0.3411,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6675,
+      "grad_norm": 0.1333102583885193,
+      "learning_rate": 9.042781640478291e-05,
+      "loss": 0.396,
+      "step": 1335
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.12051521986722946,
+      "learning_rate": 8.922876198302062e-05,
+      "loss": 0.3837,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6725,
+      "grad_norm": 0.12071400880813599,
+      "learning_rate": 8.803433552631874e-05,
+      "loss": 0.354,
+      "step": 1345
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.11258620023727417,
+      "learning_rate": 8.684462799487635e-05,
+      "loss": 0.3197,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6775,
+      "grad_norm": 0.11908067762851715,
+      "learning_rate": 8.565972998952814e-05,
+      "loss": 0.377,
+      "step": 1355
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1252991259098053,
+      "learning_rate": 8.447973174484469e-05,
+      "loss": 0.3438,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6825,
+      "grad_norm": 0.12832245230674744,
+      "learning_rate": 8.330472312226091e-05,
+      "loss": 0.346,
+      "step": 1365
+    },
+    {
+      "epoch": 0.685,
+      "grad_norm": 0.1396942287683487,
+      "learning_rate": 8.213479360323258e-05,
+      "loss": 0.3886,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.12938210368156433,
+      "learning_rate": 8.097003228242225e-05,
+      "loss": 0.3699,
+      "step": 1375
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.12459377944469452,
+      "learning_rate": 7.9810527860914e-05,
+      "loss": 0.3892,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6925,
+      "grad_norm": 0.1360333263874054,
+      "learning_rate": 7.86563686394587e-05,
+      "loss": 0.3423,
+      "step": 1385
+    },
+    {
+      "epoch": 0.695,
+      "grad_norm": 0.1357765644788742,
+      "learning_rate": 7.750764251174963e-05,
+      "loss": 0.408,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6975,
+      "grad_norm": 0.14453718066215515,
+      "learning_rate": 7.636443695772887e-05,
+      "loss": 0.3398,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.11541519314050674,
+      "learning_rate": 7.522683903692547e-05,
+      "loss": 0.4203,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7025,
+      "grad_norm": 0.13344840705394745,
+      "learning_rate": 7.409493538182545e-05,
+      "loss": 0.3694,
+      "step": 1405
+    },
+    {
+      "epoch": 0.705,
+      "grad_norm": 0.13069866597652435,
+      "learning_rate": 7.296881219127452e-05,
+      "loss": 0.3889,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7075,
+      "grad_norm": 0.12457838654518127,
+      "learning_rate": 7.184855522391359e-05,
+      "loss": 0.3342,
+      "step": 1415
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.11990659683942795,
+      "learning_rate": 7.073424979164794e-05,
+      "loss": 0.3855,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.1389523446559906,
+      "learning_rate": 6.962598075315046e-05,
+      "loss": 0.3943,
+      "step": 1425
+    },
+    {
+      "epoch": 0.715,
+      "grad_norm": 0.14108599722385406,
+      "learning_rate": 6.852383250739938e-05,
+      "loss": 0.388,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7175,
+      "grad_norm": 0.1342005580663681,
+      "learning_rate": 6.742788898725065e-05,
+      "loss": 0.3602,
+      "step": 1435
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.13516324758529663,
+      "learning_rate": 6.633823365304648e-05,
+      "loss": 0.3935,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7225,
+      "grad_norm": 0.1302197426557541,
+      "learning_rate": 6.52549494862593e-05,
+      "loss": 0.3618,
+      "step": 1445
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.12428996711969376,
+      "learning_rate": 6.417811898317259e-05,
+      "loss": 0.3338,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7275,
+      "grad_norm": 0.11249776184558868,
+      "learning_rate": 6.31078241485982e-05,
+      "loss": 0.3819,
+      "step": 1455
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1359994113445282,
+      "learning_rate": 6.204414648963159e-05,
+      "loss": 0.3356,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7325,
+      "grad_norm": 0.1118568629026413,
+      "learning_rate": 6.098716700944479e-05,
+      "loss": 0.3223,
+      "step": 1465
+    },
+    {
+      "epoch": 0.735,
+      "grad_norm": 0.12038140743970871,
+      "learning_rate": 5.993696620111741e-05,
+      "loss": 0.3481,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.12787550687789917,
+      "learning_rate": 5.889362404150703e-05,
+      "loss": 0.3766,
+      "step": 1475
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.12134893983602524,
+      "learning_rate": 5.7857219985158506e-05,
+      "loss": 0.2916,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7425,
+      "grad_norm": 0.1274223029613495,
+      "learning_rate": 5.682783295825345e-05,
+      "loss": 0.3095,
+      "step": 1485
+    },
+    {
+      "epoch": 0.745,
+      "grad_norm": 0.11817299574613571,
+      "learning_rate": 5.580554135259932e-05,
+      "loss": 0.3422,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7475,
+      "grad_norm": 0.1348387748003006,
+      "learning_rate": 5.479042301965987e-05,
+      "loss": 0.4044,
+      "step": 1495
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.14032681286334991,
+      "learning_rate": 5.378255526462631e-05,
+      "loss": 0.337,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.972183618289664e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama-hugcoder/checkpoint-1500/training_args.bin b/codellama-hugcoder/checkpoint-1500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-1500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304
diff --git a/codellama-hugcoder/checkpoint-2000/README.md b/codellama-hugcoder/checkpoint-2000/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/README.md
@@ -0,0 +1,202 @@
+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.2.dev0
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-2000/adapter_config.json b/codellama-hugcoder/checkpoint-2000/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors b/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..58d3ccd4c40a5bb55497cd8825213decfac35527
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
+size 319876032
diff --git a/codellama-hugcoder/checkpoint-2000/optimizer.pt b/codellama-hugcoder/checkpoint-2000/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..4cc780744f8d5428af28cca5b52ee03127c7c1a7
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:688ec5889a6aa6b6675276da1e991b1ffaf231ca0b9db550ca1055ee967ab484
+size 640009682
diff --git a/codellama-hugcoder/checkpoint-2000/rng_state.pth b/codellama-hugcoder/checkpoint-2000/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..4c0e5b52927ff54f84fe5d982d2c372833bb465f
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d88eee16810615d69e99ef0af6ae2767f80f0c756dab6f8b6315f916e0a2772d
+size 14180
diff --git a/codellama-hugcoder/checkpoint-2000/scheduler.pt b/codellama-hugcoder/checkpoint-2000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..a1aa08fb4ca7865e35617d28dc511dd492902a0c
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0af176d761d71fce3fbce7001f4850782b022af8f40338e8e88b22363a32018f
+size 1064
diff --git a/codellama-hugcoder/checkpoint-2000/trainer_state.json b/codellama-hugcoder/checkpoint-2000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..c1a1c35192c6d64496c6c47b59e3c26bf2ca1fbb
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/trainer_state.json
@@ -0,0 +1,2834 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 100.0,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5025,
+      "grad_norm": 0.11050508171319962,
+      "learning_rate": 0.00017501531200741534,
+      "loss": 0.4098,
+      "step": 1005
+    },
+    {
+      "epoch": 0.505,
+      "grad_norm": 0.11737144738435745,
+      "learning_rate": 0.00017372371008817256,
+      "loss": 0.3943,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5075,
+      "grad_norm": 0.11473528295755386,
+      "learning_rate": 0.00017243030151655643,
+      "loss": 0.3796,
+      "step": 1015
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.13086555898189545,
+      "learning_rate": 0.00017113518479063738,
+      "loss": 0.4367,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.11752833425998688,
+      "learning_rate": 0.00016983845853856837,
+      "loss": 0.4097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.515,
+      "grad_norm": 0.11596900969743729,
+      "learning_rate": 0.0001685402215110739,
+      "loss": 0.3812,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5175,
+      "grad_norm": 0.11850260943174362,
+      "learning_rate": 0.00016724057257392998,
+      "loss": 0.4354,
+      "step": 1035
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12466365844011307,
+      "learning_rate": 0.00016593961070043498,
+      "loss": 0.4317,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5225,
+      "grad_norm": 0.11178991943597794,
+      "learning_rate": 0.0001646374349638724,
+      "loss": 0.3936,
+      "step": 1045
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.11252165585756302,
+      "learning_rate": 0.00016333414452996623,
+      "loss": 0.386,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5275,
+      "grad_norm": 0.12886975705623627,
+      "learning_rate": 0.0001620298386493288,
+      "loss": 0.3965,
+      "step": 1055
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.11716549098491669,
+      "learning_rate": 0.00016072461664990288,
+      "loss": 0.3924,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5325,
+      "grad_norm": 0.11604485660791397,
+      "learning_rate": 0.000159418577929397,
+      "loss": 0.3624,
+      "step": 1065
+    },
+    {
+      "epoch": 0.535,
+      "grad_norm": 0.11538460850715637,
+      "learning_rate": 0.00015811182194771633,
+      "loss": 0.4338,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.11618762463331223,
+      "learning_rate": 0.00015680444821938804,
+      "loss": 0.4058,
+      "step": 1075
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11750835925340652,
+      "learning_rate": 0.00015549655630598343,
+      "loss": 0.4422,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5425,
+      "grad_norm": 0.12725204229354858,
+      "learning_rate": 0.00015418824580853535,
+      "loss": 0.4422,
+      "step": 1085
+    },
+    {
+      "epoch": 0.545,
+      "grad_norm": 0.11274927109479904,
+      "learning_rate": 0.00015287961635995347,
+      "loss": 0.4229,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5475,
+      "grad_norm": 0.11833129078149796,
+      "learning_rate": 0.00015157076761743686,
+      "loss": 0.4442,
+      "step": 1095
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.11384794861078262,
+      "learning_rate": 0.00015026179925488475,
+      "loss": 0.4528,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5525,
+      "grad_norm": 0.11864661425352097,
+      "learning_rate": 0.00014895281095530575,
+      "loss": 0.3988,
+      "step": 1105
+    },
+    {
+      "epoch": 0.555,
+      "grad_norm": 0.11673832684755325,
+      "learning_rate": 0.00014764390240322691,
+      "loss": 0.3544,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5575,
+      "grad_norm": 0.1174502745270729,
+      "learning_rate": 0.00014633517327710202,
+      "loss": 0.4034,
+      "step": 1115
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.12685547769069672,
+      "learning_rate": 0.00014502672324172107,
+      "loss": 0.3595,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.12368053942918777,
+      "learning_rate": 0.00014371865194062007,
+      "loss": 0.3395,
+      "step": 1125
+    },
+    {
+      "epoch": 0.565,
+      "grad_norm": 0.1077839657664299,
+      "learning_rate": 0.000142411058988493,
+      "loss": 0.4199,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5675,
+      "grad_norm": 0.11699855327606201,
+      "learning_rate": 0.00014110404396360576,
+      "loss": 0.3443,
+      "step": 1135
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.13238464295864105,
+      "learning_rate": 0.0001397977064002128,
+      "loss": 0.3499,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5725,
+      "grad_norm": 0.11482933163642883,
+      "learning_rate": 0.0001384921457809772,
+      "loss": 0.3619,
+      "step": 1145
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.13390353322029114,
+      "learning_rate": 0.00013718746152939487,
+      "loss": 0.3684,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5775,
+      "grad_norm": 0.11464900523424149,
+      "learning_rate": 0.00013588375300222283,
+      "loss": 0.3313,
+      "step": 1155
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.10367871820926666,
+      "learning_rate": 0.00013458111948191296,
+      "loss": 0.3323,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5825,
+      "grad_norm": 0.12259294092655182,
+      "learning_rate": 0.0001332796601690512,
+      "loss": 0.3986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.585,
+      "grad_norm": 0.10923358052968979,
+      "learning_rate": 0.00013197947417480292,
+      "loss": 0.3808,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.12479504942893982,
+      "learning_rate": 0.0001306806605133656,
+      "loss": 0.4429,
+      "step": 1175
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.11521733552217484,
+      "learning_rate": 0.000129383318094428,
+      "loss": 0.4778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5925,
+      "grad_norm": 0.14112086594104767,
+      "learning_rate": 0.00012808754571563827,
+      "loss": 0.4634,
+      "step": 1185
+    },
+    {
+      "epoch": 0.595,
+      "grad_norm": 0.12947902083396912,
+      "learning_rate": 0.00012679344205507981,
+      "loss": 0.4439,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5975,
+      "grad_norm": 0.13288578391075134,
+      "learning_rate": 0.0001255011056637567,
+      "loss": 0.4402,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1216069906949997,
+      "learning_rate": 0.00012421063495808853,
+      "loss": 0.4203,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6025,
+      "grad_norm": 0.11649637669324875,
+      "learning_rate": 0.000122922128212416,
+      "loss": 0.4512,
+      "step": 1205
+    },
+    {
+      "epoch": 0.605,
+      "grad_norm": 0.1201406940817833,
+      "learning_rate": 0.00012163568355151628,
+      "loss": 0.3725,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6075,
+      "grad_norm": 0.12117727100849152,
+      "learning_rate": 0.00012035139894313107,
+      "loss": 0.4352,
+      "step": 1215
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.11709322035312653,
+      "learning_rate": 0.00011906937219050556,
+      "loss": 0.4189,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.11865726858377457,
+      "learning_rate": 0.0001177897009249405,
+      "loss": 0.3796,
+      "step": 1225
+    },
+    {
+      "epoch": 0.615,
+      "grad_norm": 0.10807759314775467,
+      "learning_rate": 0.0001165124825983573,
+      "loss": 0.4465,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6175,
+      "grad_norm": 0.13788209855556488,
+      "learning_rate": 0.00011523781447587641,
+      "loss": 0.4994,
+      "step": 1235
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.12921364605426788,
+      "learning_rate": 0.00011396579362841044,
+      "loss": 0.4251,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6225,
+      "grad_norm": 0.12162365019321442,
+      "learning_rate": 0.0001126965169252718,
+      "loss": 0.3864,
+      "step": 1245
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.12897826731204987,
+      "learning_rate": 0.00011143008102679559,
+      "loss": 0.3753,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6275,
+      "grad_norm": 0.116109699010849,
+      "learning_rate": 0.00011016658237697866,
+      "loss": 0.3296,
+      "step": 1255
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.12935414910316467,
+      "learning_rate": 0.00010890611719613512,
+      "loss": 0.3797,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6325,
+      "grad_norm": 0.13730891048908234,
+      "learning_rate": 0.0001076487814735685,
+      "loss": 0.3711,
+      "step": 1265
+    },
+    {
+      "epoch": 0.635,
+      "grad_norm": 0.13870631158351898,
+      "learning_rate": 0.00010639467096026211,
+      "loss": 0.4328,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.11644043773412704,
+      "learning_rate": 0.00010514388116158701,
+      "loss": 0.3283,
+      "step": 1275
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.12221091985702515,
+      "learning_rate": 0.00010389650733002894,
+      "loss": 0.3898,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6425,
+      "grad_norm": 0.12048634141683578,
+      "learning_rate": 0.00010265264445793464,
+      "loss": 0.3256,
+      "step": 1285
+    },
+    {
+      "epoch": 0.645,
+      "grad_norm": 0.1250566840171814,
+      "learning_rate": 0.00010141238727027761,
+      "loss": 0.408,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6475,
+      "grad_norm": 0.13518592715263367,
+      "learning_rate": 0.00010017583021744454,
+      "loss": 0.3763,
+      "step": 1295
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.13047736883163452,
+      "learning_rate": 9.89430674680425e-05,
+      "loss": 0.3989,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6525,
+      "grad_norm": 0.11474955826997757,
+      "learning_rate": 9.771419290172773e-05,
+      "loss": 0.3374,
+      "step": 1305
+    },
+    {
+      "epoch": 0.655,
+      "grad_norm": 0.11670063436031342,
+      "learning_rate": 9.648930010205619e-05,
+      "loss": 0.3343,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6575,
+      "grad_norm": 0.15385080873966217,
+      "learning_rate": 9.526848234935704e-05,
+      "loss": 0.3432,
+      "step": 1315
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13441519439220428,
+      "learning_rate": 9.405183261362863e-05,
+      "loss": 0.3116,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.14772167801856995,
+      "learning_rate": 9.283944354745888e-05,
+      "loss": 0.3613,
+      "step": 1325
+    },
+    {
+      "epoch": 0.665,
+      "grad_norm": 0.12146154791116714,
+      "learning_rate": 9.163140747896907e-05,
+      "loss": 0.3411,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6675,
+      "grad_norm": 0.1333102583885193,
+      "learning_rate": 9.042781640478291e-05,
+      "loss": 0.396,
+      "step": 1335
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.12051521986722946,
+      "learning_rate": 8.922876198302062e-05,
+      "loss": 0.3837,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6725,
+      "grad_norm": 0.12071400880813599,
+      "learning_rate": 8.803433552631874e-05,
+      "loss": 0.354,
+      "step": 1345
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.11258620023727417,
+      "learning_rate": 8.684462799487635e-05,
+      "loss": 0.3197,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6775,
+      "grad_norm": 0.11908067762851715,
+      "learning_rate": 8.565972998952814e-05,
+      "loss": 0.377,
+      "step": 1355
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1252991259098053,
+      "learning_rate": 8.447973174484469e-05,
+      "loss": 0.3438,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6825,
+      "grad_norm": 0.12832245230674744,
+      "learning_rate": 8.330472312226091e-05,
+      "loss": 0.346,
+      "step": 1365
+    },
+    {
+      "epoch": 0.685,
+      "grad_norm": 0.1396942287683487,
+      "learning_rate": 8.213479360323258e-05,
+      "loss": 0.3886,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.12938210368156433,
+      "learning_rate": 8.097003228242225e-05,
+      "loss": 0.3699,
+      "step": 1375
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.12459377944469452,
+      "learning_rate": 7.9810527860914e-05,
+      "loss": 0.3892,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6925,
+      "grad_norm": 0.1360333263874054,
+      "learning_rate": 7.86563686394587e-05,
+      "loss": 0.3423,
+      "step": 1385
+    },
+    {
+      "epoch": 0.695,
+      "grad_norm": 0.1357765644788742,
+      "learning_rate": 7.750764251174963e-05,
+      "loss": 0.408,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6975,
+      "grad_norm": 0.14453718066215515,
+      "learning_rate": 7.636443695772887e-05,
+      "loss": 0.3398,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.11541519314050674,
+      "learning_rate": 7.522683903692547e-05,
+      "loss": 0.4203,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7025,
+      "grad_norm": 0.13344840705394745,
+      "learning_rate": 7.409493538182545e-05,
+      "loss": 0.3694,
+      "step": 1405
+    },
+    {
+      "epoch": 0.705,
+      "grad_norm": 0.13069866597652435,
+      "learning_rate": 7.296881219127452e-05,
+      "loss": 0.3889,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7075,
+      "grad_norm": 0.12457838654518127,
+      "learning_rate": 7.184855522391359e-05,
+      "loss": 0.3342,
+      "step": 1415
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.11990659683942795,
+      "learning_rate": 7.073424979164794e-05,
+      "loss": 0.3855,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.1389523446559906,
+      "learning_rate": 6.962598075315046e-05,
+      "loss": 0.3943,
+      "step": 1425
+    },
+    {
+      "epoch": 0.715,
+      "grad_norm": 0.14108599722385406,
+      "learning_rate": 6.852383250739938e-05,
+      "loss": 0.388,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7175,
+      "grad_norm": 0.1342005580663681,
+      "learning_rate": 6.742788898725065e-05,
+      "loss": 0.3602,
+      "step": 1435
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.13516324758529663,
+      "learning_rate": 6.633823365304648e-05,
+      "loss": 0.3935,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7225,
+      "grad_norm": 0.1302197426557541,
+      "learning_rate": 6.52549494862593e-05,
+      "loss": 0.3618,
+      "step": 1445
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.12428996711969376,
+      "learning_rate": 6.417811898317259e-05,
+      "loss": 0.3338,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7275,
+      "grad_norm": 0.11249776184558868,
+      "learning_rate": 6.31078241485982e-05,
+      "loss": 0.3819,
+      "step": 1455
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1359994113445282,
+      "learning_rate": 6.204414648963159e-05,
+      "loss": 0.3356,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7325,
+      "grad_norm": 0.1118568629026413,
+      "learning_rate": 6.098716700944479e-05,
+      "loss": 0.3223,
+      "step": 1465
+    },
+    {
+      "epoch": 0.735,
+      "grad_norm": 0.12038140743970871,
+      "learning_rate": 5.993696620111741e-05,
+      "loss": 0.3481,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.12787550687789917,
+      "learning_rate": 5.889362404150703e-05,
+      "loss": 0.3766,
+      "step": 1475
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.12134893983602524,
+      "learning_rate": 5.7857219985158506e-05,
+      "loss": 0.2916,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7425,
+      "grad_norm": 0.1274223029613495,
+      "learning_rate": 5.682783295825345e-05,
+      "loss": 0.3095,
+      "step": 1485
+    },
+    {
+      "epoch": 0.745,
+      "grad_norm": 0.11817299574613571,
+      "learning_rate": 5.580554135259932e-05,
+      "loss": 0.3422,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7475,
+      "grad_norm": 0.1348387748003006,
+      "learning_rate": 5.479042301965987e-05,
+      "loss": 0.4044,
+      "step": 1495
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.14032681286334991,
+      "learning_rate": 5.378255526462631e-05,
+      "loss": 0.337,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7525,
+      "grad_norm": 0.1196574866771698,
+      "learning_rate": 5.2782014840530366e-05,
+      "loss": 0.3638,
+      "step": 1505
+    },
+    {
+      "epoch": 0.755,
+      "grad_norm": 0.1307535171508789,
+      "learning_rate": 5.178887794239904e-05,
+      "loss": 0.3514,
+      "step": 1510
+    },
+    {
+      "epoch": 0.7575,
+      "grad_norm": 0.12303224951028824,
+      "learning_rate": 5.080322020145224e-05,
+      "loss": 0.3825,
+      "step": 1515
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.11517804116010666,
+      "learning_rate": 4.9825116679343025e-05,
+      "loss": 0.3474,
+      "step": 1520
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 0.1276445835828781,
+      "learning_rate": 4.885464186244154e-05,
+      "loss": 0.3084,
+      "step": 1525
+    },
+    {
+      "epoch": 0.765,
+      "grad_norm": 0.12166495621204376,
+      "learning_rate": 4.789186965616232e-05,
+      "loss": 0.2949,
+      "step": 1530
+    },
+    {
+      "epoch": 0.7675,
+      "grad_norm": 0.13007108867168427,
+      "learning_rate": 4.6936873379336564e-05,
+      "loss": 0.3336,
+      "step": 1535
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.12368687242269516,
+      "learning_rate": 4.598972575862803e-05,
+      "loss": 0.3443,
+      "step": 1540
+    },
+    {
+      "epoch": 0.7725,
+      "grad_norm": 0.11817432940006256,
+      "learning_rate": 4.5050498922995166e-05,
+      "loss": 0.3198,
+      "step": 1545
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.13239014148712158,
+      "learning_rate": 4.4119264398197843e-05,
+      "loss": 0.3145,
+      "step": 1550
+    },
+    {
+      "epoch": 0.7775,
+      "grad_norm": 0.12305855751037598,
+      "learning_rate": 4.319609310135054e-05,
+      "loss": 0.3276,
+      "step": 1555
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.13063360750675201,
+      "learning_rate": 4.228105533552169e-05,
+      "loss": 0.4115,
+      "step": 1560
+    },
+    {
+      "epoch": 0.7825,
+      "grad_norm": 0.12751415371894836,
+      "learning_rate": 4.137422078437991e-05,
+      "loss": 0.4113,
+      "step": 1565
+    },
+    {
+      "epoch": 0.785,
+      "grad_norm": 0.1429520696401596,
+      "learning_rate": 4.0475658506887136e-05,
+      "loss": 0.3634,
+      "step": 1570
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 0.13072626292705536,
+      "learning_rate": 3.9585436932039846e-05,
+      "loss": 0.3914,
+      "step": 1575
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.13076546788215637,
+      "learning_rate": 3.870362385365755e-05,
+      "loss": 0.3153,
+      "step": 1580
+    },
+    {
+      "epoch": 0.7925,
+      "grad_norm": 0.11764945089817047,
+      "learning_rate": 3.7830286425220234e-05,
+      "loss": 0.331,
+      "step": 1585
+    },
+    {
+      "epoch": 0.795,
+      "grad_norm": 0.12469421327114105,
+      "learning_rate": 3.696549115475434e-05,
+      "loss": 0.3667,
+      "step": 1590
+    },
+    {
+      "epoch": 0.7975,
+      "grad_norm": 0.13257570564746857,
+      "learning_rate": 3.6109303899767875e-05,
+      "loss": 0.3775,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1399105191230774,
+      "learning_rate": 3.5261789862235235e-05,
+      "loss": 0.3786,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8025,
+      "grad_norm": 0.1299823671579361,
+      "learning_rate": 3.442301358363163e-05,
+      "loss": 0.3984,
+      "step": 1605
+    },
+    {
+      "epoch": 0.805,
+      "grad_norm": 0.12068431079387665,
+      "learning_rate": 3.359303894001809e-05,
+      "loss": 0.3416,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8075,
+      "grad_norm": 0.12825050950050354,
+      "learning_rate": 3.277192913717717e-05,
+      "loss": 0.3973,
+      "step": 1615
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.12794139981269836,
+      "learning_rate": 3.195974670579941e-05,
+      "loss": 0.3942,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.1178906112909317,
+      "learning_rate": 3.115655349672141e-05,
+      "loss": 0.3549,
+      "step": 1625
+    },
+    {
+      "epoch": 0.815,
+      "grad_norm": 0.11859016120433807,
+      "learning_rate": 3.036241067621575e-05,
+      "loss": 0.3113,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8175,
+      "grad_norm": 0.12508928775787354,
+      "learning_rate": 2.9577378721332843e-05,
+      "loss": 0.3802,
+      "step": 1635
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1293668895959854,
+      "learning_rate": 2.8801517415295455e-05,
+      "loss": 0.3098,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8225,
+      "grad_norm": 0.12039236724376678,
+      "learning_rate": 2.8034885842945865e-05,
+      "loss": 0.2876,
+      "step": 1645
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.14805036783218384,
+      "learning_rate": 2.7277542386246454e-05,
+      "loss": 0.3618,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8275,
+      "grad_norm": 0.12638579308986664,
+      "learning_rate": 2.6529544719833706e-05,
+      "loss": 0.3328,
+      "step": 1655
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.12427478283643723,
+      "learning_rate": 2.5790949806625838e-05,
+      "loss": 0.3394,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8325,
+      "grad_norm": 0.1283419132232666,
+      "learning_rate": 2.5061813893485085e-05,
+      "loss": 0.3392,
+      "step": 1665
+    },
+    {
+      "epoch": 0.835,
+      "grad_norm": 0.12487384676933289,
+      "learning_rate": 2.434219250693419e-05,
+      "loss": 0.3592,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8375,
+      "grad_norm": 0.14032793045043945,
+      "learning_rate": 2.363214044892788e-05,
+      "loss": 0.4099,
+      "step": 1675
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.10917101800441742,
+      "learning_rate": 2.293171179267946e-05,
+      "loss": 0.3204,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8425,
+      "grad_norm": 0.1253073364496231,
+      "learning_rate": 2.2240959878542848e-05,
+      "loss": 0.3378,
+      "step": 1685
+    },
+    {
+      "epoch": 0.845,
+      "grad_norm": 0.14096981287002563,
+      "learning_rate": 2.155993730995077e-05,
+      "loss": 0.378,
+      "step": 1690
+    },
+    {
+      "epoch": 0.8475,
+      "grad_norm": 0.12039178609848022,
+      "learning_rate": 2.0888695949408468e-05,
+      "loss": 0.3197,
+      "step": 1695
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.12723132967948914,
+      "learning_rate": 2.0227286914544353e-05,
+      "loss": 0.3241,
+      "step": 1700
+    },
+    {
+      "epoch": 0.8525,
+      "grad_norm": 0.1309029906988144,
+      "learning_rate": 1.9575760574217147e-05,
+      "loss": 0.3743,
+      "step": 1705
+    },
+    {
+      "epoch": 0.855,
+      "grad_norm": 0.1324499100446701,
+      "learning_rate": 1.893416654468022e-05,
+      "loss": 0.345,
+      "step": 1710
+    },
+    {
+      "epoch": 0.8575,
+      "grad_norm": 0.11905783414840698,
+      "learning_rate": 1.8302553685802917e-05,
+      "loss": 0.3514,
+      "step": 1715
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.12570443749427795,
+      "learning_rate": 1.768097009734985e-05,
+      "loss": 0.3791,
+      "step": 1720
+    },
+    {
+      "epoch": 0.8625,
+      "grad_norm": 0.13414913415908813,
+      "learning_rate": 1.7069463115317788e-05,
+      "loss": 0.3575,
+      "step": 1725
+    },
+    {
+      "epoch": 0.865,
+      "grad_norm": 0.1283785104751587,
+      "learning_rate": 1.6468079308331023e-05,
+      "loss": 0.3496,
+      "step": 1730
+    },
+    {
+      "epoch": 0.8675,
+      "grad_norm": 0.11180217564105988,
+      "learning_rate": 1.587686447409478e-05,
+      "loss": 0.3245,
+      "step": 1735
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.13804157078266144,
+      "learning_rate": 1.5295863635907667e-05,
+      "loss": 0.367,
+      "step": 1740
+    },
+    {
+      "epoch": 0.8725,
+      "grad_norm": 0.12629055976867676,
+      "learning_rate": 1.4725121039232945e-05,
+      "loss": 0.293,
+      "step": 1745
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.12774884700775146,
+      "learning_rate": 1.4164680148329088e-05,
+      "loss": 0.3798,
+      "step": 1750
+    },
+    {
+      "epoch": 0.8775,
+      "grad_norm": 0.11681339889764786,
+      "learning_rate": 1.3614583642939718e-05,
+      "loss": 0.3474,
+      "step": 1755
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.14510560035705566,
+      "learning_rate": 1.3074873415043591e-05,
+      "loss": 0.3999,
+      "step": 1760
+    },
+    {
+      "epoch": 0.8825,
+      "grad_norm": 0.1168401762843132,
+      "learning_rate": 1.2545590565664054e-05,
+      "loss": 0.3398,
+      "step": 1765
+    },
+    {
+      "epoch": 0.885,
+      "grad_norm": 0.1411600410938263,
+      "learning_rate": 1.2026775401739348e-05,
+      "loss": 0.3346,
+      "step": 1770
+    },
+    {
+      "epoch": 0.8875,
+      "grad_norm": 0.12797729671001434,
+      "learning_rate": 1.1518467433052863e-05,
+      "loss": 0.3742,
+      "step": 1775
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.12946921586990356,
+      "learning_rate": 1.1020705369224414e-05,
+      "loss": 0.3436,
+      "step": 1780
+    },
+    {
+      "epoch": 0.8925,
+      "grad_norm": 0.13285613059997559,
+      "learning_rate": 1.0533527116762296e-05,
+      "loss": 0.3186,
+      "step": 1785
+    },
+    {
+      "epoch": 0.895,
+      "grad_norm": 0.15213604271411896,
+      "learning_rate": 1.005696977617666e-05,
+      "loss": 0.3629,
+      "step": 1790
+    },
+    {
+      "epoch": 0.8975,
+      "grad_norm": 0.12391404062509537,
+      "learning_rate": 9.591069639154008e-06,
+      "loss": 0.3421,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.11592845618724823,
+      "learning_rate": 9.135862185793636e-06,
+      "loss": 0.3107,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9025,
+      "grad_norm": 0.12540902197360992,
+      "learning_rate": 8.691382081905496e-06,
+      "loss": 0.3605,
+      "step": 1805
+    },
+    {
+      "epoch": 0.905,
+      "grad_norm": 0.14459215104579926,
+      "learning_rate": 8.257663176370389e-06,
+      "loss": 0.3884,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9075,
+      "grad_norm": 0.14139464497566223,
+      "learning_rate": 7.834738498562165e-06,
+      "loss": 0.3728,
+      "step": 1815
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.12125397473573685,
+      "learning_rate": 7.422640255832446e-06,
+      "loss": 0.3237,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9125,
+      "grad_norm": 0.13039612770080566,
+      "learning_rate": 7.021399831057961e-06,
+      "loss": 0.3055,
+      "step": 1825
+    },
+    {
+      "epoch": 0.915,
+      "grad_norm": 0.1337701678276062,
+      "learning_rate": 6.631047780250481e-06,
+      "loss": 0.368,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9175,
+      "grad_norm": 0.13020606338977814,
+      "learning_rate": 6.251613830230013e-06,
+      "loss": 0.3262,
+      "step": 1835
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.12915077805519104,
+      "learning_rate": 5.883126876360872e-06,
+      "loss": 0.3428,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9225,
+      "grad_norm": 0.12774400413036346,
+      "learning_rate": 5.525614980351284e-06,
+      "loss": 0.3735,
+      "step": 1845
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.12587039172649384,
+      "learning_rate": 5.1791053681162545e-06,
+      "loss": 0.3402,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9275,
+      "grad_norm": 0.12152459472417831,
+      "learning_rate": 4.843624427704329e-06,
+      "loss": 0.2968,
+      "step": 1855
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.11444247514009476,
+      "learning_rate": 4.519197707287986e-06,
+      "loss": 0.3448,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9325,
+      "grad_norm": 0.12532518804073334,
+      "learning_rate": 4.2058499132180734e-06,
+      "loss": 0.3613,
+      "step": 1865
+    },
+    {
+      "epoch": 0.935,
+      "grad_norm": 0.14186476171016693,
+      "learning_rate": 3.903604908142266e-06,
+      "loss": 0.2887,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.13014192879199982,
+      "learning_rate": 3.6124857091878845e-06,
+      "loss": 0.2679,
+      "step": 1875
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.1259031891822815,
+      "learning_rate": 3.3325144862090648e-06,
+      "loss": 0.2993,
+      "step": 1880
+    },
+    {
+      "epoch": 0.9425,
+      "grad_norm": 0.12168288230895996,
+      "learning_rate": 3.0637125600983916e-06,
+      "loss": 0.3317,
+      "step": 1885
+    },
+    {
+      "epoch": 0.945,
+      "grad_norm": 0.12291324138641357,
+      "learning_rate": 2.8061004011632302e-06,
+      "loss": 0.3311,
+      "step": 1890
+    },
+    {
+      "epoch": 0.9475,
+      "grad_norm": 0.13629783689975739,
+      "learning_rate": 2.5596976275668757e-06,
+      "loss": 0.3456,
+      "step": 1895
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.17415851354599,
+      "learning_rate": 2.324523003834511e-06,
+      "loss": 0.3589,
+      "step": 1900
+    },
+    {
+      "epoch": 0.9525,
+      "grad_norm": 0.1330641210079193,
+      "learning_rate": 2.100594439424269e-06,
+      "loss": 0.3826,
+      "step": 1905
+    },
+    {
+      "epoch": 0.955,
+      "grad_norm": 0.14203837513923645,
+      "learning_rate": 1.8879289873632907e-06,
+      "loss": 0.3807,
+      "step": 1910
+    },
+    {
+      "epoch": 0.9575,
+      "grad_norm": 0.1222100704908371,
+      "learning_rate": 1.686542842949129e-06,
+      "loss": 0.3084,
+      "step": 1915
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1441483348608017,
+      "learning_rate": 1.4964513425163694e-06,
+      "loss": 0.3871,
+      "step": 1920
+    },
+    {
+      "epoch": 0.9625,
+      "grad_norm": 0.1402144581079483,
+      "learning_rate": 1.3176689622687474e-06,
+      "loss": 0.3192,
+      "step": 1925
+    },
+    {
+      "epoch": 0.965,
+      "grad_norm": 0.13284745812416077,
+      "learning_rate": 1.1502093171766979e-06,
+      "loss": 0.359,
+      "step": 1930
+    },
+    {
+      "epoch": 0.9675,
+      "grad_norm": 0.1253402829170227,
+      "learning_rate": 9.94085159940533e-07,
+      "loss": 0.3214,
+      "step": 1935
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.13589312136173248,
+      "learning_rate": 8.493083800193034e-07,
+      "loss": 0.3524,
+      "step": 1940
+    },
+    {
+      "epoch": 0.9725,
+      "grad_norm": 0.13623379170894623,
+      "learning_rate": 7.158900027253223e-07,
+      "loss": 0.3711,
+      "step": 1945
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.12516111135482788,
+      "learning_rate": 5.9384018838457e-07,
+      "loss": 0.3487,
+      "step": 1950
+    },
+    {
+      "epoch": 0.9775,
+      "grad_norm": 0.1211727038025856,
+      "learning_rate": 4.831682315629304e-07,
+      "loss": 0.3079,
+      "step": 1955
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.1348896622657776,
+      "learning_rate": 3.8388256035840615e-07,
+      "loss": 0.322,
+      "step": 1960
+    },
+    {
+      "epoch": 0.9825,
+      "grad_norm": 0.12953124940395355,
+      "learning_rate": 2.959907357592661e-07,
+      "loss": 0.3054,
+      "step": 1965
+    },
+    {
+      "epoch": 0.985,
+      "grad_norm": 0.12745600938796997,
+      "learning_rate": 2.1949945106823909e-07,
+      "loss": 0.3208,
+      "step": 1970
+    },
+    {
+      "epoch": 0.9875,
+      "grad_norm": 0.13108642399311066,
+      "learning_rate": 1.544145313928047e-07,
+      "loss": 0.3641,
+      "step": 1975
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.12415596097707748,
+      "learning_rate": 1.0074093320156517e-07,
+      "loss": 0.3141,
+      "step": 1980
+    },
+    {
+      "epoch": 0.9925,
+      "grad_norm": 0.12116590887308121,
+      "learning_rate": 5.8482743946847153e-08,
+      "loss": 0.3085,
+      "step": 1985
+    },
+    {
+      "epoch": 0.995,
+      "grad_norm": 0.12617753446102142,
+      "learning_rate": 2.764318175336733e-08,
+      "loss": 0.316,
+      "step": 1990
+    },
+    {
+      "epoch": 0.9975,
+      "grad_norm": 0.13097520172595978,
+      "learning_rate": 8.224595173178527e-09,
+      "loss": 0.2772,
+      "step": 1995
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1454041749238968,
+      "learning_rate": 2.284630068460913e-10,
+      "loss": 0.3226,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.629578157719552e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama-hugcoder/checkpoint-2000/training_args.bin b/codellama-hugcoder/checkpoint-2000/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-2000/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304
diff --git a/codellama-hugcoder/checkpoint-500/README.md b/codellama-hugcoder/checkpoint-500/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a8daf76b93ee8508456247003fa155d657d2e354
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/README.md
@@ -0,0 +1,202 @@
+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+
+# Model Card for Model ID
+
+<!-- Provide a quick summary of what the model is/does. -->
+
+
+
+## Model Details
+
+### Model Description
+
+<!-- Provide a longer summary of what this model is. -->
+
+
+
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+
+### Model Sources [optional]
+
+<!-- Provide the basic links for the model. -->
+
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+
+## Uses
+
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+
+### Direct Use
+
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+
+[More Information Needed]
+
+### Downstream Use [optional]
+
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+
+[More Information Needed]
+
+### Out-of-Scope Use
+
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+
+[More Information Needed]
+
+## Bias, Risks, and Limitations
+
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+
+[More Information Needed]
+
+### Recommendations
+
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+[More Information Needed]
+
+## Training Details
+
+### Training Data
+
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+
+[More Information Needed]
+
+### Training Procedure
+
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+
+#### Preprocessing [optional]
+
+[More Information Needed]
+
+
+#### Training Hyperparameters
+
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+
+#### Speeds, Sizes, Times [optional]
+
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+
+[More Information Needed]
+
+## Evaluation
+
+<!-- This section describes the evaluation protocols and provides the results. -->
+
+### Testing Data, Factors & Metrics
+
+#### Testing Data
+
+<!-- This should link to a Dataset Card if possible. -->
+
+[More Information Needed]
+
+#### Factors
+
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+
+[More Information Needed]
+
+#### Metrics
+
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+
+[More Information Needed]
+
+### Results
+
+[More Information Needed]
+
+#### Summary
+
+
+
+## Model Examination [optional]
+
+<!-- Relevant interpretability work for the model goes here -->
+
+[More Information Needed]
+
+## Environmental Impact
+
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+
+## Technical Specifications [optional]
+
+### Model Architecture and Objective
+
+[More Information Needed]
+
+### Compute Infrastructure
+
+[More Information Needed]
+
+#### Hardware
+
+[More Information Needed]
+
+#### Software
+
+[More Information Needed]
+
+## Citation [optional]
+
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+
+**BibTeX:**
+
+[More Information Needed]
+
+**APA:**
+
+[More Information Needed]
+
+## Glossary [optional]
+
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+
+[More Information Needed]
+
+## More Information [optional]
+
+[More Information Needed]
+
+## Model Card Authors [optional]
+
+[More Information Needed]
+
+## Model Card Contact
+
+[More Information Needed]
+### Framework versions
+
+- PEFT 0.15.2.dev0
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-500/adapter_config.json b/codellama-hugcoder/checkpoint-500/adapter_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..5103ef08b354afd3803f5061efdf7a079a493cb1
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/adapter_config.json
@@ -0,0 +1,39 @@
+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}
\ No newline at end of file
diff --git a/codellama-hugcoder/checkpoint-500/adapter_model.safetensors b/codellama-hugcoder/checkpoint-500/adapter_model.safetensors
new file mode 100644
index 0000000000000000000000000000000000000000..efa8f0d96c743dc9c45dbe44b6751e8b825ccea1
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/adapter_model.safetensors
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0a03baab18f0cdae4dfc77bf7b41f7d1435807efac74517b5672e9ef8bedf1
+size 319876032
diff --git a/codellama-hugcoder/checkpoint-500/optimizer.pt b/codellama-hugcoder/checkpoint-500/optimizer.pt
new file mode 100644
index 0000000000000000000000000000000000000000..7893a45a12038c1e0135346bdb8820585f26d596
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/optimizer.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2dad4d0839af192a8e721c020748fcd5843aa02d4b867cd03a6da416f3b15a8e
+size 640009682
diff --git a/codellama-hugcoder/checkpoint-500/rng_state.pth b/codellama-hugcoder/checkpoint-500/rng_state.pth
new file mode 100644
index 0000000000000000000000000000000000000000..35d7a6ea2da3f55cb152eac5b23f5fd9af183676
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/rng_state.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8b3fe293b4ac5ae1cf2f114644c15f2a8317440ebc1144a8065f3fe94c0e32b8
+size 14244
diff --git a/codellama-hugcoder/checkpoint-500/scheduler.pt b/codellama-hugcoder/checkpoint-500/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..573d26ce871909d4cca478cf0893de681cf14192
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:12f207d7fee0843ba3ccc634c56e770b9b0bfb3e3b7ef4379b8fc405b4c45a03
+size 1064
diff --git a/codellama-hugcoder/checkpoint-500/trainer_state.json b/codellama-hugcoder/checkpoint-500/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..7d81dd11b86352eda67139874fba12a09a20421d
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/trainer_state.json
@@ -0,0 +1,734 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.25,
+  "eval_steps": 100.0,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.57394539429888e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}
diff --git a/codellama-hugcoder/checkpoint-500/training_args.bin b/codellama-hugcoder/checkpoint-500/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93
--- /dev/null
+++ b/codellama-hugcoder/checkpoint-500/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304
diff --git a/codellama-hugcoder/training_args.bin b/codellama-hugcoder/training_args.bin
new file mode 100644
index 0000000000000000000000000000000000000000..56c791e5156768b7f6e7b84cd13f2e54e47fad93
--- /dev/null
+++ b/codellama-hugcoder/training_args.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304
diff --git a/configs/deepspeed_config.yaml b/configs/deepspeed_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7aad2cf962436ad604aa39e18b05c62879001373
--- /dev/null
+++ b/configs/deepspeed_config.yaml
@@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE                                                                                                                                           
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/configs/fsdp_config.yaml b/configs/fsdp_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1089362fb2b8527f36e08ebb439644a89dbed6c0
--- /dev/null
+++ b/configs/fsdp_config.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE                                                                                                                                           
+debug: false                                                                                                                                                                 
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/fim.py b/fim.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef1d57bc2cf9994a80ffa0239492bad0ba311854
--- /dev/null
+++ b/fim.py
@@ -0,0 +1,141 @@
+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import numpy as np
+
+
+# this is expensive so we cache it
+@functools.lru_cache(maxsize=None)
+def get_fim_token_ids(tokenizer):
+    if "codellama" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.suffix_id,
+            tokenizer.prefix_id,
+            tokenizer.middle_id,
+            0,
+        )
+    elif "deepseek-coder" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<｜fim▁hole｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁begin｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁end｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<pad>", add_special_tokens=False)[0],
+        )
+    elif "stable-code" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<fim_suffix>")[0],
+            tokenizer.encode("<fim_prefix>")[0],
+            tokenizer.encode("<fim_middle>")[0],
+            tokenizer.encode("<fim_pad>")[0],
+        )
+    else:
+        bos_token_id = None
+        try:
+            FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
+                "additional_special_tokens"
+            ][1:5]
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                tokenizer.vocab[tok]
+                for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
+            )
+        except KeyError:
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                None,
+                None,
+                None,
+                None,
+            )
+    return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
+
+
+def _bos_token_processing(prefix_token_list, bos_token):
+    if bos_token is not None:
+        # add the BOS token to the beginning of the list
+        prefix_token_list.insert(0, bos_token)
+
+    return prefix_token_list
+
+
+## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
+def permute(
+    sample,
+    np_rng,
+    suffix_tok_id,
+    prefix_tok_id,
+    middle_tok_id,
+    pad_tok_id,
+    fim_rate=0.5,
+    fim_spm_rate=0.5,
+    truncate_or_pad=False,
+    bos_token_id=None,
+):
+    """
+    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
+    PSM and SPM (with a probability of fim_spm_rate).
+    """
+
+    if np_rng.binomial(1, fim_rate):
+        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
+        boundaries.sort()
+
+        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
+        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
+        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
+
+        if truncate_or_pad:
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - len(sample)
+            if diff > 0:
+                if suffix.shape[0] <= diff:
+                    return sample, np_rng
+                suffix = suffix[: suffix.shape[0] - diff]
+            elif diff < 0:
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
+
+        if np_rng.binomial(1, fim_spm_rate):
+            prefix_special_tokens = _bos_token_processing(
+                [prefix_tok_id, suffix_tok_id], bos_token_id
+            )
+            # SPM (variant 2 from FIM paper)
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    suffix,
+                    [middle_tok_id],
+                    prefix,
+                    middle,
+                ]
+            )
+        else:
+            prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
+            # PSM
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    prefix,
+                    [suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    middle,
+                ]
+            )
+    else:
+        # don't do FIM preproc
+        new_sample = sample
+    return list(new_sample), np_rng
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b7b96512c409868cf9c62d05ed6254a9dc6bd5f
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+git+https://github.com/huggingface/transformers
+git+https://github.com/huggingface/accelerate
+git+https://github.com/huggingface/peft
+trl
+huggingface-hub
+bitsandbytes
+evaluate
+datasets
+einops
+wandb
+tiktoken
+deepspeed
+tqdm
+safetensors
\ No newline at end of file
diff --git a/run_deepspeed.sh b/run_deepspeed.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4118c1ea6f811eaa05cc0c7ef15b34b0b8becf23
--- /dev/null
+++ b/run_deepspeed.sh
@@ -0,0 +1,33 @@
+accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 2e-5 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder-df" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant False \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_flash_attn True
\ No newline at end of file
diff --git a/run_fsdp.sh b/run_fsdp.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b8f816f6d05a629d27aa022c2424b8a12b27752f
--- /dev/null
+++ b/run_fsdp.sh
@@ -0,0 +1,33 @@
+accelerate launch --config_file "configs/fsdp_config.yaml" train.py \
+--model_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 1000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 25 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 1e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder-fsdp" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_flash_attn True
\ No newline at end of file
diff --git a/run_peft.sh b/run_peft.sh
new file mode 100644
index 0000000000000000000000000000000000000000..540c0e1e72cd3ca63699ac7eaebf293d475951bd
--- /dev/null
+++ b/run_peft.sh
@@ -0,0 +1,40 @@
+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 3e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 4 \
+--per_device_eval_batch_size 4 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_peft_lora True \
+--lora_r 32 \
+--lora_alpha 64 \
+--lora_dropout 0.1 \
+--lora_target_modules "all-linear" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True
\ No newline at end of file
diff --git a/run_unsloth_peft.sh b/run_unsloth_peft.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8d8a6b489e29c01cf5f7cdfc65cb97ce2da7de08
--- /dev/null
+++ b/run_unsloth_peft.sh
@@ -0,0 +1,43 @@
+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python train.py \
+--seed 11 \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 2e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.0 \
+--use_peft_lora True \
+--lora_r 16 \
+--lora_alpha 16 \
+--lora_dropout 0.1 \
+--lora_target_modules "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True \
+--use_unsloth True
\ No newline at end of file
diff --git a/train.py b/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d648ac1bcaddd7ff39da29193f1e7f2ccd9f35
--- /dev/null
+++ b/train.py
@@ -0,0 +1,495 @@
+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Continued pre-training/fine-tuning of code LLMs for code autocompletion.
+"""
+
+import gc
+import os
+import random
+import sys
+from typing import Optional
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from torch.utils.data import IterableDataset
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    HfArgumentParser,
+    set_seed,
+    BitsAndBytesConfig,
+)
+
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq
+import fim
+
+
+# Define and parse arguments.
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    lora_alpha: Optional[int] = field(default=16)
+    lora_dropout: Optional[float] = field(default=0.1)
+    lora_r: Optional[int] = field(default=64)
+    lora_target_modules: Optional[str] = field(
+        default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
+        metadata={
+            "help": "comma separated list of target modules to apply LoRA layers to"
+        },
+    )
+    use_nested_quant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Activate nested quantization for 4bit base models"},
+    )
+    bnb_4bit_compute_dtype: Optional[str] = field(
+        default="float16",
+        metadata={"help": "Compute dtype for 4bit base models"},
+    )
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4",
+        metadata={"help": "Quantization type fp4 or nf4"},
+    )
+    use_flash_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables Flash attention for training."},
+    )
+    use_peft_lora: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables PEFT LoRA for training."},
+    )
+    use_8bit_qunatization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 8bit."},
+    )
+    use_4bit_quantization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 4bit."},
+    )
+    use_reentrant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Gradient Checkpointing param. Refer the related docs"},
+    )
+    use_unsloth: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables UnSloth for training."},
+    )
+    use_loftq: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."},
+    )
+    use_loftq_callback: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    dataset_name: Optional[str] = field(
+        default="smangrul/hug_stack",
+        metadata={"help": "The preference dataset to use."},
+    )
+    dataset_text_field: str = field(
+        default="text", metadata={"help": "Dataset field to use as input text."}
+    )
+    max_seq_length: Optional[int] = field(default=4096)
+    test_size: Optional[float] = field(default=0.1)
+    fim_rate: Optional[float] = field(default=0.5)
+    fim_spm_rate: Optional[float] = field(default=0.5)
+    splits: Optional[str] = field(
+        default="train",
+        metadata={"help": "Comma separate list of the splits to use from the dataset."},
+    )
+
+
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+
+    return total_characters / total_tokens
+
+
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+        Args:
+            tokenizer (Tokenizer): The processor used for proccessing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
+            seq_length (int): Length of token sequences to return.
+            num_of_sequences (int): Number of token sequences to keep in buffer.
+            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
+            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
+            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
+            seed (int): Seed for random number generator.
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        content_field="content",
+        fim_rate=0.5,
+        fim_spm_rate=0.5,
+        seed=0,
+        shuffle=False,
+    ):
+        self.tokenizer = tokenizer
+        self.concat_token_id = tokenizer.eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.content_field = content_field
+        self.fim_rate = fim_rate
+        self.fim_spm_rate = fim_spm_rate
+        self.seed = seed
+        self.shuffle = shuffle
+
+        (
+            self.bos_token_id,
+            self.suffix_tok_id,
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.pad_tok_id,
+        ) = fim.get_fim_token_ids(self.tokenizer)
+        if not self.suffix_tok_id and self.fim_rate > 0:
+            print("FIM is not supported by tokenizer, disabling FIM")
+            self.fim_rate = 0
+
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        np_rng = np.random.RandomState(seed=self.seed)
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(next(iterator)[self.content_field])
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(
+                buffer, truncation=False, add_special_tokens=False
+            )["input_ids"]
+            all_token_ids = []
+
+            for tokenized_input in tokenized_inputs:
+                # optionally do FIM permutations
+                if self.fim_rate > 0:
+                    tokenized_input, np_rng = fim.permute(
+                        tokenized_input,
+                        np_rng,
+                        self.suffix_tok_id,
+                        self.prefix_tok_id,
+                        self.middle_tok_id,
+                        self.pad_tok_id,
+                        fim_rate=self.fim_rate,
+                        fim_spm_rate=self.fim_spm_rate,
+                        truncate_or_pad=False,
+                        bos_token_id=self.bos_token_id,
+                    )
+
+                all_token_ids.extend(tokenized_input + [self.concat_token_id])
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+
+
+def create_datasets(tokenizer, args, seed):
+    dataset = load_dataset(args.dataset_name, split=args.splits)
+    dataset = dataset.train_test_split(
+        test_size=args.test_size, seed=seed, shuffle=True
+    )
+    train_data = dataset["train"]
+    valid_data = dataset["test"]
+    print(
+        f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
+    )
+    chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field)
+    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+    train_dataset = ConstantLengthDataset(
+        tokenizer,
+        train_data,
+        infinite=True,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+        shuffle=True,
+    )
+    valid_dataset = ConstantLengthDataset(
+        tokenizer,
+        valid_data,
+        infinite=False,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+    )
+    print(f"A sample of valid dataset: {next(iter(valid_dataset))}")
+    return train_dataset, valid_dataset
+
+def get_mae(x, y):
+    return (x - y).abs().mean()
+
+
+def get_mse(x, y):
+    return torch.pow(x - y, 2).mean()
+
+
+def error_report(x, y):
+    mae = get_mae(x, y)
+    mse = get_mse(x, y)
+    print(
+        f"Mean absolute error: {mae:>8.5f}\n"
+        f"Mean squared error:  {mse:>8.5f}"
+    )
+
+    
+def loftq_init(model, tokenizer, train_dataset, max_seq_length, args):
+    if args.use_loftq_callback:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+        base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype)
+        base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
+        random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist()
+        random_inputs = [train_dataset[i]['content'] for i in random_input_ids]
+        random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length)
+        logits_base = base_model(**random_inputs).logits
+        del base_model
+        gc.collect()
+        
+        def loftq_callback(model, module_name):
+            """Callable to replace weights with LoFTQ if the mse is lower than the current best one."""
+            global current_mse
+            logits = model(**random_inputs).logits
+            mse = get_mse(logits_base, logits)
+            if mse < current_mse:
+                current_mse = mse
+                print(f"MSE improved for module {module_name}")
+                return True
+            print(f"MSE did not improve for module {module_name}")
+            return False
+        
+        replace_lora_weights_loftq(model, callback=loftq_callback)
+        logits_loftq_callback = model(**random_inputs).logits
+        error_report(logits_base, logits_loftq_callback)
+    else:
+        replace_lora_weights_loftq(model)
+
+
+def create_and_prepare_model(args, data_args, training_args):
+    device_map = None
+    bnb_config = None
+
+    load_in_8bit = args.use_8bit_qunatization
+    load_in_4bit = args.use_4bit_quantization
+
+    if args.use_unsloth:
+        from unsloth import FastLanguageModel
+
+    if args.use_4bit_quantization:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=args.use_4bit_quantization,
+            bnb_4bit_quant_type=args.bnb_4bit_quant_type,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=args.use_nested_quant,
+        )
+
+        if compute_dtype == torch.float16 and args.use_4bit_quantization:
+            major, _ = torch.cuda.get_device_capability()
+            if major >= 8:
+                print("=" * 80)
+                print(
+                    "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
+                )
+                print("=" * 80)
+
+    if args.use_4bit_quantization or args.use_8bit_qunatization:
+        device_map = (
+            int(os.environ.get("LOCAL_RANK", -1))
+            if torch.distributed.is_available() and torch.distributed.is_initialized()
+            else "auto"
+        )  # {"": 0}
+
+    if args.use_unsloth:
+        # Load model
+        model, _ = FastLanguageModel.from_pretrained(
+            model_name=args.model_name_or_path,
+            max_seq_length=data_args.max_seq_length,
+            dtype=None,
+            load_in_4bit=load_in_4bit,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            load_in_8bit=load_in_8bit,
+            quantization_config=bnb_config,
+            device_map=device_map,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+        )
+
+    if (
+        (args.use_4bit_quantization or args.use_8bit_qunatization)
+        and args.use_peft_lora
+        and not args.use_unsloth
+    ):
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant},
+        )
+
+    if args.use_peft_lora and not args.use_unsloth:
+        peft_config = LoraConfig(
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            bias="none",
+            task_type="CAUSAL_LM",
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+        )
+        model = get_peft_model(model, peft_config)
+    elif args.use_peft_lora and args.use_unsloth:
+        # Do model patching and add fast LoRA weights
+        model = FastLanguageModel.get_peft_model(
+            model,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            random_state=training_args.seed,
+            max_seq_length=data_args.max_seq_length,
+        )
+    return model
+
+
+def main(model_args, data_args, training_args):
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+
+    # load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+
+    # load the datasets
+    train_dataset, eval_dataset = create_datasets(
+        tokenizer, data_args, training_args.seed
+    )
+    train_dataset.start_iteration = 0
+
+    model = create_and_prepare_model(model_args, data_args, training_args)
+    # gradient ckpt
+    model.config.use_cache = not training_args.gradient_checkpointing
+    training_args.gradient_checkpointing = (
+        training_args.gradient_checkpointing and not model_args.use_unsloth
+    )
+    if training_args.gradient_checkpointing:
+        training_args.gradient_checkpointing_kwargs = {
+            "use_reentrant": model_args.use_reentrant
+        }
+
+    # trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    trainer.accelerator.print(f"{trainer.model}")
+    if model_args.use_peft_lora:
+        trainer.model.print_trainable_parameters()
+
+    # LoftQ initialization when using QLoRA
+    if model_args.use_4bit_quantization and model_args.use_loftq:
+        loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args)
+
+    # train
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    trainer.train(resume_from_checkpoint=checkpoint)
+
+    # saving final model
+    if trainer.is_fsdp_enabled:
+        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+    trainer.save_model()
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments)
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    main(model_args, data_args, training_args)
diff --git a/wandb/debug-internal.log b/wandb/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..ee5bf05e14a33a7993620274bc5fffbfa49ca1f9
--- /dev/null
+++ b/wandb/debug-internal.log
@@ -0,0 +1,15 @@
+{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"}
+{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"}
+{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-17T22:27:35.403843016Z","level":"INFO","msg":"stream: closing","id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404804572Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-17T22:27:35.404850555Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-17T22:27:35.40493109Z","level":"INFO","msg":"handler: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404943001Z","level":"INFO","msg":"writer: Close: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404963782Z","level":"INFO","msg":"sender: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.405060219Z","level":"INFO","msg":"stream: closed","id":"jc2tz43q"}
diff --git a/wandb/debug.log b/wandb/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..067508921187c34e3dc0bd7885357818572fbf91
--- /dev/null
+++ b/wandb/debug.log
@@ -0,0 +1,26 @@
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():852] calling init triggers
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():893] starting backend
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():897] sending inform_init request
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [wandb_init.py:init():907] backend started and connected
+2025-05-16 07:37:47,237 INFO    MainThread:29365 [wandb_init.py:init():1005] updated telemetry
+2025-05-16 07:37:47,244 INFO    MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-05-16 07:37:47,473 INFO    MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-05-16 07:37:47,857 INFO    MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process
+2025-05-16 07:37:47,859 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fa638123580>>
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None
+2025-05-17 22:27:35,403 INFO    MsgRouterThr:29365 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles.
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/output.log b/wandb/offline-run-20250516_073234-ywsmjz3f/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt b/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3
--- /dev/null
+++ b/wandb/offline-run-20250516_073234-ywsmjz3f/files/requirements.txt
@@ -0,0 +1,359 @@
+huggingface-hub==0.31.2
+tokenizers==0.21.1
+bitsandbytes==0.45.5
+py-cpuinfo==9.0.0
+nvidia-ml-py==12.575.51
+hjson==3.1.0
+smmap==5.0.2
+setproctitle==1.3.6
+sentry-sdk==2.28.0
+ninja==1.11.1.4
+msgpack==1.1.0
+einops==0.8.1
+docker-pycreds==0.4.0
+gitdb==4.0.12
+GitPython==3.1.44
+wandb==0.19.11
+transformers==4.52.0.dev0
+deepspeed==0.16.7
+accelerate==1.8.0.dev0
+peft==0.15.2.dev0
+trl==0.17.0
+flash_attn==2.7.4.post1
+APScheduler==3.10.4
+Authlib==1.3.1
+Deprecated==1.2.18
+Flask-Cors==4.0.1
+Mako==1.3.8
+Markdown==3.6
+PyJWT==2.8.0
+PyMySQL==1.1.1
+PyPika==0.48.9
+RTFDE==0.1.2
+SQLAlchemy==2.0.31
+XlsxWriter==3.2.2
+aiohttp==3.9.5
+alembic==1.13.2
+annotated-types==0.7.0
+anthropic==0.45.2
+asgiref==3.8.1
+async-timeout==4.0.3
+av==12.3.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+bidict==0.23.1
+black==24.8.0
+blinker==1.9.0
+boto3==1.34.153
+botocore==1.34.162
+build==1.2.2.post1
+cachetools==5.5.1
+chardet==5.2.0
+chroma-hnswlib==0.7.5
+chromadb==0.5.4
+click==8.1.8
+colorclass==2.2.2
+coloredlogs==15.0.1
+compressed-rtf==1.0.6
+cryptography==44.0.0
+ctranslate2==4.5.0
+dataclasses-json==0.6.7
+deepdiff==8.1.1
+distro==1.9.0
+dnspython==2.7.0
+docker==7.1.0
+docx2txt==0.8
+duckduckgo_search==6.2.13
+durationpy==0.9
+easygui==0.98.3
+ebcdic==1.1.1
+ecdsa==0.19.0
+email_validator==2.2.0
+emoji==2.14.1
+extract-msg==0.52.0
+fake-useragent==1.5.1
+fastapi==0.111.0
+fastapi-cli==0.0.7
+faster-whisper==1.0.2
+filetype==1.2.0
+Flask==3.0.3
+flatbuffers==25.1.24
+fonttools==4.55.8
+fpdf2==2.7.9
+google-ai-generativelanguage==0.6.6
+google-api-core==2.24.1
+google-api-python-client==2.160.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+grpcio==1.70.0
+grpcio-status==1.62.3
+httplib2==0.22.0
+httptools==0.6.4
+humanfriendly==10.0
+importlib_metadata==8.4.0
+importlib_resources==6.5.2
+iniconfig==2.0.0
+itsdangerous==2.2.0
+jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+kubernetes==32.0.0
+langchain==0.2.11
+langchain-chroma==0.1.2
+langchain-community==0.2.10
+langchain-core==0.2.43
+langchain-text-splitters==0.2.4
+langdetect==1.0.9
+langfuse==2.39.2
+langsmith==0.1.147
+lark==1.1.9
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.26.0
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+msoffcrypto-tool==5.4.2
+mypy-extensions==1.0.0
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.47
+oletools==0.60.2
+onnxruntime==1.20.1
+openai==1.61.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.10.0.84
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orderly-set==5.2.3
+orjson==3.10.15
+packaging==23.2
+pandas==2.2.2
+passlib==1.7.4
+pathspec==0.12.1
+pcodedmp==1.2.6
+peewee==3.17.6
+peewee-migrate==1.12.2
+pillow==11.1.0
+pluggy==1.5.0
+posthog==3.11.0
+primp==0.11.0
+proto-plus==1.26.0
+protobuf==4.25.6
+psycopg2-binary==2.9.9
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post6
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+pymongo==4.11
+pypandoc==1.13
+pyparsing==3.2.1
+pypdf==4.3.1
+pyproject_hooks==1.2.0
+pytest==8.2.2
+pytest-docker==3.1.1
+python-dotenv==1.0.1
+python-engineio==4.11.2
+python-iso639==2025.1.28
+python-jose==3.3.0
+python-magic==0.4.27
+python-multipart==0.0.9
+python-pptx==1.0.0
+python-socketio==5.11.3
+pytube==15.0.0
+pyxlsb==1.0.10
+rank-bm25==0.2.2
+RapidFuzz==3.12.1
+rapidocr-onnxruntime==1.3.24
+red-black-tree-mod==1.20
+redis==5.2.1
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rich-toolkit==0.13.2
+rsa==4.9
+s3transfer==0.10.4
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.0.1
+shapely==2.0.7
+shellingham==1.5.4
+simple-websocket==1.1.0
+starlette==0.37.2
+tabulate==0.9.0
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+typer==0.15.1
+typing-inspect==0.9.0
+tzlocal==5.2
+ujson==5.10.0
+unstructured==0.15.0
+unstructured-client==0.25.9
+uritemplate==4.1.1
+uvicorn==0.22.0
+uvloop==0.21.0
+validators==0.33.0
+watchfiles==1.0.4
+websockets==14.2
+Werkzeug==3.1.3
+wrapt==1.17.2
+wsproto==1.2.0
+xlrd==2.0.1
+youtube-transcript-api==0.6.2
+zipp==3.21.0
+aiohappyeyeballs==2.4.4
+aiosignal==1.3.2
+datasets==3.2.0
+dill==0.3.8
+et_xmlfile==2.0.0
+evaluate==0.4.3
+filelock==3.17.0
+frozenlist==1.5.0
+fsspec==2024.9.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openpyxl==3.1.5
+propcache==0.2.1
+pyarrow==19.0.0
+pytz==2025.1
+regex==2024.11.6
+safetensors==0.5.2
+sentencepiece==0.2.0
+sympy==1.13.1
+torch==2.6.0
+tqdm==4.67.1
+triton==3.2.0
+tzdata==2025.1
+xxhash==3.5.0
+yarl==1.18.3
+MarkupSafe==3.0.2
+PyYAML==6.0.2
+Send2Trash==1.8.3
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.4
+attrs==25.1.0
+babel==2.17.0
+bleach==6.2.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+defusedxml==0.7.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastjsonschema==2.21.1
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.32.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.5
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter_client==8.6.3
+jupyter-console==6.6.3
+jupyter_core==5.7.2
+jupyter-events==0.11.0
+jupyter-lsp==2.2.5
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+matplotlib-inline==0.1.7
+mistune==3.1.1
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.3.2
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-json-logger==3.2.1
+pyzmq==26.2.1
+referencing==0.36.2
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.22.3
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.4.0
+tomli==2.2.1
+tornado==6.4.2
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+uri-template==1.3.0
+urllib3==2.3.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+pip==22.0.2
+setuptools==59.6.0
+wheel==0.37.1
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json b/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..7371377be31edea723d676c7a39aefcd7fc28bcc
--- /dev/null
+++ b/wandb/offline-run-20250516_073234-ywsmjz3f/files/wandb-metadata.json
@@ -0,0 +1,162 @@
+{
+  "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.12",
+  "startedAt": "2025-05-16T07:32:34.057084Z",
+  "args": [
+    "--model_name_or_path",
+    "codellama/CodeLlama-7b-Instruct-hf",
+    "--dataset_name",
+    "smangrul/hug_stack",
+    "--splits",
+    "train",
+    "--max_seq_len",
+    "2048",
+    "--max_steps",
+    "2000",
+    "--save_steps",
+    "500",
+    "--eval_steps",
+    "100",
+    "--logging_steps",
+    "5",
+    "--log_level",
+    "info",
+    "--logging_strategy",
+    "steps",
+    "--save_strategy",
+    "steps",
+    "--push_to_hub",
+    "--hub_private_repo",
+    "True",
+    "--hub_strategy",
+    "every_save",
+    "--bf16",
+    "True",
+    "--learning_rate",
+    "3e-4",
+    "--lr_scheduler_type",
+    "cosine",
+    "--weight_decay",
+    "0.1",
+    "--warmup_ratio",
+    "0.1",
+    "--max_grad_norm",
+    "1.0",
+    "--output_dir",
+    "codellama-hugcoder",
+    "--per_device_train_batch_size",
+    "16",
+    "--per_device_eval_batch_size",
+    "16",
+    "--gradient_accumulation_steps",
+    "4",
+    "--gradient_checkpointing",
+    "True",
+    "--use_reentrant",
+    "True",
+    "--dataset_text_field",
+    "text",
+    "--test_size",
+    "0.1",
+    "--fim_rate",
+    "0.5",
+    "--fim_spm_rate",
+    "0.5",
+    "--use_peft_lora",
+    "True",
+    "--lora_r",
+    "32",
+    "--lora_alpha",
+    "64",
+    "--lora_dropout",
+    "0.1",
+    "--lora_target_modules",
+    "all-linear",
+    "--use_4bit_quantization",
+    "True",
+    "--use_nested_quant",
+    "True",
+    "--bnb_4bit_compute_dtype",
+    "bfloat16",
+    "--use_flash_attn",
+    "True"
+  ],
+  "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py",
+  "codePath": "personal_copilot/training/train.py",
+  "git": {
+    "remote": "https://github.com/pacman100/LLM-Workshop.git",
+    "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d"
+  },
+  "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training",
+  "host": "project-finecode-65846d7984-mzlgr",
+  "executable": "/usr/bin/python3",
+  "codePathLocal": "train.py",
+  "cpu_count": 96,
+  "cpu_count_logical": 192,
+  "gpu": "NVIDIA L4",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "161048670208",
+      "used": "79656607744"
+    }
+  },
+  "memory": {
+    "total": "781916942336"
+  },
+  "cpu": {
+    "count": 96,
+    "countLogical": 192
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..1e49a796bb0a79c02b80688d665ca8732b41c4d3
--- /dev/null
+++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log
@@ -0,0 +1,13 @@
+{"time":"2025-05-16T07:30:58.699801538Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmptbc9b5yr/port-24729.txt","pid":24729,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-05-16T07:30:58.726124342Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":24729}
+{"time":"2025-05-16T07:30:58.732312551Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":44465,"Zone":""}}
+{"time":"2025-05-16T07:30:58.74125048Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:34.066161637Z","level":"INFO","msg":"handleInformInit: received","streamId":"ywsmjz3f","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:34.365776057Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"ywsmjz3f","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.38484093Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.384915055Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.38497659Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.384963659Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-05-16T07:32:45.386267549Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.386310042Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:46530"}
+{"time":"2025-05-16T07:32:45.386340054Z","level":"INFO","msg":"server is closed"}
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..0aec8ce735dabb0e7ca0e293430496b387b6fdb3
--- /dev/null
+++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log
@@ -0,0 +1,15 @@
+{"time":"2025-05-16T07:32:34.149387625Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-core.log"}
+{"time":"2025-05-16T07:32:34.364789908Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"}
+{"time":"2025-05-16T07:32:34.365750715Z","level":"INFO","msg":"created new stream","id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:34.365769606Z","level":"INFO","msg":"stream: started","id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:34.365807029Z","level":"INFO","msg":"sender: started","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:34.365799878Z","level":"INFO","msg":"writer: Do: started","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:34.366790667Z","level":"INFO","msg":"handler: started","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:34.371019219Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-16T07:32:45.384954188Z","level":"INFO","msg":"stream: closing","id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:45.38498709Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-16T07:32:45.385024963Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-16T07:32:45.385961187Z","level":"INFO","msg":"handler: closed","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:45.385974788Z","level":"INFO","msg":"writer: Close: closed","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:45.386016981Z","level":"INFO","msg":"sender: closed","stream_id":"ywsmjz3f"}
+{"time":"2025-05-16T07:32:45.386092937Z","level":"INFO","msg":"stream: closed","id":"ywsmjz3f"}
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..2b76c6311f0e613e8f0ba2e409c9cd74990344ce
--- /dev/null
+++ b/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log
@@ -0,0 +1,26 @@
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_setup.py:_flush():70] Configure stats pid to 24729
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug.log
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073234-ywsmjz3f/logs/debug-internal.log
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:init():852] calling init triggers
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:init():893] starting backend
+2025-05-16 07:32:34,052 INFO    MainThread:24729 [wandb_init.py:init():897] sending inform_init request
+2025-05-16 07:32:34,056 INFO    MainThread:24729 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-16 07:32:34,056 INFO    MainThread:24729 [wandb_init.py:init():907] backend started and connected
+2025-05-16 07:32:34,057 INFO    MainThread:24729 [wandb_init.py:init():1005] updated telemetry
+2025-05-16 07:32:34,089 INFO    MainThread:24729 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-05-16 07:32:34,368 INFO    MainThread:24729 [wandb_init.py:init():1104] starting run threads in backend
+2025-05-16 07:32:34,755 INFO    MainThread:24729 [wandb_run.py:_console_start():2573] atexit reg
+2025-05-16 07:32:34,755 INFO    MainThread:24729 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-05-16 07:32:34,756 INFO    MainThread:24729 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-05-16 07:32:34,756 INFO    MainThread:24729 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-05-16 07:32:34,758 INFO    MainThread:24729 [wandb_init.py:init():1150] run started, returning control to user process
+2025-05-16 07:32:34,760 INFO    MainThread:24729 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj', 'down_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-30-12_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-05-16 07:32:34,766 INFO    MainThread:24729 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7f0370dec550>>
+2025-05-16 07:32:34,766 INFO    MainThread:24729 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None
+2025-05-16 07:32:45,384 INFO    MsgRouterThr:24729 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles.
diff --git a/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb b/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..b09c8d72a495b81524c1015c362d5dd85d4a09a3
Binary files /dev/null and b/wandb/offline-run-20250516_073234-ywsmjz3f/run-ywsmjz3f.wandb differ
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/output.log b/wandb/offline-run-20250516_073527-tbn7d6q6/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt b/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3
--- /dev/null
+++ b/wandb/offline-run-20250516_073527-tbn7d6q6/files/requirements.txt
@@ -0,0 +1,359 @@
+huggingface-hub==0.31.2
+tokenizers==0.21.1
+bitsandbytes==0.45.5
+py-cpuinfo==9.0.0
+nvidia-ml-py==12.575.51
+hjson==3.1.0
+smmap==5.0.2
+setproctitle==1.3.6
+sentry-sdk==2.28.0
+ninja==1.11.1.4
+msgpack==1.1.0
+einops==0.8.1
+docker-pycreds==0.4.0
+gitdb==4.0.12
+GitPython==3.1.44
+wandb==0.19.11
+transformers==4.52.0.dev0
+deepspeed==0.16.7
+accelerate==1.8.0.dev0
+peft==0.15.2.dev0
+trl==0.17.0
+flash_attn==2.7.4.post1
+APScheduler==3.10.4
+Authlib==1.3.1
+Deprecated==1.2.18
+Flask-Cors==4.0.1
+Mako==1.3.8
+Markdown==3.6
+PyJWT==2.8.0
+PyMySQL==1.1.1
+PyPika==0.48.9
+RTFDE==0.1.2
+SQLAlchemy==2.0.31
+XlsxWriter==3.2.2
+aiohttp==3.9.5
+alembic==1.13.2
+annotated-types==0.7.0
+anthropic==0.45.2
+asgiref==3.8.1
+async-timeout==4.0.3
+av==12.3.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+bidict==0.23.1
+black==24.8.0
+blinker==1.9.0
+boto3==1.34.153
+botocore==1.34.162
+build==1.2.2.post1
+cachetools==5.5.1
+chardet==5.2.0
+chroma-hnswlib==0.7.5
+chromadb==0.5.4
+click==8.1.8
+colorclass==2.2.2
+coloredlogs==15.0.1
+compressed-rtf==1.0.6
+cryptography==44.0.0
+ctranslate2==4.5.0
+dataclasses-json==0.6.7
+deepdiff==8.1.1
+distro==1.9.0
+dnspython==2.7.0
+docker==7.1.0
+docx2txt==0.8
+duckduckgo_search==6.2.13
+durationpy==0.9
+easygui==0.98.3
+ebcdic==1.1.1
+ecdsa==0.19.0
+email_validator==2.2.0
+emoji==2.14.1
+extract-msg==0.52.0
+fake-useragent==1.5.1
+fastapi==0.111.0
+fastapi-cli==0.0.7
+faster-whisper==1.0.2
+filetype==1.2.0
+Flask==3.0.3
+flatbuffers==25.1.24
+fonttools==4.55.8
+fpdf2==2.7.9
+google-ai-generativelanguage==0.6.6
+google-api-core==2.24.1
+google-api-python-client==2.160.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+grpcio==1.70.0
+grpcio-status==1.62.3
+httplib2==0.22.0
+httptools==0.6.4
+humanfriendly==10.0
+importlib_metadata==8.4.0
+importlib_resources==6.5.2
+iniconfig==2.0.0
+itsdangerous==2.2.0
+jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+kubernetes==32.0.0
+langchain==0.2.11
+langchain-chroma==0.1.2
+langchain-community==0.2.10
+langchain-core==0.2.43
+langchain-text-splitters==0.2.4
+langdetect==1.0.9
+langfuse==2.39.2
+langsmith==0.1.147
+lark==1.1.9
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.26.0
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+msoffcrypto-tool==5.4.2
+mypy-extensions==1.0.0
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.47
+oletools==0.60.2
+onnxruntime==1.20.1
+openai==1.61.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.10.0.84
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orderly-set==5.2.3
+orjson==3.10.15
+packaging==23.2
+pandas==2.2.2
+passlib==1.7.4
+pathspec==0.12.1
+pcodedmp==1.2.6
+peewee==3.17.6
+peewee-migrate==1.12.2
+pillow==11.1.0
+pluggy==1.5.0
+posthog==3.11.0
+primp==0.11.0
+proto-plus==1.26.0
+protobuf==4.25.6
+psycopg2-binary==2.9.9
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post6
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+pymongo==4.11
+pypandoc==1.13
+pyparsing==3.2.1
+pypdf==4.3.1
+pyproject_hooks==1.2.0
+pytest==8.2.2
+pytest-docker==3.1.1
+python-dotenv==1.0.1
+python-engineio==4.11.2
+python-iso639==2025.1.28
+python-jose==3.3.0
+python-magic==0.4.27
+python-multipart==0.0.9
+python-pptx==1.0.0
+python-socketio==5.11.3
+pytube==15.0.0
+pyxlsb==1.0.10
+rank-bm25==0.2.2
+RapidFuzz==3.12.1
+rapidocr-onnxruntime==1.3.24
+red-black-tree-mod==1.20
+redis==5.2.1
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rich-toolkit==0.13.2
+rsa==4.9
+s3transfer==0.10.4
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.0.1
+shapely==2.0.7
+shellingham==1.5.4
+simple-websocket==1.1.0
+starlette==0.37.2
+tabulate==0.9.0
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+typer==0.15.1
+typing-inspect==0.9.0
+tzlocal==5.2
+ujson==5.10.0
+unstructured==0.15.0
+unstructured-client==0.25.9
+uritemplate==4.1.1
+uvicorn==0.22.0
+uvloop==0.21.0
+validators==0.33.0
+watchfiles==1.0.4
+websockets==14.2
+Werkzeug==3.1.3
+wrapt==1.17.2
+wsproto==1.2.0
+xlrd==2.0.1
+youtube-transcript-api==0.6.2
+zipp==3.21.0
+aiohappyeyeballs==2.4.4
+aiosignal==1.3.2
+datasets==3.2.0
+dill==0.3.8
+et_xmlfile==2.0.0
+evaluate==0.4.3
+filelock==3.17.0
+frozenlist==1.5.0
+fsspec==2024.9.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openpyxl==3.1.5
+propcache==0.2.1
+pyarrow==19.0.0
+pytz==2025.1
+regex==2024.11.6
+safetensors==0.5.2
+sentencepiece==0.2.0
+sympy==1.13.1
+torch==2.6.0
+tqdm==4.67.1
+triton==3.2.0
+tzdata==2025.1
+xxhash==3.5.0
+yarl==1.18.3
+MarkupSafe==3.0.2
+PyYAML==6.0.2
+Send2Trash==1.8.3
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.4
+attrs==25.1.0
+babel==2.17.0
+bleach==6.2.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+defusedxml==0.7.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastjsonschema==2.21.1
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.32.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.5
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter_client==8.6.3
+jupyter-console==6.6.3
+jupyter_core==5.7.2
+jupyter-events==0.11.0
+jupyter-lsp==2.2.5
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+matplotlib-inline==0.1.7
+mistune==3.1.1
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.3.2
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-json-logger==3.2.1
+pyzmq==26.2.1
+referencing==0.36.2
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.22.3
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.4.0
+tomli==2.2.1
+tornado==6.4.2
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+uri-template==1.3.0
+urllib3==2.3.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+pip==22.0.2
+setuptools==59.6.0
+wheel==0.37.1
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json b/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..cdbd59cdd313d131b898c80fd5671300bab841ef
--- /dev/null
+++ b/wandb/offline-run-20250516_073527-tbn7d6q6/files/wandb-metadata.json
@@ -0,0 +1,162 @@
+{
+  "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.12",
+  "startedAt": "2025-05-16T07:35:27.841578Z",
+  "args": [
+    "--model_name_or_path",
+    "codellama/CodeLlama-7b-Instruct-hf",
+    "--dataset_name",
+    "smangrul/hug_stack",
+    "--splits",
+    "train",
+    "--max_seq_len",
+    "2048",
+    "--max_steps",
+    "2000",
+    "--save_steps",
+    "500",
+    "--eval_steps",
+    "100",
+    "--logging_steps",
+    "5",
+    "--log_level",
+    "info",
+    "--logging_strategy",
+    "steps",
+    "--save_strategy",
+    "steps",
+    "--push_to_hub",
+    "--hub_private_repo",
+    "True",
+    "--hub_strategy",
+    "every_save",
+    "--bf16",
+    "True",
+    "--learning_rate",
+    "3e-4",
+    "--lr_scheduler_type",
+    "cosine",
+    "--weight_decay",
+    "0.1",
+    "--warmup_ratio",
+    "0.1",
+    "--max_grad_norm",
+    "1.0",
+    "--output_dir",
+    "codellama-hugcoder",
+    "--per_device_train_batch_size",
+    "8",
+    "--per_device_eval_batch_size",
+    "8",
+    "--gradient_accumulation_steps",
+    "4",
+    "--gradient_checkpointing",
+    "True",
+    "--use_reentrant",
+    "True",
+    "--dataset_text_field",
+    "text",
+    "--test_size",
+    "0.1",
+    "--fim_rate",
+    "0.5",
+    "--fim_spm_rate",
+    "0.5",
+    "--use_peft_lora",
+    "True",
+    "--lora_r",
+    "32",
+    "--lora_alpha",
+    "64",
+    "--lora_dropout",
+    "0.1",
+    "--lora_target_modules",
+    "all-linear",
+    "--use_4bit_quantization",
+    "True",
+    "--use_nested_quant",
+    "True",
+    "--bnb_4bit_compute_dtype",
+    "bfloat16",
+    "--use_flash_attn",
+    "True"
+  ],
+  "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py",
+  "codePath": "personal_copilot/training/train.py",
+  "git": {
+    "remote": "https://github.com/pacman100/LLM-Workshop.git",
+    "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d"
+  },
+  "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training",
+  "host": "project-finecode-65846d7984-mzlgr",
+  "executable": "/usr/bin/python3",
+  "codePathLocal": "train.py",
+  "cpu_count": 96,
+  "cpu_count_logical": 192,
+  "gpu": "NVIDIA L4",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "161048670208",
+      "used": "79644426240"
+    }
+  },
+  "memory": {
+    "total": "781916942336"
+  },
+  "cpu": {
+    "count": 96,
+    "countLogical": 192
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..08b937ac63e90b8036187a5ee0c734bc1a24a9e5
--- /dev/null
+++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log
@@ -0,0 +1,13 @@
+{"time":"2025-05-16T07:35:23.964788517Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpru0m0q6i/port-27688.txt","pid":27688,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-05-16T07:35:23.968897511Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":27688}
+{"time":"2025-05-16T07:35:23.969782233Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":39579,"Zone":""}}
+{"time":"2025-05-16T07:35:24.077965245Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:27.860328522Z","level":"INFO","msg":"handleInformInit: received","streamId":"tbn7d6q6","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:28.313655136Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"tbn7d6q6","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.331700042Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.331802539Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-05-16T07:35:40.331789688Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.33196098Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.332470885Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.332500567Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:48002"}
+{"time":"2025-05-16T07:35:40.332516958Z","level":"INFO","msg":"server is closed"}
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..fca616e55a6dab66f5a227eadab2723bafa7f53a
--- /dev/null
+++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log
@@ -0,0 +1,15 @@
+{"time":"2025-05-16T07:35:27.893672998Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-core.log"}
+{"time":"2025-05-16T07:35:28.312975339Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"}
+{"time":"2025-05-16T07:35:28.313625704Z","level":"INFO","msg":"created new stream","id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:28.313648245Z","level":"INFO","msg":"stream: started","id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:28.313670627Z","level":"INFO","msg":"writer: Do: started","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:28.31370909Z","level":"INFO","msg":"handler: started","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:28.313708829Z","level":"INFO","msg":"sender: started","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:28.318320488Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-16T07:35:40.331838151Z","level":"INFO","msg":"stream: closing","id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:40.331893235Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-16T07:35:40.331940419Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-16T07:35:40.332018254Z","level":"INFO","msg":"handler: closed","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:40.332025334Z","level":"INFO","msg":"writer: Close: closed","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:40.332069427Z","level":"INFO","msg":"sender: closed","stream_id":"tbn7d6q6"}
+{"time":"2025-05-16T07:35:40.332288503Z","level":"INFO","msg":"stream: closed","id":"tbn7d6q6"}
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..403ba9e322b3620de5896b4a25ab7f9464c0115b
--- /dev/null
+++ b/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log
@@ -0,0 +1,26 @@
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_setup.py:_flush():70] Configure stats pid to 27688
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug.log
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073527-tbn7d6q6/logs/debug-internal.log
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:init():852] calling init triggers
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:init():893] starting backend
+2025-05-16 07:35:27,828 INFO    MainThread:27688 [wandb_init.py:init():897] sending inform_init request
+2025-05-16 07:35:27,841 INFO    MainThread:27688 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-16 07:35:27,841 INFO    MainThread:27688 [wandb_init.py:init():907] backend started and connected
+2025-05-16 07:35:27,842 INFO    MainThread:27688 [wandb_init.py:init():1005] updated telemetry
+2025-05-16 07:35:27,851 INFO    MainThread:27688 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-05-16 07:35:28,315 INFO    MainThread:27688 [wandb_init.py:init():1104] starting run threads in backend
+2025-05-16 07:35:28,703 INFO    MainThread:27688 [wandb_run.py:_console_start():2573] atexit reg
+2025-05-16 07:35:28,703 INFO    MainThread:27688 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-05-16 07:35:28,703 INFO    MainThread:27688 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-05-16 07:35:28,703 INFO    MainThread:27688 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-05-16 07:35:28,705 INFO    MainThread:27688 [wandb_init.py:init():1150] run started, returning control to user process
+2025-05-16 07:35:28,707 INFO    MainThread:27688 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'q_proj', 'o_proj', 'v_proj', 'up_proj', 'down_proj', 'gate_proj', 'k_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-34-08_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-05-16 07:35:28,713 INFO    MainThread:27688 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7eff858c3640>>
+2025-05-16 07:35:28,713 INFO    MainThread:27688 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None
+2025-05-16 07:35:40,331 INFO    MsgRouterThr:27688 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles.
diff --git a/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb b/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..7ebfb43e2a2f1440ad5f94590d197d68174a58fc
Binary files /dev/null and b/wandb/offline-run-20250516_073527-tbn7d6q6/run-tbn7d6q6.wandb differ
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/output-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/output-checkpoint.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/.ipynb_checkpoints/requirements-checkpoint.txt
@@ -0,0 +1,359 @@
+huggingface-hub==0.31.2
+tokenizers==0.21.1
+bitsandbytes==0.45.5
+py-cpuinfo==9.0.0
+nvidia-ml-py==12.575.51
+hjson==3.1.0
+smmap==5.0.2
+setproctitle==1.3.6
+sentry-sdk==2.28.0
+ninja==1.11.1.4
+msgpack==1.1.0
+einops==0.8.1
+docker-pycreds==0.4.0
+gitdb==4.0.12
+GitPython==3.1.44
+wandb==0.19.11
+transformers==4.52.0.dev0
+deepspeed==0.16.7
+accelerate==1.8.0.dev0
+peft==0.15.2.dev0
+trl==0.17.0
+flash_attn==2.7.4.post1
+APScheduler==3.10.4
+Authlib==1.3.1
+Deprecated==1.2.18
+Flask-Cors==4.0.1
+Mako==1.3.8
+Markdown==3.6
+PyJWT==2.8.0
+PyMySQL==1.1.1
+PyPika==0.48.9
+RTFDE==0.1.2
+SQLAlchemy==2.0.31
+XlsxWriter==3.2.2
+aiohttp==3.9.5
+alembic==1.13.2
+annotated-types==0.7.0
+anthropic==0.45.2
+asgiref==3.8.1
+async-timeout==4.0.3
+av==12.3.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+bidict==0.23.1
+black==24.8.0
+blinker==1.9.0
+boto3==1.34.153
+botocore==1.34.162
+build==1.2.2.post1
+cachetools==5.5.1
+chardet==5.2.0
+chroma-hnswlib==0.7.5
+chromadb==0.5.4
+click==8.1.8
+colorclass==2.2.2
+coloredlogs==15.0.1
+compressed-rtf==1.0.6
+cryptography==44.0.0
+ctranslate2==4.5.0
+dataclasses-json==0.6.7
+deepdiff==8.1.1
+distro==1.9.0
+dnspython==2.7.0
+docker==7.1.0
+docx2txt==0.8
+duckduckgo_search==6.2.13
+durationpy==0.9
+easygui==0.98.3
+ebcdic==1.1.1
+ecdsa==0.19.0
+email_validator==2.2.0
+emoji==2.14.1
+extract-msg==0.52.0
+fake-useragent==1.5.1
+fastapi==0.111.0
+fastapi-cli==0.0.7
+faster-whisper==1.0.2
+filetype==1.2.0
+Flask==3.0.3
+flatbuffers==25.1.24
+fonttools==4.55.8
+fpdf2==2.7.9
+google-ai-generativelanguage==0.6.6
+google-api-core==2.24.1
+google-api-python-client==2.160.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+grpcio==1.70.0
+grpcio-status==1.62.3
+httplib2==0.22.0
+httptools==0.6.4
+humanfriendly==10.0
+importlib_metadata==8.4.0
+importlib_resources==6.5.2
+iniconfig==2.0.0
+itsdangerous==2.2.0
+jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+kubernetes==32.0.0
+langchain==0.2.11
+langchain-chroma==0.1.2
+langchain-community==0.2.10
+langchain-core==0.2.43
+langchain-text-splitters==0.2.4
+langdetect==1.0.9
+langfuse==2.39.2
+langsmith==0.1.147
+lark==1.1.9
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.26.0
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+msoffcrypto-tool==5.4.2
+mypy-extensions==1.0.0
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.47
+oletools==0.60.2
+onnxruntime==1.20.1
+openai==1.61.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.10.0.84
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orderly-set==5.2.3
+orjson==3.10.15
+packaging==23.2
+pandas==2.2.2
+passlib==1.7.4
+pathspec==0.12.1
+pcodedmp==1.2.6
+peewee==3.17.6
+peewee-migrate==1.12.2
+pillow==11.1.0
+pluggy==1.5.0
+posthog==3.11.0
+primp==0.11.0
+proto-plus==1.26.0
+protobuf==4.25.6
+psycopg2-binary==2.9.9
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post6
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+pymongo==4.11
+pypandoc==1.13
+pyparsing==3.2.1
+pypdf==4.3.1
+pyproject_hooks==1.2.0
+pytest==8.2.2
+pytest-docker==3.1.1
+python-dotenv==1.0.1
+python-engineio==4.11.2
+python-iso639==2025.1.28
+python-jose==3.3.0
+python-magic==0.4.27
+python-multipart==0.0.9
+python-pptx==1.0.0
+python-socketio==5.11.3
+pytube==15.0.0
+pyxlsb==1.0.10
+rank-bm25==0.2.2
+RapidFuzz==3.12.1
+rapidocr-onnxruntime==1.3.24
+red-black-tree-mod==1.20
+redis==5.2.1
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rich-toolkit==0.13.2
+rsa==4.9
+s3transfer==0.10.4
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.0.1
+shapely==2.0.7
+shellingham==1.5.4
+simple-websocket==1.1.0
+starlette==0.37.2
+tabulate==0.9.0
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+typer==0.15.1
+typing-inspect==0.9.0
+tzlocal==5.2
+ujson==5.10.0
+unstructured==0.15.0
+unstructured-client==0.25.9
+uritemplate==4.1.1
+uvicorn==0.22.0
+uvloop==0.21.0
+validators==0.33.0
+watchfiles==1.0.4
+websockets==14.2
+Werkzeug==3.1.3
+wrapt==1.17.2
+wsproto==1.2.0
+xlrd==2.0.1
+youtube-transcript-api==0.6.2
+zipp==3.21.0
+aiohappyeyeballs==2.4.4
+aiosignal==1.3.2
+datasets==3.2.0
+dill==0.3.8
+et_xmlfile==2.0.0
+evaluate==0.4.3
+filelock==3.17.0
+frozenlist==1.5.0
+fsspec==2024.9.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openpyxl==3.1.5
+propcache==0.2.1
+pyarrow==19.0.0
+pytz==2025.1
+regex==2024.11.6
+safetensors==0.5.2
+sentencepiece==0.2.0
+sympy==1.13.1
+torch==2.6.0
+tqdm==4.67.1
+triton==3.2.0
+tzdata==2025.1
+xxhash==3.5.0
+yarl==1.18.3
+MarkupSafe==3.0.2
+PyYAML==6.0.2
+Send2Trash==1.8.3
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.4
+attrs==25.1.0
+babel==2.17.0
+bleach==6.2.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+defusedxml==0.7.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastjsonschema==2.21.1
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.32.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.5
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter_client==8.6.3
+jupyter-console==6.6.3
+jupyter_core==5.7.2
+jupyter-events==0.11.0
+jupyter-lsp==2.2.5
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+matplotlib-inline==0.1.7
+mistune==3.1.1
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.3.2
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-json-logger==3.2.1
+pyzmq==26.2.1
+referencing==0.36.2
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.22.3
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.4.0
+tomli==2.2.1
+tornado==6.4.2
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+uri-template==1.3.0
+urllib3==2.3.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+pip==22.0.2
+setuptools==59.6.0
+wheel==0.37.1
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/output.log b/wandb/offline-run-20250516_073747-jc2tz43q/files/output.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt b/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30272a20e03fdb784d0ddd410c9451a2fc8f06a3
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/requirements.txt
@@ -0,0 +1,359 @@
+huggingface-hub==0.31.2
+tokenizers==0.21.1
+bitsandbytes==0.45.5
+py-cpuinfo==9.0.0
+nvidia-ml-py==12.575.51
+hjson==3.1.0
+smmap==5.0.2
+setproctitle==1.3.6
+sentry-sdk==2.28.0
+ninja==1.11.1.4
+msgpack==1.1.0
+einops==0.8.1
+docker-pycreds==0.4.0
+gitdb==4.0.12
+GitPython==3.1.44
+wandb==0.19.11
+transformers==4.52.0.dev0
+deepspeed==0.16.7
+accelerate==1.8.0.dev0
+peft==0.15.2.dev0
+trl==0.17.0
+flash_attn==2.7.4.post1
+APScheduler==3.10.4
+Authlib==1.3.1
+Deprecated==1.2.18
+Flask-Cors==4.0.1
+Mako==1.3.8
+Markdown==3.6
+PyJWT==2.8.0
+PyMySQL==1.1.1
+PyPika==0.48.9
+RTFDE==0.1.2
+SQLAlchemy==2.0.31
+XlsxWriter==3.2.2
+aiohttp==3.9.5
+alembic==1.13.2
+annotated-types==0.7.0
+anthropic==0.45.2
+asgiref==3.8.1
+async-timeout==4.0.3
+av==12.3.0
+backoff==2.2.1
+bcrypt==4.1.3
+beautifulsoup4==4.12.3
+bidict==0.23.1
+black==24.8.0
+blinker==1.9.0
+boto3==1.34.153
+botocore==1.34.162
+build==1.2.2.post1
+cachetools==5.5.1
+chardet==5.2.0
+chroma-hnswlib==0.7.5
+chromadb==0.5.4
+click==8.1.8
+colorclass==2.2.2
+coloredlogs==15.0.1
+compressed-rtf==1.0.6
+cryptography==44.0.0
+ctranslate2==4.5.0
+dataclasses-json==0.6.7
+deepdiff==8.1.1
+distro==1.9.0
+dnspython==2.7.0
+docker==7.1.0
+docx2txt==0.8
+duckduckgo_search==6.2.13
+durationpy==0.9
+easygui==0.98.3
+ebcdic==1.1.1
+ecdsa==0.19.0
+email_validator==2.2.0
+emoji==2.14.1
+extract-msg==0.52.0
+fake-useragent==1.5.1
+fastapi==0.111.0
+fastapi-cli==0.0.7
+faster-whisper==1.0.2
+filetype==1.2.0
+Flask==3.0.3
+flatbuffers==25.1.24
+fonttools==4.55.8
+fpdf2==2.7.9
+google-ai-generativelanguage==0.6.6
+google-api-core==2.24.1
+google-api-python-client==2.160.0
+google-auth==2.38.0
+google-auth-httplib2==0.2.0
+google-generativeai==0.7.2
+googleapis-common-protos==1.66.0
+greenlet==3.1.1
+grpcio==1.70.0
+grpcio-status==1.62.3
+httplib2==0.22.0
+httptools==0.6.4
+humanfriendly==10.0
+importlib_metadata==8.4.0
+importlib_resources==6.5.2
+iniconfig==2.0.0
+itsdangerous==2.2.0
+jiter==0.8.2
+jmespath==1.0.1
+joblib==1.4.2
+jsonpatch==1.33
+jsonpath-python==1.0.6
+kubernetes==32.0.0
+langchain==0.2.11
+langchain-chroma==0.1.2
+langchain-community==0.2.10
+langchain-core==0.2.43
+langchain-text-splitters==0.2.4
+langdetect==1.0.9
+langfuse==2.39.2
+langsmith==0.1.147
+lark==1.1.9
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.26.0
+mdurl==0.1.2
+mmh3==5.1.0
+monotonic==1.6
+msoffcrypto-tool==5.4.2
+mypy-extensions==1.0.0
+nltk==3.9.1
+numpy==1.26.4
+oauthlib==3.2.2
+olefile==0.47
+oletools==0.60.2
+onnxruntime==1.20.1
+openai==1.61.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.10.0.84
+opentelemetry-api==1.27.0
+opentelemetry-exporter-otlp-proto-common==1.27.0
+opentelemetry-exporter-otlp-proto-grpc==1.27.0
+opentelemetry-instrumentation==0.48b0
+opentelemetry-instrumentation-asgi==0.48b0
+opentelemetry-instrumentation-fastapi==0.48b0
+opentelemetry-proto==1.27.0
+opentelemetry-sdk==1.27.0
+opentelemetry-semantic-conventions==0.48b0
+opentelemetry-util-http==0.48b0
+orderly-set==5.2.3
+orjson==3.10.15
+packaging==23.2
+pandas==2.2.2
+passlib==1.7.4
+pathspec==0.12.1
+pcodedmp==1.2.6
+peewee==3.17.6
+peewee-migrate==1.12.2
+pillow==11.1.0
+pluggy==1.5.0
+posthog==3.11.0
+primp==0.11.0
+proto-plus==1.26.0
+protobuf==4.25.6
+psycopg2-binary==2.9.9
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pyclipper==1.3.0.post6
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+pymongo==4.11
+pypandoc==1.13
+pyparsing==3.2.1
+pypdf==4.3.1
+pyproject_hooks==1.2.0
+pytest==8.2.2
+pytest-docker==3.1.1
+python-dotenv==1.0.1
+python-engineio==4.11.2
+python-iso639==2025.1.28
+python-jose==3.3.0
+python-magic==0.4.27
+python-multipart==0.0.9
+python-pptx==1.0.0
+python-socketio==5.11.3
+pytube==15.0.0
+pyxlsb==1.0.10
+rank-bm25==0.2.2
+RapidFuzz==3.12.1
+rapidocr-onnxruntime==1.3.24
+red-black-tree-mod==1.20
+redis==5.2.1
+requests-oauthlib==2.0.0
+requests-toolbelt==1.0.0
+rich==13.9.4
+rich-toolkit==0.13.2
+rsa==4.9
+s3transfer==0.10.4
+scikit-learn==1.6.1
+scipy==1.15.1
+sentence-transformers==3.0.1
+shapely==2.0.7
+shellingham==1.5.4
+simple-websocket==1.1.0
+starlette==0.37.2
+tabulate==0.9.0
+tenacity==8.5.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+typer==0.15.1
+typing-inspect==0.9.0
+tzlocal==5.2
+ujson==5.10.0
+unstructured==0.15.0
+unstructured-client==0.25.9
+uritemplate==4.1.1
+uvicorn==0.22.0
+uvloop==0.21.0
+validators==0.33.0
+watchfiles==1.0.4
+websockets==14.2
+Werkzeug==3.1.3
+wrapt==1.17.2
+wsproto==1.2.0
+xlrd==2.0.1
+youtube-transcript-api==0.6.2
+zipp==3.21.0
+aiohappyeyeballs==2.4.4
+aiosignal==1.3.2
+datasets==3.2.0
+dill==0.3.8
+et_xmlfile==2.0.0
+evaluate==0.4.3
+filelock==3.17.0
+frozenlist==1.5.0
+fsspec==2024.9.0
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+networkx==3.4.2
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openpyxl==3.1.5
+propcache==0.2.1
+pyarrow==19.0.0
+pytz==2025.1
+regex==2024.11.6
+safetensors==0.5.2
+sentencepiece==0.2.0
+sympy==1.13.1
+torch==2.6.0
+tqdm==4.67.1
+triton==3.2.0
+tzdata==2025.1
+xxhash==3.5.0
+yarl==1.18.3
+MarkupSafe==3.0.2
+PyYAML==6.0.2
+Send2Trash==1.8.3
+anyio==4.8.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.4
+attrs==25.1.0
+babel==2.17.0
+bleach==6.2.0
+certifi==2025.1.31
+cffi==1.17.1
+charset-normalizer==3.4.1
+comm==0.2.2
+debugpy==1.8.12
+decorator==5.1.1
+defusedxml==0.7.1
+exceptiongroup==1.2.2
+executing==2.2.0
+fastjsonschema==2.21.1
+fqdn==1.5.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+idna==3.10
+ipykernel==6.29.5
+ipython==8.32.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.5
+json5==0.10.0
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter_client==8.6.3
+jupyter-console==6.6.3
+jupyter_core==5.7.2
+jupyter-events==0.11.0
+jupyter-lsp==2.2.5
+jupyter_server==2.15.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.3.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+matplotlib-inline==0.1.7
+mistune==3.1.1
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.3.2
+notebook_shim==0.2.4
+overrides==7.7.0
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+platformdirs==4.3.6
+prometheus_client==0.21.1
+prompt_toolkit==3.0.50
+psutil==6.1.1
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-json-logger==3.2.1
+pyzmq==26.2.1
+referencing==0.36.2
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.22.3
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+terminado==0.18.1
+tinycss2==1.4.0
+tomli==2.2.1
+tornado==6.4.2
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20241206
+typing_extensions==4.12.2
+uri-template==1.3.0
+urllib3==2.3.0
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+widgetsnbextension==4.0.13
+pip==22.0.2
+setuptools==59.6.0
+wheel==0.37.1
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json b/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json
new file mode 100644
index 0000000000000000000000000000000000000000..7097317b98a945cbc73910836d4d7816918309c1
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/files/wandb-metadata.json
@@ -0,0 +1,162 @@
+{
+  "os": "Linux-5.10.236-228.935.amzn2.x86_64-x86_64-with-glibc2.35",
+  "python": "CPython 3.10.12",
+  "startedAt": "2025-05-16T07:37:47.236910Z",
+  "args": [
+    "--model_name_or_path",
+    "codellama/CodeLlama-7b-Instruct-hf",
+    "--dataset_name",
+    "smangrul/hug_stack",
+    "--splits",
+    "train",
+    "--max_seq_len",
+    "2048",
+    "--max_steps",
+    "2000",
+    "--save_steps",
+    "500",
+    "--eval_steps",
+    "100",
+    "--logging_steps",
+    "5",
+    "--log_level",
+    "info",
+    "--logging_strategy",
+    "steps",
+    "--save_strategy",
+    "steps",
+    "--push_to_hub",
+    "--hub_private_repo",
+    "True",
+    "--hub_strategy",
+    "every_save",
+    "--bf16",
+    "True",
+    "--learning_rate",
+    "3e-4",
+    "--lr_scheduler_type",
+    "cosine",
+    "--weight_decay",
+    "0.1",
+    "--warmup_ratio",
+    "0.1",
+    "--max_grad_norm",
+    "1.0",
+    "--output_dir",
+    "codellama-hugcoder",
+    "--per_device_train_batch_size",
+    "4",
+    "--per_device_eval_batch_size",
+    "4",
+    "--gradient_accumulation_steps",
+    "4",
+    "--gradient_checkpointing",
+    "True",
+    "--use_reentrant",
+    "True",
+    "--dataset_text_field",
+    "text",
+    "--test_size",
+    "0.1",
+    "--fim_rate",
+    "0.5",
+    "--fim_spm_rate",
+    "0.5",
+    "--use_peft_lora",
+    "True",
+    "--lora_r",
+    "32",
+    "--lora_alpha",
+    "64",
+    "--lora_dropout",
+    "0.1",
+    "--lora_target_modules",
+    "all-linear",
+    "--use_4bit_quantization",
+    "True",
+    "--use_nested_quant",
+    "True",
+    "--bnb_4bit_compute_dtype",
+    "bfloat16",
+    "--use_flash_attn",
+    "True"
+  ],
+  "program": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/train.py",
+  "codePath": "personal_copilot/training/train.py",
+  "git": {
+    "remote": "https://github.com/pacman100/LLM-Workshop.git",
+    "commit": "0ba41561ce6ea16d3993069c03ec1dca3ab6769d"
+  },
+  "root": "/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training",
+  "host": "project-finecode-65846d7984-mzlgr",
+  "executable": "/usr/bin/python3",
+  "codePathLocal": "train.py",
+  "cpu_count": 96,
+  "cpu_count_logical": 192,
+  "gpu": "NVIDIA L4",
+  "gpu_count": 8,
+  "disk": {
+    "/": {
+      "total": "161048670208",
+      "used": "79644385280"
+    }
+  },
+  "memory": {
+    "total": "781916942336"
+  },
+  "cpu": {
+    "count": 96,
+    "countLogical": 192
+  },
+  "gpu_nvidia": [
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    },
+    {
+      "name": "NVIDIA L4",
+      "memoryTotal": "24152899584",
+      "cudaCores": 7424,
+      "architecture": "Ada"
+    }
+  ],
+  "cudaVersion": "12.4"
+}
\ No newline at end of file
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log
new file mode 100644
index 0000000000000000000000000000000000000000..d90128eade86bb07359f810edbe2e6d6c84de48d
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-checkpoint.log
@@ -0,0 +1,25 @@
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():852] calling init triggers
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():893] starting backend
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():897] sending inform_init request
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [wandb_init.py:init():907] backend started and connected
+2025-05-16 07:37:47,237 INFO    MainThread:29365 [wandb_init.py:init():1005] updated telemetry
+2025-05-16 07:37:47,244 INFO    MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-05-16 07:37:47,473 INFO    MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-05-16 07:37:47,857 INFO    MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process
+2025-05-16 07:37:47,859 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fa638123580>>
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log
new file mode 100644
index 0000000000000000000000000000000000000000..d8651a3eaf5c4b99f83713269d923c485096f122
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-core-checkpoint.log
@@ -0,0 +1,6 @@
+{"time":"2025-05-16T07:37:38.620987457Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_3cqd41s/port-29365.txt","pid":29365,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-05-16T07:37:38.627281182Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":29365}
+{"time":"2025-05-16T07:37:38.628474204Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43351,"Zone":""}}
+{"time":"2025-05-16T07:37:38.725900233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57302"}
+{"time":"2025-05-16T07:37:47.242158486Z","level":"INFO","msg":"handleInformInit: received","streamId":"jc2tz43q","id":"127.0.0.1:57302"}
+{"time":"2025-05-16T07:37:47.471081329Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jc2tz43q","id":"127.0.0.1:57302"}
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log
new file mode 100644
index 0000000000000000000000000000000000000000..933f96ad797a96ef23bba6e0326388ce0cd85c7f
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/.ipynb_checkpoints/debug-internal-checkpoint.log
@@ -0,0 +1,8 @@
+{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"}
+{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"}
+{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"}
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log
new file mode 100644
index 0000000000000000000000000000000000000000..a9fa499c890860ea95258f04260540e5177981f6
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log
@@ -0,0 +1,13 @@
+{"time":"2025-05-16T07:37:38.620987457Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp_3cqd41s/port-29365.txt","pid":29365,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
+{"time":"2025-05-16T07:37:38.627281182Z","level":"INFO","msg":"Will exit if parent process dies.","ppid":29365}
+{"time":"2025-05-16T07:37:38.628474204Z","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43351,"Zone":""}}
+{"time":"2025-05-16T07:37:38.725900233Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:57302"}
+{"time":"2025-05-16T07:37:47.242158486Z","level":"INFO","msg":"handleInformInit: received","streamId":"jc2tz43q","id":"127.0.0.1:57302"}
+{"time":"2025-05-16T07:37:47.471081329Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"jc2tz43q","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.403652933Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.403810773Z","level":"INFO","msg":"server is shutting down"}
+{"time":"2025-05-17T22:27:35.403793492Z","level":"INFO","msg":"connection: closing","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.404006127Z","level":"INFO","msg":"connection: closed successfully","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.405135844Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.405151025Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:57302"}
+{"time":"2025-05-17T22:27:35.405161186Z","level":"INFO","msg":"server is closed"}
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log
new file mode 100644
index 0000000000000000000000000000000000000000..ee5bf05e14a33a7993620274bc5fffbfa49ca1f9
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log
@@ -0,0 +1,15 @@
+{"time":"2025-05-16T07:37:47.253769219Z","level":"INFO","msg":"stream: starting","core version":"0.19.11","symlink path":"/local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-core.log"}
+{"time":"2025-05-16T07:37:47.470171926Z","level":"WARN","msg":"GraphQL client is nil, skipping feature loading"}
+{"time":"2025-05-16T07:37:47.471043556Z","level":"INFO","msg":"created new stream","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471071988Z","level":"INFO","msg":"stream: started","id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.47110475Z","level":"INFO","msg":"writer: Do: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471115831Z","level":"INFO","msg":"handler: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.471180815Z","level":"INFO","msg":"sender: started","stream_id":"jc2tz43q"}
+{"time":"2025-05-16T07:37:47.477686926Z","level":"INFO","msg":"Starting system monitor"}
+{"time":"2025-05-17T22:27:35.403843016Z","level":"INFO","msg":"stream: closing","id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404804572Z","level":"INFO","msg":"Stopping system monitor"}
+{"time":"2025-05-17T22:27:35.404850555Z","level":"INFO","msg":"Stopped system monitor"}
+{"time":"2025-05-17T22:27:35.40493109Z","level":"INFO","msg":"handler: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404943001Z","level":"INFO","msg":"writer: Close: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.404963782Z","level":"INFO","msg":"sender: closed","stream_id":"jc2tz43q"}
+{"time":"2025-05-17T22:27:35.405060219Z","level":"INFO","msg":"stream: closed","id":"jc2tz43q"}
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log
new file mode 100644
index 0000000000000000000000000000000000000000..067508921187c34e3dc0bd7885357818572fbf91
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log
@@ -0,0 +1,26 @@
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Current SDK version is 0.19.11
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Configure stats pid to 29365
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /root/.config/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/settings
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_setup.py:_flush():70] Loading settings from environment variables
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():724] Logging user logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:setup_run_log_directory():725] Logging internal logs to /local_storage/persistent_data/jupyterlab/lab/workspaces/training_script/LLM-Workshop/personal_copilot/training/wandb/offline-run-20250516_073747-jc2tz43q/logs/debug-internal.log
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():852] calling init triggers
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():857] wandb.init called with sweep_config: {}
+config: {'_wandb': {}}
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():893] starting backend
+2025-05-16 07:37:47,232 INFO    MainThread:29365 [wandb_init.py:init():897] sending inform_init request
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [backend.py:_multiprocessing_setup():101] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
+2025-05-16 07:37:47,236 INFO    MainThread:29365 [wandb_init.py:init():907] backend started and connected
+2025-05-16 07:37:47,237 INFO    MainThread:29365 [wandb_init.py:init():1005] updated telemetry
+2025-05-16 07:37:47,244 INFO    MainThread:29365 [wandb_init.py:init():1029] communicating run to backend with 90.0 second timeout
+2025-05-16 07:37:47,473 INFO    MainThread:29365 [wandb_init.py:init():1104] starting run threads in backend
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_console_start():2573] atexit reg
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2421] redirect: wrap_raw
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2490] Wrapping output streams.
+2025-05-16 07:37:47,854 INFO    MainThread:29365 [wandb_run.py:_redirect():2513] Redirects installed.
+2025-05-16 07:37:47,857 INFO    MainThread:29365 [wandb_init.py:init():1150] run started, returning control to user process
+2025-05-16 07:37:47,859 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': <PeftType.LORA: 'LORA'>, 'auto_mapping': None, 'base_model_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'revision': None, 'inference_mode': False, 'r': 32, 'target_modules': {'down_proj', 'up_proj', 'k_proj', 'q_proj', 'v_proj', 'gate_proj', 'o_proj'}, 'exclude_modules': None, 'lora_alpha': 64, 'lora_dropout': 0.1, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'vocab_size': 32016, 'max_position_embeddings': 16384, 'hidden_size': 4096, 'intermediate_size': 11008, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 1000000, 'rope_scaling': None, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 128, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'float16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': None, 'eos_token_id': 2, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'codellama/CodeLlama-7b-Instruct-hf', 'transformers_version': '4.52.0.dev0', 'model_type': 'llama', 'quantization_config': {'quant_method': 'BITS_AND_BYTES', '_load_in_8bit': False, '_load_in_4bit': True, 'llm_int8_threshold': 6.0, 'llm_int8_skip_modules': None, 'llm_int8_enable_fp32_cpu_offload': False, 'llm_int8_has_fp16_weight': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_quant_storage': 'uint8', 'load_in_4bit': True, 'load_in_8bit': False}, 'output_dir': 'codellama-hugcoder', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 4, 'per_device_eval_batch_size': 4, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 4, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 0.0003, 'weight_decay': 0.1, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3.0, 'max_steps': 2000, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.1, 'warmup_steps': 0, 'log_level': 'info', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'codellama-hugcoder/runs/May16_07-36-11_project-finecode-65846d7984-mzlgr', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 500, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 100.0, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': 'codellama-hugcoder', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': True, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': True, 'hub_always_push': False, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': {'use_reentrant': True}, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False}
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 6818500608 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7fa638123580>>
+2025-05-16 07:37:47,866 INFO    MainThread:29365 [wandb_run.py:_config_callback():1436] config_cb model/num_parameters 6818500608 None
+2025-05-17 22:27:35,403 INFO    MsgRouterThr:29365 [mailbox.py:close():129] [no run ID] Closing mailbox, abandoning 0 handles.
diff --git a/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb b/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb
new file mode 100644
index 0000000000000000000000000000000000000000..be710865ae2cf80d025f72197b5b5bd35f92a90b
--- /dev/null
+++ b/wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17a73eb777948b78d05faf1e2809a681696b5c56ce0a7a7ada14ae168f5f3438
+size 33685295