limernyou commited on May 17, 2025

Commit

7140a44

verified ·

1 Parent(s): b79d2c4

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.ipynb_checkpoints/fim-checkpoint.py +141 -0
.ipynb_checkpoints/requirements-checkpoint.txt +14 -0
.ipynb_checkpoints/run_peft-checkpoint.sh +40 -0
.ipynb_checkpoints/train-checkpoint.py +495 -0
__pycache__/fim.cpython-310.pyc +0 -0
codellama-hugcoder/README.md +57 -0
codellama-hugcoder/adapter_config.json +39 -0
codellama-hugcoder/adapter_model.safetensors +3 -0
codellama-hugcoder/checkpoint-1000/README.md +202 -0
codellama-hugcoder/checkpoint-1000/adapter_config.json +39 -0
codellama-hugcoder/checkpoint-1000/adapter_model.safetensors +3 -0
codellama-hugcoder/checkpoint-1000/optimizer.pt +3 -0
codellama-hugcoder/checkpoint-1000/rng_state.pth +3 -0
codellama-hugcoder/checkpoint-1000/scheduler.pt +3 -0
codellama-hugcoder/checkpoint-1000/trainer_state.json +1434 -0
codellama-hugcoder/checkpoint-1000/training_args.bin +3 -0
codellama-hugcoder/checkpoint-1500/README.md +202 -0
codellama-hugcoder/checkpoint-1500/adapter_config.json +39 -0
codellama-hugcoder/checkpoint-1500/adapter_model.safetensors +3 -0
codellama-hugcoder/checkpoint-1500/optimizer.pt +3 -0
codellama-hugcoder/checkpoint-1500/rng_state.pth +3 -0
codellama-hugcoder/checkpoint-1500/scheduler.pt +3 -0
codellama-hugcoder/checkpoint-1500/trainer_state.json +2134 -0
codellama-hugcoder/checkpoint-1500/training_args.bin +3 -0
codellama-hugcoder/checkpoint-2000/README.md +202 -0
codellama-hugcoder/checkpoint-2000/adapter_config.json +39 -0
codellama-hugcoder/checkpoint-2000/adapter_model.safetensors +3 -0
codellama-hugcoder/checkpoint-2000/optimizer.pt +3 -0
codellama-hugcoder/checkpoint-2000/rng_state.pth +3 -0
codellama-hugcoder/checkpoint-2000/scheduler.pt +3 -0
codellama-hugcoder/checkpoint-2000/trainer_state.json +2834 -0
codellama-hugcoder/checkpoint-2000/training_args.bin +3 -0
codellama-hugcoder/checkpoint-500/README.md +202 -0
codellama-hugcoder/checkpoint-500/adapter_config.json +39 -0
codellama-hugcoder/checkpoint-500/adapter_model.safetensors +3 -0
codellama-hugcoder/checkpoint-500/optimizer.pt +3 -0
codellama-hugcoder/checkpoint-500/rng_state.pth +3 -0
codellama-hugcoder/checkpoint-500/scheduler.pt +3 -0
codellama-hugcoder/checkpoint-500/trainer_state.json +734 -0
codellama-hugcoder/checkpoint-500/training_args.bin +3 -0
codellama-hugcoder/training_args.bin +3 -0
configs/deepspeed_config.yaml +22 -0
configs/fsdp_config.yaml +25 -0
fim.py +141 -0
requirements.txt +14 -0
run_deepspeed.sh +33 -0
run_fsdp.sh +33 -0
run_peft.sh +40 -0
run_unsloth_peft.sh +43 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb filter=lfs diff=lfs merge=lfs -text

.ipynb_checkpoints/fim-checkpoint.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import numpy as np
+# this is expensive so we cache it
+@functools.lru_cache(maxsize=None)
+def get_fim_token_ids(tokenizer):
+    if "codellama" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.suffix_id,
+            tokenizer.prefix_id,
+            tokenizer.middle_id,
+            0,
+        )
+    elif "deepseek-coder" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<｜fim▁hole｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁begin｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁end｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<pad>", add_special_tokens=False)[0],
+        )
+    elif "stable-code" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<fim_suffix>")[0],
+            tokenizer.encode("<fim_prefix>")[0],
+            tokenizer.encode("<fim_middle>")[0],
+            tokenizer.encode("<fim_pad>")[0],
+        )
+    else:
+        bos_token_id = None
+        try:
+            FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
+                "additional_special_tokens"
+            ][1:5]
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                tokenizer.vocab[tok]
+                for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
+            )
+        except KeyError:
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                None,
+                None,
+                None,
+                None,
+            )
+    return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
+def _bos_token_processing(prefix_token_list, bos_token):
+    if bos_token is not None:
+        # add the BOS token to the beginning of the list
+        prefix_token_list.insert(0, bos_token)
+    return prefix_token_list
+## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
+def permute(
+    sample,
+    np_rng,
+    suffix_tok_id,
+    prefix_tok_id,
+    middle_tok_id,
+    pad_tok_id,
+    fim_rate=0.5,
+    fim_spm_rate=0.5,
+    truncate_or_pad=False,
+    bos_token_id=None,
+):
+    """
+    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
+    PSM and SPM (with a probability of fim_spm_rate).
+    """
+    if np_rng.binomial(1, fim_rate):
+        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
+        boundaries.sort()
+        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
+        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
+        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
+        if truncate_or_pad:
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - len(sample)
+            if diff > 0:
+                if suffix.shape[0] <= diff:
+                    return sample, np_rng
+                suffix = suffix[: suffix.shape[0] - diff]
+            elif diff < 0:
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
+        if np_rng.binomial(1, fim_spm_rate):
+            prefix_special_tokens = _bos_token_processing(
+                [prefix_tok_id, suffix_tok_id], bos_token_id
+            )
+            # SPM (variant 2 from FIM paper)
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    suffix,
+                    [middle_tok_id],
+                    prefix,
+                    middle,
+                ]
+            )
+        else:
+            prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
+            # PSM
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    prefix,
+                    [suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    middle,
+                ]
+            )
+    else:
+        # don't do FIM preproc
+        new_sample = sample
+    return list(new_sample), np_rng

.ipynb_checkpoints/requirements-checkpoint.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+git+https://github.com/huggingface/transformers
+git+https://github.com/huggingface/accelerate
+git+https://github.com/huggingface/peft
+trl
+huggingface-hub
+bitsandbytes
+evaluate
+datasets
+einops
+wandb
+tiktoken
+deepspeed
+tqdm
+safetensors

.ipynb_checkpoints/run_peft-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 3e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 4 \
+--per_device_eval_batch_size 4 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_peft_lora True \
+--lora_r 32 \
+--lora_alpha 64 \
+--lora_dropout 0.1 \
+--lora_target_modules "all-linear" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True

.ipynb_checkpoints/train-checkpoint.py ADDED Viewed

	@@ -0,0 +1,495 @@

+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Continued pre-training/fine-tuning of code LLMs for code autocompletion.
+"""
+import gc
+import os
+import random
+import sys
+from typing import Optional
+from dataclasses import dataclass, field
+import numpy as np
+import torch
+from datasets import load_dataset
+from torch.utils.data import IterableDataset
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    Trainer,
+    TrainingArguments,
+    HfArgumentParser,
+    set_seed,
+    BitsAndBytesConfig,
+)
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq
+import fim
+# Define and parse arguments.
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={
+            "help": "Path to pretrained model or model identifier from huggingface.co/models"
+        }
+    )
+    lora_alpha: Optional[int] = field(default=16)
+    lora_dropout: Optional[float] = field(default=0.1)
+    lora_r: Optional[int] = field(default=64)
+    lora_target_modules: Optional[str] = field(
+        default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
+        metadata={
+            "help": "comma separated list of target modules to apply LoRA layers to"
+        },
+    )
+    use_nested_quant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Activate nested quantization for 4bit base models"},
+    )
+    bnb_4bit_compute_dtype: Optional[str] = field(
+        default="float16",
+        metadata={"help": "Compute dtype for 4bit base models"},
+    )
+    bnb_4bit_quant_type: Optional[str] = field(
+        default="nf4",
+        metadata={"help": "Quantization type fp4 or nf4"},
+    )
+    use_flash_attn: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables Flash attention for training."},
+    )
+    use_peft_lora: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables PEFT LoRA for training."},
+    )
+    use_8bit_qunatization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 8bit."},
+    )
+    use_4bit_quantization: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables loading model in 4bit."},
+    )
+    use_reentrant: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Gradient Checkpointing param. Refer the related docs"},
+    )
+    use_unsloth: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables UnSloth for training."},
+    )
+    use_loftq: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."},
+    )
+    use_loftq_callback: Optional[bool] = field(
+        default=False,
+        metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."},
+    )
+@dataclass
+class DataTrainingArguments:
+    dataset_name: Optional[str] = field(
+        default="smangrul/hug_stack",
+        metadata={"help": "The preference dataset to use."},
+    )
+    dataset_text_field: str = field(
+        default="text", metadata={"help": "Dataset field to use as input text."}
+    )
+    max_seq_length: Optional[int] = field(default=4096)
+    test_size: Optional[float] = field(default=0.1)
+    fim_rate: Optional[float] = field(default=0.5)
+    fim_spm_rate: Optional[float] = field(default=0.5)
+    splits: Optional[str] = field(
+        default="train",
+        metadata={"help": "Comma separate list of the splits to use from the dataset."},
+    )
+def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        total_characters += len(example[data_column])
+        total_tokens += len(tokenizer(example[data_column]).tokens())
+    return total_characters / total_tokens
+class ConstantLengthDataset(IterableDataset):
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+        Args:
+            tokenizer (Tokenizer): The processor used for proccessing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
+            seq_length (int): Length of token sequences to return.
+            num_of_sequences (int): Number of token sequences to keep in buffer.
+            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
+            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
+            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
+            seed (int): Seed for random number generator.
+    """
+    def __init__(
+        self,
+        tokenizer,
+        dataset,
+        infinite=False,
+        seq_length=1024,
+        num_of_sequences=1024,
+        chars_per_token=3.6,
+        content_field="content",
+        fim_rate=0.5,
+        fim_spm_rate=0.5,
+        seed=0,
+        shuffle=False,
+    ):
+        self.tokenizer = tokenizer
+        self.concat_token_id = tokenizer.eos_token_id
+        self.dataset = dataset
+        self.seq_length = seq_length
+        self.infinite = infinite
+        self.current_size = 0
+        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
+        self.content_field = content_field
+        self.fim_rate = fim_rate
+        self.fim_spm_rate = fim_spm_rate
+        self.seed = seed
+        self.shuffle = shuffle
+        (
+            self.bos_token_id,
+            self.suffix_tok_id,
+            self.prefix_tok_id,
+            self.middle_tok_id,
+            self.pad_tok_id,
+        ) = fim.get_fim_token_ids(self.tokenizer)
+        if not self.suffix_tok_id and self.fim_rate > 0:
+            print("FIM is not supported by tokenizer, disabling FIM")
+            self.fim_rate = 0
+    def __iter__(self):
+        iterator = iter(self.dataset)
+        more_examples = True
+        np_rng = np.random.RandomState(seed=self.seed)
+        while more_examples:
+            buffer, buffer_len = [], 0
+            while True:
+                if buffer_len >= self.max_buffer_size:
+                    break
+                try:
+                    buffer.append(next(iterator)[self.content_field])
+                    buffer_len += len(buffer[-1])
+                except StopIteration:
+                    if self.infinite:
+                        iterator = iter(self.dataset)
+                    else:
+                        more_examples = False
+                        break
+            tokenized_inputs = self.tokenizer(
+                buffer, truncation=False, add_special_tokens=False
+            )["input_ids"]
+            all_token_ids = []
+            for tokenized_input in tokenized_inputs:
+                # optionally do FIM permutations
+                if self.fim_rate > 0:
+                    tokenized_input, np_rng = fim.permute(
+                        tokenized_input,
+                        np_rng,
+                        self.suffix_tok_id,
+                        self.prefix_tok_id,
+                        self.middle_tok_id,
+                        self.pad_tok_id,
+                        fim_rate=self.fim_rate,
+                        fim_spm_rate=self.fim_spm_rate,
+                        truncate_or_pad=False,
+                        bos_token_id=self.bos_token_id,
+                    )
+                all_token_ids.extend(tokenized_input + [self.concat_token_id])
+            examples = []
+            for i in range(0, len(all_token_ids), self.seq_length):
+                input_ids = all_token_ids[i : i + self.seq_length]
+                if len(input_ids) == self.seq_length:
+                    examples.append(input_ids)
+            if self.shuffle:
+                random.shuffle(examples)
+            for example in examples:
+                self.current_size += 1
+                yield {
+                    "input_ids": torch.LongTensor(example),
+                    "labels": torch.LongTensor(example),
+                }
+def create_datasets(tokenizer, args, seed):
+    dataset = load_dataset(args.dataset_name, split=args.splits)
+    dataset = dataset.train_test_split(
+        test_size=args.test_size, seed=seed, shuffle=True
+    )
+    train_data = dataset["train"]
+    valid_data = dataset["test"]
+    print(
+        f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
+    )
+    chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field)
+    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+    train_dataset = ConstantLengthDataset(
+        tokenizer,
+        train_data,
+        infinite=True,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+        shuffle=True,
+    )
+    valid_dataset = ConstantLengthDataset(
+        tokenizer,
+        valid_data,
+        infinite=False,
+        seq_length=args.max_seq_length,
+        chars_per_token=chars_per_token,
+        content_field=args.dataset_text_field,
+        fim_rate=args.fim_rate,
+        fim_spm_rate=args.fim_spm_rate,
+        seed=seed,
+    )
+    print(f"A sample of valid dataset: {next(iter(valid_dataset))}")
+    return train_dataset, valid_dataset
+def get_mae(x, y):
+    return (x - y).abs().mean()
+def get_mse(x, y):
+    return torch.pow(x - y, 2).mean()
+def error_report(x, y):
+    mae = get_mae(x, y)
+    mse = get_mse(x, y)
+    print(
+        f"Mean absolute error: {mae:>8.5f}\n"
+        f"Mean squared error:  {mse:>8.5f}"
+    )
+def loftq_init(model, tokenizer, train_dataset, max_seq_length, args):
+    if args.use_loftq_callback:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+        base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype)
+        base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
+        random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist()
+        random_inputs = [train_dataset[i]['content'] for i in random_input_ids]
+        random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length)
+        logits_base = base_model(**random_inputs).logits
+        del base_model
+        gc.collect()
+        def loftq_callback(model, module_name):
+            """Callable to replace weights with LoFTQ if the mse is lower than the current best one."""
+            global current_mse
+            logits = model(**random_inputs).logits
+            mse = get_mse(logits_base, logits)
+            if mse < current_mse:
+                current_mse = mse
+                print(f"MSE improved for module {module_name}")
+                return True
+            print(f"MSE did not improve for module {module_name}")
+            return False
+        replace_lora_weights_loftq(model, callback=loftq_callback)
+        logits_loftq_callback = model(**random_inputs).logits
+        error_report(logits_base, logits_loftq_callback)
+    else:
+        replace_lora_weights_loftq(model)
+def create_and_prepare_model(args, data_args, training_args):
+    device_map = None
+    bnb_config = None
+    load_in_8bit = args.use_8bit_qunatization
+    load_in_4bit = args.use_4bit_quantization
+    if args.use_unsloth:
+        from unsloth import FastLanguageModel
+    if args.use_4bit_quantization:
+        compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=args.use_4bit_quantization,
+            bnb_4bit_quant_type=args.bnb_4bit_quant_type,
+            bnb_4bit_compute_dtype=compute_dtype,
+            bnb_4bit_use_double_quant=args.use_nested_quant,
+        )
+        if compute_dtype == torch.float16 and args.use_4bit_quantization:
+            major, _ = torch.cuda.get_device_capability()
+            if major >= 8:
+                print("=" * 80)
+                print(
+                    "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
+                )
+                print("=" * 80)
+    if args.use_4bit_quantization or args.use_8bit_qunatization:
+        device_map = (
+            int(os.environ.get("LOCAL_RANK", -1))
+            if torch.distributed.is_available() and torch.distributed.is_initialized()
+            else "auto"
+        )  # {"": 0}
+    if args.use_unsloth:
+        # Load model
+        model, _ = FastLanguageModel.from_pretrained(
+            model_name=args.model_name_or_path,
+            max_seq_length=data_args.max_seq_length,
+            dtype=None,
+            load_in_4bit=load_in_4bit,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            load_in_8bit=load_in_8bit,
+            quantization_config=bnb_config,
+            device_map=device_map,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
+        )
+    if (
+        (args.use_4bit_quantization or args.use_8bit_qunatization)
+        and args.use_peft_lora
+        and not args.use_unsloth
+    ):
+        model = prepare_model_for_kbit_training(
+            model,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant},
+        )
+    if args.use_peft_lora and not args.use_unsloth:
+        peft_config = LoraConfig(
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            bias="none",
+            task_type="CAUSAL_LM",
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+        )
+        model = get_peft_model(model, peft_config)
+    elif args.use_peft_lora and args.use_unsloth:
+        # Do model patching and add fast LoRA weights
+        model = FastLanguageModel.get_peft_model(
+            model,
+            lora_alpha=args.lora_alpha,
+            lora_dropout=args.lora_dropout,
+            r=args.lora_r,
+            target_modules=args.lora_target_modules.split(",")
+            if args.lora_target_modules != "all-linear"
+            else args.lora_target_modules,
+            use_gradient_checkpointing=training_args.gradient_checkpointing,
+            random_state=training_args.seed,
+            max_seq_length=data_args.max_seq_length,
+        )
+    return model
+def main(model_args, data_args, training_args):
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    # load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    # load the datasets
+    train_dataset, eval_dataset = create_datasets(
+        tokenizer, data_args, training_args.seed
+    )
+    train_dataset.start_iteration = 0
+    model = create_and_prepare_model(model_args, data_args, training_args)
+    # gradient ckpt
+    model.config.use_cache = not training_args.gradient_checkpointing
+    training_args.gradient_checkpointing = (
+        training_args.gradient_checkpointing and not model_args.use_unsloth
+    )
+    if training_args.gradient_checkpointing:
+        training_args.gradient_checkpointing_kwargs = {
+            "use_reentrant": model_args.use_reentrant
+        }
+    # trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
+    trainer.accelerator.print(f"{trainer.model}")
+    if model_args.use_peft_lora:
+        trainer.model.print_trainable_parameters()
+    # LoftQ initialization when using QLoRA
+    if model_args.use_4bit_quantization and model_args.use_loftq:
+        loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args)
+    # train
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    trainer.train(resume_from_checkpoint=checkpoint)
+    # saving final model
+    if trainer.is_fsdp_enabled:
+        trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+    trainer.save_model()
+if __name__ == "__main__":
+    parser = HfArgumentParser(
+        (ModelArguments, DataTrainingArguments, TrainingArguments)
+    )
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    main(model_args, data_args, training_args)

__pycache__/fim.cpython-310.pyc ADDED Viewed

Binary file (2.64 kB). View file

codellama-hugcoder/README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: peft
+license: llama2
+base_model: codellama/CodeLlama-7b-Instruct-hf
+tags:
+- generated_from_trainer
+model-index:
+- name: codellama-hugcoder
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# codellama-hugcoder
+This model is a fine-tuned version of [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 16
+- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- training_steps: 2000
+### Training results
+### Framework versions
+- PEFT 0.15.2.dev0
+- Transformers 4.52.0.dev0
+- Pytorch 2.6.0+cu124
+- Datasets 3.2.0
+- Tokenizers 0.21.1

codellama-hugcoder/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

codellama-hugcoder/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
+size 319876032

codellama-hugcoder/checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2.dev0

codellama-hugcoder/checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

codellama-hugcoder/checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a808764ea3b6733b0a7c7a6002b640b2b9246cabcd9ad2d940aa7f43c05d66e3
+size 319876032

codellama-hugcoder/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4025993adcd424dc3d3b0c61b41a0e262786b3bb304e6a592a013e59b80a6b38
+size 640009682

codellama-hugcoder/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f822cfb134cd0b1f54ce227e6d11176dede74f86c94420156b0a49753efe3b7
+size 14244

codellama-hugcoder/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3569157643c45495d0de4a184cdcaab0e6cab5317a8ad5f0b1bbb2d736dd80d4
+size 1064

codellama-hugcoder/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1434 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5,
+  "eval_steps": 100.0,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.314789078859776e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

codellama-hugcoder/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304

codellama-hugcoder/checkpoint-1500/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2.dev0

codellama-hugcoder/checkpoint-1500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

codellama-hugcoder/checkpoint-1500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:954883169196fec3dbbf2581acd2ff6690fa789729045bb04113f1bb36637c46
+size 319876032

codellama-hugcoder/checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:135e0fda5af04719269dc4cca8199c95f610932728fc80b6e63f3d656098bd57
+size 640009682

codellama-hugcoder/checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fda3c0b12e2631264746b16f7dd8a85fd763004a3c1d20e136ad6fae01987d26
+size 14244

codellama-hugcoder/checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:046c4144f3d3e450ad1c1129a3ed6e680f6f65f10c488eeb2fd00b8cd376efa0
+size 1064

codellama-hugcoder/checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2134 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.75,
+  "eval_steps": 100.0,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5025,
+      "grad_norm": 0.11050508171319962,
+      "learning_rate": 0.00017501531200741534,
+      "loss": 0.4098,
+      "step": 1005
+    },
+    {
+      "epoch": 0.505,
+      "grad_norm": 0.11737144738435745,
+      "learning_rate": 0.00017372371008817256,
+      "loss": 0.3943,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5075,
+      "grad_norm": 0.11473528295755386,
+      "learning_rate": 0.00017243030151655643,
+      "loss": 0.3796,
+      "step": 1015
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.13086555898189545,
+      "learning_rate": 0.00017113518479063738,
+      "loss": 0.4367,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.11752833425998688,
+      "learning_rate": 0.00016983845853856837,
+      "loss": 0.4097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.515,
+      "grad_norm": 0.11596900969743729,
+      "learning_rate": 0.0001685402215110739,
+      "loss": 0.3812,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5175,
+      "grad_norm": 0.11850260943174362,
+      "learning_rate": 0.00016724057257392998,
+      "loss": 0.4354,
+      "step": 1035
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12466365844011307,
+      "learning_rate": 0.00016593961070043498,
+      "loss": 0.4317,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5225,
+      "grad_norm": 0.11178991943597794,
+      "learning_rate": 0.0001646374349638724,
+      "loss": 0.3936,
+      "step": 1045
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.11252165585756302,
+      "learning_rate": 0.00016333414452996623,
+      "loss": 0.386,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5275,
+      "grad_norm": 0.12886975705623627,
+      "learning_rate": 0.0001620298386493288,
+      "loss": 0.3965,
+      "step": 1055
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.11716549098491669,
+      "learning_rate": 0.00016072461664990288,
+      "loss": 0.3924,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5325,
+      "grad_norm": 0.11604485660791397,
+      "learning_rate": 0.000159418577929397,
+      "loss": 0.3624,
+      "step": 1065
+    },
+    {
+      "epoch": 0.535,
+      "grad_norm": 0.11538460850715637,
+      "learning_rate": 0.00015811182194771633,
+      "loss": 0.4338,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.11618762463331223,
+      "learning_rate": 0.00015680444821938804,
+      "loss": 0.4058,
+      "step": 1075
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11750835925340652,
+      "learning_rate": 0.00015549655630598343,
+      "loss": 0.4422,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5425,
+      "grad_norm": 0.12725204229354858,
+      "learning_rate": 0.00015418824580853535,
+      "loss": 0.4422,
+      "step": 1085
+    },
+    {
+      "epoch": 0.545,
+      "grad_norm": 0.11274927109479904,
+      "learning_rate": 0.00015287961635995347,
+      "loss": 0.4229,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5475,
+      "grad_norm": 0.11833129078149796,
+      "learning_rate": 0.00015157076761743686,
+      "loss": 0.4442,
+      "step": 1095
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.11384794861078262,
+      "learning_rate": 0.00015026179925488475,
+      "loss": 0.4528,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5525,
+      "grad_norm": 0.11864661425352097,
+      "learning_rate": 0.00014895281095530575,
+      "loss": 0.3988,
+      "step": 1105
+    },
+    {
+      "epoch": 0.555,
+      "grad_norm": 0.11673832684755325,
+      "learning_rate": 0.00014764390240322691,
+      "loss": 0.3544,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5575,
+      "grad_norm": 0.1174502745270729,
+      "learning_rate": 0.00014633517327710202,
+      "loss": 0.4034,
+      "step": 1115
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.12685547769069672,
+      "learning_rate": 0.00014502672324172107,
+      "loss": 0.3595,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.12368053942918777,
+      "learning_rate": 0.00014371865194062007,
+      "loss": 0.3395,
+      "step": 1125
+    },
+    {
+      "epoch": 0.565,
+      "grad_norm": 0.1077839657664299,
+      "learning_rate": 0.000142411058988493,
+      "loss": 0.4199,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5675,
+      "grad_norm": 0.11699855327606201,
+      "learning_rate": 0.00014110404396360576,
+      "loss": 0.3443,
+      "step": 1135
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.13238464295864105,
+      "learning_rate": 0.0001397977064002128,
+      "loss": 0.3499,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5725,
+      "grad_norm": 0.11482933163642883,
+      "learning_rate": 0.0001384921457809772,
+      "loss": 0.3619,
+      "step": 1145
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.13390353322029114,
+      "learning_rate": 0.00013718746152939487,
+      "loss": 0.3684,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5775,
+      "grad_norm": 0.11464900523424149,
+      "learning_rate": 0.00013588375300222283,
+      "loss": 0.3313,
+      "step": 1155
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.10367871820926666,
+      "learning_rate": 0.00013458111948191296,
+      "loss": 0.3323,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5825,
+      "grad_norm": 0.12259294092655182,
+      "learning_rate": 0.0001332796601690512,
+      "loss": 0.3986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.585,
+      "grad_norm": 0.10923358052968979,
+      "learning_rate": 0.00013197947417480292,
+      "loss": 0.3808,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.12479504942893982,
+      "learning_rate": 0.0001306806605133656,
+      "loss": 0.4429,
+      "step": 1175
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.11521733552217484,
+      "learning_rate": 0.000129383318094428,
+      "loss": 0.4778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5925,
+      "grad_norm": 0.14112086594104767,
+      "learning_rate": 0.00012808754571563827,
+      "loss": 0.4634,
+      "step": 1185
+    },
+    {
+      "epoch": 0.595,
+      "grad_norm": 0.12947902083396912,
+      "learning_rate": 0.00012679344205507981,
+      "loss": 0.4439,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5975,
+      "grad_norm": 0.13288578391075134,
+      "learning_rate": 0.0001255011056637567,
+      "loss": 0.4402,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1216069906949997,
+      "learning_rate": 0.00012421063495808853,
+      "loss": 0.4203,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6025,
+      "grad_norm": 0.11649637669324875,
+      "learning_rate": 0.000122922128212416,
+      "loss": 0.4512,
+      "step": 1205
+    },
+    {
+      "epoch": 0.605,
+      "grad_norm": 0.1201406940817833,
+      "learning_rate": 0.00012163568355151628,
+      "loss": 0.3725,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6075,
+      "grad_norm": 0.12117727100849152,
+      "learning_rate": 0.00012035139894313107,
+      "loss": 0.4352,
+      "step": 1215
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.11709322035312653,
+      "learning_rate": 0.00011906937219050556,
+      "loss": 0.4189,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.11865726858377457,
+      "learning_rate": 0.0001177897009249405,
+      "loss": 0.3796,
+      "step": 1225
+    },
+    {
+      "epoch": 0.615,
+      "grad_norm": 0.10807759314775467,
+      "learning_rate": 0.0001165124825983573,
+      "loss": 0.4465,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6175,
+      "grad_norm": 0.13788209855556488,
+      "learning_rate": 0.00011523781447587641,
+      "loss": 0.4994,
+      "step": 1235
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.12921364605426788,
+      "learning_rate": 0.00011396579362841044,
+      "loss": 0.4251,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6225,
+      "grad_norm": 0.12162365019321442,
+      "learning_rate": 0.0001126965169252718,
+      "loss": 0.3864,
+      "step": 1245
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.12897826731204987,
+      "learning_rate": 0.00011143008102679559,
+      "loss": 0.3753,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6275,
+      "grad_norm": 0.116109699010849,
+      "learning_rate": 0.00011016658237697866,
+      "loss": 0.3296,
+      "step": 1255
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.12935414910316467,
+      "learning_rate": 0.00010890611719613512,
+      "loss": 0.3797,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6325,
+      "grad_norm": 0.13730891048908234,
+      "learning_rate": 0.0001076487814735685,
+      "loss": 0.3711,
+      "step": 1265
+    },
+    {
+      "epoch": 0.635,
+      "grad_norm": 0.13870631158351898,
+      "learning_rate": 0.00010639467096026211,
+      "loss": 0.4328,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.11644043773412704,
+      "learning_rate": 0.00010514388116158701,
+      "loss": 0.3283,
+      "step": 1275
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.12221091985702515,
+      "learning_rate": 0.00010389650733002894,
+      "loss": 0.3898,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6425,
+      "grad_norm": 0.12048634141683578,
+      "learning_rate": 0.00010265264445793464,
+      "loss": 0.3256,
+      "step": 1285
+    },
+    {
+      "epoch": 0.645,
+      "grad_norm": 0.1250566840171814,
+      "learning_rate": 0.00010141238727027761,
+      "loss": 0.408,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6475,
+      "grad_norm": 0.13518592715263367,
+      "learning_rate": 0.00010017583021744454,
+      "loss": 0.3763,
+      "step": 1295
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.13047736883163452,
+      "learning_rate": 9.89430674680425e-05,
+      "loss": 0.3989,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6525,
+      "grad_norm": 0.11474955826997757,
+      "learning_rate": 9.771419290172773e-05,
+      "loss": 0.3374,
+      "step": 1305
+    },
+    {
+      "epoch": 0.655,
+      "grad_norm": 0.11670063436031342,
+      "learning_rate": 9.648930010205619e-05,
+      "loss": 0.3343,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6575,
+      "grad_norm": 0.15385080873966217,
+      "learning_rate": 9.526848234935704e-05,
+      "loss": 0.3432,
+      "step": 1315
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13441519439220428,
+      "learning_rate": 9.405183261362863e-05,
+      "loss": 0.3116,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.14772167801856995,
+      "learning_rate": 9.283944354745888e-05,
+      "loss": 0.3613,
+      "step": 1325
+    },
+    {
+      "epoch": 0.665,
+      "grad_norm": 0.12146154791116714,
+      "learning_rate": 9.163140747896907e-05,
+      "loss": 0.3411,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6675,
+      "grad_norm": 0.1333102583885193,
+      "learning_rate": 9.042781640478291e-05,
+      "loss": 0.396,
+      "step": 1335
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.12051521986722946,
+      "learning_rate": 8.922876198302062e-05,
+      "loss": 0.3837,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6725,
+      "grad_norm": 0.12071400880813599,
+      "learning_rate": 8.803433552631874e-05,
+      "loss": 0.354,
+      "step": 1345
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.11258620023727417,
+      "learning_rate": 8.684462799487635e-05,
+      "loss": 0.3197,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6775,
+      "grad_norm": 0.11908067762851715,
+      "learning_rate": 8.565972998952814e-05,
+      "loss": 0.377,
+      "step": 1355
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1252991259098053,
+      "learning_rate": 8.447973174484469e-05,
+      "loss": 0.3438,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6825,
+      "grad_norm": 0.12832245230674744,
+      "learning_rate": 8.330472312226091e-05,
+      "loss": 0.346,
+      "step": 1365
+    },
+    {
+      "epoch": 0.685,
+      "grad_norm": 0.1396942287683487,
+      "learning_rate": 8.213479360323258e-05,
+      "loss": 0.3886,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.12938210368156433,
+      "learning_rate": 8.097003228242225e-05,
+      "loss": 0.3699,
+      "step": 1375
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.12459377944469452,
+      "learning_rate": 7.9810527860914e-05,
+      "loss": 0.3892,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6925,
+      "grad_norm": 0.1360333263874054,
+      "learning_rate": 7.86563686394587e-05,
+      "loss": 0.3423,
+      "step": 1385
+    },
+    {
+      "epoch": 0.695,
+      "grad_norm": 0.1357765644788742,
+      "learning_rate": 7.750764251174963e-05,
+      "loss": 0.408,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6975,
+      "grad_norm": 0.14453718066215515,
+      "learning_rate": 7.636443695772887e-05,
+      "loss": 0.3398,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.11541519314050674,
+      "learning_rate": 7.522683903692547e-05,
+      "loss": 0.4203,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7025,
+      "grad_norm": 0.13344840705394745,
+      "learning_rate": 7.409493538182545e-05,
+      "loss": 0.3694,
+      "step": 1405
+    },
+    {
+      "epoch": 0.705,
+      "grad_norm": 0.13069866597652435,
+      "learning_rate": 7.296881219127452e-05,
+      "loss": 0.3889,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7075,
+      "grad_norm": 0.12457838654518127,
+      "learning_rate": 7.184855522391359e-05,
+      "loss": 0.3342,
+      "step": 1415
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.11990659683942795,
+      "learning_rate": 7.073424979164794e-05,
+      "loss": 0.3855,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.1389523446559906,
+      "learning_rate": 6.962598075315046e-05,
+      "loss": 0.3943,
+      "step": 1425
+    },
+    {
+      "epoch": 0.715,
+      "grad_norm": 0.14108599722385406,
+      "learning_rate": 6.852383250739938e-05,
+      "loss": 0.388,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7175,
+      "grad_norm": 0.1342005580663681,
+      "learning_rate": 6.742788898725065e-05,
+      "loss": 0.3602,
+      "step": 1435
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.13516324758529663,
+      "learning_rate": 6.633823365304648e-05,
+      "loss": 0.3935,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7225,
+      "grad_norm": 0.1302197426557541,
+      "learning_rate": 6.52549494862593e-05,
+      "loss": 0.3618,
+      "step": 1445
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.12428996711969376,
+      "learning_rate": 6.417811898317259e-05,
+      "loss": 0.3338,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7275,
+      "grad_norm": 0.11249776184558868,
+      "learning_rate": 6.31078241485982e-05,
+      "loss": 0.3819,
+      "step": 1455
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1359994113445282,
+      "learning_rate": 6.204414648963159e-05,
+      "loss": 0.3356,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7325,
+      "grad_norm": 0.1118568629026413,
+      "learning_rate": 6.098716700944479e-05,
+      "loss": 0.3223,
+      "step": 1465
+    },
+    {
+      "epoch": 0.735,
+      "grad_norm": 0.12038140743970871,
+      "learning_rate": 5.993696620111741e-05,
+      "loss": 0.3481,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.12787550687789917,
+      "learning_rate": 5.889362404150703e-05,
+      "loss": 0.3766,
+      "step": 1475
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.12134893983602524,
+      "learning_rate": 5.7857219985158506e-05,
+      "loss": 0.2916,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7425,
+      "grad_norm": 0.1274223029613495,
+      "learning_rate": 5.682783295825345e-05,
+      "loss": 0.3095,
+      "step": 1485
+    },
+    {
+      "epoch": 0.745,
+      "grad_norm": 0.11817299574613571,
+      "learning_rate": 5.580554135259932e-05,
+      "loss": 0.3422,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7475,
+      "grad_norm": 0.1348387748003006,
+      "learning_rate": 5.479042301965987e-05,
+      "loss": 0.4044,
+      "step": 1495
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.14032681286334991,
+      "learning_rate": 5.378255526462631e-05,
+      "loss": 0.337,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.972183618289664e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

codellama-hugcoder/checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304

codellama-hugcoder/checkpoint-2000/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2.dev0

codellama-hugcoder/checkpoint-2000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

codellama-hugcoder/checkpoint-2000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
+size 319876032

codellama-hugcoder/checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:688ec5889a6aa6b6675276da1e991b1ffaf231ca0b9db550ca1055ee967ab484
+size 640009682

codellama-hugcoder/checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d88eee16810615d69e99ef0af6ae2767f80f0c756dab6f8b6315f916e0a2772d
+size 14180

codellama-hugcoder/checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0af176d761d71fce3fbce7001f4850782b022af8f40338e8e88b22363a32018f
+size 1064

codellama-hugcoder/checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2834 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 100.0,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    },
+    {
+      "epoch": 0.2525,
+      "grad_norm": 0.11635497957468033,
+      "learning_rate": 0.0002793770503810886,
+      "loss": 0.4969,
+      "step": 505
+    },
+    {
+      "epoch": 0.255,
+      "grad_norm": 0.12205849587917328,
+      "learning_rate": 0.00027870973585854665,
+      "loss": 0.4798,
+      "step": 510
+    },
+    {
+      "epoch": 0.2575,
+      "grad_norm": 0.10270871222019196,
+      "learning_rate": 0.00027803261959129905,
+      "loss": 0.3888,
+      "step": 515
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.11313367635011673,
+      "learning_rate": 0.0002773457531443712,
+      "loss": 0.4759,
+      "step": 520
+    },
+    {
+      "epoch": 0.2625,
+      "grad_norm": 0.12905193865299225,
+      "learning_rate": 0.00027664918882530225,
+      "loss": 0.4442,
+      "step": 525
+    },
+    {
+      "epoch": 0.265,
+      "grad_norm": 0.11690939962863922,
+      "learning_rate": 0.00027594297968016197,
+      "loss": 0.5535,
+      "step": 530
+    },
+    {
+      "epoch": 0.2675,
+      "grad_norm": 0.10021405667066574,
+      "learning_rate": 0.00027522717948951094,
+      "loss": 0.4717,
+      "step": 535
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.10104178637266159,
+      "learning_rate": 0.0002745018427643051,
+      "loss": 0.4906,
+      "step": 540
+    },
+    {
+      "epoch": 0.2725,
+      "grad_norm": 0.12113891541957855,
+      "learning_rate": 0.00027376702474174425,
+      "loss": 0.5674,
+      "step": 545
+    },
+    {
+      "epoch": 0.275,
+      "grad_norm": 0.11330476403236389,
+      "learning_rate": 0.0002730227813810658,
+      "loss": 0.5184,
+      "step": 550
+    },
+    {
+      "epoch": 0.2775,
+      "grad_norm": 0.1025850847363472,
+      "learning_rate": 0.0002722691693592831,
+      "loss": 0.4395,
+      "step": 555
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.11591499298810959,
+      "learning_rate": 0.0002715062460668694,
+      "loss": 0.5003,
+      "step": 560
+    },
+    {
+      "epoch": 0.2825,
+      "grad_norm": 0.11281153559684753,
+      "learning_rate": 0.0002707340696033871,
+      "loss": 0.4672,
+      "step": 565
+    },
+    {
+      "epoch": 0.285,
+      "grad_norm": 0.1123538464307785,
+      "learning_rate": 0.00026995269877306356,
+      "loss": 0.513,
+      "step": 570
+    },
+    {
+      "epoch": 0.2875,
+      "grad_norm": 0.10776390135288239,
+      "learning_rate": 0.0002691621930803127,
+      "loss": 0.4572,
+      "step": 575
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.10008667409420013,
+      "learning_rate": 0.0002683626127252036,
+      "loss": 0.4618,
+      "step": 580
+    },
+    {
+      "epoch": 0.2925,
+      "grad_norm": 0.13961340487003326,
+      "learning_rate": 0.00026755401859887595,
+      "loss": 0.4819,
+      "step": 585
+    },
+    {
+      "epoch": 0.295,
+      "grad_norm": 0.1476685106754303,
+      "learning_rate": 0.00026673647227890316,
+      "loss": 0.4964,
+      "step": 590
+    },
+    {
+      "epoch": 0.2975,
+      "grad_norm": 0.09795507788658142,
+      "learning_rate": 0.00026591003602460263,
+      "loss": 0.4796,
+      "step": 595
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.10903532058000565,
+      "learning_rate": 0.00026507477277229496,
+      "loss": 0.4775,
+      "step": 600
+    },
+    {
+      "epoch": 0.3025,
+      "grad_norm": 0.10258448123931885,
+      "learning_rate": 0.0002642307461305105,
+      "loss": 0.4519,
+      "step": 605
+    },
+    {
+      "epoch": 0.305,
+      "grad_norm": 0.11204435676336288,
+      "learning_rate": 0.0002633780203751459,
+      "loss": 0.4451,
+      "step": 610
+    },
+    {
+      "epoch": 0.3075,
+      "grad_norm": 0.10147629678249359,
+      "learning_rate": 0.0002625166604445689,
+      "loss": 0.4256,
+      "step": 615
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.10481107234954834,
+      "learning_rate": 0.00026164673193467306,
+      "loss": 0.4381,
+      "step": 620
+    },
+    {
+      "epoch": 0.3125,
+      "grad_norm": 0.10856641829013824,
+      "learning_rate": 0.00026076830109388255,
+      "loss": 0.4958,
+      "step": 625
+    },
+    {
+      "epoch": 0.315,
+      "grad_norm": 0.09918677806854248,
+      "learning_rate": 0.0002598814348181068,
+      "loss": 0.4335,
+      "step": 630
+    },
+    {
+      "epoch": 0.3175,
+      "grad_norm": 0.10417389869689941,
+      "learning_rate": 0.00025898620064564637,
+      "loss": 0.4603,
+      "step": 635
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.0903329998254776,
+      "learning_rate": 0.00025808266675204954,
+      "loss": 0.3932,
+      "step": 640
+    },
+    {
+      "epoch": 0.3225,
+      "grad_norm": 0.11511855572462082,
+      "learning_rate": 0.0002571709019449205,
+      "loss": 0.4169,
+      "step": 645
+    },
+    {
+      "epoch": 0.325,
+      "grad_norm": 0.11355557292699814,
+      "learning_rate": 0.0002562509756586793,
+      "loss": 0.4455,
+      "step": 650
+    },
+    {
+      "epoch": 0.3275,
+      "grad_norm": 0.1271187961101532,
+      "learning_rate": 0.00025532295794927437,
+      "loss": 0.4902,
+      "step": 655
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.11936645954847336,
+      "learning_rate": 0.0002543869194888471,
+      "loss": 0.4843,
+      "step": 660
+    },
+    {
+      "epoch": 0.3325,
+      "grad_norm": 0.11935465037822723,
+      "learning_rate": 0.00025344293156035044,
+      "loss": 0.4402,
+      "step": 665
+    },
+    {
+      "epoch": 0.335,
+      "grad_norm": 0.13073407113552094,
+      "learning_rate": 0.00025249106605211986,
+      "loss": 0.467,
+      "step": 670
+    },
+    {
+      "epoch": 0.3375,
+      "grad_norm": 0.10340435802936554,
+      "learning_rate": 0.0002515313954523991,
+      "loss": 0.4827,
+      "step": 675
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.11634550243616104,
+      "learning_rate": 0.00025056399284381983,
+      "loss": 0.466,
+      "step": 680
+    },
+    {
+      "epoch": 0.3425,
+      "grad_norm": 0.10582319647073746,
+      "learning_rate": 0.0002495889318978362,
+      "loss": 0.4751,
+      "step": 685
+    },
+    {
+      "epoch": 0.345,
+      "grad_norm": 0.16781780123710632,
+      "learning_rate": 0.00024860628686911436,
+      "loss": 0.4717,
+      "step": 690
+    },
+    {
+      "epoch": 0.3475,
+      "grad_norm": 0.11522196233272552,
+      "learning_rate": 0.0002476161325898776,
+      "loss": 0.4687,
+      "step": 695
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.11830449104309082,
+      "learning_rate": 0.000246618544464208,
+      "loss": 0.436,
+      "step": 700
+    },
+    {
+      "epoch": 0.3525,
+      "grad_norm": 0.17485427856445312,
+      "learning_rate": 0.0002456135984623034,
+      "loss": 0.4284,
+      "step": 705
+    },
+    {
+      "epoch": 0.355,
+      "grad_norm": 0.12288108468055725,
+      "learning_rate": 0.00024460137111469296,
+      "loss": 0.4261,
+      "step": 710
+    },
+    {
+      "epoch": 0.3575,
+      "grad_norm": 0.11587081104516983,
+      "learning_rate": 0.0002435819395064079,
+      "loss": 0.4493,
+      "step": 715
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.10690271109342575,
+      "learning_rate": 0.0002425553812711123,
+      "loss": 0.4648,
+      "step": 720
+    },
+    {
+      "epoch": 0.3625,
+      "grad_norm": 0.10404397547245026,
+      "learning_rate": 0.00024152177458519014,
+      "loss": 0.4634,
+      "step": 725
+    },
+    {
+      "epoch": 0.365,
+      "grad_norm": 0.11986954510211945,
+      "learning_rate": 0.00024048119816179236,
+      "loss": 0.4525,
+      "step": 730
+    },
+    {
+      "epoch": 0.3675,
+      "grad_norm": 0.10243026167154312,
+      "learning_rate": 0.00023943373124484234,
+      "loss": 0.4572,
+      "step": 735
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.10386748611927032,
+      "learning_rate": 0.00023837945360300129,
+      "loss": 0.3884,
+      "step": 740
+    },
+    {
+      "epoch": 0.3725,
+      "grad_norm": 0.11165735125541687,
+      "learning_rate": 0.0002373184455235934,
+      "loss": 0.4902,
+      "step": 745
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.09951601922512054,
+      "learning_rate": 0.00023625078780649178,
+      "loss": 0.4541,
+      "step": 750
+    },
+    {
+      "epoch": 0.3775,
+      "grad_norm": 0.10347504913806915,
+      "learning_rate": 0.00023517656175796518,
+      "loss": 0.3871,
+      "step": 755
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.10478132963180542,
+      "learning_rate": 0.00023409584918448627,
+      "loss": 0.4329,
+      "step": 760
+    },
+    {
+      "epoch": 0.3825,
+      "grad_norm": 0.1198212131857872,
+      "learning_rate": 0.00023300873238650159,
+      "loss": 0.425,
+      "step": 765
+    },
+    {
+      "epoch": 0.385,
+      "grad_norm": 0.1103711724281311,
+      "learning_rate": 0.00023191529415216434,
+      "loss": 0.4274,
+      "step": 770
+    },
+    {
+      "epoch": 0.3875,
+      "grad_norm": 0.09940385073423386,
+      "learning_rate": 0.00023081561775102944,
+      "loss": 0.4368,
+      "step": 775
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.11599268019199371,
+      "learning_rate": 0.00022970978692771242,
+      "loss": 0.4386,
+      "step": 780
+    },
+    {
+      "epoch": 0.3925,
+      "grad_norm": 0.10101296752691269,
+      "learning_rate": 0.00022859788589551188,
+      "loss": 0.4696,
+      "step": 785
+    },
+    {
+      "epoch": 0.395,
+      "grad_norm": 0.10112808644771576,
+      "learning_rate": 0.00022747999932999624,
+      "loss": 0.4066,
+      "step": 790
+    },
+    {
+      "epoch": 0.3975,
+      "grad_norm": 0.09595459699630737,
+      "learning_rate": 0.00022635621236255567,
+      "loss": 0.4837,
+      "step": 795
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.10761380940675735,
+      "learning_rate": 0.00022522661057391857,
+      "loss": 0.5446,
+      "step": 800
+    },
+    {
+      "epoch": 0.4025,
+      "grad_norm": 0.11919954419136047,
+      "learning_rate": 0.00022409127998763463,
+      "loss": 0.5027,
+      "step": 805
+    },
+    {
+      "epoch": 0.405,
+      "grad_norm": 0.10851597785949707,
+      "learning_rate": 0.00022295030706352356,
+      "loss": 0.4481,
+      "step": 810
+    },
+    {
+      "epoch": 0.4075,
+      "grad_norm": 0.10030311346054077,
+      "learning_rate": 0.00022180377869109104,
+      "loss": 0.4709,
+      "step": 815
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.111280657351017,
+      "learning_rate": 0.00022065178218291147,
+      "loss": 0.4423,
+      "step": 820
+    },
+    {
+      "epoch": 0.4125,
+      "grad_norm": 0.11253602802753448,
+      "learning_rate": 0.00021949440526797926,
+      "loss": 0.4136,
+      "step": 825
+    },
+    {
+      "epoch": 0.415,
+      "grad_norm": 0.10805424302816391,
+      "learning_rate": 0.00021833173608502732,
+      "loss": 0.4656,
+      "step": 830
+    },
+    {
+      "epoch": 0.4175,
+      "grad_norm": 0.10983198881149292,
+      "learning_rate": 0.00021716386317581542,
+      "loss": 0.3687,
+      "step": 835
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.10653118044137955,
+      "learning_rate": 0.00021599087547838727,
+      "loss": 0.4654,
+      "step": 840
+    },
+    {
+      "epoch": 0.4225,
+      "grad_norm": 0.10856354981660843,
+      "learning_rate": 0.00021481286232029735,
+      "loss": 0.4298,
+      "step": 845
+    },
+    {
+      "epoch": 0.425,
+      "grad_norm": 0.11233706772327423,
+      "learning_rate": 0.0002136299134118085,
+      "loss": 0.4484,
+      "step": 850
+    },
+    {
+      "epoch": 0.4275,
+      "grad_norm": 0.1085442528128624,
+      "learning_rate": 0.00021244211883906017,
+      "loss": 0.4776,
+      "step": 855
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.12297824025154114,
+      "learning_rate": 0.0002112495690572077,
+      "loss": 0.4029,
+      "step": 860
+    },
+    {
+      "epoch": 0.4325,
+      "grad_norm": 0.10838114470243454,
+      "learning_rate": 0.00021005235488353428,
+      "loss": 0.4848,
+      "step": 865
+    },
+    {
+      "epoch": 0.435,
+      "grad_norm": 0.10273341834545135,
+      "learning_rate": 0.0002088505674905342,
+      "loss": 0.3989,
+      "step": 870
+    },
+    {
+      "epoch": 0.4375,
+      "grad_norm": 0.11189126968383789,
+      "learning_rate": 0.0002076442983989705,
+      "loss": 0.438,
+      "step": 875
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.11592905968427658,
+      "learning_rate": 0.0002064336394709048,
+      "loss": 0.4786,
+      "step": 880
+    },
+    {
+      "epoch": 0.4425,
+      "grad_norm": 0.11230389773845673,
+      "learning_rate": 0.0002052186829027017,
+      "loss": 0.3999,
+      "step": 885
+    },
+    {
+      "epoch": 0.445,
+      "grad_norm": 0.12455113977193832,
+      "learning_rate": 0.00020399952121800767,
+      "loss": 0.4856,
+      "step": 890
+    },
+    {
+      "epoch": 0.4475,
+      "grad_norm": 0.1001812294125557,
+      "learning_rate": 0.00020277624726070526,
+      "loss": 0.4689,
+      "step": 895
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.11319112777709961,
+      "learning_rate": 0.00020154895418784242,
+      "loss": 0.3998,
+      "step": 900
+    },
+    {
+      "epoch": 0.4525,
+      "grad_norm": 0.11322236061096191,
+      "learning_rate": 0.00020031773546253824,
+      "loss": 0.4321,
+      "step": 905
+    },
+    {
+      "epoch": 0.455,
+      "grad_norm": 0.12924689054489136,
+      "learning_rate": 0.00019908268484686558,
+      "loss": 0.4208,
+      "step": 910
+    },
+    {
+      "epoch": 0.4575,
+      "grad_norm": 0.11435618251562119,
+      "learning_rate": 0.00019784389639471048,
+      "loss": 0.4682,
+      "step": 915
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.10801081359386444,
+      "learning_rate": 0.00019660146444460975,
+      "loss": 0.428,
+      "step": 920
+    },
+    {
+      "epoch": 0.4625,
+      "grad_norm": 0.10906939953565598,
+      "learning_rate": 0.0001953554836125667,
+      "loss": 0.4455,
+      "step": 925
+    },
+    {
+      "epoch": 0.465,
+      "grad_norm": 0.10790123790502548,
+      "learning_rate": 0.00019410604878484556,
+      "loss": 0.4544,
+      "step": 930
+    },
+    {
+      "epoch": 0.4675,
+      "grad_norm": 0.10536376386880875,
+      "learning_rate": 0.000192853255110746,
+      "loss": 0.376,
+      "step": 935
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.11744682490825653,
+      "learning_rate": 0.00019159719799535668,
+      "loss": 0.3887,
+      "step": 940
+    },
+    {
+      "epoch": 0.4725,
+      "grad_norm": 0.12954068183898926,
+      "learning_rate": 0.00019033797309228983,
+      "loss": 0.4075,
+      "step": 945
+    },
+    {
+      "epoch": 0.475,
+      "grad_norm": 0.1401606798171997,
+      "learning_rate": 0.00018907567629639725,
+      "loss": 0.4454,
+      "step": 950
+    },
+    {
+      "epoch": 0.4775,
+      "grad_norm": 0.12059322744607925,
+      "learning_rate": 0.00018781040373646706,
+      "loss": 0.4339,
+      "step": 955
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.11798987537622452,
+      "learning_rate": 0.00018654225176790336,
+      "loss": 0.4405,
+      "step": 960
+    },
+    {
+      "epoch": 0.4825,
+      "grad_norm": 0.11344211548566818,
+      "learning_rate": 0.00018527131696538846,
+      "loss": 0.4124,
+      "step": 965
+    },
+    {
+      "epoch": 0.485,
+      "grad_norm": 0.10373330116271973,
+      "learning_rate": 0.00018399769611552824,
+      "loss": 0.4329,
+      "step": 970
+    },
+    {
+      "epoch": 0.4875,
+      "grad_norm": 0.12053704261779785,
+      "learning_rate": 0.0001827214862094814,
+      "loss": 0.4944,
+      "step": 975
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.141033336520195,
+      "learning_rate": 0.00018144278443557328,
+      "loss": 0.4569,
+      "step": 980
+    },
+    {
+      "epoch": 0.4925,
+      "grad_norm": 0.10922867804765701,
+      "learning_rate": 0.0001801616881718947,
+      "loss": 0.3879,
+      "step": 985
+    },
+    {
+      "epoch": 0.495,
+      "grad_norm": 0.09843657910823822,
+      "learning_rate": 0.00017887829497888612,
+      "loss": 0.4106,
+      "step": 990
+    },
+    {
+      "epoch": 0.4975,
+      "grad_norm": 0.12131062150001526,
+      "learning_rate": 0.000177592702591908,
+      "loss": 0.4023,
+      "step": 995
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.11343283206224442,
+      "learning_rate": 0.00017630500891379806,
+      "loss": 0.4824,
+      "step": 1000
+    },
+    {
+      "epoch": 0.5025,
+      "grad_norm": 0.11050508171319962,
+      "learning_rate": 0.00017501531200741534,
+      "loss": 0.4098,
+      "step": 1005
+    },
+    {
+      "epoch": 0.505,
+      "grad_norm": 0.11737144738435745,
+      "learning_rate": 0.00017372371008817256,
+      "loss": 0.3943,
+      "step": 1010
+    },
+    {
+      "epoch": 0.5075,
+      "grad_norm": 0.11473528295755386,
+      "learning_rate": 0.00017243030151655643,
+      "loss": 0.3796,
+      "step": 1015
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.13086555898189545,
+      "learning_rate": 0.00017113518479063738,
+      "loss": 0.4367,
+      "step": 1020
+    },
+    {
+      "epoch": 0.5125,
+      "grad_norm": 0.11752833425998688,
+      "learning_rate": 0.00016983845853856837,
+      "loss": 0.4097,
+      "step": 1025
+    },
+    {
+      "epoch": 0.515,
+      "grad_norm": 0.11596900969743729,
+      "learning_rate": 0.0001685402215110739,
+      "loss": 0.3812,
+      "step": 1030
+    },
+    {
+      "epoch": 0.5175,
+      "grad_norm": 0.11850260943174362,
+      "learning_rate": 0.00016724057257392998,
+      "loss": 0.4354,
+      "step": 1035
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.12466365844011307,
+      "learning_rate": 0.00016593961070043498,
+      "loss": 0.4317,
+      "step": 1040
+    },
+    {
+      "epoch": 0.5225,
+      "grad_norm": 0.11178991943597794,
+      "learning_rate": 0.0001646374349638724,
+      "loss": 0.3936,
+      "step": 1045
+    },
+    {
+      "epoch": 0.525,
+      "grad_norm": 0.11252165585756302,
+      "learning_rate": 0.00016333414452996623,
+      "loss": 0.386,
+      "step": 1050
+    },
+    {
+      "epoch": 0.5275,
+      "grad_norm": 0.12886975705623627,
+      "learning_rate": 0.0001620298386493288,
+      "loss": 0.3965,
+      "step": 1055
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.11716549098491669,
+      "learning_rate": 0.00016072461664990288,
+      "loss": 0.3924,
+      "step": 1060
+    },
+    {
+      "epoch": 0.5325,
+      "grad_norm": 0.11604485660791397,
+      "learning_rate": 0.000159418577929397,
+      "loss": 0.3624,
+      "step": 1065
+    },
+    {
+      "epoch": 0.535,
+      "grad_norm": 0.11538460850715637,
+      "learning_rate": 0.00015811182194771633,
+      "loss": 0.4338,
+      "step": 1070
+    },
+    {
+      "epoch": 0.5375,
+      "grad_norm": 0.11618762463331223,
+      "learning_rate": 0.00015680444821938804,
+      "loss": 0.4058,
+      "step": 1075
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.11750835925340652,
+      "learning_rate": 0.00015549655630598343,
+      "loss": 0.4422,
+      "step": 1080
+    },
+    {
+      "epoch": 0.5425,
+      "grad_norm": 0.12725204229354858,
+      "learning_rate": 0.00015418824580853535,
+      "loss": 0.4422,
+      "step": 1085
+    },
+    {
+      "epoch": 0.545,
+      "grad_norm": 0.11274927109479904,
+      "learning_rate": 0.00015287961635995347,
+      "loss": 0.4229,
+      "step": 1090
+    },
+    {
+      "epoch": 0.5475,
+      "grad_norm": 0.11833129078149796,
+      "learning_rate": 0.00015157076761743686,
+      "loss": 0.4442,
+      "step": 1095
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.11384794861078262,
+      "learning_rate": 0.00015026179925488475,
+      "loss": 0.4528,
+      "step": 1100
+    },
+    {
+      "epoch": 0.5525,
+      "grad_norm": 0.11864661425352097,
+      "learning_rate": 0.00014895281095530575,
+      "loss": 0.3988,
+      "step": 1105
+    },
+    {
+      "epoch": 0.555,
+      "grad_norm": 0.11673832684755325,
+      "learning_rate": 0.00014764390240322691,
+      "loss": 0.3544,
+      "step": 1110
+    },
+    {
+      "epoch": 0.5575,
+      "grad_norm": 0.1174502745270729,
+      "learning_rate": 0.00014633517327710202,
+      "loss": 0.4034,
+      "step": 1115
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.12685547769069672,
+      "learning_rate": 0.00014502672324172107,
+      "loss": 0.3595,
+      "step": 1120
+    },
+    {
+      "epoch": 0.5625,
+      "grad_norm": 0.12368053942918777,
+      "learning_rate": 0.00014371865194062007,
+      "loss": 0.3395,
+      "step": 1125
+    },
+    {
+      "epoch": 0.565,
+      "grad_norm": 0.1077839657664299,
+      "learning_rate": 0.000142411058988493,
+      "loss": 0.4199,
+      "step": 1130
+    },
+    {
+      "epoch": 0.5675,
+      "grad_norm": 0.11699855327606201,
+      "learning_rate": 0.00014110404396360576,
+      "loss": 0.3443,
+      "step": 1135
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.13238464295864105,
+      "learning_rate": 0.0001397977064002128,
+      "loss": 0.3499,
+      "step": 1140
+    },
+    {
+      "epoch": 0.5725,
+      "grad_norm": 0.11482933163642883,
+      "learning_rate": 0.0001384921457809772,
+      "loss": 0.3619,
+      "step": 1145
+    },
+    {
+      "epoch": 0.575,
+      "grad_norm": 0.13390353322029114,
+      "learning_rate": 0.00013718746152939487,
+      "loss": 0.3684,
+      "step": 1150
+    },
+    {
+      "epoch": 0.5775,
+      "grad_norm": 0.11464900523424149,
+      "learning_rate": 0.00013588375300222283,
+      "loss": 0.3313,
+      "step": 1155
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.10367871820926666,
+      "learning_rate": 0.00013458111948191296,
+      "loss": 0.3323,
+      "step": 1160
+    },
+    {
+      "epoch": 0.5825,
+      "grad_norm": 0.12259294092655182,
+      "learning_rate": 0.0001332796601690512,
+      "loss": 0.3986,
+      "step": 1165
+    },
+    {
+      "epoch": 0.585,
+      "grad_norm": 0.10923358052968979,
+      "learning_rate": 0.00013197947417480292,
+      "loss": 0.3808,
+      "step": 1170
+    },
+    {
+      "epoch": 0.5875,
+      "grad_norm": 0.12479504942893982,
+      "learning_rate": 0.0001306806605133656,
+      "loss": 0.4429,
+      "step": 1175
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.11521733552217484,
+      "learning_rate": 0.000129383318094428,
+      "loss": 0.4778,
+      "step": 1180
+    },
+    {
+      "epoch": 0.5925,
+      "grad_norm": 0.14112086594104767,
+      "learning_rate": 0.00012808754571563827,
+      "loss": 0.4634,
+      "step": 1185
+    },
+    {
+      "epoch": 0.595,
+      "grad_norm": 0.12947902083396912,
+      "learning_rate": 0.00012679344205507981,
+      "loss": 0.4439,
+      "step": 1190
+    },
+    {
+      "epoch": 0.5975,
+      "grad_norm": 0.13288578391075134,
+      "learning_rate": 0.0001255011056637567,
+      "loss": 0.4402,
+      "step": 1195
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.1216069906949997,
+      "learning_rate": 0.00012421063495808853,
+      "loss": 0.4203,
+      "step": 1200
+    },
+    {
+      "epoch": 0.6025,
+      "grad_norm": 0.11649637669324875,
+      "learning_rate": 0.000122922128212416,
+      "loss": 0.4512,
+      "step": 1205
+    },
+    {
+      "epoch": 0.605,
+      "grad_norm": 0.1201406940817833,
+      "learning_rate": 0.00012163568355151628,
+      "loss": 0.3725,
+      "step": 1210
+    },
+    {
+      "epoch": 0.6075,
+      "grad_norm": 0.12117727100849152,
+      "learning_rate": 0.00012035139894313107,
+      "loss": 0.4352,
+      "step": 1215
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.11709322035312653,
+      "learning_rate": 0.00011906937219050556,
+      "loss": 0.4189,
+      "step": 1220
+    },
+    {
+      "epoch": 0.6125,
+      "grad_norm": 0.11865726858377457,
+      "learning_rate": 0.0001177897009249405,
+      "loss": 0.3796,
+      "step": 1225
+    },
+    {
+      "epoch": 0.615,
+      "grad_norm": 0.10807759314775467,
+      "learning_rate": 0.0001165124825983573,
+      "loss": 0.4465,
+      "step": 1230
+    },
+    {
+      "epoch": 0.6175,
+      "grad_norm": 0.13788209855556488,
+      "learning_rate": 0.00011523781447587641,
+      "loss": 0.4994,
+      "step": 1235
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.12921364605426788,
+      "learning_rate": 0.00011396579362841044,
+      "loss": 0.4251,
+      "step": 1240
+    },
+    {
+      "epoch": 0.6225,
+      "grad_norm": 0.12162365019321442,
+      "learning_rate": 0.0001126965169252718,
+      "loss": 0.3864,
+      "step": 1245
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.12897826731204987,
+      "learning_rate": 0.00011143008102679559,
+      "loss": 0.3753,
+      "step": 1250
+    },
+    {
+      "epoch": 0.6275,
+      "grad_norm": 0.116109699010849,
+      "learning_rate": 0.00011016658237697866,
+      "loss": 0.3296,
+      "step": 1255
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.12935414910316467,
+      "learning_rate": 0.00010890611719613512,
+      "loss": 0.3797,
+      "step": 1260
+    },
+    {
+      "epoch": 0.6325,
+      "grad_norm": 0.13730891048908234,
+      "learning_rate": 0.0001076487814735685,
+      "loss": 0.3711,
+      "step": 1265
+    },
+    {
+      "epoch": 0.635,
+      "grad_norm": 0.13870631158351898,
+      "learning_rate": 0.00010639467096026211,
+      "loss": 0.4328,
+      "step": 1270
+    },
+    {
+      "epoch": 0.6375,
+      "grad_norm": 0.11644043773412704,
+      "learning_rate": 0.00010514388116158701,
+      "loss": 0.3283,
+      "step": 1275
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.12221091985702515,
+      "learning_rate": 0.00010389650733002894,
+      "loss": 0.3898,
+      "step": 1280
+    },
+    {
+      "epoch": 0.6425,
+      "grad_norm": 0.12048634141683578,
+      "learning_rate": 0.00010265264445793464,
+      "loss": 0.3256,
+      "step": 1285
+    },
+    {
+      "epoch": 0.645,
+      "grad_norm": 0.1250566840171814,
+      "learning_rate": 0.00010141238727027761,
+      "loss": 0.408,
+      "step": 1290
+    },
+    {
+      "epoch": 0.6475,
+      "grad_norm": 0.13518592715263367,
+      "learning_rate": 0.00010017583021744454,
+      "loss": 0.3763,
+      "step": 1295
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.13047736883163452,
+      "learning_rate": 9.89430674680425e-05,
+      "loss": 0.3989,
+      "step": 1300
+    },
+    {
+      "epoch": 0.6525,
+      "grad_norm": 0.11474955826997757,
+      "learning_rate": 9.771419290172773e-05,
+      "loss": 0.3374,
+      "step": 1305
+    },
+    {
+      "epoch": 0.655,
+      "grad_norm": 0.11670063436031342,
+      "learning_rate": 9.648930010205619e-05,
+      "loss": 0.3343,
+      "step": 1310
+    },
+    {
+      "epoch": 0.6575,
+      "grad_norm": 0.15385080873966217,
+      "learning_rate": 9.526848234935704e-05,
+      "loss": 0.3432,
+      "step": 1315
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.13441519439220428,
+      "learning_rate": 9.405183261362863e-05,
+      "loss": 0.3116,
+      "step": 1320
+    },
+    {
+      "epoch": 0.6625,
+      "grad_norm": 0.14772167801856995,
+      "learning_rate": 9.283944354745888e-05,
+      "loss": 0.3613,
+      "step": 1325
+    },
+    {
+      "epoch": 0.665,
+      "grad_norm": 0.12146154791116714,
+      "learning_rate": 9.163140747896907e-05,
+      "loss": 0.3411,
+      "step": 1330
+    },
+    {
+      "epoch": 0.6675,
+      "grad_norm": 0.1333102583885193,
+      "learning_rate": 9.042781640478291e-05,
+      "loss": 0.396,
+      "step": 1335
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.12051521986722946,
+      "learning_rate": 8.922876198302062e-05,
+      "loss": 0.3837,
+      "step": 1340
+    },
+    {
+      "epoch": 0.6725,
+      "grad_norm": 0.12071400880813599,
+      "learning_rate": 8.803433552631874e-05,
+      "loss": 0.354,
+      "step": 1345
+    },
+    {
+      "epoch": 0.675,
+      "grad_norm": 0.11258620023727417,
+      "learning_rate": 8.684462799487635e-05,
+      "loss": 0.3197,
+      "step": 1350
+    },
+    {
+      "epoch": 0.6775,
+      "grad_norm": 0.11908067762851715,
+      "learning_rate": 8.565972998952814e-05,
+      "loss": 0.377,
+      "step": 1355
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1252991259098053,
+      "learning_rate": 8.447973174484469e-05,
+      "loss": 0.3438,
+      "step": 1360
+    },
+    {
+      "epoch": 0.6825,
+      "grad_norm": 0.12832245230674744,
+      "learning_rate": 8.330472312226091e-05,
+      "loss": 0.346,
+      "step": 1365
+    },
+    {
+      "epoch": 0.685,
+      "grad_norm": 0.1396942287683487,
+      "learning_rate": 8.213479360323258e-05,
+      "loss": 0.3886,
+      "step": 1370
+    },
+    {
+      "epoch": 0.6875,
+      "grad_norm": 0.12938210368156433,
+      "learning_rate": 8.097003228242225e-05,
+      "loss": 0.3699,
+      "step": 1375
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.12459377944469452,
+      "learning_rate": 7.9810527860914e-05,
+      "loss": 0.3892,
+      "step": 1380
+    },
+    {
+      "epoch": 0.6925,
+      "grad_norm": 0.1360333263874054,
+      "learning_rate": 7.86563686394587e-05,
+      "loss": 0.3423,
+      "step": 1385
+    },
+    {
+      "epoch": 0.695,
+      "grad_norm": 0.1357765644788742,
+      "learning_rate": 7.750764251174963e-05,
+      "loss": 0.408,
+      "step": 1390
+    },
+    {
+      "epoch": 0.6975,
+      "grad_norm": 0.14453718066215515,
+      "learning_rate": 7.636443695772887e-05,
+      "loss": 0.3398,
+      "step": 1395
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.11541519314050674,
+      "learning_rate": 7.522683903692547e-05,
+      "loss": 0.4203,
+      "step": 1400
+    },
+    {
+      "epoch": 0.7025,
+      "grad_norm": 0.13344840705394745,
+      "learning_rate": 7.409493538182545e-05,
+      "loss": 0.3694,
+      "step": 1405
+    },
+    {
+      "epoch": 0.705,
+      "grad_norm": 0.13069866597652435,
+      "learning_rate": 7.296881219127452e-05,
+      "loss": 0.3889,
+      "step": 1410
+    },
+    {
+      "epoch": 0.7075,
+      "grad_norm": 0.12457838654518127,
+      "learning_rate": 7.184855522391359e-05,
+      "loss": 0.3342,
+      "step": 1415
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.11990659683942795,
+      "learning_rate": 7.073424979164794e-05,
+      "loss": 0.3855,
+      "step": 1420
+    },
+    {
+      "epoch": 0.7125,
+      "grad_norm": 0.1389523446559906,
+      "learning_rate": 6.962598075315046e-05,
+      "loss": 0.3943,
+      "step": 1425
+    },
+    {
+      "epoch": 0.715,
+      "grad_norm": 0.14108599722385406,
+      "learning_rate": 6.852383250739938e-05,
+      "loss": 0.388,
+      "step": 1430
+    },
+    {
+      "epoch": 0.7175,
+      "grad_norm": 0.1342005580663681,
+      "learning_rate": 6.742788898725065e-05,
+      "loss": 0.3602,
+      "step": 1435
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.13516324758529663,
+      "learning_rate": 6.633823365304648e-05,
+      "loss": 0.3935,
+      "step": 1440
+    },
+    {
+      "epoch": 0.7225,
+      "grad_norm": 0.1302197426557541,
+      "learning_rate": 6.52549494862593e-05,
+      "loss": 0.3618,
+      "step": 1445
+    },
+    {
+      "epoch": 0.725,
+      "grad_norm": 0.12428996711969376,
+      "learning_rate": 6.417811898317259e-05,
+      "loss": 0.3338,
+      "step": 1450
+    },
+    {
+      "epoch": 0.7275,
+      "grad_norm": 0.11249776184558868,
+      "learning_rate": 6.31078241485982e-05,
+      "loss": 0.3819,
+      "step": 1455
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.1359994113445282,
+      "learning_rate": 6.204414648963159e-05,
+      "loss": 0.3356,
+      "step": 1460
+    },
+    {
+      "epoch": 0.7325,
+      "grad_norm": 0.1118568629026413,
+      "learning_rate": 6.098716700944479e-05,
+      "loss": 0.3223,
+      "step": 1465
+    },
+    {
+      "epoch": 0.735,
+      "grad_norm": 0.12038140743970871,
+      "learning_rate": 5.993696620111741e-05,
+      "loss": 0.3481,
+      "step": 1470
+    },
+    {
+      "epoch": 0.7375,
+      "grad_norm": 0.12787550687789917,
+      "learning_rate": 5.889362404150703e-05,
+      "loss": 0.3766,
+      "step": 1475
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.12134893983602524,
+      "learning_rate": 5.7857219985158506e-05,
+      "loss": 0.2916,
+      "step": 1480
+    },
+    {
+      "epoch": 0.7425,
+      "grad_norm": 0.1274223029613495,
+      "learning_rate": 5.682783295825345e-05,
+      "loss": 0.3095,
+      "step": 1485
+    },
+    {
+      "epoch": 0.745,
+      "grad_norm": 0.11817299574613571,
+      "learning_rate": 5.580554135259932e-05,
+      "loss": 0.3422,
+      "step": 1490
+    },
+    {
+      "epoch": 0.7475,
+      "grad_norm": 0.1348387748003006,
+      "learning_rate": 5.479042301965987e-05,
+      "loss": 0.4044,
+      "step": 1495
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.14032681286334991,
+      "learning_rate": 5.378255526462631e-05,
+      "loss": 0.337,
+      "step": 1500
+    },
+    {
+      "epoch": 0.7525,
+      "grad_norm": 0.1196574866771698,
+      "learning_rate": 5.2782014840530366e-05,
+      "loss": 0.3638,
+      "step": 1505
+    },
+    {
+      "epoch": 0.755,
+      "grad_norm": 0.1307535171508789,
+      "learning_rate": 5.178887794239904e-05,
+      "loss": 0.3514,
+      "step": 1510
+    },
+    {
+      "epoch": 0.7575,
+      "grad_norm": 0.12303224951028824,
+      "learning_rate": 5.080322020145224e-05,
+      "loss": 0.3825,
+      "step": 1515
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.11517804116010666,
+      "learning_rate": 4.9825116679343025e-05,
+      "loss": 0.3474,
+      "step": 1520
+    },
+    {
+      "epoch": 0.7625,
+      "grad_norm": 0.1276445835828781,
+      "learning_rate": 4.885464186244154e-05,
+      "loss": 0.3084,
+      "step": 1525
+    },
+    {
+      "epoch": 0.765,
+      "grad_norm": 0.12166495621204376,
+      "learning_rate": 4.789186965616232e-05,
+      "loss": 0.2949,
+      "step": 1530
+    },
+    {
+      "epoch": 0.7675,
+      "grad_norm": 0.13007108867168427,
+      "learning_rate": 4.6936873379336564e-05,
+      "loss": 0.3336,
+      "step": 1535
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.12368687242269516,
+      "learning_rate": 4.598972575862803e-05,
+      "loss": 0.3443,
+      "step": 1540
+    },
+    {
+      "epoch": 0.7725,
+      "grad_norm": 0.11817432940006256,
+      "learning_rate": 4.5050498922995166e-05,
+      "loss": 0.3198,
+      "step": 1545
+    },
+    {
+      "epoch": 0.775,
+      "grad_norm": 0.13239014148712158,
+      "learning_rate": 4.4119264398197843e-05,
+      "loss": 0.3145,
+      "step": 1550
+    },
+    {
+      "epoch": 0.7775,
+      "grad_norm": 0.12305855751037598,
+      "learning_rate": 4.319609310135054e-05,
+      "loss": 0.3276,
+      "step": 1555
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.13063360750675201,
+      "learning_rate": 4.228105533552169e-05,
+      "loss": 0.4115,
+      "step": 1560
+    },
+    {
+      "epoch": 0.7825,
+      "grad_norm": 0.12751415371894836,
+      "learning_rate": 4.137422078437991e-05,
+      "loss": 0.4113,
+      "step": 1565
+    },
+    {
+      "epoch": 0.785,
+      "grad_norm": 0.1429520696401596,
+      "learning_rate": 4.0475658506887136e-05,
+      "loss": 0.3634,
+      "step": 1570
+    },
+    {
+      "epoch": 0.7875,
+      "grad_norm": 0.13072626292705536,
+      "learning_rate": 3.9585436932039846e-05,
+      "loss": 0.3914,
+      "step": 1575
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.13076546788215637,
+      "learning_rate": 3.870362385365755e-05,
+      "loss": 0.3153,
+      "step": 1580
+    },
+    {
+      "epoch": 0.7925,
+      "grad_norm": 0.11764945089817047,
+      "learning_rate": 3.7830286425220234e-05,
+      "loss": 0.331,
+      "step": 1585
+    },
+    {
+      "epoch": 0.795,
+      "grad_norm": 0.12469421327114105,
+      "learning_rate": 3.696549115475434e-05,
+      "loss": 0.3667,
+      "step": 1590
+    },
+    {
+      "epoch": 0.7975,
+      "grad_norm": 0.13257570564746857,
+      "learning_rate": 3.6109303899767875e-05,
+      "loss": 0.3775,
+      "step": 1595
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1399105191230774,
+      "learning_rate": 3.5261789862235235e-05,
+      "loss": 0.3786,
+      "step": 1600
+    },
+    {
+      "epoch": 0.8025,
+      "grad_norm": 0.1299823671579361,
+      "learning_rate": 3.442301358363163e-05,
+      "loss": 0.3984,
+      "step": 1605
+    },
+    {
+      "epoch": 0.805,
+      "grad_norm": 0.12068431079387665,
+      "learning_rate": 3.359303894001809e-05,
+      "loss": 0.3416,
+      "step": 1610
+    },
+    {
+      "epoch": 0.8075,
+      "grad_norm": 0.12825050950050354,
+      "learning_rate": 3.277192913717717e-05,
+      "loss": 0.3973,
+      "step": 1615
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.12794139981269836,
+      "learning_rate": 3.195974670579941e-05,
+      "loss": 0.3942,
+      "step": 1620
+    },
+    {
+      "epoch": 0.8125,
+      "grad_norm": 0.1178906112909317,
+      "learning_rate": 3.115655349672141e-05,
+      "loss": 0.3549,
+      "step": 1625
+    },
+    {
+      "epoch": 0.815,
+      "grad_norm": 0.11859016120433807,
+      "learning_rate": 3.036241067621575e-05,
+      "loss": 0.3113,
+      "step": 1630
+    },
+    {
+      "epoch": 0.8175,
+      "grad_norm": 0.12508928775787354,
+      "learning_rate": 2.9577378721332843e-05,
+      "loss": 0.3802,
+      "step": 1635
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1293668895959854,
+      "learning_rate": 2.8801517415295455e-05,
+      "loss": 0.3098,
+      "step": 1640
+    },
+    {
+      "epoch": 0.8225,
+      "grad_norm": 0.12039236724376678,
+      "learning_rate": 2.8034885842945865e-05,
+      "loss": 0.2876,
+      "step": 1645
+    },
+    {
+      "epoch": 0.825,
+      "grad_norm": 0.14805036783218384,
+      "learning_rate": 2.7277542386246454e-05,
+      "loss": 0.3618,
+      "step": 1650
+    },
+    {
+      "epoch": 0.8275,
+      "grad_norm": 0.12638579308986664,
+      "learning_rate": 2.6529544719833706e-05,
+      "loss": 0.3328,
+      "step": 1655
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.12427478283643723,
+      "learning_rate": 2.5790949806625838e-05,
+      "loss": 0.3394,
+      "step": 1660
+    },
+    {
+      "epoch": 0.8325,
+      "grad_norm": 0.1283419132232666,
+      "learning_rate": 2.5061813893485085e-05,
+      "loss": 0.3392,
+      "step": 1665
+    },
+    {
+      "epoch": 0.835,
+      "grad_norm": 0.12487384676933289,
+      "learning_rate": 2.434219250693419e-05,
+      "loss": 0.3592,
+      "step": 1670
+    },
+    {
+      "epoch": 0.8375,
+      "grad_norm": 0.14032793045043945,
+      "learning_rate": 2.363214044892788e-05,
+      "loss": 0.4099,
+      "step": 1675
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.10917101800441742,
+      "learning_rate": 2.293171179267946e-05,
+      "loss": 0.3204,
+      "step": 1680
+    },
+    {
+      "epoch": 0.8425,
+      "grad_norm": 0.1253073364496231,
+      "learning_rate": 2.2240959878542848e-05,
+      "loss": 0.3378,
+      "step": 1685
+    },
+    {
+      "epoch": 0.845,
+      "grad_norm": 0.14096981287002563,
+      "learning_rate": 2.155993730995077e-05,
+      "loss": 0.378,
+      "step": 1690
+    },
+    {
+      "epoch": 0.8475,
+      "grad_norm": 0.12039178609848022,
+      "learning_rate": 2.0888695949408468e-05,
+      "loss": 0.3197,
+      "step": 1695
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.12723132967948914,
+      "learning_rate": 2.0227286914544353e-05,
+      "loss": 0.3241,
+      "step": 1700
+    },
+    {
+      "epoch": 0.8525,
+      "grad_norm": 0.1309029906988144,
+      "learning_rate": 1.9575760574217147e-05,
+      "loss": 0.3743,
+      "step": 1705
+    },
+    {
+      "epoch": 0.855,
+      "grad_norm": 0.1324499100446701,
+      "learning_rate": 1.893416654468022e-05,
+      "loss": 0.345,
+      "step": 1710
+    },
+    {
+      "epoch": 0.8575,
+      "grad_norm": 0.11905783414840698,
+      "learning_rate": 1.8302553685802917e-05,
+      "loss": 0.3514,
+      "step": 1715
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.12570443749427795,
+      "learning_rate": 1.768097009734985e-05,
+      "loss": 0.3791,
+      "step": 1720
+    },
+    {
+      "epoch": 0.8625,
+      "grad_norm": 0.13414913415908813,
+      "learning_rate": 1.7069463115317788e-05,
+      "loss": 0.3575,
+      "step": 1725
+    },
+    {
+      "epoch": 0.865,
+      "grad_norm": 0.1283785104751587,
+      "learning_rate": 1.6468079308331023e-05,
+      "loss": 0.3496,
+      "step": 1730
+    },
+    {
+      "epoch": 0.8675,
+      "grad_norm": 0.11180217564105988,
+      "learning_rate": 1.587686447409478e-05,
+      "loss": 0.3245,
+      "step": 1735
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.13804157078266144,
+      "learning_rate": 1.5295863635907667e-05,
+      "loss": 0.367,
+      "step": 1740
+    },
+    {
+      "epoch": 0.8725,
+      "grad_norm": 0.12629055976867676,
+      "learning_rate": 1.4725121039232945e-05,
+      "loss": 0.293,
+      "step": 1745
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.12774884700775146,
+      "learning_rate": 1.4164680148329088e-05,
+      "loss": 0.3798,
+      "step": 1750
+    },
+    {
+      "epoch": 0.8775,
+      "grad_norm": 0.11681339889764786,
+      "learning_rate": 1.3614583642939718e-05,
+      "loss": 0.3474,
+      "step": 1755
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.14510560035705566,
+      "learning_rate": 1.3074873415043591e-05,
+      "loss": 0.3999,
+      "step": 1760
+    },
+    {
+      "epoch": 0.8825,
+      "grad_norm": 0.1168401762843132,
+      "learning_rate": 1.2545590565664054e-05,
+      "loss": 0.3398,
+      "step": 1765
+    },
+    {
+      "epoch": 0.885,
+      "grad_norm": 0.1411600410938263,
+      "learning_rate": 1.2026775401739348e-05,
+      "loss": 0.3346,
+      "step": 1770
+    },
+    {
+      "epoch": 0.8875,
+      "grad_norm": 0.12797729671001434,
+      "learning_rate": 1.1518467433052863e-05,
+      "loss": 0.3742,
+      "step": 1775
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.12946921586990356,
+      "learning_rate": 1.1020705369224414e-05,
+      "loss": 0.3436,
+      "step": 1780
+    },
+    {
+      "epoch": 0.8925,
+      "grad_norm": 0.13285613059997559,
+      "learning_rate": 1.0533527116762296e-05,
+      "loss": 0.3186,
+      "step": 1785
+    },
+    {
+      "epoch": 0.895,
+      "grad_norm": 0.15213604271411896,
+      "learning_rate": 1.005696977617666e-05,
+      "loss": 0.3629,
+      "step": 1790
+    },
+    {
+      "epoch": 0.8975,
+      "grad_norm": 0.12391404062509537,
+      "learning_rate": 9.591069639154008e-06,
+      "loss": 0.3421,
+      "step": 1795
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.11592845618724823,
+      "learning_rate": 9.135862185793636e-06,
+      "loss": 0.3107,
+      "step": 1800
+    },
+    {
+      "epoch": 0.9025,
+      "grad_norm": 0.12540902197360992,
+      "learning_rate": 8.691382081905496e-06,
+      "loss": 0.3605,
+      "step": 1805
+    },
+    {
+      "epoch": 0.905,
+      "grad_norm": 0.14459215104579926,
+      "learning_rate": 8.257663176370389e-06,
+      "loss": 0.3884,
+      "step": 1810
+    },
+    {
+      "epoch": 0.9075,
+      "grad_norm": 0.14139464497566223,
+      "learning_rate": 7.834738498562165e-06,
+      "loss": 0.3728,
+      "step": 1815
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.12125397473573685,
+      "learning_rate": 7.422640255832446e-06,
+      "loss": 0.3237,
+      "step": 1820
+    },
+    {
+      "epoch": 0.9125,
+      "grad_norm": 0.13039612770080566,
+      "learning_rate": 7.021399831057961e-06,
+      "loss": 0.3055,
+      "step": 1825
+    },
+    {
+      "epoch": 0.915,
+      "grad_norm": 0.1337701678276062,
+      "learning_rate": 6.631047780250481e-06,
+      "loss": 0.368,
+      "step": 1830
+    },
+    {
+      "epoch": 0.9175,
+      "grad_norm": 0.13020606338977814,
+      "learning_rate": 6.251613830230013e-06,
+      "loss": 0.3262,
+      "step": 1835
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.12915077805519104,
+      "learning_rate": 5.883126876360872e-06,
+      "loss": 0.3428,
+      "step": 1840
+    },
+    {
+      "epoch": 0.9225,
+      "grad_norm": 0.12774400413036346,
+      "learning_rate": 5.525614980351284e-06,
+      "loss": 0.3735,
+      "step": 1845
+    },
+    {
+      "epoch": 0.925,
+      "grad_norm": 0.12587039172649384,
+      "learning_rate": 5.1791053681162545e-06,
+      "loss": 0.3402,
+      "step": 1850
+    },
+    {
+      "epoch": 0.9275,
+      "grad_norm": 0.12152459472417831,
+      "learning_rate": 4.843624427704329e-06,
+      "loss": 0.2968,
+      "step": 1855
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.11444247514009476,
+      "learning_rate": 4.519197707287986e-06,
+      "loss": 0.3448,
+      "step": 1860
+    },
+    {
+      "epoch": 0.9325,
+      "grad_norm": 0.12532518804073334,
+      "learning_rate": 4.2058499132180734e-06,
+      "loss": 0.3613,
+      "step": 1865
+    },
+    {
+      "epoch": 0.935,
+      "grad_norm": 0.14186476171016693,
+      "learning_rate": 3.903604908142266e-06,
+      "loss": 0.2887,
+      "step": 1870
+    },
+    {
+      "epoch": 0.9375,
+      "grad_norm": 0.13014192879199982,
+      "learning_rate": 3.6124857091878845e-06,
+      "loss": 0.2679,
+      "step": 1875
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.1259031891822815,
+      "learning_rate": 3.3325144862090648e-06,
+      "loss": 0.2993,
+      "step": 1880
+    },
+    {
+      "epoch": 0.9425,
+      "grad_norm": 0.12168288230895996,
+      "learning_rate": 3.0637125600983916e-06,
+      "loss": 0.3317,
+      "step": 1885
+    },
+    {
+      "epoch": 0.945,
+      "grad_norm": 0.12291324138641357,
+      "learning_rate": 2.8061004011632302e-06,
+      "loss": 0.3311,
+      "step": 1890
+    },
+    {
+      "epoch": 0.9475,
+      "grad_norm": 0.13629783689975739,
+      "learning_rate": 2.5596976275668757e-06,
+      "loss": 0.3456,
+      "step": 1895
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.17415851354599,
+      "learning_rate": 2.324523003834511e-06,
+      "loss": 0.3589,
+      "step": 1900
+    },
+    {
+      "epoch": 0.9525,
+      "grad_norm": 0.1330641210079193,
+      "learning_rate": 2.100594439424269e-06,
+      "loss": 0.3826,
+      "step": 1905
+    },
+    {
+      "epoch": 0.955,
+      "grad_norm": 0.14203837513923645,
+      "learning_rate": 1.8879289873632907e-06,
+      "loss": 0.3807,
+      "step": 1910
+    },
+    {
+      "epoch": 0.9575,
+      "grad_norm": 0.1222100704908371,
+      "learning_rate": 1.686542842949129e-06,
+      "loss": 0.3084,
+      "step": 1915
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.1441483348608017,
+      "learning_rate": 1.4964513425163694e-06,
+      "loss": 0.3871,
+      "step": 1920
+    },
+    {
+      "epoch": 0.9625,
+      "grad_norm": 0.1402144581079483,
+      "learning_rate": 1.3176689622687474e-06,
+      "loss": 0.3192,
+      "step": 1925
+    },
+    {
+      "epoch": 0.965,
+      "grad_norm": 0.13284745812416077,
+      "learning_rate": 1.1502093171766979e-06,
+      "loss": 0.359,
+      "step": 1930
+    },
+    {
+      "epoch": 0.9675,
+      "grad_norm": 0.1253402829170227,
+      "learning_rate": 9.94085159940533e-07,
+      "loss": 0.3214,
+      "step": 1935
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.13589312136173248,
+      "learning_rate": 8.493083800193034e-07,
+      "loss": 0.3524,
+      "step": 1940
+    },
+    {
+      "epoch": 0.9725,
+      "grad_norm": 0.13623379170894623,
+      "learning_rate": 7.158900027253223e-07,
+      "loss": 0.3711,
+      "step": 1945
+    },
+    {
+      "epoch": 0.975,
+      "grad_norm": 0.12516111135482788,
+      "learning_rate": 5.9384018838457e-07,
+      "loss": 0.3487,
+      "step": 1950
+    },
+    {
+      "epoch": 0.9775,
+      "grad_norm": 0.1211727038025856,
+      "learning_rate": 4.831682315629304e-07,
+      "loss": 0.3079,
+      "step": 1955
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.1348896622657776,
+      "learning_rate": 3.8388256035840615e-07,
+      "loss": 0.322,
+      "step": 1960
+    },
+    {
+      "epoch": 0.9825,
+      "grad_norm": 0.12953124940395355,
+      "learning_rate": 2.959907357592661e-07,
+      "loss": 0.3054,
+      "step": 1965
+    },
+    {
+      "epoch": 0.985,
+      "grad_norm": 0.12745600938796997,
+      "learning_rate": 2.1949945106823909e-07,
+      "loss": 0.3208,
+      "step": 1970
+    },
+    {
+      "epoch": 0.9875,
+      "grad_norm": 0.13108642399311066,
+      "learning_rate": 1.544145313928047e-07,
+      "loss": 0.3641,
+      "step": 1975
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.12415596097707748,
+      "learning_rate": 1.0074093320156517e-07,
+      "loss": 0.3141,
+      "step": 1980
+    },
+    {
+      "epoch": 0.9925,
+      "grad_norm": 0.12116590887308121,
+      "learning_rate": 5.8482743946847153e-08,
+      "loss": 0.3085,
+      "step": 1985
+    },
+    {
+      "epoch": 0.995,
+      "grad_norm": 0.12617753446102142,
+      "learning_rate": 2.764318175336733e-08,
+      "loss": 0.316,
+      "step": 1990
+    },
+    {
+      "epoch": 0.9975,
+      "grad_norm": 0.13097520172595978,
+      "learning_rate": 8.224595173178527e-09,
+      "loss": 0.2772,
+      "step": 1995
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.1454041749238968,
+      "learning_rate": 2.284630068460913e-10,
+      "loss": 0.3226,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.629578157719552e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

codellama-hugcoder/checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304

codellama-hugcoder/checkpoint-500/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: codellama/CodeLlama-7b-Instruct-hf
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2.dev0

codellama-hugcoder/checkpoint-500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "up_proj",
+    "k_proj",
+    "q_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

codellama-hugcoder/checkpoint-500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba0a03baab18f0cdae4dfc77bf7b41f7d1435807efac74517b5672e9ef8bedf1
+size 319876032

codellama-hugcoder/checkpoint-500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dad4d0839af192a8e721c020748fcd5843aa02d4b867cd03a6da416f3b15a8e
+size 640009682

codellama-hugcoder/checkpoint-500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b3fe293b4ac5ae1cf2f114644c15f2a8317440ebc1144a8065f3fe94c0e32b8
+size 14244

codellama-hugcoder/checkpoint-500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12f207d7fee0843ba3ccc634c56e770b9b0bfb3e3b7ef4379b8fc405b4c45a03
+size 1064

codellama-hugcoder/checkpoint-500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,734 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.25,
+  "eval_steps": 100.0,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0025,
+      "grad_norm": 0.09379793703556061,
+      "learning_rate": 5.999999999999999e-06,
+      "loss": 0.6799,
+      "step": 5
+    },
+    {
+      "epoch": 0.005,
+      "grad_norm": 0.1399833709001541,
+      "learning_rate": 1.3499999999999998e-05,
+      "loss": 0.6954,
+      "step": 10
+    },
+    {
+      "epoch": 0.0075,
+      "grad_norm": 0.08632303029298782,
+      "learning_rate": 2.1e-05,
+      "loss": 0.6921,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.10006701201200485,
+      "learning_rate": 2.8499999999999998e-05,
+      "loss": 0.69,
+      "step": 20
+    },
+    {
+      "epoch": 0.0125,
+      "grad_norm": 0.07633858919143677,
+      "learning_rate": 3.5999999999999994e-05,
+      "loss": 0.6722,
+      "step": 25
+    },
+    {
+      "epoch": 0.015,
+      "grad_norm": 0.09399061650037766,
+      "learning_rate": 4.3499999999999993e-05,
+      "loss": 0.6453,
+      "step": 30
+    },
+    {
+      "epoch": 0.0175,
+      "grad_norm": 0.0843738541007042,
+      "learning_rate": 5.1e-05,
+      "loss": 0.6276,
+      "step": 35
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.08583351224660873,
+      "learning_rate": 5.85e-05,
+      "loss": 0.58,
+      "step": 40
+    },
+    {
+      "epoch": 0.0225,
+      "grad_norm": 0.09571370482444763,
+      "learning_rate": 6.599999999999999e-05,
+      "loss": 0.6355,
+      "step": 45
+    },
+    {
+      "epoch": 0.025,
+      "grad_norm": 0.1083935871720314,
+      "learning_rate": 7.35e-05,
+      "loss": 0.589,
+      "step": 50
+    },
+    {
+      "epoch": 0.0275,
+      "grad_norm": 0.10387319326400757,
+      "learning_rate": 8.1e-05,
+      "loss": 0.6061,
+      "step": 55
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.11083361506462097,
+      "learning_rate": 8.849999999999998e-05,
+      "loss": 0.572,
+      "step": 60
+    },
+    {
+      "epoch": 0.0325,
+      "grad_norm": 0.12665686011314392,
+      "learning_rate": 9.599999999999999e-05,
+      "loss": 0.5442,
+      "step": 65
+    },
+    {
+      "epoch": 0.035,
+      "grad_norm": 0.1308053582906723,
+      "learning_rate": 0.00010349999999999998,
+      "loss": 0.6524,
+      "step": 70
+    },
+    {
+      "epoch": 0.0375,
+      "grad_norm": 0.13535510003566742,
+      "learning_rate": 0.00011099999999999999,
+      "loss": 0.6404,
+      "step": 75
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.12833671271800995,
+      "learning_rate": 0.0001185,
+      "loss": 0.5717,
+      "step": 80
+    },
+    {
+      "epoch": 0.0425,
+      "grad_norm": 0.11962099373340607,
+      "learning_rate": 0.00012599999999999997,
+      "loss": 0.6098,
+      "step": 85
+    },
+    {
+      "epoch": 0.045,
+      "grad_norm": 0.13898271322250366,
+      "learning_rate": 0.0001335,
+      "loss": 0.6099,
+      "step": 90
+    },
+    {
+      "epoch": 0.0475,
+      "grad_norm": 0.14486610889434814,
+      "learning_rate": 0.00014099999999999998,
+      "loss": 0.5744,
+      "step": 95
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1432138830423355,
+      "learning_rate": 0.00014849999999999998,
+      "loss": 0.5659,
+      "step": 100
+    },
+    {
+      "epoch": 0.0525,
+      "grad_norm": 0.13487878441810608,
+      "learning_rate": 0.000156,
+      "loss": 0.5622,
+      "step": 105
+    },
+    {
+      "epoch": 0.055,
+      "grad_norm": 0.12495309859514236,
+      "learning_rate": 0.0001635,
+      "loss": 0.5951,
+      "step": 110
+    },
+    {
+      "epoch": 0.0575,
+      "grad_norm": 0.13011734187602997,
+      "learning_rate": 0.00017099999999999998,
+      "loss": 0.6249,
+      "step": 115
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.13987745344638824,
+      "learning_rate": 0.00017849999999999997,
+      "loss": 0.559,
+      "step": 120
+    },
+    {
+      "epoch": 0.0625,
+      "grad_norm": 0.13373605906963348,
+      "learning_rate": 0.000186,
+      "loss": 0.5475,
+      "step": 125
+    },
+    {
+      "epoch": 0.065,
+      "grad_norm": 0.12433867901563644,
+      "learning_rate": 0.0001935,
+      "loss": 0.5274,
+      "step": 130
+    },
+    {
+      "epoch": 0.0675,
+      "grad_norm": 0.11097615957260132,
+      "learning_rate": 0.000201,
+      "loss": 0.678,
+      "step": 135
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.1155027225613594,
+      "learning_rate": 0.00020849999999999997,
+      "loss": 0.5611,
+      "step": 140
+    },
+    {
+      "epoch": 0.0725,
+      "grad_norm": 0.11431068181991577,
+      "learning_rate": 0.00021599999999999996,
+      "loss": 0.6054,
+      "step": 145
+    },
+    {
+      "epoch": 0.075,
+      "grad_norm": 0.09796140342950821,
+      "learning_rate": 0.00022349999999999998,
+      "loss": 0.5472,
+      "step": 150
+    },
+    {
+      "epoch": 0.0775,
+      "grad_norm": 0.09489257633686066,
+      "learning_rate": 0.00023099999999999998,
+      "loss": 0.4636,
+      "step": 155
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.10787788033485413,
+      "learning_rate": 0.0002385,
+      "loss": 0.6164,
+      "step": 160
+    },
+    {
+      "epoch": 0.0825,
+      "grad_norm": 0.10261733084917068,
+      "learning_rate": 0.00024599999999999996,
+      "loss": 0.5408,
+      "step": 165
+    },
+    {
+      "epoch": 0.085,
+      "grad_norm": 0.11870352178812027,
+      "learning_rate": 0.0002535,
+      "loss": 0.5268,
+      "step": 170
+    },
+    {
+      "epoch": 0.0875,
+      "grad_norm": 0.11910569667816162,
+      "learning_rate": 0.000261,
+      "loss": 0.5461,
+      "step": 175
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.10083702206611633,
+      "learning_rate": 0.00026849999999999997,
+      "loss": 0.4794,
+      "step": 180
+    },
+    {
+      "epoch": 0.0925,
+      "grad_norm": 0.10453511029481888,
+      "learning_rate": 0.000276,
+      "loss": 0.5539,
+      "step": 185
+    },
+    {
+      "epoch": 0.095,
+      "grad_norm": 0.101403146982193,
+      "learning_rate": 0.00028349999999999995,
+      "loss": 0.5346,
+      "step": 190
+    },
+    {
+      "epoch": 0.0975,
+      "grad_norm": 0.10724789649248123,
+      "learning_rate": 0.00029099999999999997,
+      "loss": 0.6026,
+      "step": 195
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1140277311205864,
+      "learning_rate": 0.0002985,
+      "loss": 0.5193,
+      "step": 200
+    },
+    {
+      "epoch": 0.1025,
+      "grad_norm": 0.09706108272075653,
+      "learning_rate": 0.0002999963446058092,
+      "loss": 0.54,
+      "step": 205
+    },
+    {
+      "epoch": 0.105,
+      "grad_norm": 0.10003062337636948,
+      "learning_rate": 0.0002999814948722491,
+      "loss": 0.5365,
+      "step": 210
+    },
+    {
+      "epoch": 0.1075,
+      "grad_norm": 0.1078687533736229,
+      "learning_rate": 0.00029995522346717746,
+      "loss": 0.5889,
+      "step": 215
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.10538115352392197,
+      "learning_rate": 0.0002999175323912636,
+      "loss": 0.5611,
+      "step": 220
+    },
+    {
+      "epoch": 0.1125,
+      "grad_norm": 0.1020808294415474,
+      "learning_rate": 0.00029986842451482874,
+      "loss": 0.6103,
+      "step": 225
+    },
+    {
+      "epoch": 0.115,
+      "grad_norm": 0.09635835886001587,
+      "learning_rate": 0.0002998079035776279,
+      "loss": 0.5229,
+      "step": 230
+    },
+    {
+      "epoch": 0.1175,
+      "grad_norm": 0.10287190228700638,
+      "learning_rate": 0.0002997359741885648,
+      "loss": 0.5312,
+      "step": 235
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.09160075336694717,
+      "learning_rate": 0.0002996526418253408,
+      "loss": 0.5673,
+      "step": 240
+    },
+    {
+      "epoch": 0.1225,
+      "grad_norm": 0.08691006153821945,
+      "learning_rate": 0.000299557912834038,
+      "loss": 0.5326,
+      "step": 245
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.10096988826990128,
+      "learning_rate": 0.00029945179442863594,
+      "loss": 0.6004,
+      "step": 250
+    },
+    {
+      "epoch": 0.1275,
+      "grad_norm": 0.09594204276800156,
+      "learning_rate": 0.000299334294690462,
+      "loss": 0.5516,
+      "step": 255
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.10281919687986374,
+      "learning_rate": 0.00029920542256757607,
+      "loss": 0.5515,
+      "step": 260
+    },
+    {
+      "epoch": 0.1325,
+      "grad_norm": 0.08547840267419815,
+      "learning_rate": 0.00029906518787408944,
+      "loss": 0.5243,
+      "step": 265
+    },
+    {
+      "epoch": 0.135,
+      "grad_norm": 0.10161560773849487,
+      "learning_rate": 0.0002989136012894168,
+      "loss": 0.5096,
+      "step": 270
+    },
+    {
+      "epoch": 0.1375,
+      "grad_norm": 0.09101904183626175,
+      "learning_rate": 0.0002987506743574635,
+      "loss": 0.553,
+      "step": 275
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.09769442677497864,
+      "learning_rate": 0.0002985764194857463,
+      "loss": 0.4953,
+      "step": 280
+    },
+    {
+      "epoch": 0.1425,
+      "grad_norm": 0.10991579294204712,
+      "learning_rate": 0.00029839084994444826,
+      "loss": 0.5152,
+      "step": 285
+    },
+    {
+      "epoch": 0.145,
+      "grad_norm": 0.09450916200876236,
+      "learning_rate": 0.00029819397986540836,
+      "loss": 0.5397,
+      "step": 290
+    },
+    {
+      "epoch": 0.1475,
+      "grad_norm": 0.10876069217920303,
+      "learning_rate": 0.0002979858242410454,
+      "loss": 0.4858,
+      "step": 295
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.097995825111866,
+      "learning_rate": 0.00029776639892321606,
+      "loss": 0.5566,
+      "step": 300
+    },
+    {
+      "epoch": 0.1525,
+      "grad_norm": 0.1145048514008522,
+      "learning_rate": 0.0002975357206220079,
+      "loss": 0.4531,
+      "step": 305
+    },
+    {
+      "epoch": 0.155,
+      "grad_norm": 0.10271880775690079,
+      "learning_rate": 0.00029729380690446654,
+      "loss": 0.5199,
+      "step": 310
+    },
+    {
+      "epoch": 0.1575,
+      "grad_norm": 0.11095371842384338,
+      "learning_rate": 0.0002970406761932583,
+      "loss": 0.5416,
+      "step": 315
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.09949438273906708,
+      "learning_rate": 0.00029677634776526673,
+      "loss": 0.4841,
+      "step": 320
+    },
+    {
+      "epoch": 0.1625,
+      "grad_norm": 0.1163724958896637,
+      "learning_rate": 0.00029650084175012517,
+      "loss": 0.4913,
+      "step": 325
+    },
+    {
+      "epoch": 0.165,
+      "grad_norm": 0.10726840049028397,
+      "learning_rate": 0.00029621417912868323,
+      "loss": 0.5203,
+      "step": 330
+    },
+    {
+      "epoch": 0.1675,
+      "grad_norm": 0.09609931707382202,
+      "learning_rate": 0.00029591638173140947,
+      "loss": 0.5607,
+      "step": 335
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.10824442654848099,
+      "learning_rate": 0.0002956074722367286,
+      "loss": 0.6004,
+      "step": 340
+    },
+    {
+      "epoch": 0.1725,
+      "grad_norm": 0.10465679317712784,
+      "learning_rate": 0.00029528747416929463,
+      "loss": 0.5216,
+      "step": 345
+    },
+    {
+      "epoch": 0.175,
+      "grad_norm": 0.10518354922533035,
+      "learning_rate": 0.0002949564118981994,
+      "loss": 0.499,
+      "step": 350
+    },
+    {
+      "epoch": 0.1775,
+      "grad_norm": 0.0955279991030693,
+      "learning_rate": 0.0002946143106351165,
+      "loss": 0.5607,
+      "step": 355
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.11159654706716537,
+      "learning_rate": 0.0002942611964323817,
+      "loss": 0.5204,
+      "step": 360
+    },
+    {
+      "epoch": 0.1825,
+      "grad_norm": 0.09571187198162079,
+      "learning_rate": 0.0002938970961810086,
+      "loss": 0.6113,
+      "step": 365
+    },
+    {
+      "epoch": 0.185,
+      "grad_norm": 0.11854679882526398,
+      "learning_rate": 0.0002935220376086411,
+      "loss": 0.5639,
+      "step": 370
+    },
+    {
+      "epoch": 0.1875,
+      "grad_norm": 0.1050512045621872,
+      "learning_rate": 0.0002931360492774415,
+      "loss": 0.548,
+      "step": 375
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.1053968220949173,
+      "learning_rate": 0.0002927391605819157,
+      "loss": 0.5507,
+      "step": 380
+    },
+    {
+      "epoch": 0.1925,
+      "grad_norm": 0.10567320138216019,
+      "learning_rate": 0.00029233140174667445,
+      "loss": 0.5312,
+      "step": 385
+    },
+    {
+      "epoch": 0.195,
+      "grad_norm": 0.11914283782243729,
+      "learning_rate": 0.0002919128038241318,
+      "loss": 0.5961,
+      "step": 390
+    },
+    {
+      "epoch": 0.1975,
+      "grad_norm": 0.09915795922279358,
+      "learning_rate": 0.0002914833986921401,
+      "loss": 0.5086,
+      "step": 395
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.10796502232551575,
+      "learning_rate": 0.0002910432190515628,
+      "loss": 0.5585,
+      "step": 400
+    },
+    {
+      "epoch": 0.2025,
+      "grad_norm": 0.10748997330665588,
+      "learning_rate": 0.00029059229842378373,
+      "loss": 0.5466,
+      "step": 405
+    },
+    {
+      "epoch": 0.205,
+      "grad_norm": 0.10696308314800262,
+      "learning_rate": 0.0002901306711481544,
+      "loss": 0.5513,
+      "step": 410
+    },
+    {
+      "epoch": 0.2075,
+      "grad_norm": 0.10418657958507538,
+      "learning_rate": 0.0002896583723793792,
+      "loss": 0.5391,
+      "step": 415
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.16421550512313843,
+      "learning_rate": 0.00028917543808483796,
+      "loss": 0.4699,
+      "step": 420
+    },
+    {
+      "epoch": 0.2125,
+      "grad_norm": 0.12929962575435638,
+      "learning_rate": 0.00028868190504184696,
+      "loss": 0.4984,
+      "step": 425
+    },
+    {
+      "epoch": 0.215,
+      "grad_norm": 0.10469454526901245,
+      "learning_rate": 0.00028817781083485816,
+      "loss": 0.5119,
+      "step": 430
+    },
+    {
+      "epoch": 0.2175,
+      "grad_norm": 0.0964970663189888,
+      "learning_rate": 0.00028766319385259713,
+      "loss": 0.5167,
+      "step": 435
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.12395574152469635,
+      "learning_rate": 0.00028713809328513953,
+      "loss": 0.5692,
+      "step": 440
+    },
+    {
+      "epoch": 0.2225,
+      "grad_norm": 0.10189738124608994,
+      "learning_rate": 0.0002866025491209265,
+      "loss": 0.4628,
+      "step": 445
+    },
+    {
+      "epoch": 0.225,
+      "grad_norm": 0.10433454066514969,
+      "learning_rate": 0.0002860566021437197,
+      "loss": 0.4869,
+      "step": 450
+    },
+    {
+      "epoch": 0.2275,
+      "grad_norm": 0.13003456592559814,
+      "learning_rate": 0.0002855002939294951,
+      "loss": 0.5291,
+      "step": 455
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.11692202836275101,
+      "learning_rate": 0.000284933666843277,
+      "loss": 0.5229,
+      "step": 460
+    },
+    {
+      "epoch": 0.2325,
+      "grad_norm": 0.10757846385240555,
+      "learning_rate": 0.0002843567640359119,
+      "loss": 0.435,
+      "step": 465
+    },
+    {
+      "epoch": 0.235,
+      "grad_norm": 0.10775501281023026,
+      "learning_rate": 0.00028376962944078206,
+      "loss": 0.4418,
+      "step": 470
+    },
+    {
+      "epoch": 0.2375,
+      "grad_norm": 0.11543692648410797,
+      "learning_rate": 0.00028317230777046015,
+      "loss": 0.4204,
+      "step": 475
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.10946698486804962,
+      "learning_rate": 0.00028256484451330403,
+      "loss": 0.49,
+      "step": 480
+    },
+    {
+      "epoch": 0.2425,
+      "grad_norm": 0.11528221517801285,
+      "learning_rate": 0.00028194728592999247,
+      "loss": 0.4752,
+      "step": 485
+    },
+    {
+      "epoch": 0.245,
+      "grad_norm": 0.10474205762147903,
+      "learning_rate": 0.0002813196790500027,
+      "loss": 0.4847,
+      "step": 490
+    },
+    {
+      "epoch": 0.2475,
+      "grad_norm": 0.10768820345401764,
+      "learning_rate": 0.00028068207166802837,
+      "loss": 0.4664,
+      "step": 495
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.12158560007810593,
+      "learning_rate": 0.00028003451234034037,
+      "loss": 0.4741,
+      "step": 500
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 2000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9223372036854775807,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.57394539429888e+17,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

codellama-hugcoder/checkpoint-500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304

codellama-hugcoder/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
+size 5304

configs/deepspeed_config.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

configs/fsdp_config.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch_policy: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 1
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

fim.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# coding=utf-8
+# Copyright 2024 Sourab Mangrulkar. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import numpy as np
+# this is expensive so we cache it
+@functools.lru_cache(maxsize=None)
+def get_fim_token_ids(tokenizer):
+    if "codellama" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.suffix_id,
+            tokenizer.prefix_id,
+            tokenizer.middle_id,
+            0,
+        )
+    elif "deepseek-coder" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<｜fim▁hole｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁begin｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<｜fim▁end｜>", add_special_tokens=False)[0],
+            tokenizer.encode("<pad>", add_special_tokens=False)[0],
+        )
+    elif "stable-code" in tokenizer.name_or_path:
+        return (
+            tokenizer.bos_token_id,
+            tokenizer.encode("<fim_suffix>")[0],
+            tokenizer.encode("<fim_prefix>")[0],
+            tokenizer.encode("<fim_middle>")[0],
+            tokenizer.encode("<fim_pad>")[0],
+        )
+    else:
+        bos_token_id = None
+        try:
+            FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
+                "additional_special_tokens"
+            ][1:5]
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                tokenizer.vocab[tok]
+                for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
+            )
+        except KeyError:
+            suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
+                None,
+                None,
+                None,
+                None,
+            )
+    return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
+def _bos_token_processing(prefix_token_list, bos_token):
+    if bos_token is not None:
+        # add the BOS token to the beginning of the list
+        prefix_token_list.insert(0, bos_token)
+    return prefix_token_list
+## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
+def permute(
+    sample,
+    np_rng,
+    suffix_tok_id,
+    prefix_tok_id,
+    middle_tok_id,
+    pad_tok_id,
+    fim_rate=0.5,
+    fim_spm_rate=0.5,
+    truncate_or_pad=False,
+    bos_token_id=None,
+):
+    """
+    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
+    PSM and SPM (with a probability of fim_spm_rate).
+    """
+    if np_rng.binomial(1, fim_rate):
+        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
+        boundaries.sort()
+        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
+        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
+        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
+        if truncate_or_pad:
+            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
+            diff = new_length - len(sample)
+            if diff > 0:
+                if suffix.shape[0] <= diff:
+                    return sample, np_rng
+                suffix = suffix[: suffix.shape[0] - diff]
+            elif diff < 0:
+                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
+        if np_rng.binomial(1, fim_spm_rate):
+            prefix_special_tokens = _bos_token_processing(
+                [prefix_tok_id, suffix_tok_id], bos_token_id
+            )
+            # SPM (variant 2 from FIM paper)
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    suffix,
+                    [middle_tok_id],
+                    prefix,
+                    middle,
+                ]
+            )
+        else:
+            prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
+            # PSM
+            new_sample = np.concatenate(
+                [
+                    prefix_special_tokens,
+                    prefix,
+                    [suffix_tok_id],
+                    suffix,
+                    [middle_tok_id],
+                    middle,
+                ]
+            )
+    else:
+        # don't do FIM preproc
+        new_sample = sample
+    return list(new_sample), np_rng

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+git+https://github.com/huggingface/transformers
+git+https://github.com/huggingface/accelerate
+git+https://github.com/huggingface/peft
+trl
+huggingface-hub
+bitsandbytes
+evaluate
+datasets
+einops
+wandb
+tiktoken
+deepspeed
+tqdm
+safetensors

run_deepspeed.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 2e-5 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder-df" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant False \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_flash_attn True

run_fsdp.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+accelerate launch --config_file "configs/fsdp_config.yaml" train.py \
+--model_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 1000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 25 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 1e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder-fsdp" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_flash_attn True

run_peft.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 3e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 4 \
+--per_device_eval_batch_size 4 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.5 \
+--use_peft_lora True \
+--lora_r 32 \
+--lora_alpha 64 \
+--lora_dropout 0.1 \
+--lora_target_modules "all-linear" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True

run_unsloth_peft.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python train.py \
+--seed 11 \
+--model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
+--dataset_name "smangrul/hug_stack" \
+--splits "train" \
+--max_seq_len 2048 \
+--max_steps 2000 \
+--save_steps 500 \
+--eval_steps 100 \
+--logging_steps 5 \
+--log_level "info" \
+--logging_strategy "steps" \
+--evaluation_strategy "steps" \
+--save_strategy "steps" \
+--push_to_hub \
+--hub_private_repo True \
+--hub_strategy "every_save" \
+--bf16 True \
+--learning_rate 2e-4 \
+--lr_scheduler_type "cosine" \
+--weight_decay 0.1 \
+--warmup_ratio 0.1 \
+--max_grad_norm 1.0 \
+--output_dir "codellama-hugcoder" \
+--per_device_train_batch_size 16 \
+--per_device_eval_batch_size 16 \
+--gradient_accumulation_steps 4 \
+--gradient_checkpointing True \
+--use_reentrant True \
+--dataset_text_field "text" \
+--test_size 0.1 \
+--fim_rate 0.5 \
+--fim_spm_rate 0.0 \
+--use_peft_lora True \
+--lora_r 16 \
+--lora_alpha 16 \
+--lora_dropout 0.1 \
+--lora_target_modules "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj" \
+--use_4bit_quantization True \
+--use_nested_quant True \
+--bnb_4bit_compute_dtype "bfloat16" \
+--use_flash_attn True \
+--use_unsloth True