limernyou commited on
Commit
7140a44
·
verified ·
1 Parent(s): b79d2c4

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .ipynb_checkpoints/fim-checkpoint.py +141 -0
  3. .ipynb_checkpoints/requirements-checkpoint.txt +14 -0
  4. .ipynb_checkpoints/run_peft-checkpoint.sh +40 -0
  5. .ipynb_checkpoints/train-checkpoint.py +495 -0
  6. __pycache__/fim.cpython-310.pyc +0 -0
  7. codellama-hugcoder/README.md +57 -0
  8. codellama-hugcoder/adapter_config.json +39 -0
  9. codellama-hugcoder/adapter_model.safetensors +3 -0
  10. codellama-hugcoder/checkpoint-1000/README.md +202 -0
  11. codellama-hugcoder/checkpoint-1000/adapter_config.json +39 -0
  12. codellama-hugcoder/checkpoint-1000/adapter_model.safetensors +3 -0
  13. codellama-hugcoder/checkpoint-1000/optimizer.pt +3 -0
  14. codellama-hugcoder/checkpoint-1000/rng_state.pth +3 -0
  15. codellama-hugcoder/checkpoint-1000/scheduler.pt +3 -0
  16. codellama-hugcoder/checkpoint-1000/trainer_state.json +1434 -0
  17. codellama-hugcoder/checkpoint-1000/training_args.bin +3 -0
  18. codellama-hugcoder/checkpoint-1500/README.md +202 -0
  19. codellama-hugcoder/checkpoint-1500/adapter_config.json +39 -0
  20. codellama-hugcoder/checkpoint-1500/adapter_model.safetensors +3 -0
  21. codellama-hugcoder/checkpoint-1500/optimizer.pt +3 -0
  22. codellama-hugcoder/checkpoint-1500/rng_state.pth +3 -0
  23. codellama-hugcoder/checkpoint-1500/scheduler.pt +3 -0
  24. codellama-hugcoder/checkpoint-1500/trainer_state.json +2134 -0
  25. codellama-hugcoder/checkpoint-1500/training_args.bin +3 -0
  26. codellama-hugcoder/checkpoint-2000/README.md +202 -0
  27. codellama-hugcoder/checkpoint-2000/adapter_config.json +39 -0
  28. codellama-hugcoder/checkpoint-2000/adapter_model.safetensors +3 -0
  29. codellama-hugcoder/checkpoint-2000/optimizer.pt +3 -0
  30. codellama-hugcoder/checkpoint-2000/rng_state.pth +3 -0
  31. codellama-hugcoder/checkpoint-2000/scheduler.pt +3 -0
  32. codellama-hugcoder/checkpoint-2000/trainer_state.json +2834 -0
  33. codellama-hugcoder/checkpoint-2000/training_args.bin +3 -0
  34. codellama-hugcoder/checkpoint-500/README.md +202 -0
  35. codellama-hugcoder/checkpoint-500/adapter_config.json +39 -0
  36. codellama-hugcoder/checkpoint-500/adapter_model.safetensors +3 -0
  37. codellama-hugcoder/checkpoint-500/optimizer.pt +3 -0
  38. codellama-hugcoder/checkpoint-500/rng_state.pth +3 -0
  39. codellama-hugcoder/checkpoint-500/scheduler.pt +3 -0
  40. codellama-hugcoder/checkpoint-500/trainer_state.json +734 -0
  41. codellama-hugcoder/checkpoint-500/training_args.bin +3 -0
  42. codellama-hugcoder/training_args.bin +3 -0
  43. configs/deepspeed_config.yaml +22 -0
  44. configs/fsdp_config.yaml +25 -0
  45. fim.py +141 -0
  46. requirements.txt +14 -0
  47. run_deepspeed.sh +33 -0
  48. run_fsdp.sh +33 -0
  49. run_peft.sh +40 -0
  50. run_unsloth_peft.sh +43 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/offline-run-20250516_073747-jc2tz43q/run-jc2tz43q.wandb filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/fim-checkpoint.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Sourab Mangrulkar. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import functools
17
+ import numpy as np
18
+
19
+
20
+ # this is expensive so we cache it
21
+ @functools.lru_cache(maxsize=None)
22
+ def get_fim_token_ids(tokenizer):
23
+ if "codellama" in tokenizer.name_or_path:
24
+ return (
25
+ tokenizer.bos_token_id,
26
+ tokenizer.suffix_id,
27
+ tokenizer.prefix_id,
28
+ tokenizer.middle_id,
29
+ 0,
30
+ )
31
+ elif "deepseek-coder" in tokenizer.name_or_path:
32
+ return (
33
+ tokenizer.bos_token_id,
34
+ tokenizer.encode("<|fim▁hole|>", add_special_tokens=False)[0],
35
+ tokenizer.encode("<|fim▁begin|>", add_special_tokens=False)[0],
36
+ tokenizer.encode("<|fim▁end|>", add_special_tokens=False)[0],
37
+ tokenizer.encode("<pad>", add_special_tokens=False)[0],
38
+ )
39
+ elif "stable-code" in tokenizer.name_or_path:
40
+ return (
41
+ tokenizer.bos_token_id,
42
+ tokenizer.encode("<fim_suffix>")[0],
43
+ tokenizer.encode("<fim_prefix>")[0],
44
+ tokenizer.encode("<fim_middle>")[0],
45
+ tokenizer.encode("<fim_pad>")[0],
46
+ )
47
+ else:
48
+ bos_token_id = None
49
+ try:
50
+ FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
51
+ "additional_special_tokens"
52
+ ][1:5]
53
+ suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
54
+ tokenizer.vocab[tok]
55
+ for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
56
+ )
57
+ except KeyError:
58
+ suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
59
+ None,
60
+ None,
61
+ None,
62
+ None,
63
+ )
64
+ return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
65
+
66
+
67
+ def _bos_token_processing(prefix_token_list, bos_token):
68
+ if bos_token is not None:
69
+ # add the BOS token to the beginning of the list
70
+ prefix_token_list.insert(0, bos_token)
71
+
72
+ return prefix_token_list
73
+
74
+
75
+ ## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
76
+ def permute(
77
+ sample,
78
+ np_rng,
79
+ suffix_tok_id,
80
+ prefix_tok_id,
81
+ middle_tok_id,
82
+ pad_tok_id,
83
+ fim_rate=0.5,
84
+ fim_spm_rate=0.5,
85
+ truncate_or_pad=False,
86
+ bos_token_id=None,
87
+ ):
88
+ """
89
+ Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
90
+ PSM and SPM (with a probability of fim_spm_rate).
91
+ """
92
+
93
+ if np_rng.binomial(1, fim_rate):
94
+ boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
95
+ boundaries.sort()
96
+
97
+ prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
98
+ middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
99
+ suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
100
+
101
+ if truncate_or_pad:
102
+ new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
103
+ diff = new_length - len(sample)
104
+ if diff > 0:
105
+ if suffix.shape[0] <= diff:
106
+ return sample, np_rng
107
+ suffix = suffix[: suffix.shape[0] - diff]
108
+ elif diff < 0:
109
+ suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
110
+
111
+ if np_rng.binomial(1, fim_spm_rate):
112
+ prefix_special_tokens = _bos_token_processing(
113
+ [prefix_tok_id, suffix_tok_id], bos_token_id
114
+ )
115
+ # SPM (variant 2 from FIM paper)
116
+ new_sample = np.concatenate(
117
+ [
118
+ prefix_special_tokens,
119
+ suffix,
120
+ [middle_tok_id],
121
+ prefix,
122
+ middle,
123
+ ]
124
+ )
125
+ else:
126
+ prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
127
+ # PSM
128
+ new_sample = np.concatenate(
129
+ [
130
+ prefix_special_tokens,
131
+ prefix,
132
+ [suffix_tok_id],
133
+ suffix,
134
+ [middle_tok_id],
135
+ middle,
136
+ ]
137
+ )
138
+ else:
139
+ # don't do FIM preproc
140
+ new_sample = sample
141
+ return list(new_sample), np_rng
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/huggingface/accelerate
3
+ git+https://github.com/huggingface/peft
4
+ trl
5
+ huggingface-hub
6
+ bitsandbytes
7
+ evaluate
8
+ datasets
9
+ einops
10
+ wandb
11
+ tiktoken
12
+ deepspeed
13
+ tqdm
14
+ safetensors
.ipynb_checkpoints/run_peft-checkpoint.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
2
+ --model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
3
+ --dataset_name "smangrul/hug_stack" \
4
+ --splits "train" \
5
+ --max_seq_len 2048 \
6
+ --max_steps 2000 \
7
+ --save_steps 500 \
8
+ --eval_steps 100 \
9
+ --logging_steps 5 \
10
+ --log_level "info" \
11
+ --logging_strategy "steps" \
12
+ --save_strategy "steps" \
13
+ --push_to_hub \
14
+ --hub_private_repo True \
15
+ --hub_strategy "every_save" \
16
+ --bf16 True \
17
+ --learning_rate 3e-4 \
18
+ --lr_scheduler_type "cosine" \
19
+ --weight_decay 0.1 \
20
+ --warmup_ratio 0.1 \
21
+ --max_grad_norm 1.0 \
22
+ --output_dir "codellama-hugcoder" \
23
+ --per_device_train_batch_size 4 \
24
+ --per_device_eval_batch_size 4 \
25
+ --gradient_accumulation_steps 4 \
26
+ --gradient_checkpointing True \
27
+ --use_reentrant True \
28
+ --dataset_text_field "text" \
29
+ --test_size 0.1 \
30
+ --fim_rate 0.5 \
31
+ --fim_spm_rate 0.5 \
32
+ --use_peft_lora True \
33
+ --lora_r 32 \
34
+ --lora_alpha 64 \
35
+ --lora_dropout 0.1 \
36
+ --lora_target_modules "all-linear" \
37
+ --use_4bit_quantization True \
38
+ --use_nested_quant True \
39
+ --bnb_4bit_compute_dtype "bfloat16" \
40
+ --use_flash_attn True
.ipynb_checkpoints/train-checkpoint.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Sourab Mangrulkar. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Continued pre-training/fine-tuning of code LLMs for code autocompletion.
18
+ """
19
+
20
+ import gc
21
+ import os
22
+ import random
23
+ import sys
24
+ from typing import Optional
25
+ from dataclasses import dataclass, field
26
+
27
+ import numpy as np
28
+ import torch
29
+ from datasets import load_dataset
30
+ from torch.utils.data import IterableDataset
31
+ from tqdm import tqdm
32
+ from transformers import (
33
+ AutoModelForCausalLM,
34
+ AutoTokenizer,
35
+ Trainer,
36
+ TrainingArguments,
37
+ HfArgumentParser,
38
+ set_seed,
39
+ BitsAndBytesConfig,
40
+ )
41
+
42
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, replace_lora_weights_loftq
43
+ import fim
44
+
45
+
46
+ # Define and parse arguments.
47
+ @dataclass
48
+ class ModelArguments:
49
+ """
50
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
51
+ """
52
+
53
+ model_name_or_path: str = field(
54
+ metadata={
55
+ "help": "Path to pretrained model or model identifier from huggingface.co/models"
56
+ }
57
+ )
58
+ lora_alpha: Optional[int] = field(default=16)
59
+ lora_dropout: Optional[float] = field(default=0.1)
60
+ lora_r: Optional[int] = field(default=64)
61
+ lora_target_modules: Optional[str] = field(
62
+ default="q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj",
63
+ metadata={
64
+ "help": "comma separated list of target modules to apply LoRA layers to"
65
+ },
66
+ )
67
+ use_nested_quant: Optional[bool] = field(
68
+ default=False,
69
+ metadata={"help": "Activate nested quantization for 4bit base models"},
70
+ )
71
+ bnb_4bit_compute_dtype: Optional[str] = field(
72
+ default="float16",
73
+ metadata={"help": "Compute dtype for 4bit base models"},
74
+ )
75
+ bnb_4bit_quant_type: Optional[str] = field(
76
+ default="nf4",
77
+ metadata={"help": "Quantization type fp4 or nf4"},
78
+ )
79
+ use_flash_attn: Optional[bool] = field(
80
+ default=False,
81
+ metadata={"help": "Enables Flash attention for training."},
82
+ )
83
+ use_peft_lora: Optional[bool] = field(
84
+ default=False,
85
+ metadata={"help": "Enables PEFT LoRA for training."},
86
+ )
87
+ use_8bit_qunatization: Optional[bool] = field(
88
+ default=False,
89
+ metadata={"help": "Enables loading model in 8bit."},
90
+ )
91
+ use_4bit_quantization: Optional[bool] = field(
92
+ default=False,
93
+ metadata={"help": "Enables loading model in 4bit."},
94
+ )
95
+ use_reentrant: Optional[bool] = field(
96
+ default=False,
97
+ metadata={"help": "Gradient Checkpointing param. Refer the related docs"},
98
+ )
99
+ use_unsloth: Optional[bool] = field(
100
+ default=False,
101
+ metadata={"help": "Enables UnSloth for training."},
102
+ )
103
+ use_loftq: Optional[bool] = field(
104
+ default=False,
105
+ metadata={"help": "Enables LoftQ init for the LoRA adapters when using QLoRA."},
106
+ )
107
+ use_loftq_callback: Optional[bool] = field(
108
+ default=False,
109
+ metadata={"help": "Enables LoftQ callback comparing logits of base model to the ones from LoftQ init. Provides better init."},
110
+ )
111
+
112
+
113
+ @dataclass
114
+ class DataTrainingArguments:
115
+ dataset_name: Optional[str] = field(
116
+ default="smangrul/hug_stack",
117
+ metadata={"help": "The preference dataset to use."},
118
+ )
119
+ dataset_text_field: str = field(
120
+ default="text", metadata={"help": "Dataset field to use as input text."}
121
+ )
122
+ max_seq_length: Optional[int] = field(default=4096)
123
+ test_size: Optional[float] = field(default=0.1)
124
+ fim_rate: Optional[float] = field(default=0.5)
125
+ fim_spm_rate: Optional[float] = field(default=0.5)
126
+ splits: Optional[str] = field(
127
+ default="train",
128
+ metadata={"help": "Comma separate list of the splits to use from the dataset."},
129
+ )
130
+
131
+
132
+ def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
133
+ """
134
+ Estimate the average number of characters per token in the dataset.
135
+ """
136
+ total_characters, total_tokens = 0, 0
137
+ for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
138
+ total_characters += len(example[data_column])
139
+ total_tokens += len(tokenizer(example[data_column]).tokens())
140
+
141
+ return total_characters / total_tokens
142
+
143
+
144
+ class ConstantLengthDataset(IterableDataset):
145
+ """
146
+ Iterable dataset that returns constant length chunks of tokens from stream of text files.
147
+ Args:
148
+ tokenizer (Tokenizer): The processor used for proccessing the data.
149
+ dataset (dataset.Dataset): Dataset with text files.
150
+ infinite (bool): If True the iterator is reset after dataset reaches end else stops.
151
+ seq_length (int): Length of token sequences to return.
152
+ num_of_sequences (int): Number of token sequences to keep in buffer.
153
+ chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
154
+ fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
155
+ fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
156
+ seed (int): Seed for random number generator.
157
+ """
158
+
159
+ def __init__(
160
+ self,
161
+ tokenizer,
162
+ dataset,
163
+ infinite=False,
164
+ seq_length=1024,
165
+ num_of_sequences=1024,
166
+ chars_per_token=3.6,
167
+ content_field="content",
168
+ fim_rate=0.5,
169
+ fim_spm_rate=0.5,
170
+ seed=0,
171
+ shuffle=False,
172
+ ):
173
+ self.tokenizer = tokenizer
174
+ self.concat_token_id = tokenizer.eos_token_id
175
+ self.dataset = dataset
176
+ self.seq_length = seq_length
177
+ self.infinite = infinite
178
+ self.current_size = 0
179
+ self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
180
+ self.content_field = content_field
181
+ self.fim_rate = fim_rate
182
+ self.fim_spm_rate = fim_spm_rate
183
+ self.seed = seed
184
+ self.shuffle = shuffle
185
+
186
+ (
187
+ self.bos_token_id,
188
+ self.suffix_tok_id,
189
+ self.prefix_tok_id,
190
+ self.middle_tok_id,
191
+ self.pad_tok_id,
192
+ ) = fim.get_fim_token_ids(self.tokenizer)
193
+ if not self.suffix_tok_id and self.fim_rate > 0:
194
+ print("FIM is not supported by tokenizer, disabling FIM")
195
+ self.fim_rate = 0
196
+
197
+ def __iter__(self):
198
+ iterator = iter(self.dataset)
199
+ more_examples = True
200
+ np_rng = np.random.RandomState(seed=self.seed)
201
+ while more_examples:
202
+ buffer, buffer_len = [], 0
203
+ while True:
204
+ if buffer_len >= self.max_buffer_size:
205
+ break
206
+ try:
207
+ buffer.append(next(iterator)[self.content_field])
208
+ buffer_len += len(buffer[-1])
209
+ except StopIteration:
210
+ if self.infinite:
211
+ iterator = iter(self.dataset)
212
+ else:
213
+ more_examples = False
214
+ break
215
+ tokenized_inputs = self.tokenizer(
216
+ buffer, truncation=False, add_special_tokens=False
217
+ )["input_ids"]
218
+ all_token_ids = []
219
+
220
+ for tokenized_input in tokenized_inputs:
221
+ # optionally do FIM permutations
222
+ if self.fim_rate > 0:
223
+ tokenized_input, np_rng = fim.permute(
224
+ tokenized_input,
225
+ np_rng,
226
+ self.suffix_tok_id,
227
+ self.prefix_tok_id,
228
+ self.middle_tok_id,
229
+ self.pad_tok_id,
230
+ fim_rate=self.fim_rate,
231
+ fim_spm_rate=self.fim_spm_rate,
232
+ truncate_or_pad=False,
233
+ bos_token_id=self.bos_token_id,
234
+ )
235
+
236
+ all_token_ids.extend(tokenized_input + [self.concat_token_id])
237
+ examples = []
238
+ for i in range(0, len(all_token_ids), self.seq_length):
239
+ input_ids = all_token_ids[i : i + self.seq_length]
240
+ if len(input_ids) == self.seq_length:
241
+ examples.append(input_ids)
242
+ if self.shuffle:
243
+ random.shuffle(examples)
244
+ for example in examples:
245
+ self.current_size += 1
246
+ yield {
247
+ "input_ids": torch.LongTensor(example),
248
+ "labels": torch.LongTensor(example),
249
+ }
250
+
251
+
252
+ def create_datasets(tokenizer, args, seed):
253
+ dataset = load_dataset(args.dataset_name, split=args.splits)
254
+ dataset = dataset.train_test_split(
255
+ test_size=args.test_size, seed=seed, shuffle=True
256
+ )
257
+ train_data = dataset["train"]
258
+ valid_data = dataset["test"]
259
+ print(
260
+ f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
261
+ )
262
+ chars_per_token = chars_token_ratio(train_data, tokenizer, args.dataset_text_field)
263
+ print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
264
+ train_dataset = ConstantLengthDataset(
265
+ tokenizer,
266
+ train_data,
267
+ infinite=True,
268
+ seq_length=args.max_seq_length,
269
+ chars_per_token=chars_per_token,
270
+ content_field=args.dataset_text_field,
271
+ fim_rate=args.fim_rate,
272
+ fim_spm_rate=args.fim_spm_rate,
273
+ seed=seed,
274
+ shuffle=True,
275
+ )
276
+ valid_dataset = ConstantLengthDataset(
277
+ tokenizer,
278
+ valid_data,
279
+ infinite=False,
280
+ seq_length=args.max_seq_length,
281
+ chars_per_token=chars_per_token,
282
+ content_field=args.dataset_text_field,
283
+ fim_rate=args.fim_rate,
284
+ fim_spm_rate=args.fim_spm_rate,
285
+ seed=seed,
286
+ )
287
+ print(f"A sample of valid dataset: {next(iter(valid_dataset))}")
288
+ return train_dataset, valid_dataset
289
+
290
+ def get_mae(x, y):
291
+ return (x - y).abs().mean()
292
+
293
+
294
+ def get_mse(x, y):
295
+ return torch.pow(x - y, 2).mean()
296
+
297
+
298
+ def error_report(x, y):
299
+ mae = get_mae(x, y)
300
+ mse = get_mse(x, y)
301
+ print(
302
+ f"Mean absolute error: {mae:>8.5f}\n"
303
+ f"Mean squared error: {mse:>8.5f}"
304
+ )
305
+
306
+
307
+ def loftq_init(model, tokenizer, train_dataset, max_seq_length, args):
308
+ if args.use_loftq_callback:
309
+ compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
310
+ base_model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=compute_dtype)
311
+ base_model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
312
+ random_input_ids = torch.randint(0, len(train_dataset), size=(1,)).numpy().tolist()
313
+ random_inputs = [train_dataset[i]['content'] for i in random_input_ids]
314
+ random_inputs = tokenizer(random_inputs, return_tensors="pt", padding=True, truncation="max_length", max_length=max_seq_length)
315
+ logits_base = base_model(**random_inputs).logits
316
+ del base_model
317
+ gc.collect()
318
+
319
+ def loftq_callback(model, module_name):
320
+ """Callable to replace weights with LoFTQ if the mse is lower than the current best one."""
321
+ global current_mse
322
+ logits = model(**random_inputs).logits
323
+ mse = get_mse(logits_base, logits)
324
+ if mse < current_mse:
325
+ current_mse = mse
326
+ print(f"MSE improved for module {module_name}")
327
+ return True
328
+ print(f"MSE did not improve for module {module_name}")
329
+ return False
330
+
331
+ replace_lora_weights_loftq(model, callback=loftq_callback)
332
+ logits_loftq_callback = model(**random_inputs).logits
333
+ error_report(logits_base, logits_loftq_callback)
334
+ else:
335
+ replace_lora_weights_loftq(model)
336
+
337
+
338
+ def create_and_prepare_model(args, data_args, training_args):
339
+ device_map = None
340
+ bnb_config = None
341
+
342
+ load_in_8bit = args.use_8bit_qunatization
343
+ load_in_4bit = args.use_4bit_quantization
344
+
345
+ if args.use_unsloth:
346
+ from unsloth import FastLanguageModel
347
+
348
+ if args.use_4bit_quantization:
349
+ compute_dtype = getattr(torch, args.bnb_4bit_compute_dtype)
350
+
351
+ bnb_config = BitsAndBytesConfig(
352
+ load_in_4bit=args.use_4bit_quantization,
353
+ bnb_4bit_quant_type=args.bnb_4bit_quant_type,
354
+ bnb_4bit_compute_dtype=compute_dtype,
355
+ bnb_4bit_use_double_quant=args.use_nested_quant,
356
+ )
357
+
358
+ if compute_dtype == torch.float16 and args.use_4bit_quantization:
359
+ major, _ = torch.cuda.get_device_capability()
360
+ if major >= 8:
361
+ print("=" * 80)
362
+ print(
363
+ "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
364
+ )
365
+ print("=" * 80)
366
+
367
+ if args.use_4bit_quantization or args.use_8bit_qunatization:
368
+ device_map = (
369
+ int(os.environ.get("LOCAL_RANK", -1))
370
+ if torch.distributed.is_available() and torch.distributed.is_initialized()
371
+ else "auto"
372
+ ) # {"": 0}
373
+
374
+ if args.use_unsloth:
375
+ # Load model
376
+ model, _ = FastLanguageModel.from_pretrained(
377
+ model_name=args.model_name_or_path,
378
+ max_seq_length=data_args.max_seq_length,
379
+ dtype=None,
380
+ load_in_4bit=load_in_4bit,
381
+ )
382
+ else:
383
+ model = AutoModelForCausalLM.from_pretrained(
384
+ args.model_name_or_path,
385
+ load_in_8bit=load_in_8bit,
386
+ quantization_config=bnb_config,
387
+ device_map=device_map,
388
+ trust_remote_code=True,
389
+ attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
390
+ )
391
+
392
+ if (
393
+ (args.use_4bit_quantization or args.use_8bit_qunatization)
394
+ and args.use_peft_lora
395
+ and not args.use_unsloth
396
+ ):
397
+ model = prepare_model_for_kbit_training(
398
+ model,
399
+ use_gradient_checkpointing=training_args.gradient_checkpointing,
400
+ gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant},
401
+ )
402
+
403
+ if args.use_peft_lora and not args.use_unsloth:
404
+ peft_config = LoraConfig(
405
+ lora_alpha=args.lora_alpha,
406
+ lora_dropout=args.lora_dropout,
407
+ r=args.lora_r,
408
+ bias="none",
409
+ task_type="CAUSAL_LM",
410
+ target_modules=args.lora_target_modules.split(",")
411
+ if args.lora_target_modules != "all-linear"
412
+ else args.lora_target_modules,
413
+ )
414
+ model = get_peft_model(model, peft_config)
415
+ elif args.use_peft_lora and args.use_unsloth:
416
+ # Do model patching and add fast LoRA weights
417
+ model = FastLanguageModel.get_peft_model(
418
+ model,
419
+ lora_alpha=args.lora_alpha,
420
+ lora_dropout=args.lora_dropout,
421
+ r=args.lora_r,
422
+ target_modules=args.lora_target_modules.split(",")
423
+ if args.lora_target_modules != "all-linear"
424
+ else args.lora_target_modules,
425
+ use_gradient_checkpointing=training_args.gradient_checkpointing,
426
+ random_state=training_args.seed,
427
+ max_seq_length=data_args.max_seq_length,
428
+ )
429
+ return model
430
+
431
+
432
+ def main(model_args, data_args, training_args):
433
+ # Set seed for reproducibility
434
+ set_seed(training_args.seed)
435
+
436
+ # load the tokenizer
437
+ tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
438
+
439
+ # load the datasets
440
+ train_dataset, eval_dataset = create_datasets(
441
+ tokenizer, data_args, training_args.seed
442
+ )
443
+ train_dataset.start_iteration = 0
444
+
445
+ model = create_and_prepare_model(model_args, data_args, training_args)
446
+ # gradient ckpt
447
+ model.config.use_cache = not training_args.gradient_checkpointing
448
+ training_args.gradient_checkpointing = (
449
+ training_args.gradient_checkpointing and not model_args.use_unsloth
450
+ )
451
+ if training_args.gradient_checkpointing:
452
+ training_args.gradient_checkpointing_kwargs = {
453
+ "use_reentrant": model_args.use_reentrant
454
+ }
455
+
456
+ # trainer
457
+ trainer = Trainer(
458
+ model=model,
459
+ args=training_args,
460
+ train_dataset=train_dataset,
461
+ eval_dataset=eval_dataset,
462
+ )
463
+ trainer.accelerator.print(f"{trainer.model}")
464
+ if model_args.use_peft_lora:
465
+ trainer.model.print_trainable_parameters()
466
+
467
+ # LoftQ initialization when using QLoRA
468
+ if model_args.use_4bit_quantization and model_args.use_loftq:
469
+ loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args)
470
+
471
+ # train
472
+ checkpoint = None
473
+ if training_args.resume_from_checkpoint is not None:
474
+ checkpoint = training_args.resume_from_checkpoint
475
+ trainer.train(resume_from_checkpoint=checkpoint)
476
+
477
+ # saving final model
478
+ if trainer.is_fsdp_enabled:
479
+ trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
480
+ trainer.save_model()
481
+
482
+
483
+ if __name__ == "__main__":
484
+ parser = HfArgumentParser(
485
+ (ModelArguments, DataTrainingArguments, TrainingArguments)
486
+ )
487
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
488
+ # If we pass only one argument to the script and it's the path to a json file,
489
+ # let's parse it to get our arguments.
490
+ model_args, data_args, training_args = parser.parse_json_file(
491
+ json_file=os.path.abspath(sys.argv[1])
492
+ )
493
+ else:
494
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
495
+ main(model_args, data_args, training_args)
__pycache__/fim.cpython-310.pyc ADDED
Binary file (2.64 kB). View file
 
codellama-hugcoder/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: llama2
4
+ base_model: codellama/CodeLlama-7b-Instruct-hf
5
+ tags:
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: codellama-hugcoder
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ # codellama-hugcoder
16
+
17
+ This model is a fine-tuned version of [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf) on an unknown dataset.
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 0.0003
37
+ - train_batch_size: 4
38
+ - eval_batch_size: 4
39
+ - seed: 42
40
+ - gradient_accumulation_steps: 4
41
+ - total_train_batch_size: 16
42
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
43
+ - lr_scheduler_type: cosine
44
+ - lr_scheduler_warmup_ratio: 0.1
45
+ - training_steps: 2000
46
+
47
+ ### Training results
48
+
49
+
50
+
51
+ ### Framework versions
52
+
53
+ - PEFT 0.15.2.dev0
54
+ - Transformers 4.52.0.dev0
55
+ - Pytorch 2.6.0+cu124
56
+ - Datasets 3.2.0
57
+ - Tokenizers 0.21.1
codellama-hugcoder/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
codellama-hugcoder/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
3
+ size 319876032
codellama-hugcoder/checkpoint-1000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLlama-7b-Instruct-hf
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2.dev0
codellama-hugcoder/checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
codellama-hugcoder/checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a808764ea3b6733b0a7c7a6002b640b2b9246cabcd9ad2d940aa7f43c05d66e3
3
+ size 319876032
codellama-hugcoder/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4025993adcd424dc3d3b0c61b41a0e262786b3bb304e6a592a013e59b80a6b38
3
+ size 640009682
codellama-hugcoder/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f822cfb134cd0b1f54ce227e6d11176dede74f86c94420156b0a49753efe3b7
3
+ size 14244
codellama-hugcoder/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3569157643c45495d0de4a184cdcaab0e6cab5317a8ad5f0b1bbb2d736dd80d4
3
+ size 1064
codellama-hugcoder/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5,
6
+ "eval_steps": 100.0,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0025,
14
+ "grad_norm": 0.09379793703556061,
15
+ "learning_rate": 5.999999999999999e-06,
16
+ "loss": 0.6799,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.005,
21
+ "grad_norm": 0.1399833709001541,
22
+ "learning_rate": 1.3499999999999998e-05,
23
+ "loss": 0.6954,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0075,
28
+ "grad_norm": 0.08632303029298782,
29
+ "learning_rate": 2.1e-05,
30
+ "loss": 0.6921,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 0.10006701201200485,
36
+ "learning_rate": 2.8499999999999998e-05,
37
+ "loss": 0.69,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.0125,
42
+ "grad_norm": 0.07633858919143677,
43
+ "learning_rate": 3.5999999999999994e-05,
44
+ "loss": 0.6722,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.015,
49
+ "grad_norm": 0.09399061650037766,
50
+ "learning_rate": 4.3499999999999993e-05,
51
+ "loss": 0.6453,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.0175,
56
+ "grad_norm": 0.0843738541007042,
57
+ "learning_rate": 5.1e-05,
58
+ "loss": 0.6276,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 0.08583351224660873,
64
+ "learning_rate": 5.85e-05,
65
+ "loss": 0.58,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.0225,
70
+ "grad_norm": 0.09571370482444763,
71
+ "learning_rate": 6.599999999999999e-05,
72
+ "loss": 0.6355,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.025,
77
+ "grad_norm": 0.1083935871720314,
78
+ "learning_rate": 7.35e-05,
79
+ "loss": 0.589,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.0275,
84
+ "grad_norm": 0.10387319326400757,
85
+ "learning_rate": 8.1e-05,
86
+ "loss": 0.6061,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "grad_norm": 0.11083361506462097,
92
+ "learning_rate": 8.849999999999998e-05,
93
+ "loss": 0.572,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.0325,
98
+ "grad_norm": 0.12665686011314392,
99
+ "learning_rate": 9.599999999999999e-05,
100
+ "loss": 0.5442,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.035,
105
+ "grad_norm": 0.1308053582906723,
106
+ "learning_rate": 0.00010349999999999998,
107
+ "loss": 0.6524,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.0375,
112
+ "grad_norm": 0.13535510003566742,
113
+ "learning_rate": 0.00011099999999999999,
114
+ "loss": 0.6404,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.04,
119
+ "grad_norm": 0.12833671271800995,
120
+ "learning_rate": 0.0001185,
121
+ "loss": 0.5717,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.0425,
126
+ "grad_norm": 0.11962099373340607,
127
+ "learning_rate": 0.00012599999999999997,
128
+ "loss": 0.6098,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.045,
133
+ "grad_norm": 0.13898271322250366,
134
+ "learning_rate": 0.0001335,
135
+ "loss": 0.6099,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.0475,
140
+ "grad_norm": 0.14486610889434814,
141
+ "learning_rate": 0.00014099999999999998,
142
+ "loss": 0.5744,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.05,
147
+ "grad_norm": 0.1432138830423355,
148
+ "learning_rate": 0.00014849999999999998,
149
+ "loss": 0.5659,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.0525,
154
+ "grad_norm": 0.13487878441810608,
155
+ "learning_rate": 0.000156,
156
+ "loss": 0.5622,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.055,
161
+ "grad_norm": 0.12495309859514236,
162
+ "learning_rate": 0.0001635,
163
+ "loss": 0.5951,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.0575,
168
+ "grad_norm": 0.13011734187602997,
169
+ "learning_rate": 0.00017099999999999998,
170
+ "loss": 0.6249,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.06,
175
+ "grad_norm": 0.13987745344638824,
176
+ "learning_rate": 0.00017849999999999997,
177
+ "loss": 0.559,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.0625,
182
+ "grad_norm": 0.13373605906963348,
183
+ "learning_rate": 0.000186,
184
+ "loss": 0.5475,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.065,
189
+ "grad_norm": 0.12433867901563644,
190
+ "learning_rate": 0.0001935,
191
+ "loss": 0.5274,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.0675,
196
+ "grad_norm": 0.11097615957260132,
197
+ "learning_rate": 0.000201,
198
+ "loss": 0.678,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.07,
203
+ "grad_norm": 0.1155027225613594,
204
+ "learning_rate": 0.00020849999999999997,
205
+ "loss": 0.5611,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.0725,
210
+ "grad_norm": 0.11431068181991577,
211
+ "learning_rate": 0.00021599999999999996,
212
+ "loss": 0.6054,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.075,
217
+ "grad_norm": 0.09796140342950821,
218
+ "learning_rate": 0.00022349999999999998,
219
+ "loss": 0.5472,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.0775,
224
+ "grad_norm": 0.09489257633686066,
225
+ "learning_rate": 0.00023099999999999998,
226
+ "loss": 0.4636,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "grad_norm": 0.10787788033485413,
232
+ "learning_rate": 0.0002385,
233
+ "loss": 0.6164,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.0825,
238
+ "grad_norm": 0.10261733084917068,
239
+ "learning_rate": 0.00024599999999999996,
240
+ "loss": 0.5408,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.085,
245
+ "grad_norm": 0.11870352178812027,
246
+ "learning_rate": 0.0002535,
247
+ "loss": 0.5268,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.0875,
252
+ "grad_norm": 0.11910569667816162,
253
+ "learning_rate": 0.000261,
254
+ "loss": 0.5461,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.09,
259
+ "grad_norm": 0.10083702206611633,
260
+ "learning_rate": 0.00026849999999999997,
261
+ "loss": 0.4794,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.0925,
266
+ "grad_norm": 0.10453511029481888,
267
+ "learning_rate": 0.000276,
268
+ "loss": 0.5539,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.095,
273
+ "grad_norm": 0.101403146982193,
274
+ "learning_rate": 0.00028349999999999995,
275
+ "loss": 0.5346,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.0975,
280
+ "grad_norm": 0.10724789649248123,
281
+ "learning_rate": 0.00029099999999999997,
282
+ "loss": 0.6026,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1,
287
+ "grad_norm": 0.1140277311205864,
288
+ "learning_rate": 0.0002985,
289
+ "loss": 0.5193,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.1025,
294
+ "grad_norm": 0.09706108272075653,
295
+ "learning_rate": 0.0002999963446058092,
296
+ "loss": 0.54,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.105,
301
+ "grad_norm": 0.10003062337636948,
302
+ "learning_rate": 0.0002999814948722491,
303
+ "loss": 0.5365,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1075,
308
+ "grad_norm": 0.1078687533736229,
309
+ "learning_rate": 0.00029995522346717746,
310
+ "loss": 0.5889,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.11,
315
+ "grad_norm": 0.10538115352392197,
316
+ "learning_rate": 0.0002999175323912636,
317
+ "loss": 0.5611,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.1125,
322
+ "grad_norm": 0.1020808294415474,
323
+ "learning_rate": 0.00029986842451482874,
324
+ "loss": 0.6103,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.115,
329
+ "grad_norm": 0.09635835886001587,
330
+ "learning_rate": 0.0002998079035776279,
331
+ "loss": 0.5229,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.1175,
336
+ "grad_norm": 0.10287190228700638,
337
+ "learning_rate": 0.0002997359741885648,
338
+ "loss": 0.5312,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.12,
343
+ "grad_norm": 0.09160075336694717,
344
+ "learning_rate": 0.0002996526418253408,
345
+ "loss": 0.5673,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.1225,
350
+ "grad_norm": 0.08691006153821945,
351
+ "learning_rate": 0.000299557912834038,
352
+ "loss": 0.5326,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.125,
357
+ "grad_norm": 0.10096988826990128,
358
+ "learning_rate": 0.00029945179442863594,
359
+ "loss": 0.6004,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.1275,
364
+ "grad_norm": 0.09594204276800156,
365
+ "learning_rate": 0.000299334294690462,
366
+ "loss": 0.5516,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.13,
371
+ "grad_norm": 0.10281919687986374,
372
+ "learning_rate": 0.00029920542256757607,
373
+ "loss": 0.5515,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.1325,
378
+ "grad_norm": 0.08547840267419815,
379
+ "learning_rate": 0.00029906518787408944,
380
+ "loss": 0.5243,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.135,
385
+ "grad_norm": 0.10161560773849487,
386
+ "learning_rate": 0.0002989136012894168,
387
+ "loss": 0.5096,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.1375,
392
+ "grad_norm": 0.09101904183626175,
393
+ "learning_rate": 0.0002987506743574635,
394
+ "loss": 0.553,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.14,
399
+ "grad_norm": 0.09769442677497864,
400
+ "learning_rate": 0.0002985764194857463,
401
+ "loss": 0.4953,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.1425,
406
+ "grad_norm": 0.10991579294204712,
407
+ "learning_rate": 0.00029839084994444826,
408
+ "loss": 0.5152,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.145,
413
+ "grad_norm": 0.09450916200876236,
414
+ "learning_rate": 0.00029819397986540836,
415
+ "loss": 0.5397,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.1475,
420
+ "grad_norm": 0.10876069217920303,
421
+ "learning_rate": 0.0002979858242410454,
422
+ "loss": 0.4858,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.15,
427
+ "grad_norm": 0.097995825111866,
428
+ "learning_rate": 0.00029776639892321606,
429
+ "loss": 0.5566,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.1525,
434
+ "grad_norm": 0.1145048514008522,
435
+ "learning_rate": 0.0002975357206220079,
436
+ "loss": 0.4531,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.155,
441
+ "grad_norm": 0.10271880775690079,
442
+ "learning_rate": 0.00029729380690446654,
443
+ "loss": 0.5199,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.1575,
448
+ "grad_norm": 0.11095371842384338,
449
+ "learning_rate": 0.0002970406761932583,
450
+ "loss": 0.5416,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.16,
455
+ "grad_norm": 0.09949438273906708,
456
+ "learning_rate": 0.00029677634776526673,
457
+ "loss": 0.4841,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.1625,
462
+ "grad_norm": 0.1163724958896637,
463
+ "learning_rate": 0.00029650084175012517,
464
+ "loss": 0.4913,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.165,
469
+ "grad_norm": 0.10726840049028397,
470
+ "learning_rate": 0.00029621417912868323,
471
+ "loss": 0.5203,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.1675,
476
+ "grad_norm": 0.09609931707382202,
477
+ "learning_rate": 0.00029591638173140947,
478
+ "loss": 0.5607,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.17,
483
+ "grad_norm": 0.10824442654848099,
484
+ "learning_rate": 0.0002956074722367286,
485
+ "loss": 0.6004,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.1725,
490
+ "grad_norm": 0.10465679317712784,
491
+ "learning_rate": 0.00029528747416929463,
492
+ "loss": 0.5216,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.175,
497
+ "grad_norm": 0.10518354922533035,
498
+ "learning_rate": 0.0002949564118981994,
499
+ "loss": 0.499,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.1775,
504
+ "grad_norm": 0.0955279991030693,
505
+ "learning_rate": 0.0002946143106351165,
506
+ "loss": 0.5607,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.18,
511
+ "grad_norm": 0.11159654706716537,
512
+ "learning_rate": 0.0002942611964323817,
513
+ "loss": 0.5204,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.1825,
518
+ "grad_norm": 0.09571187198162079,
519
+ "learning_rate": 0.0002938970961810086,
520
+ "loss": 0.6113,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.185,
525
+ "grad_norm": 0.11854679882526398,
526
+ "learning_rate": 0.0002935220376086411,
527
+ "loss": 0.5639,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.1875,
532
+ "grad_norm": 0.1050512045621872,
533
+ "learning_rate": 0.0002931360492774415,
534
+ "loss": 0.548,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.19,
539
+ "grad_norm": 0.1053968220949173,
540
+ "learning_rate": 0.0002927391605819157,
541
+ "loss": 0.5507,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.1925,
546
+ "grad_norm": 0.10567320138216019,
547
+ "learning_rate": 0.00029233140174667445,
548
+ "loss": 0.5312,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.195,
553
+ "grad_norm": 0.11914283782243729,
554
+ "learning_rate": 0.0002919128038241318,
555
+ "loss": 0.5961,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.1975,
560
+ "grad_norm": 0.09915795922279358,
561
+ "learning_rate": 0.0002914833986921401,
562
+ "loss": 0.5086,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.2,
567
+ "grad_norm": 0.10796502232551575,
568
+ "learning_rate": 0.0002910432190515628,
569
+ "loss": 0.5585,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.2025,
574
+ "grad_norm": 0.10748997330665588,
575
+ "learning_rate": 0.00029059229842378373,
576
+ "loss": 0.5466,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.205,
581
+ "grad_norm": 0.10696308314800262,
582
+ "learning_rate": 0.0002901306711481544,
583
+ "loss": 0.5513,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.2075,
588
+ "grad_norm": 0.10418657958507538,
589
+ "learning_rate": 0.0002896583723793792,
590
+ "loss": 0.5391,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.21,
595
+ "grad_norm": 0.16421550512313843,
596
+ "learning_rate": 0.00028917543808483796,
597
+ "loss": 0.4699,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.2125,
602
+ "grad_norm": 0.12929962575435638,
603
+ "learning_rate": 0.00028868190504184696,
604
+ "loss": 0.4984,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.215,
609
+ "grad_norm": 0.10469454526901245,
610
+ "learning_rate": 0.00028817781083485816,
611
+ "loss": 0.5119,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.2175,
616
+ "grad_norm": 0.0964970663189888,
617
+ "learning_rate": 0.00028766319385259713,
618
+ "loss": 0.5167,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.22,
623
+ "grad_norm": 0.12395574152469635,
624
+ "learning_rate": 0.00028713809328513953,
625
+ "loss": 0.5692,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.2225,
630
+ "grad_norm": 0.10189738124608994,
631
+ "learning_rate": 0.0002866025491209265,
632
+ "loss": 0.4628,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.225,
637
+ "grad_norm": 0.10433454066514969,
638
+ "learning_rate": 0.0002860566021437197,
639
+ "loss": 0.4869,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.2275,
644
+ "grad_norm": 0.13003456592559814,
645
+ "learning_rate": 0.0002855002939294951,
646
+ "loss": 0.5291,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.23,
651
+ "grad_norm": 0.11692202836275101,
652
+ "learning_rate": 0.000284933666843277,
653
+ "loss": 0.5229,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.2325,
658
+ "grad_norm": 0.10757846385240555,
659
+ "learning_rate": 0.0002843567640359119,
660
+ "loss": 0.435,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.235,
665
+ "grad_norm": 0.10775501281023026,
666
+ "learning_rate": 0.00028376962944078206,
667
+ "loss": 0.4418,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.2375,
672
+ "grad_norm": 0.11543692648410797,
673
+ "learning_rate": 0.00028317230777046015,
674
+ "loss": 0.4204,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.24,
679
+ "grad_norm": 0.10946698486804962,
680
+ "learning_rate": 0.00028256484451330403,
681
+ "loss": 0.49,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.2425,
686
+ "grad_norm": 0.11528221517801285,
687
+ "learning_rate": 0.00028194728592999247,
688
+ "loss": 0.4752,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.245,
693
+ "grad_norm": 0.10474205762147903,
694
+ "learning_rate": 0.0002813196790500027,
695
+ "loss": 0.4847,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.2475,
700
+ "grad_norm": 0.10768820345401764,
701
+ "learning_rate": 0.00028068207166802837,
702
+ "loss": 0.4664,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.25,
707
+ "grad_norm": 0.12158560007810593,
708
+ "learning_rate": 0.00028003451234034037,
709
+ "loss": 0.4741,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.2525,
714
+ "grad_norm": 0.11635497957468033,
715
+ "learning_rate": 0.0002793770503810886,
716
+ "loss": 0.4969,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.255,
721
+ "grad_norm": 0.12205849587917328,
722
+ "learning_rate": 0.00027870973585854665,
723
+ "loss": 0.4798,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.2575,
728
+ "grad_norm": 0.10270871222019196,
729
+ "learning_rate": 0.00027803261959129905,
730
+ "loss": 0.3888,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.26,
735
+ "grad_norm": 0.11313367635011673,
736
+ "learning_rate": 0.0002773457531443712,
737
+ "loss": 0.4759,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.2625,
742
+ "grad_norm": 0.12905193865299225,
743
+ "learning_rate": 0.00027664918882530225,
744
+ "loss": 0.4442,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.265,
749
+ "grad_norm": 0.11690939962863922,
750
+ "learning_rate": 0.00027594297968016197,
751
+ "loss": 0.5535,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.2675,
756
+ "grad_norm": 0.10021405667066574,
757
+ "learning_rate": 0.00027522717948951094,
758
+ "loss": 0.4717,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.27,
763
+ "grad_norm": 0.10104178637266159,
764
+ "learning_rate": 0.0002745018427643051,
765
+ "loss": 0.4906,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.2725,
770
+ "grad_norm": 0.12113891541957855,
771
+ "learning_rate": 0.00027376702474174425,
772
+ "loss": 0.5674,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.275,
777
+ "grad_norm": 0.11330476403236389,
778
+ "learning_rate": 0.0002730227813810658,
779
+ "loss": 0.5184,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.2775,
784
+ "grad_norm": 0.1025850847363472,
785
+ "learning_rate": 0.0002722691693592831,
786
+ "loss": 0.4395,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.28,
791
+ "grad_norm": 0.11591499298810959,
792
+ "learning_rate": 0.0002715062460668694,
793
+ "loss": 0.5003,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.2825,
798
+ "grad_norm": 0.11281153559684753,
799
+ "learning_rate": 0.0002707340696033871,
800
+ "loss": 0.4672,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.285,
805
+ "grad_norm": 0.1123538464307785,
806
+ "learning_rate": 0.00026995269877306356,
807
+ "loss": 0.513,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.2875,
812
+ "grad_norm": 0.10776390135288239,
813
+ "learning_rate": 0.0002691621930803127,
814
+ "loss": 0.4572,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.29,
819
+ "grad_norm": 0.10008667409420013,
820
+ "learning_rate": 0.0002683626127252036,
821
+ "loss": 0.4618,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.2925,
826
+ "grad_norm": 0.13961340487003326,
827
+ "learning_rate": 0.00026755401859887595,
828
+ "loss": 0.4819,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.295,
833
+ "grad_norm": 0.1476685106754303,
834
+ "learning_rate": 0.00026673647227890316,
835
+ "loss": 0.4964,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.2975,
840
+ "grad_norm": 0.09795507788658142,
841
+ "learning_rate": 0.00026591003602460263,
842
+ "loss": 0.4796,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.3,
847
+ "grad_norm": 0.10903532058000565,
848
+ "learning_rate": 0.00026507477277229496,
849
+ "loss": 0.4775,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.3025,
854
+ "grad_norm": 0.10258448123931885,
855
+ "learning_rate": 0.0002642307461305105,
856
+ "loss": 0.4519,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.305,
861
+ "grad_norm": 0.11204435676336288,
862
+ "learning_rate": 0.0002633780203751459,
863
+ "loss": 0.4451,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.3075,
868
+ "grad_norm": 0.10147629678249359,
869
+ "learning_rate": 0.0002625166604445689,
870
+ "loss": 0.4256,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.31,
875
+ "grad_norm": 0.10481107234954834,
876
+ "learning_rate": 0.00026164673193467306,
877
+ "loss": 0.4381,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.3125,
882
+ "grad_norm": 0.10856641829013824,
883
+ "learning_rate": 0.00026076830109388255,
884
+ "loss": 0.4958,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.315,
889
+ "grad_norm": 0.09918677806854248,
890
+ "learning_rate": 0.0002598814348181068,
891
+ "loss": 0.4335,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.3175,
896
+ "grad_norm": 0.10417389869689941,
897
+ "learning_rate": 0.00025898620064564637,
898
+ "loss": 0.4603,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.32,
903
+ "grad_norm": 0.0903329998254776,
904
+ "learning_rate": 0.00025808266675204954,
905
+ "loss": 0.3932,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.3225,
910
+ "grad_norm": 0.11511855572462082,
911
+ "learning_rate": 0.0002571709019449205,
912
+ "loss": 0.4169,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.325,
917
+ "grad_norm": 0.11355557292699814,
918
+ "learning_rate": 0.0002562509756586793,
919
+ "loss": 0.4455,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.3275,
924
+ "grad_norm": 0.1271187961101532,
925
+ "learning_rate": 0.00025532295794927437,
926
+ "loss": 0.4902,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.33,
931
+ "grad_norm": 0.11936645954847336,
932
+ "learning_rate": 0.0002543869194888471,
933
+ "loss": 0.4843,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.3325,
938
+ "grad_norm": 0.11935465037822723,
939
+ "learning_rate": 0.00025344293156035044,
940
+ "loss": 0.4402,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.335,
945
+ "grad_norm": 0.13073407113552094,
946
+ "learning_rate": 0.00025249106605211986,
947
+ "loss": 0.467,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.3375,
952
+ "grad_norm": 0.10340435802936554,
953
+ "learning_rate": 0.0002515313954523991,
954
+ "loss": 0.4827,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.34,
959
+ "grad_norm": 0.11634550243616104,
960
+ "learning_rate": 0.00025056399284381983,
961
+ "loss": 0.466,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.3425,
966
+ "grad_norm": 0.10582319647073746,
967
+ "learning_rate": 0.0002495889318978362,
968
+ "loss": 0.4751,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.345,
973
+ "grad_norm": 0.16781780123710632,
974
+ "learning_rate": 0.00024860628686911436,
975
+ "loss": 0.4717,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.3475,
980
+ "grad_norm": 0.11522196233272552,
981
+ "learning_rate": 0.0002476161325898776,
982
+ "loss": 0.4687,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.35,
987
+ "grad_norm": 0.11830449104309082,
988
+ "learning_rate": 0.000246618544464208,
989
+ "loss": 0.436,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.3525,
994
+ "grad_norm": 0.17485427856445312,
995
+ "learning_rate": 0.0002456135984623034,
996
+ "loss": 0.4284,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.355,
1001
+ "grad_norm": 0.12288108468055725,
1002
+ "learning_rate": 0.00024460137111469296,
1003
+ "loss": 0.4261,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.3575,
1008
+ "grad_norm": 0.11587081104516983,
1009
+ "learning_rate": 0.0002435819395064079,
1010
+ "loss": 0.4493,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.36,
1015
+ "grad_norm": 0.10690271109342575,
1016
+ "learning_rate": 0.0002425553812711123,
1017
+ "loss": 0.4648,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.3625,
1022
+ "grad_norm": 0.10404397547245026,
1023
+ "learning_rate": 0.00024152177458519014,
1024
+ "loss": 0.4634,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.365,
1029
+ "grad_norm": 0.11986954510211945,
1030
+ "learning_rate": 0.00024048119816179236,
1031
+ "loss": 0.4525,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.3675,
1036
+ "grad_norm": 0.10243026167154312,
1037
+ "learning_rate": 0.00023943373124484234,
1038
+ "loss": 0.4572,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.37,
1043
+ "grad_norm": 0.10386748611927032,
1044
+ "learning_rate": 0.00023837945360300129,
1045
+ "loss": 0.3884,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.3725,
1050
+ "grad_norm": 0.11165735125541687,
1051
+ "learning_rate": 0.0002373184455235934,
1052
+ "loss": 0.4902,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.375,
1057
+ "grad_norm": 0.09951601922512054,
1058
+ "learning_rate": 0.00023625078780649178,
1059
+ "loss": 0.4541,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.3775,
1064
+ "grad_norm": 0.10347504913806915,
1065
+ "learning_rate": 0.00023517656175796518,
1066
+ "loss": 0.3871,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.38,
1071
+ "grad_norm": 0.10478132963180542,
1072
+ "learning_rate": 0.00023409584918448627,
1073
+ "loss": 0.4329,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.3825,
1078
+ "grad_norm": 0.1198212131857872,
1079
+ "learning_rate": 0.00023300873238650159,
1080
+ "loss": 0.425,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.385,
1085
+ "grad_norm": 0.1103711724281311,
1086
+ "learning_rate": 0.00023191529415216434,
1087
+ "loss": 0.4274,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.3875,
1092
+ "grad_norm": 0.09940385073423386,
1093
+ "learning_rate": 0.00023081561775102944,
1094
+ "loss": 0.4368,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.39,
1099
+ "grad_norm": 0.11599268019199371,
1100
+ "learning_rate": 0.00022970978692771242,
1101
+ "loss": 0.4386,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.3925,
1106
+ "grad_norm": 0.10101296752691269,
1107
+ "learning_rate": 0.00022859788589551188,
1108
+ "loss": 0.4696,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.395,
1113
+ "grad_norm": 0.10112808644771576,
1114
+ "learning_rate": 0.00022747999932999624,
1115
+ "loss": 0.4066,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.3975,
1120
+ "grad_norm": 0.09595459699630737,
1121
+ "learning_rate": 0.00022635621236255567,
1122
+ "loss": 0.4837,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.4,
1127
+ "grad_norm": 0.10761380940675735,
1128
+ "learning_rate": 0.00022522661057391857,
1129
+ "loss": 0.5446,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.4025,
1134
+ "grad_norm": 0.11919954419136047,
1135
+ "learning_rate": 0.00022409127998763463,
1136
+ "loss": 0.5027,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.405,
1141
+ "grad_norm": 0.10851597785949707,
1142
+ "learning_rate": 0.00022295030706352356,
1143
+ "loss": 0.4481,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.4075,
1148
+ "grad_norm": 0.10030311346054077,
1149
+ "learning_rate": 0.00022180377869109104,
1150
+ "loss": 0.4709,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.41,
1155
+ "grad_norm": 0.111280657351017,
1156
+ "learning_rate": 0.00022065178218291147,
1157
+ "loss": 0.4423,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.4125,
1162
+ "grad_norm": 0.11253602802753448,
1163
+ "learning_rate": 0.00021949440526797926,
1164
+ "loss": 0.4136,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.415,
1169
+ "grad_norm": 0.10805424302816391,
1170
+ "learning_rate": 0.00021833173608502732,
1171
+ "loss": 0.4656,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.4175,
1176
+ "grad_norm": 0.10983198881149292,
1177
+ "learning_rate": 0.00021716386317581542,
1178
+ "loss": 0.3687,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.42,
1183
+ "grad_norm": 0.10653118044137955,
1184
+ "learning_rate": 0.00021599087547838727,
1185
+ "loss": 0.4654,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.4225,
1190
+ "grad_norm": 0.10856354981660843,
1191
+ "learning_rate": 0.00021481286232029735,
1192
+ "loss": 0.4298,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.425,
1197
+ "grad_norm": 0.11233706772327423,
1198
+ "learning_rate": 0.0002136299134118085,
1199
+ "loss": 0.4484,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.4275,
1204
+ "grad_norm": 0.1085442528128624,
1205
+ "learning_rate": 0.00021244211883906017,
1206
+ "loss": 0.4776,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.43,
1211
+ "grad_norm": 0.12297824025154114,
1212
+ "learning_rate": 0.0002112495690572077,
1213
+ "loss": 0.4029,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.4325,
1218
+ "grad_norm": 0.10838114470243454,
1219
+ "learning_rate": 0.00021005235488353428,
1220
+ "loss": 0.4848,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.435,
1225
+ "grad_norm": 0.10273341834545135,
1226
+ "learning_rate": 0.0002088505674905342,
1227
+ "loss": 0.3989,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.4375,
1232
+ "grad_norm": 0.11189126968383789,
1233
+ "learning_rate": 0.0002076442983989705,
1234
+ "loss": 0.438,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.44,
1239
+ "grad_norm": 0.11592905968427658,
1240
+ "learning_rate": 0.0002064336394709048,
1241
+ "loss": 0.4786,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.4425,
1246
+ "grad_norm": 0.11230389773845673,
1247
+ "learning_rate": 0.0002052186829027017,
1248
+ "loss": 0.3999,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.445,
1253
+ "grad_norm": 0.12455113977193832,
1254
+ "learning_rate": 0.00020399952121800767,
1255
+ "loss": 0.4856,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.4475,
1260
+ "grad_norm": 0.1001812294125557,
1261
+ "learning_rate": 0.00020277624726070526,
1262
+ "loss": 0.4689,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.45,
1267
+ "grad_norm": 0.11319112777709961,
1268
+ "learning_rate": 0.00020154895418784242,
1269
+ "loss": 0.3998,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.4525,
1274
+ "grad_norm": 0.11322236061096191,
1275
+ "learning_rate": 0.00020031773546253824,
1276
+ "loss": 0.4321,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.455,
1281
+ "grad_norm": 0.12924689054489136,
1282
+ "learning_rate": 0.00019908268484686558,
1283
+ "loss": 0.4208,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.4575,
1288
+ "grad_norm": 0.11435618251562119,
1289
+ "learning_rate": 0.00019784389639471048,
1290
+ "loss": 0.4682,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.46,
1295
+ "grad_norm": 0.10801081359386444,
1296
+ "learning_rate": 0.00019660146444460975,
1297
+ "loss": 0.428,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.4625,
1302
+ "grad_norm": 0.10906939953565598,
1303
+ "learning_rate": 0.0001953554836125667,
1304
+ "loss": 0.4455,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.465,
1309
+ "grad_norm": 0.10790123790502548,
1310
+ "learning_rate": 0.00019410604878484556,
1311
+ "loss": 0.4544,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.4675,
1316
+ "grad_norm": 0.10536376386880875,
1317
+ "learning_rate": 0.000192853255110746,
1318
+ "loss": 0.376,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.47,
1323
+ "grad_norm": 0.11744682490825653,
1324
+ "learning_rate": 0.00019159719799535668,
1325
+ "loss": 0.3887,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.4725,
1330
+ "grad_norm": 0.12954068183898926,
1331
+ "learning_rate": 0.00019033797309228983,
1332
+ "loss": 0.4075,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.475,
1337
+ "grad_norm": 0.1401606798171997,
1338
+ "learning_rate": 0.00018907567629639725,
1339
+ "loss": 0.4454,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.4775,
1344
+ "grad_norm": 0.12059322744607925,
1345
+ "learning_rate": 0.00018781040373646706,
1346
+ "loss": 0.4339,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.48,
1351
+ "grad_norm": 0.11798987537622452,
1352
+ "learning_rate": 0.00018654225176790336,
1353
+ "loss": 0.4405,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.4825,
1358
+ "grad_norm": 0.11344211548566818,
1359
+ "learning_rate": 0.00018527131696538846,
1360
+ "loss": 0.4124,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.485,
1365
+ "grad_norm": 0.10373330116271973,
1366
+ "learning_rate": 0.00018399769611552824,
1367
+ "loss": 0.4329,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.4875,
1372
+ "grad_norm": 0.12053704261779785,
1373
+ "learning_rate": 0.0001827214862094814,
1374
+ "loss": 0.4944,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.49,
1379
+ "grad_norm": 0.141033336520195,
1380
+ "learning_rate": 0.00018144278443557328,
1381
+ "loss": 0.4569,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.4925,
1386
+ "grad_norm": 0.10922867804765701,
1387
+ "learning_rate": 0.0001801616881718947,
1388
+ "loss": 0.3879,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.495,
1393
+ "grad_norm": 0.09843657910823822,
1394
+ "learning_rate": 0.00017887829497888612,
1395
+ "loss": 0.4106,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.4975,
1400
+ "grad_norm": 0.12131062150001526,
1401
+ "learning_rate": 0.000177592702591908,
1402
+ "loss": 0.4023,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.5,
1407
+ "grad_norm": 0.11343283206224442,
1408
+ "learning_rate": 0.00017630500891379806,
1409
+ "loss": 0.4824,
1410
+ "step": 1000
1411
+ }
1412
+ ],
1413
+ "logging_steps": 5,
1414
+ "max_steps": 2000,
1415
+ "num_input_tokens_seen": 0,
1416
+ "num_train_epochs": 9223372036854775807,
1417
+ "save_steps": 500,
1418
+ "stateful_callbacks": {
1419
+ "TrainerControl": {
1420
+ "args": {
1421
+ "should_epoch_stop": false,
1422
+ "should_evaluate": false,
1423
+ "should_log": false,
1424
+ "should_save": true,
1425
+ "should_training_stop": false
1426
+ },
1427
+ "attributes": {}
1428
+ }
1429
+ },
1430
+ "total_flos": 1.314789078859776e+18,
1431
+ "train_batch_size": 4,
1432
+ "trial_name": null,
1433
+ "trial_params": null
1434
+ }
codellama-hugcoder/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
3
+ size 5304
codellama-hugcoder/checkpoint-1500/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLlama-7b-Instruct-hf
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2.dev0
codellama-hugcoder/checkpoint-1500/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
codellama-hugcoder/checkpoint-1500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:954883169196fec3dbbf2581acd2ff6690fa789729045bb04113f1bb36637c46
3
+ size 319876032
codellama-hugcoder/checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:135e0fda5af04719269dc4cca8199c95f610932728fc80b6e63f3d656098bd57
3
+ size 640009682
codellama-hugcoder/checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fda3c0b12e2631264746b16f7dd8a85fd763004a3c1d20e136ad6fae01987d26
3
+ size 14244
codellama-hugcoder/checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046c4144f3d3e450ad1c1129a3ed6e680f6f65f10c488eeb2fd00b8cd376efa0
3
+ size 1064
codellama-hugcoder/checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,2134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.75,
6
+ "eval_steps": 100.0,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0025,
14
+ "grad_norm": 0.09379793703556061,
15
+ "learning_rate": 5.999999999999999e-06,
16
+ "loss": 0.6799,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.005,
21
+ "grad_norm": 0.1399833709001541,
22
+ "learning_rate": 1.3499999999999998e-05,
23
+ "loss": 0.6954,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0075,
28
+ "grad_norm": 0.08632303029298782,
29
+ "learning_rate": 2.1e-05,
30
+ "loss": 0.6921,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 0.10006701201200485,
36
+ "learning_rate": 2.8499999999999998e-05,
37
+ "loss": 0.69,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.0125,
42
+ "grad_norm": 0.07633858919143677,
43
+ "learning_rate": 3.5999999999999994e-05,
44
+ "loss": 0.6722,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.015,
49
+ "grad_norm": 0.09399061650037766,
50
+ "learning_rate": 4.3499999999999993e-05,
51
+ "loss": 0.6453,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.0175,
56
+ "grad_norm": 0.0843738541007042,
57
+ "learning_rate": 5.1e-05,
58
+ "loss": 0.6276,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 0.08583351224660873,
64
+ "learning_rate": 5.85e-05,
65
+ "loss": 0.58,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.0225,
70
+ "grad_norm": 0.09571370482444763,
71
+ "learning_rate": 6.599999999999999e-05,
72
+ "loss": 0.6355,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.025,
77
+ "grad_norm": 0.1083935871720314,
78
+ "learning_rate": 7.35e-05,
79
+ "loss": 0.589,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.0275,
84
+ "grad_norm": 0.10387319326400757,
85
+ "learning_rate": 8.1e-05,
86
+ "loss": 0.6061,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "grad_norm": 0.11083361506462097,
92
+ "learning_rate": 8.849999999999998e-05,
93
+ "loss": 0.572,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.0325,
98
+ "grad_norm": 0.12665686011314392,
99
+ "learning_rate": 9.599999999999999e-05,
100
+ "loss": 0.5442,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.035,
105
+ "grad_norm": 0.1308053582906723,
106
+ "learning_rate": 0.00010349999999999998,
107
+ "loss": 0.6524,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.0375,
112
+ "grad_norm": 0.13535510003566742,
113
+ "learning_rate": 0.00011099999999999999,
114
+ "loss": 0.6404,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.04,
119
+ "grad_norm": 0.12833671271800995,
120
+ "learning_rate": 0.0001185,
121
+ "loss": 0.5717,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.0425,
126
+ "grad_norm": 0.11962099373340607,
127
+ "learning_rate": 0.00012599999999999997,
128
+ "loss": 0.6098,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.045,
133
+ "grad_norm": 0.13898271322250366,
134
+ "learning_rate": 0.0001335,
135
+ "loss": 0.6099,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.0475,
140
+ "grad_norm": 0.14486610889434814,
141
+ "learning_rate": 0.00014099999999999998,
142
+ "loss": 0.5744,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.05,
147
+ "grad_norm": 0.1432138830423355,
148
+ "learning_rate": 0.00014849999999999998,
149
+ "loss": 0.5659,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.0525,
154
+ "grad_norm": 0.13487878441810608,
155
+ "learning_rate": 0.000156,
156
+ "loss": 0.5622,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.055,
161
+ "grad_norm": 0.12495309859514236,
162
+ "learning_rate": 0.0001635,
163
+ "loss": 0.5951,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.0575,
168
+ "grad_norm": 0.13011734187602997,
169
+ "learning_rate": 0.00017099999999999998,
170
+ "loss": 0.6249,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.06,
175
+ "grad_norm": 0.13987745344638824,
176
+ "learning_rate": 0.00017849999999999997,
177
+ "loss": 0.559,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.0625,
182
+ "grad_norm": 0.13373605906963348,
183
+ "learning_rate": 0.000186,
184
+ "loss": 0.5475,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.065,
189
+ "grad_norm": 0.12433867901563644,
190
+ "learning_rate": 0.0001935,
191
+ "loss": 0.5274,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.0675,
196
+ "grad_norm": 0.11097615957260132,
197
+ "learning_rate": 0.000201,
198
+ "loss": 0.678,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.07,
203
+ "grad_norm": 0.1155027225613594,
204
+ "learning_rate": 0.00020849999999999997,
205
+ "loss": 0.5611,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.0725,
210
+ "grad_norm": 0.11431068181991577,
211
+ "learning_rate": 0.00021599999999999996,
212
+ "loss": 0.6054,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.075,
217
+ "grad_norm": 0.09796140342950821,
218
+ "learning_rate": 0.00022349999999999998,
219
+ "loss": 0.5472,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.0775,
224
+ "grad_norm": 0.09489257633686066,
225
+ "learning_rate": 0.00023099999999999998,
226
+ "loss": 0.4636,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "grad_norm": 0.10787788033485413,
232
+ "learning_rate": 0.0002385,
233
+ "loss": 0.6164,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.0825,
238
+ "grad_norm": 0.10261733084917068,
239
+ "learning_rate": 0.00024599999999999996,
240
+ "loss": 0.5408,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.085,
245
+ "grad_norm": 0.11870352178812027,
246
+ "learning_rate": 0.0002535,
247
+ "loss": 0.5268,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.0875,
252
+ "grad_norm": 0.11910569667816162,
253
+ "learning_rate": 0.000261,
254
+ "loss": 0.5461,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.09,
259
+ "grad_norm": 0.10083702206611633,
260
+ "learning_rate": 0.00026849999999999997,
261
+ "loss": 0.4794,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.0925,
266
+ "grad_norm": 0.10453511029481888,
267
+ "learning_rate": 0.000276,
268
+ "loss": 0.5539,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.095,
273
+ "grad_norm": 0.101403146982193,
274
+ "learning_rate": 0.00028349999999999995,
275
+ "loss": 0.5346,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.0975,
280
+ "grad_norm": 0.10724789649248123,
281
+ "learning_rate": 0.00029099999999999997,
282
+ "loss": 0.6026,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1,
287
+ "grad_norm": 0.1140277311205864,
288
+ "learning_rate": 0.0002985,
289
+ "loss": 0.5193,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.1025,
294
+ "grad_norm": 0.09706108272075653,
295
+ "learning_rate": 0.0002999963446058092,
296
+ "loss": 0.54,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.105,
301
+ "grad_norm": 0.10003062337636948,
302
+ "learning_rate": 0.0002999814948722491,
303
+ "loss": 0.5365,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1075,
308
+ "grad_norm": 0.1078687533736229,
309
+ "learning_rate": 0.00029995522346717746,
310
+ "loss": 0.5889,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.11,
315
+ "grad_norm": 0.10538115352392197,
316
+ "learning_rate": 0.0002999175323912636,
317
+ "loss": 0.5611,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.1125,
322
+ "grad_norm": 0.1020808294415474,
323
+ "learning_rate": 0.00029986842451482874,
324
+ "loss": 0.6103,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.115,
329
+ "grad_norm": 0.09635835886001587,
330
+ "learning_rate": 0.0002998079035776279,
331
+ "loss": 0.5229,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.1175,
336
+ "grad_norm": 0.10287190228700638,
337
+ "learning_rate": 0.0002997359741885648,
338
+ "loss": 0.5312,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.12,
343
+ "grad_norm": 0.09160075336694717,
344
+ "learning_rate": 0.0002996526418253408,
345
+ "loss": 0.5673,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.1225,
350
+ "grad_norm": 0.08691006153821945,
351
+ "learning_rate": 0.000299557912834038,
352
+ "loss": 0.5326,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.125,
357
+ "grad_norm": 0.10096988826990128,
358
+ "learning_rate": 0.00029945179442863594,
359
+ "loss": 0.6004,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.1275,
364
+ "grad_norm": 0.09594204276800156,
365
+ "learning_rate": 0.000299334294690462,
366
+ "loss": 0.5516,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.13,
371
+ "grad_norm": 0.10281919687986374,
372
+ "learning_rate": 0.00029920542256757607,
373
+ "loss": 0.5515,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.1325,
378
+ "grad_norm": 0.08547840267419815,
379
+ "learning_rate": 0.00029906518787408944,
380
+ "loss": 0.5243,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.135,
385
+ "grad_norm": 0.10161560773849487,
386
+ "learning_rate": 0.0002989136012894168,
387
+ "loss": 0.5096,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.1375,
392
+ "grad_norm": 0.09101904183626175,
393
+ "learning_rate": 0.0002987506743574635,
394
+ "loss": 0.553,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.14,
399
+ "grad_norm": 0.09769442677497864,
400
+ "learning_rate": 0.0002985764194857463,
401
+ "loss": 0.4953,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.1425,
406
+ "grad_norm": 0.10991579294204712,
407
+ "learning_rate": 0.00029839084994444826,
408
+ "loss": 0.5152,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.145,
413
+ "grad_norm": 0.09450916200876236,
414
+ "learning_rate": 0.00029819397986540836,
415
+ "loss": 0.5397,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.1475,
420
+ "grad_norm": 0.10876069217920303,
421
+ "learning_rate": 0.0002979858242410454,
422
+ "loss": 0.4858,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.15,
427
+ "grad_norm": 0.097995825111866,
428
+ "learning_rate": 0.00029776639892321606,
429
+ "loss": 0.5566,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.1525,
434
+ "grad_norm": 0.1145048514008522,
435
+ "learning_rate": 0.0002975357206220079,
436
+ "loss": 0.4531,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.155,
441
+ "grad_norm": 0.10271880775690079,
442
+ "learning_rate": 0.00029729380690446654,
443
+ "loss": 0.5199,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.1575,
448
+ "grad_norm": 0.11095371842384338,
449
+ "learning_rate": 0.0002970406761932583,
450
+ "loss": 0.5416,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.16,
455
+ "grad_norm": 0.09949438273906708,
456
+ "learning_rate": 0.00029677634776526673,
457
+ "loss": 0.4841,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.1625,
462
+ "grad_norm": 0.1163724958896637,
463
+ "learning_rate": 0.00029650084175012517,
464
+ "loss": 0.4913,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.165,
469
+ "grad_norm": 0.10726840049028397,
470
+ "learning_rate": 0.00029621417912868323,
471
+ "loss": 0.5203,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.1675,
476
+ "grad_norm": 0.09609931707382202,
477
+ "learning_rate": 0.00029591638173140947,
478
+ "loss": 0.5607,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.17,
483
+ "grad_norm": 0.10824442654848099,
484
+ "learning_rate": 0.0002956074722367286,
485
+ "loss": 0.6004,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.1725,
490
+ "grad_norm": 0.10465679317712784,
491
+ "learning_rate": 0.00029528747416929463,
492
+ "loss": 0.5216,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.175,
497
+ "grad_norm": 0.10518354922533035,
498
+ "learning_rate": 0.0002949564118981994,
499
+ "loss": 0.499,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.1775,
504
+ "grad_norm": 0.0955279991030693,
505
+ "learning_rate": 0.0002946143106351165,
506
+ "loss": 0.5607,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.18,
511
+ "grad_norm": 0.11159654706716537,
512
+ "learning_rate": 0.0002942611964323817,
513
+ "loss": 0.5204,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.1825,
518
+ "grad_norm": 0.09571187198162079,
519
+ "learning_rate": 0.0002938970961810086,
520
+ "loss": 0.6113,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.185,
525
+ "grad_norm": 0.11854679882526398,
526
+ "learning_rate": 0.0002935220376086411,
527
+ "loss": 0.5639,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.1875,
532
+ "grad_norm": 0.1050512045621872,
533
+ "learning_rate": 0.0002931360492774415,
534
+ "loss": 0.548,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.19,
539
+ "grad_norm": 0.1053968220949173,
540
+ "learning_rate": 0.0002927391605819157,
541
+ "loss": 0.5507,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.1925,
546
+ "grad_norm": 0.10567320138216019,
547
+ "learning_rate": 0.00029233140174667445,
548
+ "loss": 0.5312,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.195,
553
+ "grad_norm": 0.11914283782243729,
554
+ "learning_rate": 0.0002919128038241318,
555
+ "loss": 0.5961,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.1975,
560
+ "grad_norm": 0.09915795922279358,
561
+ "learning_rate": 0.0002914833986921401,
562
+ "loss": 0.5086,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.2,
567
+ "grad_norm": 0.10796502232551575,
568
+ "learning_rate": 0.0002910432190515628,
569
+ "loss": 0.5585,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.2025,
574
+ "grad_norm": 0.10748997330665588,
575
+ "learning_rate": 0.00029059229842378373,
576
+ "loss": 0.5466,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.205,
581
+ "grad_norm": 0.10696308314800262,
582
+ "learning_rate": 0.0002901306711481544,
583
+ "loss": 0.5513,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.2075,
588
+ "grad_norm": 0.10418657958507538,
589
+ "learning_rate": 0.0002896583723793792,
590
+ "loss": 0.5391,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.21,
595
+ "grad_norm": 0.16421550512313843,
596
+ "learning_rate": 0.00028917543808483796,
597
+ "loss": 0.4699,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.2125,
602
+ "grad_norm": 0.12929962575435638,
603
+ "learning_rate": 0.00028868190504184696,
604
+ "loss": 0.4984,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.215,
609
+ "grad_norm": 0.10469454526901245,
610
+ "learning_rate": 0.00028817781083485816,
611
+ "loss": 0.5119,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.2175,
616
+ "grad_norm": 0.0964970663189888,
617
+ "learning_rate": 0.00028766319385259713,
618
+ "loss": 0.5167,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.22,
623
+ "grad_norm": 0.12395574152469635,
624
+ "learning_rate": 0.00028713809328513953,
625
+ "loss": 0.5692,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.2225,
630
+ "grad_norm": 0.10189738124608994,
631
+ "learning_rate": 0.0002866025491209265,
632
+ "loss": 0.4628,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.225,
637
+ "grad_norm": 0.10433454066514969,
638
+ "learning_rate": 0.0002860566021437197,
639
+ "loss": 0.4869,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.2275,
644
+ "grad_norm": 0.13003456592559814,
645
+ "learning_rate": 0.0002855002939294951,
646
+ "loss": 0.5291,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.23,
651
+ "grad_norm": 0.11692202836275101,
652
+ "learning_rate": 0.000284933666843277,
653
+ "loss": 0.5229,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.2325,
658
+ "grad_norm": 0.10757846385240555,
659
+ "learning_rate": 0.0002843567640359119,
660
+ "loss": 0.435,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.235,
665
+ "grad_norm": 0.10775501281023026,
666
+ "learning_rate": 0.00028376962944078206,
667
+ "loss": 0.4418,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.2375,
672
+ "grad_norm": 0.11543692648410797,
673
+ "learning_rate": 0.00028317230777046015,
674
+ "loss": 0.4204,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.24,
679
+ "grad_norm": 0.10946698486804962,
680
+ "learning_rate": 0.00028256484451330403,
681
+ "loss": 0.49,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.2425,
686
+ "grad_norm": 0.11528221517801285,
687
+ "learning_rate": 0.00028194728592999247,
688
+ "loss": 0.4752,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.245,
693
+ "grad_norm": 0.10474205762147903,
694
+ "learning_rate": 0.0002813196790500027,
695
+ "loss": 0.4847,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.2475,
700
+ "grad_norm": 0.10768820345401764,
701
+ "learning_rate": 0.00028068207166802837,
702
+ "loss": 0.4664,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.25,
707
+ "grad_norm": 0.12158560007810593,
708
+ "learning_rate": 0.00028003451234034037,
709
+ "loss": 0.4741,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.2525,
714
+ "grad_norm": 0.11635497957468033,
715
+ "learning_rate": 0.0002793770503810886,
716
+ "loss": 0.4969,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.255,
721
+ "grad_norm": 0.12205849587917328,
722
+ "learning_rate": 0.00027870973585854665,
723
+ "loss": 0.4798,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.2575,
728
+ "grad_norm": 0.10270871222019196,
729
+ "learning_rate": 0.00027803261959129905,
730
+ "loss": 0.3888,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.26,
735
+ "grad_norm": 0.11313367635011673,
736
+ "learning_rate": 0.0002773457531443712,
737
+ "loss": 0.4759,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.2625,
742
+ "grad_norm": 0.12905193865299225,
743
+ "learning_rate": 0.00027664918882530225,
744
+ "loss": 0.4442,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.265,
749
+ "grad_norm": 0.11690939962863922,
750
+ "learning_rate": 0.00027594297968016197,
751
+ "loss": 0.5535,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.2675,
756
+ "grad_norm": 0.10021405667066574,
757
+ "learning_rate": 0.00027522717948951094,
758
+ "loss": 0.4717,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.27,
763
+ "grad_norm": 0.10104178637266159,
764
+ "learning_rate": 0.0002745018427643051,
765
+ "loss": 0.4906,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.2725,
770
+ "grad_norm": 0.12113891541957855,
771
+ "learning_rate": 0.00027376702474174425,
772
+ "loss": 0.5674,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.275,
777
+ "grad_norm": 0.11330476403236389,
778
+ "learning_rate": 0.0002730227813810658,
779
+ "loss": 0.5184,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.2775,
784
+ "grad_norm": 0.1025850847363472,
785
+ "learning_rate": 0.0002722691693592831,
786
+ "loss": 0.4395,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.28,
791
+ "grad_norm": 0.11591499298810959,
792
+ "learning_rate": 0.0002715062460668694,
793
+ "loss": 0.5003,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.2825,
798
+ "grad_norm": 0.11281153559684753,
799
+ "learning_rate": 0.0002707340696033871,
800
+ "loss": 0.4672,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.285,
805
+ "grad_norm": 0.1123538464307785,
806
+ "learning_rate": 0.00026995269877306356,
807
+ "loss": 0.513,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.2875,
812
+ "grad_norm": 0.10776390135288239,
813
+ "learning_rate": 0.0002691621930803127,
814
+ "loss": 0.4572,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.29,
819
+ "grad_norm": 0.10008667409420013,
820
+ "learning_rate": 0.0002683626127252036,
821
+ "loss": 0.4618,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.2925,
826
+ "grad_norm": 0.13961340487003326,
827
+ "learning_rate": 0.00026755401859887595,
828
+ "loss": 0.4819,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.295,
833
+ "grad_norm": 0.1476685106754303,
834
+ "learning_rate": 0.00026673647227890316,
835
+ "loss": 0.4964,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.2975,
840
+ "grad_norm": 0.09795507788658142,
841
+ "learning_rate": 0.00026591003602460263,
842
+ "loss": 0.4796,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.3,
847
+ "grad_norm": 0.10903532058000565,
848
+ "learning_rate": 0.00026507477277229496,
849
+ "loss": 0.4775,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.3025,
854
+ "grad_norm": 0.10258448123931885,
855
+ "learning_rate": 0.0002642307461305105,
856
+ "loss": 0.4519,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.305,
861
+ "grad_norm": 0.11204435676336288,
862
+ "learning_rate": 0.0002633780203751459,
863
+ "loss": 0.4451,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.3075,
868
+ "grad_norm": 0.10147629678249359,
869
+ "learning_rate": 0.0002625166604445689,
870
+ "loss": 0.4256,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.31,
875
+ "grad_norm": 0.10481107234954834,
876
+ "learning_rate": 0.00026164673193467306,
877
+ "loss": 0.4381,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.3125,
882
+ "grad_norm": 0.10856641829013824,
883
+ "learning_rate": 0.00026076830109388255,
884
+ "loss": 0.4958,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.315,
889
+ "grad_norm": 0.09918677806854248,
890
+ "learning_rate": 0.0002598814348181068,
891
+ "loss": 0.4335,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.3175,
896
+ "grad_norm": 0.10417389869689941,
897
+ "learning_rate": 0.00025898620064564637,
898
+ "loss": 0.4603,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.32,
903
+ "grad_norm": 0.0903329998254776,
904
+ "learning_rate": 0.00025808266675204954,
905
+ "loss": 0.3932,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.3225,
910
+ "grad_norm": 0.11511855572462082,
911
+ "learning_rate": 0.0002571709019449205,
912
+ "loss": 0.4169,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.325,
917
+ "grad_norm": 0.11355557292699814,
918
+ "learning_rate": 0.0002562509756586793,
919
+ "loss": 0.4455,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.3275,
924
+ "grad_norm": 0.1271187961101532,
925
+ "learning_rate": 0.00025532295794927437,
926
+ "loss": 0.4902,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.33,
931
+ "grad_norm": 0.11936645954847336,
932
+ "learning_rate": 0.0002543869194888471,
933
+ "loss": 0.4843,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.3325,
938
+ "grad_norm": 0.11935465037822723,
939
+ "learning_rate": 0.00025344293156035044,
940
+ "loss": 0.4402,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.335,
945
+ "grad_norm": 0.13073407113552094,
946
+ "learning_rate": 0.00025249106605211986,
947
+ "loss": 0.467,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.3375,
952
+ "grad_norm": 0.10340435802936554,
953
+ "learning_rate": 0.0002515313954523991,
954
+ "loss": 0.4827,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.34,
959
+ "grad_norm": 0.11634550243616104,
960
+ "learning_rate": 0.00025056399284381983,
961
+ "loss": 0.466,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.3425,
966
+ "grad_norm": 0.10582319647073746,
967
+ "learning_rate": 0.0002495889318978362,
968
+ "loss": 0.4751,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.345,
973
+ "grad_norm": 0.16781780123710632,
974
+ "learning_rate": 0.00024860628686911436,
975
+ "loss": 0.4717,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.3475,
980
+ "grad_norm": 0.11522196233272552,
981
+ "learning_rate": 0.0002476161325898776,
982
+ "loss": 0.4687,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.35,
987
+ "grad_norm": 0.11830449104309082,
988
+ "learning_rate": 0.000246618544464208,
989
+ "loss": 0.436,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.3525,
994
+ "grad_norm": 0.17485427856445312,
995
+ "learning_rate": 0.0002456135984623034,
996
+ "loss": 0.4284,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.355,
1001
+ "grad_norm": 0.12288108468055725,
1002
+ "learning_rate": 0.00024460137111469296,
1003
+ "loss": 0.4261,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.3575,
1008
+ "grad_norm": 0.11587081104516983,
1009
+ "learning_rate": 0.0002435819395064079,
1010
+ "loss": 0.4493,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.36,
1015
+ "grad_norm": 0.10690271109342575,
1016
+ "learning_rate": 0.0002425553812711123,
1017
+ "loss": 0.4648,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.3625,
1022
+ "grad_norm": 0.10404397547245026,
1023
+ "learning_rate": 0.00024152177458519014,
1024
+ "loss": 0.4634,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.365,
1029
+ "grad_norm": 0.11986954510211945,
1030
+ "learning_rate": 0.00024048119816179236,
1031
+ "loss": 0.4525,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.3675,
1036
+ "grad_norm": 0.10243026167154312,
1037
+ "learning_rate": 0.00023943373124484234,
1038
+ "loss": 0.4572,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.37,
1043
+ "grad_norm": 0.10386748611927032,
1044
+ "learning_rate": 0.00023837945360300129,
1045
+ "loss": 0.3884,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.3725,
1050
+ "grad_norm": 0.11165735125541687,
1051
+ "learning_rate": 0.0002373184455235934,
1052
+ "loss": 0.4902,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.375,
1057
+ "grad_norm": 0.09951601922512054,
1058
+ "learning_rate": 0.00023625078780649178,
1059
+ "loss": 0.4541,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.3775,
1064
+ "grad_norm": 0.10347504913806915,
1065
+ "learning_rate": 0.00023517656175796518,
1066
+ "loss": 0.3871,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.38,
1071
+ "grad_norm": 0.10478132963180542,
1072
+ "learning_rate": 0.00023409584918448627,
1073
+ "loss": 0.4329,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.3825,
1078
+ "grad_norm": 0.1198212131857872,
1079
+ "learning_rate": 0.00023300873238650159,
1080
+ "loss": 0.425,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.385,
1085
+ "grad_norm": 0.1103711724281311,
1086
+ "learning_rate": 0.00023191529415216434,
1087
+ "loss": 0.4274,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.3875,
1092
+ "grad_norm": 0.09940385073423386,
1093
+ "learning_rate": 0.00023081561775102944,
1094
+ "loss": 0.4368,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.39,
1099
+ "grad_norm": 0.11599268019199371,
1100
+ "learning_rate": 0.00022970978692771242,
1101
+ "loss": 0.4386,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.3925,
1106
+ "grad_norm": 0.10101296752691269,
1107
+ "learning_rate": 0.00022859788589551188,
1108
+ "loss": 0.4696,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.395,
1113
+ "grad_norm": 0.10112808644771576,
1114
+ "learning_rate": 0.00022747999932999624,
1115
+ "loss": 0.4066,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.3975,
1120
+ "grad_norm": 0.09595459699630737,
1121
+ "learning_rate": 0.00022635621236255567,
1122
+ "loss": 0.4837,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.4,
1127
+ "grad_norm": 0.10761380940675735,
1128
+ "learning_rate": 0.00022522661057391857,
1129
+ "loss": 0.5446,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.4025,
1134
+ "grad_norm": 0.11919954419136047,
1135
+ "learning_rate": 0.00022409127998763463,
1136
+ "loss": 0.5027,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.405,
1141
+ "grad_norm": 0.10851597785949707,
1142
+ "learning_rate": 0.00022295030706352356,
1143
+ "loss": 0.4481,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.4075,
1148
+ "grad_norm": 0.10030311346054077,
1149
+ "learning_rate": 0.00022180377869109104,
1150
+ "loss": 0.4709,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.41,
1155
+ "grad_norm": 0.111280657351017,
1156
+ "learning_rate": 0.00022065178218291147,
1157
+ "loss": 0.4423,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.4125,
1162
+ "grad_norm": 0.11253602802753448,
1163
+ "learning_rate": 0.00021949440526797926,
1164
+ "loss": 0.4136,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.415,
1169
+ "grad_norm": 0.10805424302816391,
1170
+ "learning_rate": 0.00021833173608502732,
1171
+ "loss": 0.4656,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.4175,
1176
+ "grad_norm": 0.10983198881149292,
1177
+ "learning_rate": 0.00021716386317581542,
1178
+ "loss": 0.3687,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.42,
1183
+ "grad_norm": 0.10653118044137955,
1184
+ "learning_rate": 0.00021599087547838727,
1185
+ "loss": 0.4654,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.4225,
1190
+ "grad_norm": 0.10856354981660843,
1191
+ "learning_rate": 0.00021481286232029735,
1192
+ "loss": 0.4298,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.425,
1197
+ "grad_norm": 0.11233706772327423,
1198
+ "learning_rate": 0.0002136299134118085,
1199
+ "loss": 0.4484,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.4275,
1204
+ "grad_norm": 0.1085442528128624,
1205
+ "learning_rate": 0.00021244211883906017,
1206
+ "loss": 0.4776,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.43,
1211
+ "grad_norm": 0.12297824025154114,
1212
+ "learning_rate": 0.0002112495690572077,
1213
+ "loss": 0.4029,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.4325,
1218
+ "grad_norm": 0.10838114470243454,
1219
+ "learning_rate": 0.00021005235488353428,
1220
+ "loss": 0.4848,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.435,
1225
+ "grad_norm": 0.10273341834545135,
1226
+ "learning_rate": 0.0002088505674905342,
1227
+ "loss": 0.3989,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.4375,
1232
+ "grad_norm": 0.11189126968383789,
1233
+ "learning_rate": 0.0002076442983989705,
1234
+ "loss": 0.438,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.44,
1239
+ "grad_norm": 0.11592905968427658,
1240
+ "learning_rate": 0.0002064336394709048,
1241
+ "loss": 0.4786,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.4425,
1246
+ "grad_norm": 0.11230389773845673,
1247
+ "learning_rate": 0.0002052186829027017,
1248
+ "loss": 0.3999,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.445,
1253
+ "grad_norm": 0.12455113977193832,
1254
+ "learning_rate": 0.00020399952121800767,
1255
+ "loss": 0.4856,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.4475,
1260
+ "grad_norm": 0.1001812294125557,
1261
+ "learning_rate": 0.00020277624726070526,
1262
+ "loss": 0.4689,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.45,
1267
+ "grad_norm": 0.11319112777709961,
1268
+ "learning_rate": 0.00020154895418784242,
1269
+ "loss": 0.3998,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.4525,
1274
+ "grad_norm": 0.11322236061096191,
1275
+ "learning_rate": 0.00020031773546253824,
1276
+ "loss": 0.4321,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.455,
1281
+ "grad_norm": 0.12924689054489136,
1282
+ "learning_rate": 0.00019908268484686558,
1283
+ "loss": 0.4208,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.4575,
1288
+ "grad_norm": 0.11435618251562119,
1289
+ "learning_rate": 0.00019784389639471048,
1290
+ "loss": 0.4682,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.46,
1295
+ "grad_norm": 0.10801081359386444,
1296
+ "learning_rate": 0.00019660146444460975,
1297
+ "loss": 0.428,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.4625,
1302
+ "grad_norm": 0.10906939953565598,
1303
+ "learning_rate": 0.0001953554836125667,
1304
+ "loss": 0.4455,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.465,
1309
+ "grad_norm": 0.10790123790502548,
1310
+ "learning_rate": 0.00019410604878484556,
1311
+ "loss": 0.4544,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.4675,
1316
+ "grad_norm": 0.10536376386880875,
1317
+ "learning_rate": 0.000192853255110746,
1318
+ "loss": 0.376,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.47,
1323
+ "grad_norm": 0.11744682490825653,
1324
+ "learning_rate": 0.00019159719799535668,
1325
+ "loss": 0.3887,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.4725,
1330
+ "grad_norm": 0.12954068183898926,
1331
+ "learning_rate": 0.00019033797309228983,
1332
+ "loss": 0.4075,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.475,
1337
+ "grad_norm": 0.1401606798171997,
1338
+ "learning_rate": 0.00018907567629639725,
1339
+ "loss": 0.4454,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.4775,
1344
+ "grad_norm": 0.12059322744607925,
1345
+ "learning_rate": 0.00018781040373646706,
1346
+ "loss": 0.4339,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.48,
1351
+ "grad_norm": 0.11798987537622452,
1352
+ "learning_rate": 0.00018654225176790336,
1353
+ "loss": 0.4405,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.4825,
1358
+ "grad_norm": 0.11344211548566818,
1359
+ "learning_rate": 0.00018527131696538846,
1360
+ "loss": 0.4124,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.485,
1365
+ "grad_norm": 0.10373330116271973,
1366
+ "learning_rate": 0.00018399769611552824,
1367
+ "loss": 0.4329,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.4875,
1372
+ "grad_norm": 0.12053704261779785,
1373
+ "learning_rate": 0.0001827214862094814,
1374
+ "loss": 0.4944,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.49,
1379
+ "grad_norm": 0.141033336520195,
1380
+ "learning_rate": 0.00018144278443557328,
1381
+ "loss": 0.4569,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.4925,
1386
+ "grad_norm": 0.10922867804765701,
1387
+ "learning_rate": 0.0001801616881718947,
1388
+ "loss": 0.3879,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.495,
1393
+ "grad_norm": 0.09843657910823822,
1394
+ "learning_rate": 0.00017887829497888612,
1395
+ "loss": 0.4106,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.4975,
1400
+ "grad_norm": 0.12131062150001526,
1401
+ "learning_rate": 0.000177592702591908,
1402
+ "loss": 0.4023,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.5,
1407
+ "grad_norm": 0.11343283206224442,
1408
+ "learning_rate": 0.00017630500891379806,
1409
+ "loss": 0.4824,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 0.5025,
1414
+ "grad_norm": 0.11050508171319962,
1415
+ "learning_rate": 0.00017501531200741534,
1416
+ "loss": 0.4098,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 0.505,
1421
+ "grad_norm": 0.11737144738435745,
1422
+ "learning_rate": 0.00017372371008817256,
1423
+ "loss": 0.3943,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 0.5075,
1428
+ "grad_norm": 0.11473528295755386,
1429
+ "learning_rate": 0.00017243030151655643,
1430
+ "loss": 0.3796,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 0.51,
1435
+ "grad_norm": 0.13086555898189545,
1436
+ "learning_rate": 0.00017113518479063738,
1437
+ "loss": 0.4367,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 0.5125,
1442
+ "grad_norm": 0.11752833425998688,
1443
+ "learning_rate": 0.00016983845853856837,
1444
+ "loss": 0.4097,
1445
+ "step": 1025
1446
+ },
1447
+ {
1448
+ "epoch": 0.515,
1449
+ "grad_norm": 0.11596900969743729,
1450
+ "learning_rate": 0.0001685402215110739,
1451
+ "loss": 0.3812,
1452
+ "step": 1030
1453
+ },
1454
+ {
1455
+ "epoch": 0.5175,
1456
+ "grad_norm": 0.11850260943174362,
1457
+ "learning_rate": 0.00016724057257392998,
1458
+ "loss": 0.4354,
1459
+ "step": 1035
1460
+ },
1461
+ {
1462
+ "epoch": 0.52,
1463
+ "grad_norm": 0.12466365844011307,
1464
+ "learning_rate": 0.00016593961070043498,
1465
+ "loss": 0.4317,
1466
+ "step": 1040
1467
+ },
1468
+ {
1469
+ "epoch": 0.5225,
1470
+ "grad_norm": 0.11178991943597794,
1471
+ "learning_rate": 0.0001646374349638724,
1472
+ "loss": 0.3936,
1473
+ "step": 1045
1474
+ },
1475
+ {
1476
+ "epoch": 0.525,
1477
+ "grad_norm": 0.11252165585756302,
1478
+ "learning_rate": 0.00016333414452996623,
1479
+ "loss": 0.386,
1480
+ "step": 1050
1481
+ },
1482
+ {
1483
+ "epoch": 0.5275,
1484
+ "grad_norm": 0.12886975705623627,
1485
+ "learning_rate": 0.0001620298386493288,
1486
+ "loss": 0.3965,
1487
+ "step": 1055
1488
+ },
1489
+ {
1490
+ "epoch": 0.53,
1491
+ "grad_norm": 0.11716549098491669,
1492
+ "learning_rate": 0.00016072461664990288,
1493
+ "loss": 0.3924,
1494
+ "step": 1060
1495
+ },
1496
+ {
1497
+ "epoch": 0.5325,
1498
+ "grad_norm": 0.11604485660791397,
1499
+ "learning_rate": 0.000159418577929397,
1500
+ "loss": 0.3624,
1501
+ "step": 1065
1502
+ },
1503
+ {
1504
+ "epoch": 0.535,
1505
+ "grad_norm": 0.11538460850715637,
1506
+ "learning_rate": 0.00015811182194771633,
1507
+ "loss": 0.4338,
1508
+ "step": 1070
1509
+ },
1510
+ {
1511
+ "epoch": 0.5375,
1512
+ "grad_norm": 0.11618762463331223,
1513
+ "learning_rate": 0.00015680444821938804,
1514
+ "loss": 0.4058,
1515
+ "step": 1075
1516
+ },
1517
+ {
1518
+ "epoch": 0.54,
1519
+ "grad_norm": 0.11750835925340652,
1520
+ "learning_rate": 0.00015549655630598343,
1521
+ "loss": 0.4422,
1522
+ "step": 1080
1523
+ },
1524
+ {
1525
+ "epoch": 0.5425,
1526
+ "grad_norm": 0.12725204229354858,
1527
+ "learning_rate": 0.00015418824580853535,
1528
+ "loss": 0.4422,
1529
+ "step": 1085
1530
+ },
1531
+ {
1532
+ "epoch": 0.545,
1533
+ "grad_norm": 0.11274927109479904,
1534
+ "learning_rate": 0.00015287961635995347,
1535
+ "loss": 0.4229,
1536
+ "step": 1090
1537
+ },
1538
+ {
1539
+ "epoch": 0.5475,
1540
+ "grad_norm": 0.11833129078149796,
1541
+ "learning_rate": 0.00015157076761743686,
1542
+ "loss": 0.4442,
1543
+ "step": 1095
1544
+ },
1545
+ {
1546
+ "epoch": 0.55,
1547
+ "grad_norm": 0.11384794861078262,
1548
+ "learning_rate": 0.00015026179925488475,
1549
+ "loss": 0.4528,
1550
+ "step": 1100
1551
+ },
1552
+ {
1553
+ "epoch": 0.5525,
1554
+ "grad_norm": 0.11864661425352097,
1555
+ "learning_rate": 0.00014895281095530575,
1556
+ "loss": 0.3988,
1557
+ "step": 1105
1558
+ },
1559
+ {
1560
+ "epoch": 0.555,
1561
+ "grad_norm": 0.11673832684755325,
1562
+ "learning_rate": 0.00014764390240322691,
1563
+ "loss": 0.3544,
1564
+ "step": 1110
1565
+ },
1566
+ {
1567
+ "epoch": 0.5575,
1568
+ "grad_norm": 0.1174502745270729,
1569
+ "learning_rate": 0.00014633517327710202,
1570
+ "loss": 0.4034,
1571
+ "step": 1115
1572
+ },
1573
+ {
1574
+ "epoch": 0.56,
1575
+ "grad_norm": 0.12685547769069672,
1576
+ "learning_rate": 0.00014502672324172107,
1577
+ "loss": 0.3595,
1578
+ "step": 1120
1579
+ },
1580
+ {
1581
+ "epoch": 0.5625,
1582
+ "grad_norm": 0.12368053942918777,
1583
+ "learning_rate": 0.00014371865194062007,
1584
+ "loss": 0.3395,
1585
+ "step": 1125
1586
+ },
1587
+ {
1588
+ "epoch": 0.565,
1589
+ "grad_norm": 0.1077839657664299,
1590
+ "learning_rate": 0.000142411058988493,
1591
+ "loss": 0.4199,
1592
+ "step": 1130
1593
+ },
1594
+ {
1595
+ "epoch": 0.5675,
1596
+ "grad_norm": 0.11699855327606201,
1597
+ "learning_rate": 0.00014110404396360576,
1598
+ "loss": 0.3443,
1599
+ "step": 1135
1600
+ },
1601
+ {
1602
+ "epoch": 0.57,
1603
+ "grad_norm": 0.13238464295864105,
1604
+ "learning_rate": 0.0001397977064002128,
1605
+ "loss": 0.3499,
1606
+ "step": 1140
1607
+ },
1608
+ {
1609
+ "epoch": 0.5725,
1610
+ "grad_norm": 0.11482933163642883,
1611
+ "learning_rate": 0.0001384921457809772,
1612
+ "loss": 0.3619,
1613
+ "step": 1145
1614
+ },
1615
+ {
1616
+ "epoch": 0.575,
1617
+ "grad_norm": 0.13390353322029114,
1618
+ "learning_rate": 0.00013718746152939487,
1619
+ "loss": 0.3684,
1620
+ "step": 1150
1621
+ },
1622
+ {
1623
+ "epoch": 0.5775,
1624
+ "grad_norm": 0.11464900523424149,
1625
+ "learning_rate": 0.00013588375300222283,
1626
+ "loss": 0.3313,
1627
+ "step": 1155
1628
+ },
1629
+ {
1630
+ "epoch": 0.58,
1631
+ "grad_norm": 0.10367871820926666,
1632
+ "learning_rate": 0.00013458111948191296,
1633
+ "loss": 0.3323,
1634
+ "step": 1160
1635
+ },
1636
+ {
1637
+ "epoch": 0.5825,
1638
+ "grad_norm": 0.12259294092655182,
1639
+ "learning_rate": 0.0001332796601690512,
1640
+ "loss": 0.3986,
1641
+ "step": 1165
1642
+ },
1643
+ {
1644
+ "epoch": 0.585,
1645
+ "grad_norm": 0.10923358052968979,
1646
+ "learning_rate": 0.00013197947417480292,
1647
+ "loss": 0.3808,
1648
+ "step": 1170
1649
+ },
1650
+ {
1651
+ "epoch": 0.5875,
1652
+ "grad_norm": 0.12479504942893982,
1653
+ "learning_rate": 0.0001306806605133656,
1654
+ "loss": 0.4429,
1655
+ "step": 1175
1656
+ },
1657
+ {
1658
+ "epoch": 0.59,
1659
+ "grad_norm": 0.11521733552217484,
1660
+ "learning_rate": 0.000129383318094428,
1661
+ "loss": 0.4778,
1662
+ "step": 1180
1663
+ },
1664
+ {
1665
+ "epoch": 0.5925,
1666
+ "grad_norm": 0.14112086594104767,
1667
+ "learning_rate": 0.00012808754571563827,
1668
+ "loss": 0.4634,
1669
+ "step": 1185
1670
+ },
1671
+ {
1672
+ "epoch": 0.595,
1673
+ "grad_norm": 0.12947902083396912,
1674
+ "learning_rate": 0.00012679344205507981,
1675
+ "loss": 0.4439,
1676
+ "step": 1190
1677
+ },
1678
+ {
1679
+ "epoch": 0.5975,
1680
+ "grad_norm": 0.13288578391075134,
1681
+ "learning_rate": 0.0001255011056637567,
1682
+ "loss": 0.4402,
1683
+ "step": 1195
1684
+ },
1685
+ {
1686
+ "epoch": 0.6,
1687
+ "grad_norm": 0.1216069906949997,
1688
+ "learning_rate": 0.00012421063495808853,
1689
+ "loss": 0.4203,
1690
+ "step": 1200
1691
+ },
1692
+ {
1693
+ "epoch": 0.6025,
1694
+ "grad_norm": 0.11649637669324875,
1695
+ "learning_rate": 0.000122922128212416,
1696
+ "loss": 0.4512,
1697
+ "step": 1205
1698
+ },
1699
+ {
1700
+ "epoch": 0.605,
1701
+ "grad_norm": 0.1201406940817833,
1702
+ "learning_rate": 0.00012163568355151628,
1703
+ "loss": 0.3725,
1704
+ "step": 1210
1705
+ },
1706
+ {
1707
+ "epoch": 0.6075,
1708
+ "grad_norm": 0.12117727100849152,
1709
+ "learning_rate": 0.00012035139894313107,
1710
+ "loss": 0.4352,
1711
+ "step": 1215
1712
+ },
1713
+ {
1714
+ "epoch": 0.61,
1715
+ "grad_norm": 0.11709322035312653,
1716
+ "learning_rate": 0.00011906937219050556,
1717
+ "loss": 0.4189,
1718
+ "step": 1220
1719
+ },
1720
+ {
1721
+ "epoch": 0.6125,
1722
+ "grad_norm": 0.11865726858377457,
1723
+ "learning_rate": 0.0001177897009249405,
1724
+ "loss": 0.3796,
1725
+ "step": 1225
1726
+ },
1727
+ {
1728
+ "epoch": 0.615,
1729
+ "grad_norm": 0.10807759314775467,
1730
+ "learning_rate": 0.0001165124825983573,
1731
+ "loss": 0.4465,
1732
+ "step": 1230
1733
+ },
1734
+ {
1735
+ "epoch": 0.6175,
1736
+ "grad_norm": 0.13788209855556488,
1737
+ "learning_rate": 0.00011523781447587641,
1738
+ "loss": 0.4994,
1739
+ "step": 1235
1740
+ },
1741
+ {
1742
+ "epoch": 0.62,
1743
+ "grad_norm": 0.12921364605426788,
1744
+ "learning_rate": 0.00011396579362841044,
1745
+ "loss": 0.4251,
1746
+ "step": 1240
1747
+ },
1748
+ {
1749
+ "epoch": 0.6225,
1750
+ "grad_norm": 0.12162365019321442,
1751
+ "learning_rate": 0.0001126965169252718,
1752
+ "loss": 0.3864,
1753
+ "step": 1245
1754
+ },
1755
+ {
1756
+ "epoch": 0.625,
1757
+ "grad_norm": 0.12897826731204987,
1758
+ "learning_rate": 0.00011143008102679559,
1759
+ "loss": 0.3753,
1760
+ "step": 1250
1761
+ },
1762
+ {
1763
+ "epoch": 0.6275,
1764
+ "grad_norm": 0.116109699010849,
1765
+ "learning_rate": 0.00011016658237697866,
1766
+ "loss": 0.3296,
1767
+ "step": 1255
1768
+ },
1769
+ {
1770
+ "epoch": 0.63,
1771
+ "grad_norm": 0.12935414910316467,
1772
+ "learning_rate": 0.00010890611719613512,
1773
+ "loss": 0.3797,
1774
+ "step": 1260
1775
+ },
1776
+ {
1777
+ "epoch": 0.6325,
1778
+ "grad_norm": 0.13730891048908234,
1779
+ "learning_rate": 0.0001076487814735685,
1780
+ "loss": 0.3711,
1781
+ "step": 1265
1782
+ },
1783
+ {
1784
+ "epoch": 0.635,
1785
+ "grad_norm": 0.13870631158351898,
1786
+ "learning_rate": 0.00010639467096026211,
1787
+ "loss": 0.4328,
1788
+ "step": 1270
1789
+ },
1790
+ {
1791
+ "epoch": 0.6375,
1792
+ "grad_norm": 0.11644043773412704,
1793
+ "learning_rate": 0.00010514388116158701,
1794
+ "loss": 0.3283,
1795
+ "step": 1275
1796
+ },
1797
+ {
1798
+ "epoch": 0.64,
1799
+ "grad_norm": 0.12221091985702515,
1800
+ "learning_rate": 0.00010389650733002894,
1801
+ "loss": 0.3898,
1802
+ "step": 1280
1803
+ },
1804
+ {
1805
+ "epoch": 0.6425,
1806
+ "grad_norm": 0.12048634141683578,
1807
+ "learning_rate": 0.00010265264445793464,
1808
+ "loss": 0.3256,
1809
+ "step": 1285
1810
+ },
1811
+ {
1812
+ "epoch": 0.645,
1813
+ "grad_norm": 0.1250566840171814,
1814
+ "learning_rate": 0.00010141238727027761,
1815
+ "loss": 0.408,
1816
+ "step": 1290
1817
+ },
1818
+ {
1819
+ "epoch": 0.6475,
1820
+ "grad_norm": 0.13518592715263367,
1821
+ "learning_rate": 0.00010017583021744454,
1822
+ "loss": 0.3763,
1823
+ "step": 1295
1824
+ },
1825
+ {
1826
+ "epoch": 0.65,
1827
+ "grad_norm": 0.13047736883163452,
1828
+ "learning_rate": 9.89430674680425e-05,
1829
+ "loss": 0.3989,
1830
+ "step": 1300
1831
+ },
1832
+ {
1833
+ "epoch": 0.6525,
1834
+ "grad_norm": 0.11474955826997757,
1835
+ "learning_rate": 9.771419290172773e-05,
1836
+ "loss": 0.3374,
1837
+ "step": 1305
1838
+ },
1839
+ {
1840
+ "epoch": 0.655,
1841
+ "grad_norm": 0.11670063436031342,
1842
+ "learning_rate": 9.648930010205619e-05,
1843
+ "loss": 0.3343,
1844
+ "step": 1310
1845
+ },
1846
+ {
1847
+ "epoch": 0.6575,
1848
+ "grad_norm": 0.15385080873966217,
1849
+ "learning_rate": 9.526848234935704e-05,
1850
+ "loss": 0.3432,
1851
+ "step": 1315
1852
+ },
1853
+ {
1854
+ "epoch": 0.66,
1855
+ "grad_norm": 0.13441519439220428,
1856
+ "learning_rate": 9.405183261362863e-05,
1857
+ "loss": 0.3116,
1858
+ "step": 1320
1859
+ },
1860
+ {
1861
+ "epoch": 0.6625,
1862
+ "grad_norm": 0.14772167801856995,
1863
+ "learning_rate": 9.283944354745888e-05,
1864
+ "loss": 0.3613,
1865
+ "step": 1325
1866
+ },
1867
+ {
1868
+ "epoch": 0.665,
1869
+ "grad_norm": 0.12146154791116714,
1870
+ "learning_rate": 9.163140747896907e-05,
1871
+ "loss": 0.3411,
1872
+ "step": 1330
1873
+ },
1874
+ {
1875
+ "epoch": 0.6675,
1876
+ "grad_norm": 0.1333102583885193,
1877
+ "learning_rate": 9.042781640478291e-05,
1878
+ "loss": 0.396,
1879
+ "step": 1335
1880
+ },
1881
+ {
1882
+ "epoch": 0.67,
1883
+ "grad_norm": 0.12051521986722946,
1884
+ "learning_rate": 8.922876198302062e-05,
1885
+ "loss": 0.3837,
1886
+ "step": 1340
1887
+ },
1888
+ {
1889
+ "epoch": 0.6725,
1890
+ "grad_norm": 0.12071400880813599,
1891
+ "learning_rate": 8.803433552631874e-05,
1892
+ "loss": 0.354,
1893
+ "step": 1345
1894
+ },
1895
+ {
1896
+ "epoch": 0.675,
1897
+ "grad_norm": 0.11258620023727417,
1898
+ "learning_rate": 8.684462799487635e-05,
1899
+ "loss": 0.3197,
1900
+ "step": 1350
1901
+ },
1902
+ {
1903
+ "epoch": 0.6775,
1904
+ "grad_norm": 0.11908067762851715,
1905
+ "learning_rate": 8.565972998952814e-05,
1906
+ "loss": 0.377,
1907
+ "step": 1355
1908
+ },
1909
+ {
1910
+ "epoch": 0.68,
1911
+ "grad_norm": 0.1252991259098053,
1912
+ "learning_rate": 8.447973174484469e-05,
1913
+ "loss": 0.3438,
1914
+ "step": 1360
1915
+ },
1916
+ {
1917
+ "epoch": 0.6825,
1918
+ "grad_norm": 0.12832245230674744,
1919
+ "learning_rate": 8.330472312226091e-05,
1920
+ "loss": 0.346,
1921
+ "step": 1365
1922
+ },
1923
+ {
1924
+ "epoch": 0.685,
1925
+ "grad_norm": 0.1396942287683487,
1926
+ "learning_rate": 8.213479360323258e-05,
1927
+ "loss": 0.3886,
1928
+ "step": 1370
1929
+ },
1930
+ {
1931
+ "epoch": 0.6875,
1932
+ "grad_norm": 0.12938210368156433,
1933
+ "learning_rate": 8.097003228242225e-05,
1934
+ "loss": 0.3699,
1935
+ "step": 1375
1936
+ },
1937
+ {
1938
+ "epoch": 0.69,
1939
+ "grad_norm": 0.12459377944469452,
1940
+ "learning_rate": 7.9810527860914e-05,
1941
+ "loss": 0.3892,
1942
+ "step": 1380
1943
+ },
1944
+ {
1945
+ "epoch": 0.6925,
1946
+ "grad_norm": 0.1360333263874054,
1947
+ "learning_rate": 7.86563686394587e-05,
1948
+ "loss": 0.3423,
1949
+ "step": 1385
1950
+ },
1951
+ {
1952
+ "epoch": 0.695,
1953
+ "grad_norm": 0.1357765644788742,
1954
+ "learning_rate": 7.750764251174963e-05,
1955
+ "loss": 0.408,
1956
+ "step": 1390
1957
+ },
1958
+ {
1959
+ "epoch": 0.6975,
1960
+ "grad_norm": 0.14453718066215515,
1961
+ "learning_rate": 7.636443695772887e-05,
1962
+ "loss": 0.3398,
1963
+ "step": 1395
1964
+ },
1965
+ {
1966
+ "epoch": 0.7,
1967
+ "grad_norm": 0.11541519314050674,
1968
+ "learning_rate": 7.522683903692547e-05,
1969
+ "loss": 0.4203,
1970
+ "step": 1400
1971
+ },
1972
+ {
1973
+ "epoch": 0.7025,
1974
+ "grad_norm": 0.13344840705394745,
1975
+ "learning_rate": 7.409493538182545e-05,
1976
+ "loss": 0.3694,
1977
+ "step": 1405
1978
+ },
1979
+ {
1980
+ "epoch": 0.705,
1981
+ "grad_norm": 0.13069866597652435,
1982
+ "learning_rate": 7.296881219127452e-05,
1983
+ "loss": 0.3889,
1984
+ "step": 1410
1985
+ },
1986
+ {
1987
+ "epoch": 0.7075,
1988
+ "grad_norm": 0.12457838654518127,
1989
+ "learning_rate": 7.184855522391359e-05,
1990
+ "loss": 0.3342,
1991
+ "step": 1415
1992
+ },
1993
+ {
1994
+ "epoch": 0.71,
1995
+ "grad_norm": 0.11990659683942795,
1996
+ "learning_rate": 7.073424979164794e-05,
1997
+ "loss": 0.3855,
1998
+ "step": 1420
1999
+ },
2000
+ {
2001
+ "epoch": 0.7125,
2002
+ "grad_norm": 0.1389523446559906,
2003
+ "learning_rate": 6.962598075315046e-05,
2004
+ "loss": 0.3943,
2005
+ "step": 1425
2006
+ },
2007
+ {
2008
+ "epoch": 0.715,
2009
+ "grad_norm": 0.14108599722385406,
2010
+ "learning_rate": 6.852383250739938e-05,
2011
+ "loss": 0.388,
2012
+ "step": 1430
2013
+ },
2014
+ {
2015
+ "epoch": 0.7175,
2016
+ "grad_norm": 0.1342005580663681,
2017
+ "learning_rate": 6.742788898725065e-05,
2018
+ "loss": 0.3602,
2019
+ "step": 1435
2020
+ },
2021
+ {
2022
+ "epoch": 0.72,
2023
+ "grad_norm": 0.13516324758529663,
2024
+ "learning_rate": 6.633823365304648e-05,
2025
+ "loss": 0.3935,
2026
+ "step": 1440
2027
+ },
2028
+ {
2029
+ "epoch": 0.7225,
2030
+ "grad_norm": 0.1302197426557541,
2031
+ "learning_rate": 6.52549494862593e-05,
2032
+ "loss": 0.3618,
2033
+ "step": 1445
2034
+ },
2035
+ {
2036
+ "epoch": 0.725,
2037
+ "grad_norm": 0.12428996711969376,
2038
+ "learning_rate": 6.417811898317259e-05,
2039
+ "loss": 0.3338,
2040
+ "step": 1450
2041
+ },
2042
+ {
2043
+ "epoch": 0.7275,
2044
+ "grad_norm": 0.11249776184558868,
2045
+ "learning_rate": 6.31078241485982e-05,
2046
+ "loss": 0.3819,
2047
+ "step": 1455
2048
+ },
2049
+ {
2050
+ "epoch": 0.73,
2051
+ "grad_norm": 0.1359994113445282,
2052
+ "learning_rate": 6.204414648963159e-05,
2053
+ "loss": 0.3356,
2054
+ "step": 1460
2055
+ },
2056
+ {
2057
+ "epoch": 0.7325,
2058
+ "grad_norm": 0.1118568629026413,
2059
+ "learning_rate": 6.098716700944479e-05,
2060
+ "loss": 0.3223,
2061
+ "step": 1465
2062
+ },
2063
+ {
2064
+ "epoch": 0.735,
2065
+ "grad_norm": 0.12038140743970871,
2066
+ "learning_rate": 5.993696620111741e-05,
2067
+ "loss": 0.3481,
2068
+ "step": 1470
2069
+ },
2070
+ {
2071
+ "epoch": 0.7375,
2072
+ "grad_norm": 0.12787550687789917,
2073
+ "learning_rate": 5.889362404150703e-05,
2074
+ "loss": 0.3766,
2075
+ "step": 1475
2076
+ },
2077
+ {
2078
+ "epoch": 0.74,
2079
+ "grad_norm": 0.12134893983602524,
2080
+ "learning_rate": 5.7857219985158506e-05,
2081
+ "loss": 0.2916,
2082
+ "step": 1480
2083
+ },
2084
+ {
2085
+ "epoch": 0.7425,
2086
+ "grad_norm": 0.1274223029613495,
2087
+ "learning_rate": 5.682783295825345e-05,
2088
+ "loss": 0.3095,
2089
+ "step": 1485
2090
+ },
2091
+ {
2092
+ "epoch": 0.745,
2093
+ "grad_norm": 0.11817299574613571,
2094
+ "learning_rate": 5.580554135259932e-05,
2095
+ "loss": 0.3422,
2096
+ "step": 1490
2097
+ },
2098
+ {
2099
+ "epoch": 0.7475,
2100
+ "grad_norm": 0.1348387748003006,
2101
+ "learning_rate": 5.479042301965987e-05,
2102
+ "loss": 0.4044,
2103
+ "step": 1495
2104
+ },
2105
+ {
2106
+ "epoch": 0.75,
2107
+ "grad_norm": 0.14032681286334991,
2108
+ "learning_rate": 5.378255526462631e-05,
2109
+ "loss": 0.337,
2110
+ "step": 1500
2111
+ }
2112
+ ],
2113
+ "logging_steps": 5,
2114
+ "max_steps": 2000,
2115
+ "num_input_tokens_seen": 0,
2116
+ "num_train_epochs": 9223372036854775807,
2117
+ "save_steps": 500,
2118
+ "stateful_callbacks": {
2119
+ "TrainerControl": {
2120
+ "args": {
2121
+ "should_epoch_stop": false,
2122
+ "should_evaluate": false,
2123
+ "should_log": false,
2124
+ "should_save": true,
2125
+ "should_training_stop": false
2126
+ },
2127
+ "attributes": {}
2128
+ }
2129
+ },
2130
+ "total_flos": 1.972183618289664e+18,
2131
+ "train_batch_size": 4,
2132
+ "trial_name": null,
2133
+ "trial_params": null
2134
+ }
codellama-hugcoder/checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
3
+ size 5304
codellama-hugcoder/checkpoint-2000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLlama-7b-Instruct-hf
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2.dev0
codellama-hugcoder/checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
codellama-hugcoder/checkpoint-2000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:456cbd6da326b2c6f27a85ab19d40e13bf3fb60689cbe5ec56653d42193963f8
3
+ size 319876032
codellama-hugcoder/checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688ec5889a6aa6b6675276da1e991b1ffaf231ca0b9db550ca1055ee967ab484
3
+ size 640009682
codellama-hugcoder/checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d88eee16810615d69e99ef0af6ae2767f80f0c756dab6f8b6315f916e0a2772d
3
+ size 14180
codellama-hugcoder/checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0af176d761d71fce3fbce7001f4850782b022af8f40338e8e88b22363a32018f
3
+ size 1064
codellama-hugcoder/checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,2834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 100.0,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0025,
14
+ "grad_norm": 0.09379793703556061,
15
+ "learning_rate": 5.999999999999999e-06,
16
+ "loss": 0.6799,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.005,
21
+ "grad_norm": 0.1399833709001541,
22
+ "learning_rate": 1.3499999999999998e-05,
23
+ "loss": 0.6954,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0075,
28
+ "grad_norm": 0.08632303029298782,
29
+ "learning_rate": 2.1e-05,
30
+ "loss": 0.6921,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 0.10006701201200485,
36
+ "learning_rate": 2.8499999999999998e-05,
37
+ "loss": 0.69,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.0125,
42
+ "grad_norm": 0.07633858919143677,
43
+ "learning_rate": 3.5999999999999994e-05,
44
+ "loss": 0.6722,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.015,
49
+ "grad_norm": 0.09399061650037766,
50
+ "learning_rate": 4.3499999999999993e-05,
51
+ "loss": 0.6453,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.0175,
56
+ "grad_norm": 0.0843738541007042,
57
+ "learning_rate": 5.1e-05,
58
+ "loss": 0.6276,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 0.08583351224660873,
64
+ "learning_rate": 5.85e-05,
65
+ "loss": 0.58,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.0225,
70
+ "grad_norm": 0.09571370482444763,
71
+ "learning_rate": 6.599999999999999e-05,
72
+ "loss": 0.6355,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.025,
77
+ "grad_norm": 0.1083935871720314,
78
+ "learning_rate": 7.35e-05,
79
+ "loss": 0.589,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.0275,
84
+ "grad_norm": 0.10387319326400757,
85
+ "learning_rate": 8.1e-05,
86
+ "loss": 0.6061,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "grad_norm": 0.11083361506462097,
92
+ "learning_rate": 8.849999999999998e-05,
93
+ "loss": 0.572,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.0325,
98
+ "grad_norm": 0.12665686011314392,
99
+ "learning_rate": 9.599999999999999e-05,
100
+ "loss": 0.5442,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.035,
105
+ "grad_norm": 0.1308053582906723,
106
+ "learning_rate": 0.00010349999999999998,
107
+ "loss": 0.6524,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.0375,
112
+ "grad_norm": 0.13535510003566742,
113
+ "learning_rate": 0.00011099999999999999,
114
+ "loss": 0.6404,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.04,
119
+ "grad_norm": 0.12833671271800995,
120
+ "learning_rate": 0.0001185,
121
+ "loss": 0.5717,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.0425,
126
+ "grad_norm": 0.11962099373340607,
127
+ "learning_rate": 0.00012599999999999997,
128
+ "loss": 0.6098,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.045,
133
+ "grad_norm": 0.13898271322250366,
134
+ "learning_rate": 0.0001335,
135
+ "loss": 0.6099,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.0475,
140
+ "grad_norm": 0.14486610889434814,
141
+ "learning_rate": 0.00014099999999999998,
142
+ "loss": 0.5744,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.05,
147
+ "grad_norm": 0.1432138830423355,
148
+ "learning_rate": 0.00014849999999999998,
149
+ "loss": 0.5659,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.0525,
154
+ "grad_norm": 0.13487878441810608,
155
+ "learning_rate": 0.000156,
156
+ "loss": 0.5622,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.055,
161
+ "grad_norm": 0.12495309859514236,
162
+ "learning_rate": 0.0001635,
163
+ "loss": 0.5951,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.0575,
168
+ "grad_norm": 0.13011734187602997,
169
+ "learning_rate": 0.00017099999999999998,
170
+ "loss": 0.6249,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.06,
175
+ "grad_norm": 0.13987745344638824,
176
+ "learning_rate": 0.00017849999999999997,
177
+ "loss": 0.559,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.0625,
182
+ "grad_norm": 0.13373605906963348,
183
+ "learning_rate": 0.000186,
184
+ "loss": 0.5475,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.065,
189
+ "grad_norm": 0.12433867901563644,
190
+ "learning_rate": 0.0001935,
191
+ "loss": 0.5274,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.0675,
196
+ "grad_norm": 0.11097615957260132,
197
+ "learning_rate": 0.000201,
198
+ "loss": 0.678,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.07,
203
+ "grad_norm": 0.1155027225613594,
204
+ "learning_rate": 0.00020849999999999997,
205
+ "loss": 0.5611,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.0725,
210
+ "grad_norm": 0.11431068181991577,
211
+ "learning_rate": 0.00021599999999999996,
212
+ "loss": 0.6054,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.075,
217
+ "grad_norm": 0.09796140342950821,
218
+ "learning_rate": 0.00022349999999999998,
219
+ "loss": 0.5472,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.0775,
224
+ "grad_norm": 0.09489257633686066,
225
+ "learning_rate": 0.00023099999999999998,
226
+ "loss": 0.4636,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "grad_norm": 0.10787788033485413,
232
+ "learning_rate": 0.0002385,
233
+ "loss": 0.6164,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.0825,
238
+ "grad_norm": 0.10261733084917068,
239
+ "learning_rate": 0.00024599999999999996,
240
+ "loss": 0.5408,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.085,
245
+ "grad_norm": 0.11870352178812027,
246
+ "learning_rate": 0.0002535,
247
+ "loss": 0.5268,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.0875,
252
+ "grad_norm": 0.11910569667816162,
253
+ "learning_rate": 0.000261,
254
+ "loss": 0.5461,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.09,
259
+ "grad_norm": 0.10083702206611633,
260
+ "learning_rate": 0.00026849999999999997,
261
+ "loss": 0.4794,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.0925,
266
+ "grad_norm": 0.10453511029481888,
267
+ "learning_rate": 0.000276,
268
+ "loss": 0.5539,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.095,
273
+ "grad_norm": 0.101403146982193,
274
+ "learning_rate": 0.00028349999999999995,
275
+ "loss": 0.5346,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.0975,
280
+ "grad_norm": 0.10724789649248123,
281
+ "learning_rate": 0.00029099999999999997,
282
+ "loss": 0.6026,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1,
287
+ "grad_norm": 0.1140277311205864,
288
+ "learning_rate": 0.0002985,
289
+ "loss": 0.5193,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.1025,
294
+ "grad_norm": 0.09706108272075653,
295
+ "learning_rate": 0.0002999963446058092,
296
+ "loss": 0.54,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.105,
301
+ "grad_norm": 0.10003062337636948,
302
+ "learning_rate": 0.0002999814948722491,
303
+ "loss": 0.5365,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1075,
308
+ "grad_norm": 0.1078687533736229,
309
+ "learning_rate": 0.00029995522346717746,
310
+ "loss": 0.5889,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.11,
315
+ "grad_norm": 0.10538115352392197,
316
+ "learning_rate": 0.0002999175323912636,
317
+ "loss": 0.5611,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.1125,
322
+ "grad_norm": 0.1020808294415474,
323
+ "learning_rate": 0.00029986842451482874,
324
+ "loss": 0.6103,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.115,
329
+ "grad_norm": 0.09635835886001587,
330
+ "learning_rate": 0.0002998079035776279,
331
+ "loss": 0.5229,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.1175,
336
+ "grad_norm": 0.10287190228700638,
337
+ "learning_rate": 0.0002997359741885648,
338
+ "loss": 0.5312,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.12,
343
+ "grad_norm": 0.09160075336694717,
344
+ "learning_rate": 0.0002996526418253408,
345
+ "loss": 0.5673,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.1225,
350
+ "grad_norm": 0.08691006153821945,
351
+ "learning_rate": 0.000299557912834038,
352
+ "loss": 0.5326,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.125,
357
+ "grad_norm": 0.10096988826990128,
358
+ "learning_rate": 0.00029945179442863594,
359
+ "loss": 0.6004,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.1275,
364
+ "grad_norm": 0.09594204276800156,
365
+ "learning_rate": 0.000299334294690462,
366
+ "loss": 0.5516,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.13,
371
+ "grad_norm": 0.10281919687986374,
372
+ "learning_rate": 0.00029920542256757607,
373
+ "loss": 0.5515,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.1325,
378
+ "grad_norm": 0.08547840267419815,
379
+ "learning_rate": 0.00029906518787408944,
380
+ "loss": 0.5243,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.135,
385
+ "grad_norm": 0.10161560773849487,
386
+ "learning_rate": 0.0002989136012894168,
387
+ "loss": 0.5096,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.1375,
392
+ "grad_norm": 0.09101904183626175,
393
+ "learning_rate": 0.0002987506743574635,
394
+ "loss": 0.553,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.14,
399
+ "grad_norm": 0.09769442677497864,
400
+ "learning_rate": 0.0002985764194857463,
401
+ "loss": 0.4953,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.1425,
406
+ "grad_norm": 0.10991579294204712,
407
+ "learning_rate": 0.00029839084994444826,
408
+ "loss": 0.5152,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.145,
413
+ "grad_norm": 0.09450916200876236,
414
+ "learning_rate": 0.00029819397986540836,
415
+ "loss": 0.5397,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.1475,
420
+ "grad_norm": 0.10876069217920303,
421
+ "learning_rate": 0.0002979858242410454,
422
+ "loss": 0.4858,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.15,
427
+ "grad_norm": 0.097995825111866,
428
+ "learning_rate": 0.00029776639892321606,
429
+ "loss": 0.5566,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.1525,
434
+ "grad_norm": 0.1145048514008522,
435
+ "learning_rate": 0.0002975357206220079,
436
+ "loss": 0.4531,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.155,
441
+ "grad_norm": 0.10271880775690079,
442
+ "learning_rate": 0.00029729380690446654,
443
+ "loss": 0.5199,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.1575,
448
+ "grad_norm": 0.11095371842384338,
449
+ "learning_rate": 0.0002970406761932583,
450
+ "loss": 0.5416,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.16,
455
+ "grad_norm": 0.09949438273906708,
456
+ "learning_rate": 0.00029677634776526673,
457
+ "loss": 0.4841,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.1625,
462
+ "grad_norm": 0.1163724958896637,
463
+ "learning_rate": 0.00029650084175012517,
464
+ "loss": 0.4913,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.165,
469
+ "grad_norm": 0.10726840049028397,
470
+ "learning_rate": 0.00029621417912868323,
471
+ "loss": 0.5203,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.1675,
476
+ "grad_norm": 0.09609931707382202,
477
+ "learning_rate": 0.00029591638173140947,
478
+ "loss": 0.5607,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.17,
483
+ "grad_norm": 0.10824442654848099,
484
+ "learning_rate": 0.0002956074722367286,
485
+ "loss": 0.6004,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.1725,
490
+ "grad_norm": 0.10465679317712784,
491
+ "learning_rate": 0.00029528747416929463,
492
+ "loss": 0.5216,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.175,
497
+ "grad_norm": 0.10518354922533035,
498
+ "learning_rate": 0.0002949564118981994,
499
+ "loss": 0.499,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.1775,
504
+ "grad_norm": 0.0955279991030693,
505
+ "learning_rate": 0.0002946143106351165,
506
+ "loss": 0.5607,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.18,
511
+ "grad_norm": 0.11159654706716537,
512
+ "learning_rate": 0.0002942611964323817,
513
+ "loss": 0.5204,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.1825,
518
+ "grad_norm": 0.09571187198162079,
519
+ "learning_rate": 0.0002938970961810086,
520
+ "loss": 0.6113,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.185,
525
+ "grad_norm": 0.11854679882526398,
526
+ "learning_rate": 0.0002935220376086411,
527
+ "loss": 0.5639,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.1875,
532
+ "grad_norm": 0.1050512045621872,
533
+ "learning_rate": 0.0002931360492774415,
534
+ "loss": 0.548,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.19,
539
+ "grad_norm": 0.1053968220949173,
540
+ "learning_rate": 0.0002927391605819157,
541
+ "loss": 0.5507,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.1925,
546
+ "grad_norm": 0.10567320138216019,
547
+ "learning_rate": 0.00029233140174667445,
548
+ "loss": 0.5312,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.195,
553
+ "grad_norm": 0.11914283782243729,
554
+ "learning_rate": 0.0002919128038241318,
555
+ "loss": 0.5961,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.1975,
560
+ "grad_norm": 0.09915795922279358,
561
+ "learning_rate": 0.0002914833986921401,
562
+ "loss": 0.5086,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.2,
567
+ "grad_norm": 0.10796502232551575,
568
+ "learning_rate": 0.0002910432190515628,
569
+ "loss": 0.5585,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.2025,
574
+ "grad_norm": 0.10748997330665588,
575
+ "learning_rate": 0.00029059229842378373,
576
+ "loss": 0.5466,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.205,
581
+ "grad_norm": 0.10696308314800262,
582
+ "learning_rate": 0.0002901306711481544,
583
+ "loss": 0.5513,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.2075,
588
+ "grad_norm": 0.10418657958507538,
589
+ "learning_rate": 0.0002896583723793792,
590
+ "loss": 0.5391,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.21,
595
+ "grad_norm": 0.16421550512313843,
596
+ "learning_rate": 0.00028917543808483796,
597
+ "loss": 0.4699,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.2125,
602
+ "grad_norm": 0.12929962575435638,
603
+ "learning_rate": 0.00028868190504184696,
604
+ "loss": 0.4984,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.215,
609
+ "grad_norm": 0.10469454526901245,
610
+ "learning_rate": 0.00028817781083485816,
611
+ "loss": 0.5119,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.2175,
616
+ "grad_norm": 0.0964970663189888,
617
+ "learning_rate": 0.00028766319385259713,
618
+ "loss": 0.5167,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.22,
623
+ "grad_norm": 0.12395574152469635,
624
+ "learning_rate": 0.00028713809328513953,
625
+ "loss": 0.5692,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.2225,
630
+ "grad_norm": 0.10189738124608994,
631
+ "learning_rate": 0.0002866025491209265,
632
+ "loss": 0.4628,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.225,
637
+ "grad_norm": 0.10433454066514969,
638
+ "learning_rate": 0.0002860566021437197,
639
+ "loss": 0.4869,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.2275,
644
+ "grad_norm": 0.13003456592559814,
645
+ "learning_rate": 0.0002855002939294951,
646
+ "loss": 0.5291,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.23,
651
+ "grad_norm": 0.11692202836275101,
652
+ "learning_rate": 0.000284933666843277,
653
+ "loss": 0.5229,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.2325,
658
+ "grad_norm": 0.10757846385240555,
659
+ "learning_rate": 0.0002843567640359119,
660
+ "loss": 0.435,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.235,
665
+ "grad_norm": 0.10775501281023026,
666
+ "learning_rate": 0.00028376962944078206,
667
+ "loss": 0.4418,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.2375,
672
+ "grad_norm": 0.11543692648410797,
673
+ "learning_rate": 0.00028317230777046015,
674
+ "loss": 0.4204,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.24,
679
+ "grad_norm": 0.10946698486804962,
680
+ "learning_rate": 0.00028256484451330403,
681
+ "loss": 0.49,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.2425,
686
+ "grad_norm": 0.11528221517801285,
687
+ "learning_rate": 0.00028194728592999247,
688
+ "loss": 0.4752,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.245,
693
+ "grad_norm": 0.10474205762147903,
694
+ "learning_rate": 0.0002813196790500027,
695
+ "loss": 0.4847,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.2475,
700
+ "grad_norm": 0.10768820345401764,
701
+ "learning_rate": 0.00028068207166802837,
702
+ "loss": 0.4664,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.25,
707
+ "grad_norm": 0.12158560007810593,
708
+ "learning_rate": 0.00028003451234034037,
709
+ "loss": 0.4741,
710
+ "step": 500
711
+ },
712
+ {
713
+ "epoch": 0.2525,
714
+ "grad_norm": 0.11635497957468033,
715
+ "learning_rate": 0.0002793770503810886,
716
+ "loss": 0.4969,
717
+ "step": 505
718
+ },
719
+ {
720
+ "epoch": 0.255,
721
+ "grad_norm": 0.12205849587917328,
722
+ "learning_rate": 0.00027870973585854665,
723
+ "loss": 0.4798,
724
+ "step": 510
725
+ },
726
+ {
727
+ "epoch": 0.2575,
728
+ "grad_norm": 0.10270871222019196,
729
+ "learning_rate": 0.00027803261959129905,
730
+ "loss": 0.3888,
731
+ "step": 515
732
+ },
733
+ {
734
+ "epoch": 0.26,
735
+ "grad_norm": 0.11313367635011673,
736
+ "learning_rate": 0.0002773457531443712,
737
+ "loss": 0.4759,
738
+ "step": 520
739
+ },
740
+ {
741
+ "epoch": 0.2625,
742
+ "grad_norm": 0.12905193865299225,
743
+ "learning_rate": 0.00027664918882530225,
744
+ "loss": 0.4442,
745
+ "step": 525
746
+ },
747
+ {
748
+ "epoch": 0.265,
749
+ "grad_norm": 0.11690939962863922,
750
+ "learning_rate": 0.00027594297968016197,
751
+ "loss": 0.5535,
752
+ "step": 530
753
+ },
754
+ {
755
+ "epoch": 0.2675,
756
+ "grad_norm": 0.10021405667066574,
757
+ "learning_rate": 0.00027522717948951094,
758
+ "loss": 0.4717,
759
+ "step": 535
760
+ },
761
+ {
762
+ "epoch": 0.27,
763
+ "grad_norm": 0.10104178637266159,
764
+ "learning_rate": 0.0002745018427643051,
765
+ "loss": 0.4906,
766
+ "step": 540
767
+ },
768
+ {
769
+ "epoch": 0.2725,
770
+ "grad_norm": 0.12113891541957855,
771
+ "learning_rate": 0.00027376702474174425,
772
+ "loss": 0.5674,
773
+ "step": 545
774
+ },
775
+ {
776
+ "epoch": 0.275,
777
+ "grad_norm": 0.11330476403236389,
778
+ "learning_rate": 0.0002730227813810658,
779
+ "loss": 0.5184,
780
+ "step": 550
781
+ },
782
+ {
783
+ "epoch": 0.2775,
784
+ "grad_norm": 0.1025850847363472,
785
+ "learning_rate": 0.0002722691693592831,
786
+ "loss": 0.4395,
787
+ "step": 555
788
+ },
789
+ {
790
+ "epoch": 0.28,
791
+ "grad_norm": 0.11591499298810959,
792
+ "learning_rate": 0.0002715062460668694,
793
+ "loss": 0.5003,
794
+ "step": 560
795
+ },
796
+ {
797
+ "epoch": 0.2825,
798
+ "grad_norm": 0.11281153559684753,
799
+ "learning_rate": 0.0002707340696033871,
800
+ "loss": 0.4672,
801
+ "step": 565
802
+ },
803
+ {
804
+ "epoch": 0.285,
805
+ "grad_norm": 0.1123538464307785,
806
+ "learning_rate": 0.00026995269877306356,
807
+ "loss": 0.513,
808
+ "step": 570
809
+ },
810
+ {
811
+ "epoch": 0.2875,
812
+ "grad_norm": 0.10776390135288239,
813
+ "learning_rate": 0.0002691621930803127,
814
+ "loss": 0.4572,
815
+ "step": 575
816
+ },
817
+ {
818
+ "epoch": 0.29,
819
+ "grad_norm": 0.10008667409420013,
820
+ "learning_rate": 0.0002683626127252036,
821
+ "loss": 0.4618,
822
+ "step": 580
823
+ },
824
+ {
825
+ "epoch": 0.2925,
826
+ "grad_norm": 0.13961340487003326,
827
+ "learning_rate": 0.00026755401859887595,
828
+ "loss": 0.4819,
829
+ "step": 585
830
+ },
831
+ {
832
+ "epoch": 0.295,
833
+ "grad_norm": 0.1476685106754303,
834
+ "learning_rate": 0.00026673647227890316,
835
+ "loss": 0.4964,
836
+ "step": 590
837
+ },
838
+ {
839
+ "epoch": 0.2975,
840
+ "grad_norm": 0.09795507788658142,
841
+ "learning_rate": 0.00026591003602460263,
842
+ "loss": 0.4796,
843
+ "step": 595
844
+ },
845
+ {
846
+ "epoch": 0.3,
847
+ "grad_norm": 0.10903532058000565,
848
+ "learning_rate": 0.00026507477277229496,
849
+ "loss": 0.4775,
850
+ "step": 600
851
+ },
852
+ {
853
+ "epoch": 0.3025,
854
+ "grad_norm": 0.10258448123931885,
855
+ "learning_rate": 0.0002642307461305105,
856
+ "loss": 0.4519,
857
+ "step": 605
858
+ },
859
+ {
860
+ "epoch": 0.305,
861
+ "grad_norm": 0.11204435676336288,
862
+ "learning_rate": 0.0002633780203751459,
863
+ "loss": 0.4451,
864
+ "step": 610
865
+ },
866
+ {
867
+ "epoch": 0.3075,
868
+ "grad_norm": 0.10147629678249359,
869
+ "learning_rate": 0.0002625166604445689,
870
+ "loss": 0.4256,
871
+ "step": 615
872
+ },
873
+ {
874
+ "epoch": 0.31,
875
+ "grad_norm": 0.10481107234954834,
876
+ "learning_rate": 0.00026164673193467306,
877
+ "loss": 0.4381,
878
+ "step": 620
879
+ },
880
+ {
881
+ "epoch": 0.3125,
882
+ "grad_norm": 0.10856641829013824,
883
+ "learning_rate": 0.00026076830109388255,
884
+ "loss": 0.4958,
885
+ "step": 625
886
+ },
887
+ {
888
+ "epoch": 0.315,
889
+ "grad_norm": 0.09918677806854248,
890
+ "learning_rate": 0.0002598814348181068,
891
+ "loss": 0.4335,
892
+ "step": 630
893
+ },
894
+ {
895
+ "epoch": 0.3175,
896
+ "grad_norm": 0.10417389869689941,
897
+ "learning_rate": 0.00025898620064564637,
898
+ "loss": 0.4603,
899
+ "step": 635
900
+ },
901
+ {
902
+ "epoch": 0.32,
903
+ "grad_norm": 0.0903329998254776,
904
+ "learning_rate": 0.00025808266675204954,
905
+ "loss": 0.3932,
906
+ "step": 640
907
+ },
908
+ {
909
+ "epoch": 0.3225,
910
+ "grad_norm": 0.11511855572462082,
911
+ "learning_rate": 0.0002571709019449205,
912
+ "loss": 0.4169,
913
+ "step": 645
914
+ },
915
+ {
916
+ "epoch": 0.325,
917
+ "grad_norm": 0.11355557292699814,
918
+ "learning_rate": 0.0002562509756586793,
919
+ "loss": 0.4455,
920
+ "step": 650
921
+ },
922
+ {
923
+ "epoch": 0.3275,
924
+ "grad_norm": 0.1271187961101532,
925
+ "learning_rate": 0.00025532295794927437,
926
+ "loss": 0.4902,
927
+ "step": 655
928
+ },
929
+ {
930
+ "epoch": 0.33,
931
+ "grad_norm": 0.11936645954847336,
932
+ "learning_rate": 0.0002543869194888471,
933
+ "loss": 0.4843,
934
+ "step": 660
935
+ },
936
+ {
937
+ "epoch": 0.3325,
938
+ "grad_norm": 0.11935465037822723,
939
+ "learning_rate": 0.00025344293156035044,
940
+ "loss": 0.4402,
941
+ "step": 665
942
+ },
943
+ {
944
+ "epoch": 0.335,
945
+ "grad_norm": 0.13073407113552094,
946
+ "learning_rate": 0.00025249106605211986,
947
+ "loss": 0.467,
948
+ "step": 670
949
+ },
950
+ {
951
+ "epoch": 0.3375,
952
+ "grad_norm": 0.10340435802936554,
953
+ "learning_rate": 0.0002515313954523991,
954
+ "loss": 0.4827,
955
+ "step": 675
956
+ },
957
+ {
958
+ "epoch": 0.34,
959
+ "grad_norm": 0.11634550243616104,
960
+ "learning_rate": 0.00025056399284381983,
961
+ "loss": 0.466,
962
+ "step": 680
963
+ },
964
+ {
965
+ "epoch": 0.3425,
966
+ "grad_norm": 0.10582319647073746,
967
+ "learning_rate": 0.0002495889318978362,
968
+ "loss": 0.4751,
969
+ "step": 685
970
+ },
971
+ {
972
+ "epoch": 0.345,
973
+ "grad_norm": 0.16781780123710632,
974
+ "learning_rate": 0.00024860628686911436,
975
+ "loss": 0.4717,
976
+ "step": 690
977
+ },
978
+ {
979
+ "epoch": 0.3475,
980
+ "grad_norm": 0.11522196233272552,
981
+ "learning_rate": 0.0002476161325898776,
982
+ "loss": 0.4687,
983
+ "step": 695
984
+ },
985
+ {
986
+ "epoch": 0.35,
987
+ "grad_norm": 0.11830449104309082,
988
+ "learning_rate": 0.000246618544464208,
989
+ "loss": 0.436,
990
+ "step": 700
991
+ },
992
+ {
993
+ "epoch": 0.3525,
994
+ "grad_norm": 0.17485427856445312,
995
+ "learning_rate": 0.0002456135984623034,
996
+ "loss": 0.4284,
997
+ "step": 705
998
+ },
999
+ {
1000
+ "epoch": 0.355,
1001
+ "grad_norm": 0.12288108468055725,
1002
+ "learning_rate": 0.00024460137111469296,
1003
+ "loss": 0.4261,
1004
+ "step": 710
1005
+ },
1006
+ {
1007
+ "epoch": 0.3575,
1008
+ "grad_norm": 0.11587081104516983,
1009
+ "learning_rate": 0.0002435819395064079,
1010
+ "loss": 0.4493,
1011
+ "step": 715
1012
+ },
1013
+ {
1014
+ "epoch": 0.36,
1015
+ "grad_norm": 0.10690271109342575,
1016
+ "learning_rate": 0.0002425553812711123,
1017
+ "loss": 0.4648,
1018
+ "step": 720
1019
+ },
1020
+ {
1021
+ "epoch": 0.3625,
1022
+ "grad_norm": 0.10404397547245026,
1023
+ "learning_rate": 0.00024152177458519014,
1024
+ "loss": 0.4634,
1025
+ "step": 725
1026
+ },
1027
+ {
1028
+ "epoch": 0.365,
1029
+ "grad_norm": 0.11986954510211945,
1030
+ "learning_rate": 0.00024048119816179236,
1031
+ "loss": 0.4525,
1032
+ "step": 730
1033
+ },
1034
+ {
1035
+ "epoch": 0.3675,
1036
+ "grad_norm": 0.10243026167154312,
1037
+ "learning_rate": 0.00023943373124484234,
1038
+ "loss": 0.4572,
1039
+ "step": 735
1040
+ },
1041
+ {
1042
+ "epoch": 0.37,
1043
+ "grad_norm": 0.10386748611927032,
1044
+ "learning_rate": 0.00023837945360300129,
1045
+ "loss": 0.3884,
1046
+ "step": 740
1047
+ },
1048
+ {
1049
+ "epoch": 0.3725,
1050
+ "grad_norm": 0.11165735125541687,
1051
+ "learning_rate": 0.0002373184455235934,
1052
+ "loss": 0.4902,
1053
+ "step": 745
1054
+ },
1055
+ {
1056
+ "epoch": 0.375,
1057
+ "grad_norm": 0.09951601922512054,
1058
+ "learning_rate": 0.00023625078780649178,
1059
+ "loss": 0.4541,
1060
+ "step": 750
1061
+ },
1062
+ {
1063
+ "epoch": 0.3775,
1064
+ "grad_norm": 0.10347504913806915,
1065
+ "learning_rate": 0.00023517656175796518,
1066
+ "loss": 0.3871,
1067
+ "step": 755
1068
+ },
1069
+ {
1070
+ "epoch": 0.38,
1071
+ "grad_norm": 0.10478132963180542,
1072
+ "learning_rate": 0.00023409584918448627,
1073
+ "loss": 0.4329,
1074
+ "step": 760
1075
+ },
1076
+ {
1077
+ "epoch": 0.3825,
1078
+ "grad_norm": 0.1198212131857872,
1079
+ "learning_rate": 0.00023300873238650159,
1080
+ "loss": 0.425,
1081
+ "step": 765
1082
+ },
1083
+ {
1084
+ "epoch": 0.385,
1085
+ "grad_norm": 0.1103711724281311,
1086
+ "learning_rate": 0.00023191529415216434,
1087
+ "loss": 0.4274,
1088
+ "step": 770
1089
+ },
1090
+ {
1091
+ "epoch": 0.3875,
1092
+ "grad_norm": 0.09940385073423386,
1093
+ "learning_rate": 0.00023081561775102944,
1094
+ "loss": 0.4368,
1095
+ "step": 775
1096
+ },
1097
+ {
1098
+ "epoch": 0.39,
1099
+ "grad_norm": 0.11599268019199371,
1100
+ "learning_rate": 0.00022970978692771242,
1101
+ "loss": 0.4386,
1102
+ "step": 780
1103
+ },
1104
+ {
1105
+ "epoch": 0.3925,
1106
+ "grad_norm": 0.10101296752691269,
1107
+ "learning_rate": 0.00022859788589551188,
1108
+ "loss": 0.4696,
1109
+ "step": 785
1110
+ },
1111
+ {
1112
+ "epoch": 0.395,
1113
+ "grad_norm": 0.10112808644771576,
1114
+ "learning_rate": 0.00022747999932999624,
1115
+ "loss": 0.4066,
1116
+ "step": 790
1117
+ },
1118
+ {
1119
+ "epoch": 0.3975,
1120
+ "grad_norm": 0.09595459699630737,
1121
+ "learning_rate": 0.00022635621236255567,
1122
+ "loss": 0.4837,
1123
+ "step": 795
1124
+ },
1125
+ {
1126
+ "epoch": 0.4,
1127
+ "grad_norm": 0.10761380940675735,
1128
+ "learning_rate": 0.00022522661057391857,
1129
+ "loss": 0.5446,
1130
+ "step": 800
1131
+ },
1132
+ {
1133
+ "epoch": 0.4025,
1134
+ "grad_norm": 0.11919954419136047,
1135
+ "learning_rate": 0.00022409127998763463,
1136
+ "loss": 0.5027,
1137
+ "step": 805
1138
+ },
1139
+ {
1140
+ "epoch": 0.405,
1141
+ "grad_norm": 0.10851597785949707,
1142
+ "learning_rate": 0.00022295030706352356,
1143
+ "loss": 0.4481,
1144
+ "step": 810
1145
+ },
1146
+ {
1147
+ "epoch": 0.4075,
1148
+ "grad_norm": 0.10030311346054077,
1149
+ "learning_rate": 0.00022180377869109104,
1150
+ "loss": 0.4709,
1151
+ "step": 815
1152
+ },
1153
+ {
1154
+ "epoch": 0.41,
1155
+ "grad_norm": 0.111280657351017,
1156
+ "learning_rate": 0.00022065178218291147,
1157
+ "loss": 0.4423,
1158
+ "step": 820
1159
+ },
1160
+ {
1161
+ "epoch": 0.4125,
1162
+ "grad_norm": 0.11253602802753448,
1163
+ "learning_rate": 0.00021949440526797926,
1164
+ "loss": 0.4136,
1165
+ "step": 825
1166
+ },
1167
+ {
1168
+ "epoch": 0.415,
1169
+ "grad_norm": 0.10805424302816391,
1170
+ "learning_rate": 0.00021833173608502732,
1171
+ "loss": 0.4656,
1172
+ "step": 830
1173
+ },
1174
+ {
1175
+ "epoch": 0.4175,
1176
+ "grad_norm": 0.10983198881149292,
1177
+ "learning_rate": 0.00021716386317581542,
1178
+ "loss": 0.3687,
1179
+ "step": 835
1180
+ },
1181
+ {
1182
+ "epoch": 0.42,
1183
+ "grad_norm": 0.10653118044137955,
1184
+ "learning_rate": 0.00021599087547838727,
1185
+ "loss": 0.4654,
1186
+ "step": 840
1187
+ },
1188
+ {
1189
+ "epoch": 0.4225,
1190
+ "grad_norm": 0.10856354981660843,
1191
+ "learning_rate": 0.00021481286232029735,
1192
+ "loss": 0.4298,
1193
+ "step": 845
1194
+ },
1195
+ {
1196
+ "epoch": 0.425,
1197
+ "grad_norm": 0.11233706772327423,
1198
+ "learning_rate": 0.0002136299134118085,
1199
+ "loss": 0.4484,
1200
+ "step": 850
1201
+ },
1202
+ {
1203
+ "epoch": 0.4275,
1204
+ "grad_norm": 0.1085442528128624,
1205
+ "learning_rate": 0.00021244211883906017,
1206
+ "loss": 0.4776,
1207
+ "step": 855
1208
+ },
1209
+ {
1210
+ "epoch": 0.43,
1211
+ "grad_norm": 0.12297824025154114,
1212
+ "learning_rate": 0.0002112495690572077,
1213
+ "loss": 0.4029,
1214
+ "step": 860
1215
+ },
1216
+ {
1217
+ "epoch": 0.4325,
1218
+ "grad_norm": 0.10838114470243454,
1219
+ "learning_rate": 0.00021005235488353428,
1220
+ "loss": 0.4848,
1221
+ "step": 865
1222
+ },
1223
+ {
1224
+ "epoch": 0.435,
1225
+ "grad_norm": 0.10273341834545135,
1226
+ "learning_rate": 0.0002088505674905342,
1227
+ "loss": 0.3989,
1228
+ "step": 870
1229
+ },
1230
+ {
1231
+ "epoch": 0.4375,
1232
+ "grad_norm": 0.11189126968383789,
1233
+ "learning_rate": 0.0002076442983989705,
1234
+ "loss": 0.438,
1235
+ "step": 875
1236
+ },
1237
+ {
1238
+ "epoch": 0.44,
1239
+ "grad_norm": 0.11592905968427658,
1240
+ "learning_rate": 0.0002064336394709048,
1241
+ "loss": 0.4786,
1242
+ "step": 880
1243
+ },
1244
+ {
1245
+ "epoch": 0.4425,
1246
+ "grad_norm": 0.11230389773845673,
1247
+ "learning_rate": 0.0002052186829027017,
1248
+ "loss": 0.3999,
1249
+ "step": 885
1250
+ },
1251
+ {
1252
+ "epoch": 0.445,
1253
+ "grad_norm": 0.12455113977193832,
1254
+ "learning_rate": 0.00020399952121800767,
1255
+ "loss": 0.4856,
1256
+ "step": 890
1257
+ },
1258
+ {
1259
+ "epoch": 0.4475,
1260
+ "grad_norm": 0.1001812294125557,
1261
+ "learning_rate": 0.00020277624726070526,
1262
+ "loss": 0.4689,
1263
+ "step": 895
1264
+ },
1265
+ {
1266
+ "epoch": 0.45,
1267
+ "grad_norm": 0.11319112777709961,
1268
+ "learning_rate": 0.00020154895418784242,
1269
+ "loss": 0.3998,
1270
+ "step": 900
1271
+ },
1272
+ {
1273
+ "epoch": 0.4525,
1274
+ "grad_norm": 0.11322236061096191,
1275
+ "learning_rate": 0.00020031773546253824,
1276
+ "loss": 0.4321,
1277
+ "step": 905
1278
+ },
1279
+ {
1280
+ "epoch": 0.455,
1281
+ "grad_norm": 0.12924689054489136,
1282
+ "learning_rate": 0.00019908268484686558,
1283
+ "loss": 0.4208,
1284
+ "step": 910
1285
+ },
1286
+ {
1287
+ "epoch": 0.4575,
1288
+ "grad_norm": 0.11435618251562119,
1289
+ "learning_rate": 0.00019784389639471048,
1290
+ "loss": 0.4682,
1291
+ "step": 915
1292
+ },
1293
+ {
1294
+ "epoch": 0.46,
1295
+ "grad_norm": 0.10801081359386444,
1296
+ "learning_rate": 0.00019660146444460975,
1297
+ "loss": 0.428,
1298
+ "step": 920
1299
+ },
1300
+ {
1301
+ "epoch": 0.4625,
1302
+ "grad_norm": 0.10906939953565598,
1303
+ "learning_rate": 0.0001953554836125667,
1304
+ "loss": 0.4455,
1305
+ "step": 925
1306
+ },
1307
+ {
1308
+ "epoch": 0.465,
1309
+ "grad_norm": 0.10790123790502548,
1310
+ "learning_rate": 0.00019410604878484556,
1311
+ "loss": 0.4544,
1312
+ "step": 930
1313
+ },
1314
+ {
1315
+ "epoch": 0.4675,
1316
+ "grad_norm": 0.10536376386880875,
1317
+ "learning_rate": 0.000192853255110746,
1318
+ "loss": 0.376,
1319
+ "step": 935
1320
+ },
1321
+ {
1322
+ "epoch": 0.47,
1323
+ "grad_norm": 0.11744682490825653,
1324
+ "learning_rate": 0.00019159719799535668,
1325
+ "loss": 0.3887,
1326
+ "step": 940
1327
+ },
1328
+ {
1329
+ "epoch": 0.4725,
1330
+ "grad_norm": 0.12954068183898926,
1331
+ "learning_rate": 0.00019033797309228983,
1332
+ "loss": 0.4075,
1333
+ "step": 945
1334
+ },
1335
+ {
1336
+ "epoch": 0.475,
1337
+ "grad_norm": 0.1401606798171997,
1338
+ "learning_rate": 0.00018907567629639725,
1339
+ "loss": 0.4454,
1340
+ "step": 950
1341
+ },
1342
+ {
1343
+ "epoch": 0.4775,
1344
+ "grad_norm": 0.12059322744607925,
1345
+ "learning_rate": 0.00018781040373646706,
1346
+ "loss": 0.4339,
1347
+ "step": 955
1348
+ },
1349
+ {
1350
+ "epoch": 0.48,
1351
+ "grad_norm": 0.11798987537622452,
1352
+ "learning_rate": 0.00018654225176790336,
1353
+ "loss": 0.4405,
1354
+ "step": 960
1355
+ },
1356
+ {
1357
+ "epoch": 0.4825,
1358
+ "grad_norm": 0.11344211548566818,
1359
+ "learning_rate": 0.00018527131696538846,
1360
+ "loss": 0.4124,
1361
+ "step": 965
1362
+ },
1363
+ {
1364
+ "epoch": 0.485,
1365
+ "grad_norm": 0.10373330116271973,
1366
+ "learning_rate": 0.00018399769611552824,
1367
+ "loss": 0.4329,
1368
+ "step": 970
1369
+ },
1370
+ {
1371
+ "epoch": 0.4875,
1372
+ "grad_norm": 0.12053704261779785,
1373
+ "learning_rate": 0.0001827214862094814,
1374
+ "loss": 0.4944,
1375
+ "step": 975
1376
+ },
1377
+ {
1378
+ "epoch": 0.49,
1379
+ "grad_norm": 0.141033336520195,
1380
+ "learning_rate": 0.00018144278443557328,
1381
+ "loss": 0.4569,
1382
+ "step": 980
1383
+ },
1384
+ {
1385
+ "epoch": 0.4925,
1386
+ "grad_norm": 0.10922867804765701,
1387
+ "learning_rate": 0.0001801616881718947,
1388
+ "loss": 0.3879,
1389
+ "step": 985
1390
+ },
1391
+ {
1392
+ "epoch": 0.495,
1393
+ "grad_norm": 0.09843657910823822,
1394
+ "learning_rate": 0.00017887829497888612,
1395
+ "loss": 0.4106,
1396
+ "step": 990
1397
+ },
1398
+ {
1399
+ "epoch": 0.4975,
1400
+ "grad_norm": 0.12131062150001526,
1401
+ "learning_rate": 0.000177592702591908,
1402
+ "loss": 0.4023,
1403
+ "step": 995
1404
+ },
1405
+ {
1406
+ "epoch": 0.5,
1407
+ "grad_norm": 0.11343283206224442,
1408
+ "learning_rate": 0.00017630500891379806,
1409
+ "loss": 0.4824,
1410
+ "step": 1000
1411
+ },
1412
+ {
1413
+ "epoch": 0.5025,
1414
+ "grad_norm": 0.11050508171319962,
1415
+ "learning_rate": 0.00017501531200741534,
1416
+ "loss": 0.4098,
1417
+ "step": 1005
1418
+ },
1419
+ {
1420
+ "epoch": 0.505,
1421
+ "grad_norm": 0.11737144738435745,
1422
+ "learning_rate": 0.00017372371008817256,
1423
+ "loss": 0.3943,
1424
+ "step": 1010
1425
+ },
1426
+ {
1427
+ "epoch": 0.5075,
1428
+ "grad_norm": 0.11473528295755386,
1429
+ "learning_rate": 0.00017243030151655643,
1430
+ "loss": 0.3796,
1431
+ "step": 1015
1432
+ },
1433
+ {
1434
+ "epoch": 0.51,
1435
+ "grad_norm": 0.13086555898189545,
1436
+ "learning_rate": 0.00017113518479063738,
1437
+ "loss": 0.4367,
1438
+ "step": 1020
1439
+ },
1440
+ {
1441
+ "epoch": 0.5125,
1442
+ "grad_norm": 0.11752833425998688,
1443
+ "learning_rate": 0.00016983845853856837,
1444
+ "loss": 0.4097,
1445
+ "step": 1025
1446
+ },
1447
+ {
1448
+ "epoch": 0.515,
1449
+ "grad_norm": 0.11596900969743729,
1450
+ "learning_rate": 0.0001685402215110739,
1451
+ "loss": 0.3812,
1452
+ "step": 1030
1453
+ },
1454
+ {
1455
+ "epoch": 0.5175,
1456
+ "grad_norm": 0.11850260943174362,
1457
+ "learning_rate": 0.00016724057257392998,
1458
+ "loss": 0.4354,
1459
+ "step": 1035
1460
+ },
1461
+ {
1462
+ "epoch": 0.52,
1463
+ "grad_norm": 0.12466365844011307,
1464
+ "learning_rate": 0.00016593961070043498,
1465
+ "loss": 0.4317,
1466
+ "step": 1040
1467
+ },
1468
+ {
1469
+ "epoch": 0.5225,
1470
+ "grad_norm": 0.11178991943597794,
1471
+ "learning_rate": 0.0001646374349638724,
1472
+ "loss": 0.3936,
1473
+ "step": 1045
1474
+ },
1475
+ {
1476
+ "epoch": 0.525,
1477
+ "grad_norm": 0.11252165585756302,
1478
+ "learning_rate": 0.00016333414452996623,
1479
+ "loss": 0.386,
1480
+ "step": 1050
1481
+ },
1482
+ {
1483
+ "epoch": 0.5275,
1484
+ "grad_norm": 0.12886975705623627,
1485
+ "learning_rate": 0.0001620298386493288,
1486
+ "loss": 0.3965,
1487
+ "step": 1055
1488
+ },
1489
+ {
1490
+ "epoch": 0.53,
1491
+ "grad_norm": 0.11716549098491669,
1492
+ "learning_rate": 0.00016072461664990288,
1493
+ "loss": 0.3924,
1494
+ "step": 1060
1495
+ },
1496
+ {
1497
+ "epoch": 0.5325,
1498
+ "grad_norm": 0.11604485660791397,
1499
+ "learning_rate": 0.000159418577929397,
1500
+ "loss": 0.3624,
1501
+ "step": 1065
1502
+ },
1503
+ {
1504
+ "epoch": 0.535,
1505
+ "grad_norm": 0.11538460850715637,
1506
+ "learning_rate": 0.00015811182194771633,
1507
+ "loss": 0.4338,
1508
+ "step": 1070
1509
+ },
1510
+ {
1511
+ "epoch": 0.5375,
1512
+ "grad_norm": 0.11618762463331223,
1513
+ "learning_rate": 0.00015680444821938804,
1514
+ "loss": 0.4058,
1515
+ "step": 1075
1516
+ },
1517
+ {
1518
+ "epoch": 0.54,
1519
+ "grad_norm": 0.11750835925340652,
1520
+ "learning_rate": 0.00015549655630598343,
1521
+ "loss": 0.4422,
1522
+ "step": 1080
1523
+ },
1524
+ {
1525
+ "epoch": 0.5425,
1526
+ "grad_norm": 0.12725204229354858,
1527
+ "learning_rate": 0.00015418824580853535,
1528
+ "loss": 0.4422,
1529
+ "step": 1085
1530
+ },
1531
+ {
1532
+ "epoch": 0.545,
1533
+ "grad_norm": 0.11274927109479904,
1534
+ "learning_rate": 0.00015287961635995347,
1535
+ "loss": 0.4229,
1536
+ "step": 1090
1537
+ },
1538
+ {
1539
+ "epoch": 0.5475,
1540
+ "grad_norm": 0.11833129078149796,
1541
+ "learning_rate": 0.00015157076761743686,
1542
+ "loss": 0.4442,
1543
+ "step": 1095
1544
+ },
1545
+ {
1546
+ "epoch": 0.55,
1547
+ "grad_norm": 0.11384794861078262,
1548
+ "learning_rate": 0.00015026179925488475,
1549
+ "loss": 0.4528,
1550
+ "step": 1100
1551
+ },
1552
+ {
1553
+ "epoch": 0.5525,
1554
+ "grad_norm": 0.11864661425352097,
1555
+ "learning_rate": 0.00014895281095530575,
1556
+ "loss": 0.3988,
1557
+ "step": 1105
1558
+ },
1559
+ {
1560
+ "epoch": 0.555,
1561
+ "grad_norm": 0.11673832684755325,
1562
+ "learning_rate": 0.00014764390240322691,
1563
+ "loss": 0.3544,
1564
+ "step": 1110
1565
+ },
1566
+ {
1567
+ "epoch": 0.5575,
1568
+ "grad_norm": 0.1174502745270729,
1569
+ "learning_rate": 0.00014633517327710202,
1570
+ "loss": 0.4034,
1571
+ "step": 1115
1572
+ },
1573
+ {
1574
+ "epoch": 0.56,
1575
+ "grad_norm": 0.12685547769069672,
1576
+ "learning_rate": 0.00014502672324172107,
1577
+ "loss": 0.3595,
1578
+ "step": 1120
1579
+ },
1580
+ {
1581
+ "epoch": 0.5625,
1582
+ "grad_norm": 0.12368053942918777,
1583
+ "learning_rate": 0.00014371865194062007,
1584
+ "loss": 0.3395,
1585
+ "step": 1125
1586
+ },
1587
+ {
1588
+ "epoch": 0.565,
1589
+ "grad_norm": 0.1077839657664299,
1590
+ "learning_rate": 0.000142411058988493,
1591
+ "loss": 0.4199,
1592
+ "step": 1130
1593
+ },
1594
+ {
1595
+ "epoch": 0.5675,
1596
+ "grad_norm": 0.11699855327606201,
1597
+ "learning_rate": 0.00014110404396360576,
1598
+ "loss": 0.3443,
1599
+ "step": 1135
1600
+ },
1601
+ {
1602
+ "epoch": 0.57,
1603
+ "grad_norm": 0.13238464295864105,
1604
+ "learning_rate": 0.0001397977064002128,
1605
+ "loss": 0.3499,
1606
+ "step": 1140
1607
+ },
1608
+ {
1609
+ "epoch": 0.5725,
1610
+ "grad_norm": 0.11482933163642883,
1611
+ "learning_rate": 0.0001384921457809772,
1612
+ "loss": 0.3619,
1613
+ "step": 1145
1614
+ },
1615
+ {
1616
+ "epoch": 0.575,
1617
+ "grad_norm": 0.13390353322029114,
1618
+ "learning_rate": 0.00013718746152939487,
1619
+ "loss": 0.3684,
1620
+ "step": 1150
1621
+ },
1622
+ {
1623
+ "epoch": 0.5775,
1624
+ "grad_norm": 0.11464900523424149,
1625
+ "learning_rate": 0.00013588375300222283,
1626
+ "loss": 0.3313,
1627
+ "step": 1155
1628
+ },
1629
+ {
1630
+ "epoch": 0.58,
1631
+ "grad_norm": 0.10367871820926666,
1632
+ "learning_rate": 0.00013458111948191296,
1633
+ "loss": 0.3323,
1634
+ "step": 1160
1635
+ },
1636
+ {
1637
+ "epoch": 0.5825,
1638
+ "grad_norm": 0.12259294092655182,
1639
+ "learning_rate": 0.0001332796601690512,
1640
+ "loss": 0.3986,
1641
+ "step": 1165
1642
+ },
1643
+ {
1644
+ "epoch": 0.585,
1645
+ "grad_norm": 0.10923358052968979,
1646
+ "learning_rate": 0.00013197947417480292,
1647
+ "loss": 0.3808,
1648
+ "step": 1170
1649
+ },
1650
+ {
1651
+ "epoch": 0.5875,
1652
+ "grad_norm": 0.12479504942893982,
1653
+ "learning_rate": 0.0001306806605133656,
1654
+ "loss": 0.4429,
1655
+ "step": 1175
1656
+ },
1657
+ {
1658
+ "epoch": 0.59,
1659
+ "grad_norm": 0.11521733552217484,
1660
+ "learning_rate": 0.000129383318094428,
1661
+ "loss": 0.4778,
1662
+ "step": 1180
1663
+ },
1664
+ {
1665
+ "epoch": 0.5925,
1666
+ "grad_norm": 0.14112086594104767,
1667
+ "learning_rate": 0.00012808754571563827,
1668
+ "loss": 0.4634,
1669
+ "step": 1185
1670
+ },
1671
+ {
1672
+ "epoch": 0.595,
1673
+ "grad_norm": 0.12947902083396912,
1674
+ "learning_rate": 0.00012679344205507981,
1675
+ "loss": 0.4439,
1676
+ "step": 1190
1677
+ },
1678
+ {
1679
+ "epoch": 0.5975,
1680
+ "grad_norm": 0.13288578391075134,
1681
+ "learning_rate": 0.0001255011056637567,
1682
+ "loss": 0.4402,
1683
+ "step": 1195
1684
+ },
1685
+ {
1686
+ "epoch": 0.6,
1687
+ "grad_norm": 0.1216069906949997,
1688
+ "learning_rate": 0.00012421063495808853,
1689
+ "loss": 0.4203,
1690
+ "step": 1200
1691
+ },
1692
+ {
1693
+ "epoch": 0.6025,
1694
+ "grad_norm": 0.11649637669324875,
1695
+ "learning_rate": 0.000122922128212416,
1696
+ "loss": 0.4512,
1697
+ "step": 1205
1698
+ },
1699
+ {
1700
+ "epoch": 0.605,
1701
+ "grad_norm": 0.1201406940817833,
1702
+ "learning_rate": 0.00012163568355151628,
1703
+ "loss": 0.3725,
1704
+ "step": 1210
1705
+ },
1706
+ {
1707
+ "epoch": 0.6075,
1708
+ "grad_norm": 0.12117727100849152,
1709
+ "learning_rate": 0.00012035139894313107,
1710
+ "loss": 0.4352,
1711
+ "step": 1215
1712
+ },
1713
+ {
1714
+ "epoch": 0.61,
1715
+ "grad_norm": 0.11709322035312653,
1716
+ "learning_rate": 0.00011906937219050556,
1717
+ "loss": 0.4189,
1718
+ "step": 1220
1719
+ },
1720
+ {
1721
+ "epoch": 0.6125,
1722
+ "grad_norm": 0.11865726858377457,
1723
+ "learning_rate": 0.0001177897009249405,
1724
+ "loss": 0.3796,
1725
+ "step": 1225
1726
+ },
1727
+ {
1728
+ "epoch": 0.615,
1729
+ "grad_norm": 0.10807759314775467,
1730
+ "learning_rate": 0.0001165124825983573,
1731
+ "loss": 0.4465,
1732
+ "step": 1230
1733
+ },
1734
+ {
1735
+ "epoch": 0.6175,
1736
+ "grad_norm": 0.13788209855556488,
1737
+ "learning_rate": 0.00011523781447587641,
1738
+ "loss": 0.4994,
1739
+ "step": 1235
1740
+ },
1741
+ {
1742
+ "epoch": 0.62,
1743
+ "grad_norm": 0.12921364605426788,
1744
+ "learning_rate": 0.00011396579362841044,
1745
+ "loss": 0.4251,
1746
+ "step": 1240
1747
+ },
1748
+ {
1749
+ "epoch": 0.6225,
1750
+ "grad_norm": 0.12162365019321442,
1751
+ "learning_rate": 0.0001126965169252718,
1752
+ "loss": 0.3864,
1753
+ "step": 1245
1754
+ },
1755
+ {
1756
+ "epoch": 0.625,
1757
+ "grad_norm": 0.12897826731204987,
1758
+ "learning_rate": 0.00011143008102679559,
1759
+ "loss": 0.3753,
1760
+ "step": 1250
1761
+ },
1762
+ {
1763
+ "epoch": 0.6275,
1764
+ "grad_norm": 0.116109699010849,
1765
+ "learning_rate": 0.00011016658237697866,
1766
+ "loss": 0.3296,
1767
+ "step": 1255
1768
+ },
1769
+ {
1770
+ "epoch": 0.63,
1771
+ "grad_norm": 0.12935414910316467,
1772
+ "learning_rate": 0.00010890611719613512,
1773
+ "loss": 0.3797,
1774
+ "step": 1260
1775
+ },
1776
+ {
1777
+ "epoch": 0.6325,
1778
+ "grad_norm": 0.13730891048908234,
1779
+ "learning_rate": 0.0001076487814735685,
1780
+ "loss": 0.3711,
1781
+ "step": 1265
1782
+ },
1783
+ {
1784
+ "epoch": 0.635,
1785
+ "grad_norm": 0.13870631158351898,
1786
+ "learning_rate": 0.00010639467096026211,
1787
+ "loss": 0.4328,
1788
+ "step": 1270
1789
+ },
1790
+ {
1791
+ "epoch": 0.6375,
1792
+ "grad_norm": 0.11644043773412704,
1793
+ "learning_rate": 0.00010514388116158701,
1794
+ "loss": 0.3283,
1795
+ "step": 1275
1796
+ },
1797
+ {
1798
+ "epoch": 0.64,
1799
+ "grad_norm": 0.12221091985702515,
1800
+ "learning_rate": 0.00010389650733002894,
1801
+ "loss": 0.3898,
1802
+ "step": 1280
1803
+ },
1804
+ {
1805
+ "epoch": 0.6425,
1806
+ "grad_norm": 0.12048634141683578,
1807
+ "learning_rate": 0.00010265264445793464,
1808
+ "loss": 0.3256,
1809
+ "step": 1285
1810
+ },
1811
+ {
1812
+ "epoch": 0.645,
1813
+ "grad_norm": 0.1250566840171814,
1814
+ "learning_rate": 0.00010141238727027761,
1815
+ "loss": 0.408,
1816
+ "step": 1290
1817
+ },
1818
+ {
1819
+ "epoch": 0.6475,
1820
+ "grad_norm": 0.13518592715263367,
1821
+ "learning_rate": 0.00010017583021744454,
1822
+ "loss": 0.3763,
1823
+ "step": 1295
1824
+ },
1825
+ {
1826
+ "epoch": 0.65,
1827
+ "grad_norm": 0.13047736883163452,
1828
+ "learning_rate": 9.89430674680425e-05,
1829
+ "loss": 0.3989,
1830
+ "step": 1300
1831
+ },
1832
+ {
1833
+ "epoch": 0.6525,
1834
+ "grad_norm": 0.11474955826997757,
1835
+ "learning_rate": 9.771419290172773e-05,
1836
+ "loss": 0.3374,
1837
+ "step": 1305
1838
+ },
1839
+ {
1840
+ "epoch": 0.655,
1841
+ "grad_norm": 0.11670063436031342,
1842
+ "learning_rate": 9.648930010205619e-05,
1843
+ "loss": 0.3343,
1844
+ "step": 1310
1845
+ },
1846
+ {
1847
+ "epoch": 0.6575,
1848
+ "grad_norm": 0.15385080873966217,
1849
+ "learning_rate": 9.526848234935704e-05,
1850
+ "loss": 0.3432,
1851
+ "step": 1315
1852
+ },
1853
+ {
1854
+ "epoch": 0.66,
1855
+ "grad_norm": 0.13441519439220428,
1856
+ "learning_rate": 9.405183261362863e-05,
1857
+ "loss": 0.3116,
1858
+ "step": 1320
1859
+ },
1860
+ {
1861
+ "epoch": 0.6625,
1862
+ "grad_norm": 0.14772167801856995,
1863
+ "learning_rate": 9.283944354745888e-05,
1864
+ "loss": 0.3613,
1865
+ "step": 1325
1866
+ },
1867
+ {
1868
+ "epoch": 0.665,
1869
+ "grad_norm": 0.12146154791116714,
1870
+ "learning_rate": 9.163140747896907e-05,
1871
+ "loss": 0.3411,
1872
+ "step": 1330
1873
+ },
1874
+ {
1875
+ "epoch": 0.6675,
1876
+ "grad_norm": 0.1333102583885193,
1877
+ "learning_rate": 9.042781640478291e-05,
1878
+ "loss": 0.396,
1879
+ "step": 1335
1880
+ },
1881
+ {
1882
+ "epoch": 0.67,
1883
+ "grad_norm": 0.12051521986722946,
1884
+ "learning_rate": 8.922876198302062e-05,
1885
+ "loss": 0.3837,
1886
+ "step": 1340
1887
+ },
1888
+ {
1889
+ "epoch": 0.6725,
1890
+ "grad_norm": 0.12071400880813599,
1891
+ "learning_rate": 8.803433552631874e-05,
1892
+ "loss": 0.354,
1893
+ "step": 1345
1894
+ },
1895
+ {
1896
+ "epoch": 0.675,
1897
+ "grad_norm": 0.11258620023727417,
1898
+ "learning_rate": 8.684462799487635e-05,
1899
+ "loss": 0.3197,
1900
+ "step": 1350
1901
+ },
1902
+ {
1903
+ "epoch": 0.6775,
1904
+ "grad_norm": 0.11908067762851715,
1905
+ "learning_rate": 8.565972998952814e-05,
1906
+ "loss": 0.377,
1907
+ "step": 1355
1908
+ },
1909
+ {
1910
+ "epoch": 0.68,
1911
+ "grad_norm": 0.1252991259098053,
1912
+ "learning_rate": 8.447973174484469e-05,
1913
+ "loss": 0.3438,
1914
+ "step": 1360
1915
+ },
1916
+ {
1917
+ "epoch": 0.6825,
1918
+ "grad_norm": 0.12832245230674744,
1919
+ "learning_rate": 8.330472312226091e-05,
1920
+ "loss": 0.346,
1921
+ "step": 1365
1922
+ },
1923
+ {
1924
+ "epoch": 0.685,
1925
+ "grad_norm": 0.1396942287683487,
1926
+ "learning_rate": 8.213479360323258e-05,
1927
+ "loss": 0.3886,
1928
+ "step": 1370
1929
+ },
1930
+ {
1931
+ "epoch": 0.6875,
1932
+ "grad_norm": 0.12938210368156433,
1933
+ "learning_rate": 8.097003228242225e-05,
1934
+ "loss": 0.3699,
1935
+ "step": 1375
1936
+ },
1937
+ {
1938
+ "epoch": 0.69,
1939
+ "grad_norm": 0.12459377944469452,
1940
+ "learning_rate": 7.9810527860914e-05,
1941
+ "loss": 0.3892,
1942
+ "step": 1380
1943
+ },
1944
+ {
1945
+ "epoch": 0.6925,
1946
+ "grad_norm": 0.1360333263874054,
1947
+ "learning_rate": 7.86563686394587e-05,
1948
+ "loss": 0.3423,
1949
+ "step": 1385
1950
+ },
1951
+ {
1952
+ "epoch": 0.695,
1953
+ "grad_norm": 0.1357765644788742,
1954
+ "learning_rate": 7.750764251174963e-05,
1955
+ "loss": 0.408,
1956
+ "step": 1390
1957
+ },
1958
+ {
1959
+ "epoch": 0.6975,
1960
+ "grad_norm": 0.14453718066215515,
1961
+ "learning_rate": 7.636443695772887e-05,
1962
+ "loss": 0.3398,
1963
+ "step": 1395
1964
+ },
1965
+ {
1966
+ "epoch": 0.7,
1967
+ "grad_norm": 0.11541519314050674,
1968
+ "learning_rate": 7.522683903692547e-05,
1969
+ "loss": 0.4203,
1970
+ "step": 1400
1971
+ },
1972
+ {
1973
+ "epoch": 0.7025,
1974
+ "grad_norm": 0.13344840705394745,
1975
+ "learning_rate": 7.409493538182545e-05,
1976
+ "loss": 0.3694,
1977
+ "step": 1405
1978
+ },
1979
+ {
1980
+ "epoch": 0.705,
1981
+ "grad_norm": 0.13069866597652435,
1982
+ "learning_rate": 7.296881219127452e-05,
1983
+ "loss": 0.3889,
1984
+ "step": 1410
1985
+ },
1986
+ {
1987
+ "epoch": 0.7075,
1988
+ "grad_norm": 0.12457838654518127,
1989
+ "learning_rate": 7.184855522391359e-05,
1990
+ "loss": 0.3342,
1991
+ "step": 1415
1992
+ },
1993
+ {
1994
+ "epoch": 0.71,
1995
+ "grad_norm": 0.11990659683942795,
1996
+ "learning_rate": 7.073424979164794e-05,
1997
+ "loss": 0.3855,
1998
+ "step": 1420
1999
+ },
2000
+ {
2001
+ "epoch": 0.7125,
2002
+ "grad_norm": 0.1389523446559906,
2003
+ "learning_rate": 6.962598075315046e-05,
2004
+ "loss": 0.3943,
2005
+ "step": 1425
2006
+ },
2007
+ {
2008
+ "epoch": 0.715,
2009
+ "grad_norm": 0.14108599722385406,
2010
+ "learning_rate": 6.852383250739938e-05,
2011
+ "loss": 0.388,
2012
+ "step": 1430
2013
+ },
2014
+ {
2015
+ "epoch": 0.7175,
2016
+ "grad_norm": 0.1342005580663681,
2017
+ "learning_rate": 6.742788898725065e-05,
2018
+ "loss": 0.3602,
2019
+ "step": 1435
2020
+ },
2021
+ {
2022
+ "epoch": 0.72,
2023
+ "grad_norm": 0.13516324758529663,
2024
+ "learning_rate": 6.633823365304648e-05,
2025
+ "loss": 0.3935,
2026
+ "step": 1440
2027
+ },
2028
+ {
2029
+ "epoch": 0.7225,
2030
+ "grad_norm": 0.1302197426557541,
2031
+ "learning_rate": 6.52549494862593e-05,
2032
+ "loss": 0.3618,
2033
+ "step": 1445
2034
+ },
2035
+ {
2036
+ "epoch": 0.725,
2037
+ "grad_norm": 0.12428996711969376,
2038
+ "learning_rate": 6.417811898317259e-05,
2039
+ "loss": 0.3338,
2040
+ "step": 1450
2041
+ },
2042
+ {
2043
+ "epoch": 0.7275,
2044
+ "grad_norm": 0.11249776184558868,
2045
+ "learning_rate": 6.31078241485982e-05,
2046
+ "loss": 0.3819,
2047
+ "step": 1455
2048
+ },
2049
+ {
2050
+ "epoch": 0.73,
2051
+ "grad_norm": 0.1359994113445282,
2052
+ "learning_rate": 6.204414648963159e-05,
2053
+ "loss": 0.3356,
2054
+ "step": 1460
2055
+ },
2056
+ {
2057
+ "epoch": 0.7325,
2058
+ "grad_norm": 0.1118568629026413,
2059
+ "learning_rate": 6.098716700944479e-05,
2060
+ "loss": 0.3223,
2061
+ "step": 1465
2062
+ },
2063
+ {
2064
+ "epoch": 0.735,
2065
+ "grad_norm": 0.12038140743970871,
2066
+ "learning_rate": 5.993696620111741e-05,
2067
+ "loss": 0.3481,
2068
+ "step": 1470
2069
+ },
2070
+ {
2071
+ "epoch": 0.7375,
2072
+ "grad_norm": 0.12787550687789917,
2073
+ "learning_rate": 5.889362404150703e-05,
2074
+ "loss": 0.3766,
2075
+ "step": 1475
2076
+ },
2077
+ {
2078
+ "epoch": 0.74,
2079
+ "grad_norm": 0.12134893983602524,
2080
+ "learning_rate": 5.7857219985158506e-05,
2081
+ "loss": 0.2916,
2082
+ "step": 1480
2083
+ },
2084
+ {
2085
+ "epoch": 0.7425,
2086
+ "grad_norm": 0.1274223029613495,
2087
+ "learning_rate": 5.682783295825345e-05,
2088
+ "loss": 0.3095,
2089
+ "step": 1485
2090
+ },
2091
+ {
2092
+ "epoch": 0.745,
2093
+ "grad_norm": 0.11817299574613571,
2094
+ "learning_rate": 5.580554135259932e-05,
2095
+ "loss": 0.3422,
2096
+ "step": 1490
2097
+ },
2098
+ {
2099
+ "epoch": 0.7475,
2100
+ "grad_norm": 0.1348387748003006,
2101
+ "learning_rate": 5.479042301965987e-05,
2102
+ "loss": 0.4044,
2103
+ "step": 1495
2104
+ },
2105
+ {
2106
+ "epoch": 0.75,
2107
+ "grad_norm": 0.14032681286334991,
2108
+ "learning_rate": 5.378255526462631e-05,
2109
+ "loss": 0.337,
2110
+ "step": 1500
2111
+ },
2112
+ {
2113
+ "epoch": 0.7525,
2114
+ "grad_norm": 0.1196574866771698,
2115
+ "learning_rate": 5.2782014840530366e-05,
2116
+ "loss": 0.3638,
2117
+ "step": 1505
2118
+ },
2119
+ {
2120
+ "epoch": 0.755,
2121
+ "grad_norm": 0.1307535171508789,
2122
+ "learning_rate": 5.178887794239904e-05,
2123
+ "loss": 0.3514,
2124
+ "step": 1510
2125
+ },
2126
+ {
2127
+ "epoch": 0.7575,
2128
+ "grad_norm": 0.12303224951028824,
2129
+ "learning_rate": 5.080322020145224e-05,
2130
+ "loss": 0.3825,
2131
+ "step": 1515
2132
+ },
2133
+ {
2134
+ "epoch": 0.76,
2135
+ "grad_norm": 0.11517804116010666,
2136
+ "learning_rate": 4.9825116679343025e-05,
2137
+ "loss": 0.3474,
2138
+ "step": 1520
2139
+ },
2140
+ {
2141
+ "epoch": 0.7625,
2142
+ "grad_norm": 0.1276445835828781,
2143
+ "learning_rate": 4.885464186244154e-05,
2144
+ "loss": 0.3084,
2145
+ "step": 1525
2146
+ },
2147
+ {
2148
+ "epoch": 0.765,
2149
+ "grad_norm": 0.12166495621204376,
2150
+ "learning_rate": 4.789186965616232e-05,
2151
+ "loss": 0.2949,
2152
+ "step": 1530
2153
+ },
2154
+ {
2155
+ "epoch": 0.7675,
2156
+ "grad_norm": 0.13007108867168427,
2157
+ "learning_rate": 4.6936873379336564e-05,
2158
+ "loss": 0.3336,
2159
+ "step": 1535
2160
+ },
2161
+ {
2162
+ "epoch": 0.77,
2163
+ "grad_norm": 0.12368687242269516,
2164
+ "learning_rate": 4.598972575862803e-05,
2165
+ "loss": 0.3443,
2166
+ "step": 1540
2167
+ },
2168
+ {
2169
+ "epoch": 0.7725,
2170
+ "grad_norm": 0.11817432940006256,
2171
+ "learning_rate": 4.5050498922995166e-05,
2172
+ "loss": 0.3198,
2173
+ "step": 1545
2174
+ },
2175
+ {
2176
+ "epoch": 0.775,
2177
+ "grad_norm": 0.13239014148712158,
2178
+ "learning_rate": 4.4119264398197843e-05,
2179
+ "loss": 0.3145,
2180
+ "step": 1550
2181
+ },
2182
+ {
2183
+ "epoch": 0.7775,
2184
+ "grad_norm": 0.12305855751037598,
2185
+ "learning_rate": 4.319609310135054e-05,
2186
+ "loss": 0.3276,
2187
+ "step": 1555
2188
+ },
2189
+ {
2190
+ "epoch": 0.78,
2191
+ "grad_norm": 0.13063360750675201,
2192
+ "learning_rate": 4.228105533552169e-05,
2193
+ "loss": 0.4115,
2194
+ "step": 1560
2195
+ },
2196
+ {
2197
+ "epoch": 0.7825,
2198
+ "grad_norm": 0.12751415371894836,
2199
+ "learning_rate": 4.137422078437991e-05,
2200
+ "loss": 0.4113,
2201
+ "step": 1565
2202
+ },
2203
+ {
2204
+ "epoch": 0.785,
2205
+ "grad_norm": 0.1429520696401596,
2206
+ "learning_rate": 4.0475658506887136e-05,
2207
+ "loss": 0.3634,
2208
+ "step": 1570
2209
+ },
2210
+ {
2211
+ "epoch": 0.7875,
2212
+ "grad_norm": 0.13072626292705536,
2213
+ "learning_rate": 3.9585436932039846e-05,
2214
+ "loss": 0.3914,
2215
+ "step": 1575
2216
+ },
2217
+ {
2218
+ "epoch": 0.79,
2219
+ "grad_norm": 0.13076546788215637,
2220
+ "learning_rate": 3.870362385365755e-05,
2221
+ "loss": 0.3153,
2222
+ "step": 1580
2223
+ },
2224
+ {
2225
+ "epoch": 0.7925,
2226
+ "grad_norm": 0.11764945089817047,
2227
+ "learning_rate": 3.7830286425220234e-05,
2228
+ "loss": 0.331,
2229
+ "step": 1585
2230
+ },
2231
+ {
2232
+ "epoch": 0.795,
2233
+ "grad_norm": 0.12469421327114105,
2234
+ "learning_rate": 3.696549115475434e-05,
2235
+ "loss": 0.3667,
2236
+ "step": 1590
2237
+ },
2238
+ {
2239
+ "epoch": 0.7975,
2240
+ "grad_norm": 0.13257570564746857,
2241
+ "learning_rate": 3.6109303899767875e-05,
2242
+ "loss": 0.3775,
2243
+ "step": 1595
2244
+ },
2245
+ {
2246
+ "epoch": 0.8,
2247
+ "grad_norm": 0.1399105191230774,
2248
+ "learning_rate": 3.5261789862235235e-05,
2249
+ "loss": 0.3786,
2250
+ "step": 1600
2251
+ },
2252
+ {
2253
+ "epoch": 0.8025,
2254
+ "grad_norm": 0.1299823671579361,
2255
+ "learning_rate": 3.442301358363163e-05,
2256
+ "loss": 0.3984,
2257
+ "step": 1605
2258
+ },
2259
+ {
2260
+ "epoch": 0.805,
2261
+ "grad_norm": 0.12068431079387665,
2262
+ "learning_rate": 3.359303894001809e-05,
2263
+ "loss": 0.3416,
2264
+ "step": 1610
2265
+ },
2266
+ {
2267
+ "epoch": 0.8075,
2268
+ "grad_norm": 0.12825050950050354,
2269
+ "learning_rate": 3.277192913717717e-05,
2270
+ "loss": 0.3973,
2271
+ "step": 1615
2272
+ },
2273
+ {
2274
+ "epoch": 0.81,
2275
+ "grad_norm": 0.12794139981269836,
2276
+ "learning_rate": 3.195974670579941e-05,
2277
+ "loss": 0.3942,
2278
+ "step": 1620
2279
+ },
2280
+ {
2281
+ "epoch": 0.8125,
2282
+ "grad_norm": 0.1178906112909317,
2283
+ "learning_rate": 3.115655349672141e-05,
2284
+ "loss": 0.3549,
2285
+ "step": 1625
2286
+ },
2287
+ {
2288
+ "epoch": 0.815,
2289
+ "grad_norm": 0.11859016120433807,
2290
+ "learning_rate": 3.036241067621575e-05,
2291
+ "loss": 0.3113,
2292
+ "step": 1630
2293
+ },
2294
+ {
2295
+ "epoch": 0.8175,
2296
+ "grad_norm": 0.12508928775787354,
2297
+ "learning_rate": 2.9577378721332843e-05,
2298
+ "loss": 0.3802,
2299
+ "step": 1635
2300
+ },
2301
+ {
2302
+ "epoch": 0.82,
2303
+ "grad_norm": 0.1293668895959854,
2304
+ "learning_rate": 2.8801517415295455e-05,
2305
+ "loss": 0.3098,
2306
+ "step": 1640
2307
+ },
2308
+ {
2309
+ "epoch": 0.8225,
2310
+ "grad_norm": 0.12039236724376678,
2311
+ "learning_rate": 2.8034885842945865e-05,
2312
+ "loss": 0.2876,
2313
+ "step": 1645
2314
+ },
2315
+ {
2316
+ "epoch": 0.825,
2317
+ "grad_norm": 0.14805036783218384,
2318
+ "learning_rate": 2.7277542386246454e-05,
2319
+ "loss": 0.3618,
2320
+ "step": 1650
2321
+ },
2322
+ {
2323
+ "epoch": 0.8275,
2324
+ "grad_norm": 0.12638579308986664,
2325
+ "learning_rate": 2.6529544719833706e-05,
2326
+ "loss": 0.3328,
2327
+ "step": 1655
2328
+ },
2329
+ {
2330
+ "epoch": 0.83,
2331
+ "grad_norm": 0.12427478283643723,
2332
+ "learning_rate": 2.5790949806625838e-05,
2333
+ "loss": 0.3394,
2334
+ "step": 1660
2335
+ },
2336
+ {
2337
+ "epoch": 0.8325,
2338
+ "grad_norm": 0.1283419132232666,
2339
+ "learning_rate": 2.5061813893485085e-05,
2340
+ "loss": 0.3392,
2341
+ "step": 1665
2342
+ },
2343
+ {
2344
+ "epoch": 0.835,
2345
+ "grad_norm": 0.12487384676933289,
2346
+ "learning_rate": 2.434219250693419e-05,
2347
+ "loss": 0.3592,
2348
+ "step": 1670
2349
+ },
2350
+ {
2351
+ "epoch": 0.8375,
2352
+ "grad_norm": 0.14032793045043945,
2353
+ "learning_rate": 2.363214044892788e-05,
2354
+ "loss": 0.4099,
2355
+ "step": 1675
2356
+ },
2357
+ {
2358
+ "epoch": 0.84,
2359
+ "grad_norm": 0.10917101800441742,
2360
+ "learning_rate": 2.293171179267946e-05,
2361
+ "loss": 0.3204,
2362
+ "step": 1680
2363
+ },
2364
+ {
2365
+ "epoch": 0.8425,
2366
+ "grad_norm": 0.1253073364496231,
2367
+ "learning_rate": 2.2240959878542848e-05,
2368
+ "loss": 0.3378,
2369
+ "step": 1685
2370
+ },
2371
+ {
2372
+ "epoch": 0.845,
2373
+ "grad_norm": 0.14096981287002563,
2374
+ "learning_rate": 2.155993730995077e-05,
2375
+ "loss": 0.378,
2376
+ "step": 1690
2377
+ },
2378
+ {
2379
+ "epoch": 0.8475,
2380
+ "grad_norm": 0.12039178609848022,
2381
+ "learning_rate": 2.0888695949408468e-05,
2382
+ "loss": 0.3197,
2383
+ "step": 1695
2384
+ },
2385
+ {
2386
+ "epoch": 0.85,
2387
+ "grad_norm": 0.12723132967948914,
2388
+ "learning_rate": 2.0227286914544353e-05,
2389
+ "loss": 0.3241,
2390
+ "step": 1700
2391
+ },
2392
+ {
2393
+ "epoch": 0.8525,
2394
+ "grad_norm": 0.1309029906988144,
2395
+ "learning_rate": 1.9575760574217147e-05,
2396
+ "loss": 0.3743,
2397
+ "step": 1705
2398
+ },
2399
+ {
2400
+ "epoch": 0.855,
2401
+ "grad_norm": 0.1324499100446701,
2402
+ "learning_rate": 1.893416654468022e-05,
2403
+ "loss": 0.345,
2404
+ "step": 1710
2405
+ },
2406
+ {
2407
+ "epoch": 0.8575,
2408
+ "grad_norm": 0.11905783414840698,
2409
+ "learning_rate": 1.8302553685802917e-05,
2410
+ "loss": 0.3514,
2411
+ "step": 1715
2412
+ },
2413
+ {
2414
+ "epoch": 0.86,
2415
+ "grad_norm": 0.12570443749427795,
2416
+ "learning_rate": 1.768097009734985e-05,
2417
+ "loss": 0.3791,
2418
+ "step": 1720
2419
+ },
2420
+ {
2421
+ "epoch": 0.8625,
2422
+ "grad_norm": 0.13414913415908813,
2423
+ "learning_rate": 1.7069463115317788e-05,
2424
+ "loss": 0.3575,
2425
+ "step": 1725
2426
+ },
2427
+ {
2428
+ "epoch": 0.865,
2429
+ "grad_norm": 0.1283785104751587,
2430
+ "learning_rate": 1.6468079308331023e-05,
2431
+ "loss": 0.3496,
2432
+ "step": 1730
2433
+ },
2434
+ {
2435
+ "epoch": 0.8675,
2436
+ "grad_norm": 0.11180217564105988,
2437
+ "learning_rate": 1.587686447409478e-05,
2438
+ "loss": 0.3245,
2439
+ "step": 1735
2440
+ },
2441
+ {
2442
+ "epoch": 0.87,
2443
+ "grad_norm": 0.13804157078266144,
2444
+ "learning_rate": 1.5295863635907667e-05,
2445
+ "loss": 0.367,
2446
+ "step": 1740
2447
+ },
2448
+ {
2449
+ "epoch": 0.8725,
2450
+ "grad_norm": 0.12629055976867676,
2451
+ "learning_rate": 1.4725121039232945e-05,
2452
+ "loss": 0.293,
2453
+ "step": 1745
2454
+ },
2455
+ {
2456
+ "epoch": 0.875,
2457
+ "grad_norm": 0.12774884700775146,
2458
+ "learning_rate": 1.4164680148329088e-05,
2459
+ "loss": 0.3798,
2460
+ "step": 1750
2461
+ },
2462
+ {
2463
+ "epoch": 0.8775,
2464
+ "grad_norm": 0.11681339889764786,
2465
+ "learning_rate": 1.3614583642939718e-05,
2466
+ "loss": 0.3474,
2467
+ "step": 1755
2468
+ },
2469
+ {
2470
+ "epoch": 0.88,
2471
+ "grad_norm": 0.14510560035705566,
2472
+ "learning_rate": 1.3074873415043591e-05,
2473
+ "loss": 0.3999,
2474
+ "step": 1760
2475
+ },
2476
+ {
2477
+ "epoch": 0.8825,
2478
+ "grad_norm": 0.1168401762843132,
2479
+ "learning_rate": 1.2545590565664054e-05,
2480
+ "loss": 0.3398,
2481
+ "step": 1765
2482
+ },
2483
+ {
2484
+ "epoch": 0.885,
2485
+ "grad_norm": 0.1411600410938263,
2486
+ "learning_rate": 1.2026775401739348e-05,
2487
+ "loss": 0.3346,
2488
+ "step": 1770
2489
+ },
2490
+ {
2491
+ "epoch": 0.8875,
2492
+ "grad_norm": 0.12797729671001434,
2493
+ "learning_rate": 1.1518467433052863e-05,
2494
+ "loss": 0.3742,
2495
+ "step": 1775
2496
+ },
2497
+ {
2498
+ "epoch": 0.89,
2499
+ "grad_norm": 0.12946921586990356,
2500
+ "learning_rate": 1.1020705369224414e-05,
2501
+ "loss": 0.3436,
2502
+ "step": 1780
2503
+ },
2504
+ {
2505
+ "epoch": 0.8925,
2506
+ "grad_norm": 0.13285613059997559,
2507
+ "learning_rate": 1.0533527116762296e-05,
2508
+ "loss": 0.3186,
2509
+ "step": 1785
2510
+ },
2511
+ {
2512
+ "epoch": 0.895,
2513
+ "grad_norm": 0.15213604271411896,
2514
+ "learning_rate": 1.005696977617666e-05,
2515
+ "loss": 0.3629,
2516
+ "step": 1790
2517
+ },
2518
+ {
2519
+ "epoch": 0.8975,
2520
+ "grad_norm": 0.12391404062509537,
2521
+ "learning_rate": 9.591069639154008e-06,
2522
+ "loss": 0.3421,
2523
+ "step": 1795
2524
+ },
2525
+ {
2526
+ "epoch": 0.9,
2527
+ "grad_norm": 0.11592845618724823,
2528
+ "learning_rate": 9.135862185793636e-06,
2529
+ "loss": 0.3107,
2530
+ "step": 1800
2531
+ },
2532
+ {
2533
+ "epoch": 0.9025,
2534
+ "grad_norm": 0.12540902197360992,
2535
+ "learning_rate": 8.691382081905496e-06,
2536
+ "loss": 0.3605,
2537
+ "step": 1805
2538
+ },
2539
+ {
2540
+ "epoch": 0.905,
2541
+ "grad_norm": 0.14459215104579926,
2542
+ "learning_rate": 8.257663176370389e-06,
2543
+ "loss": 0.3884,
2544
+ "step": 1810
2545
+ },
2546
+ {
2547
+ "epoch": 0.9075,
2548
+ "grad_norm": 0.14139464497566223,
2549
+ "learning_rate": 7.834738498562165e-06,
2550
+ "loss": 0.3728,
2551
+ "step": 1815
2552
+ },
2553
+ {
2554
+ "epoch": 0.91,
2555
+ "grad_norm": 0.12125397473573685,
2556
+ "learning_rate": 7.422640255832446e-06,
2557
+ "loss": 0.3237,
2558
+ "step": 1820
2559
+ },
2560
+ {
2561
+ "epoch": 0.9125,
2562
+ "grad_norm": 0.13039612770080566,
2563
+ "learning_rate": 7.021399831057961e-06,
2564
+ "loss": 0.3055,
2565
+ "step": 1825
2566
+ },
2567
+ {
2568
+ "epoch": 0.915,
2569
+ "grad_norm": 0.1337701678276062,
2570
+ "learning_rate": 6.631047780250481e-06,
2571
+ "loss": 0.368,
2572
+ "step": 1830
2573
+ },
2574
+ {
2575
+ "epoch": 0.9175,
2576
+ "grad_norm": 0.13020606338977814,
2577
+ "learning_rate": 6.251613830230013e-06,
2578
+ "loss": 0.3262,
2579
+ "step": 1835
2580
+ },
2581
+ {
2582
+ "epoch": 0.92,
2583
+ "grad_norm": 0.12915077805519104,
2584
+ "learning_rate": 5.883126876360872e-06,
2585
+ "loss": 0.3428,
2586
+ "step": 1840
2587
+ },
2588
+ {
2589
+ "epoch": 0.9225,
2590
+ "grad_norm": 0.12774400413036346,
2591
+ "learning_rate": 5.525614980351284e-06,
2592
+ "loss": 0.3735,
2593
+ "step": 1845
2594
+ },
2595
+ {
2596
+ "epoch": 0.925,
2597
+ "grad_norm": 0.12587039172649384,
2598
+ "learning_rate": 5.1791053681162545e-06,
2599
+ "loss": 0.3402,
2600
+ "step": 1850
2601
+ },
2602
+ {
2603
+ "epoch": 0.9275,
2604
+ "grad_norm": 0.12152459472417831,
2605
+ "learning_rate": 4.843624427704329e-06,
2606
+ "loss": 0.2968,
2607
+ "step": 1855
2608
+ },
2609
+ {
2610
+ "epoch": 0.93,
2611
+ "grad_norm": 0.11444247514009476,
2612
+ "learning_rate": 4.519197707287986e-06,
2613
+ "loss": 0.3448,
2614
+ "step": 1860
2615
+ },
2616
+ {
2617
+ "epoch": 0.9325,
2618
+ "grad_norm": 0.12532518804073334,
2619
+ "learning_rate": 4.2058499132180734e-06,
2620
+ "loss": 0.3613,
2621
+ "step": 1865
2622
+ },
2623
+ {
2624
+ "epoch": 0.935,
2625
+ "grad_norm": 0.14186476171016693,
2626
+ "learning_rate": 3.903604908142266e-06,
2627
+ "loss": 0.2887,
2628
+ "step": 1870
2629
+ },
2630
+ {
2631
+ "epoch": 0.9375,
2632
+ "grad_norm": 0.13014192879199982,
2633
+ "learning_rate": 3.6124857091878845e-06,
2634
+ "loss": 0.2679,
2635
+ "step": 1875
2636
+ },
2637
+ {
2638
+ "epoch": 0.94,
2639
+ "grad_norm": 0.1259031891822815,
2640
+ "learning_rate": 3.3325144862090648e-06,
2641
+ "loss": 0.2993,
2642
+ "step": 1880
2643
+ },
2644
+ {
2645
+ "epoch": 0.9425,
2646
+ "grad_norm": 0.12168288230895996,
2647
+ "learning_rate": 3.0637125600983916e-06,
2648
+ "loss": 0.3317,
2649
+ "step": 1885
2650
+ },
2651
+ {
2652
+ "epoch": 0.945,
2653
+ "grad_norm": 0.12291324138641357,
2654
+ "learning_rate": 2.8061004011632302e-06,
2655
+ "loss": 0.3311,
2656
+ "step": 1890
2657
+ },
2658
+ {
2659
+ "epoch": 0.9475,
2660
+ "grad_norm": 0.13629783689975739,
2661
+ "learning_rate": 2.5596976275668757e-06,
2662
+ "loss": 0.3456,
2663
+ "step": 1895
2664
+ },
2665
+ {
2666
+ "epoch": 0.95,
2667
+ "grad_norm": 0.17415851354599,
2668
+ "learning_rate": 2.324523003834511e-06,
2669
+ "loss": 0.3589,
2670
+ "step": 1900
2671
+ },
2672
+ {
2673
+ "epoch": 0.9525,
2674
+ "grad_norm": 0.1330641210079193,
2675
+ "learning_rate": 2.100594439424269e-06,
2676
+ "loss": 0.3826,
2677
+ "step": 1905
2678
+ },
2679
+ {
2680
+ "epoch": 0.955,
2681
+ "grad_norm": 0.14203837513923645,
2682
+ "learning_rate": 1.8879289873632907e-06,
2683
+ "loss": 0.3807,
2684
+ "step": 1910
2685
+ },
2686
+ {
2687
+ "epoch": 0.9575,
2688
+ "grad_norm": 0.1222100704908371,
2689
+ "learning_rate": 1.686542842949129e-06,
2690
+ "loss": 0.3084,
2691
+ "step": 1915
2692
+ },
2693
+ {
2694
+ "epoch": 0.96,
2695
+ "grad_norm": 0.1441483348608017,
2696
+ "learning_rate": 1.4964513425163694e-06,
2697
+ "loss": 0.3871,
2698
+ "step": 1920
2699
+ },
2700
+ {
2701
+ "epoch": 0.9625,
2702
+ "grad_norm": 0.1402144581079483,
2703
+ "learning_rate": 1.3176689622687474e-06,
2704
+ "loss": 0.3192,
2705
+ "step": 1925
2706
+ },
2707
+ {
2708
+ "epoch": 0.965,
2709
+ "grad_norm": 0.13284745812416077,
2710
+ "learning_rate": 1.1502093171766979e-06,
2711
+ "loss": 0.359,
2712
+ "step": 1930
2713
+ },
2714
+ {
2715
+ "epoch": 0.9675,
2716
+ "grad_norm": 0.1253402829170227,
2717
+ "learning_rate": 9.94085159940533e-07,
2718
+ "loss": 0.3214,
2719
+ "step": 1935
2720
+ },
2721
+ {
2722
+ "epoch": 0.97,
2723
+ "grad_norm": 0.13589312136173248,
2724
+ "learning_rate": 8.493083800193034e-07,
2725
+ "loss": 0.3524,
2726
+ "step": 1940
2727
+ },
2728
+ {
2729
+ "epoch": 0.9725,
2730
+ "grad_norm": 0.13623379170894623,
2731
+ "learning_rate": 7.158900027253223e-07,
2732
+ "loss": 0.3711,
2733
+ "step": 1945
2734
+ },
2735
+ {
2736
+ "epoch": 0.975,
2737
+ "grad_norm": 0.12516111135482788,
2738
+ "learning_rate": 5.9384018838457e-07,
2739
+ "loss": 0.3487,
2740
+ "step": 1950
2741
+ },
2742
+ {
2743
+ "epoch": 0.9775,
2744
+ "grad_norm": 0.1211727038025856,
2745
+ "learning_rate": 4.831682315629304e-07,
2746
+ "loss": 0.3079,
2747
+ "step": 1955
2748
+ },
2749
+ {
2750
+ "epoch": 0.98,
2751
+ "grad_norm": 0.1348896622657776,
2752
+ "learning_rate": 3.8388256035840615e-07,
2753
+ "loss": 0.322,
2754
+ "step": 1960
2755
+ },
2756
+ {
2757
+ "epoch": 0.9825,
2758
+ "grad_norm": 0.12953124940395355,
2759
+ "learning_rate": 2.959907357592661e-07,
2760
+ "loss": 0.3054,
2761
+ "step": 1965
2762
+ },
2763
+ {
2764
+ "epoch": 0.985,
2765
+ "grad_norm": 0.12745600938796997,
2766
+ "learning_rate": 2.1949945106823909e-07,
2767
+ "loss": 0.3208,
2768
+ "step": 1970
2769
+ },
2770
+ {
2771
+ "epoch": 0.9875,
2772
+ "grad_norm": 0.13108642399311066,
2773
+ "learning_rate": 1.544145313928047e-07,
2774
+ "loss": 0.3641,
2775
+ "step": 1975
2776
+ },
2777
+ {
2778
+ "epoch": 0.99,
2779
+ "grad_norm": 0.12415596097707748,
2780
+ "learning_rate": 1.0074093320156517e-07,
2781
+ "loss": 0.3141,
2782
+ "step": 1980
2783
+ },
2784
+ {
2785
+ "epoch": 0.9925,
2786
+ "grad_norm": 0.12116590887308121,
2787
+ "learning_rate": 5.8482743946847153e-08,
2788
+ "loss": 0.3085,
2789
+ "step": 1985
2790
+ },
2791
+ {
2792
+ "epoch": 0.995,
2793
+ "grad_norm": 0.12617753446102142,
2794
+ "learning_rate": 2.764318175336733e-08,
2795
+ "loss": 0.316,
2796
+ "step": 1990
2797
+ },
2798
+ {
2799
+ "epoch": 0.9975,
2800
+ "grad_norm": 0.13097520172595978,
2801
+ "learning_rate": 8.224595173178527e-09,
2802
+ "loss": 0.2772,
2803
+ "step": 1995
2804
+ },
2805
+ {
2806
+ "epoch": 1.0,
2807
+ "grad_norm": 0.1454041749238968,
2808
+ "learning_rate": 2.284630068460913e-10,
2809
+ "loss": 0.3226,
2810
+ "step": 2000
2811
+ }
2812
+ ],
2813
+ "logging_steps": 5,
2814
+ "max_steps": 2000,
2815
+ "num_input_tokens_seen": 0,
2816
+ "num_train_epochs": 9223372036854775807,
2817
+ "save_steps": 500,
2818
+ "stateful_callbacks": {
2819
+ "TrainerControl": {
2820
+ "args": {
2821
+ "should_epoch_stop": false,
2822
+ "should_evaluate": false,
2823
+ "should_log": false,
2824
+ "should_save": true,
2825
+ "should_training_stop": true
2826
+ },
2827
+ "attributes": {}
2828
+ }
2829
+ },
2830
+ "total_flos": 2.629578157719552e+18,
2831
+ "train_batch_size": 4,
2832
+ "trial_name": null,
2833
+ "trial_params": null
2834
+ }
codellama-hugcoder/checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
3
+ size 5304
codellama-hugcoder/checkpoint-500/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLlama-7b-Instruct-hf
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.15.2.dev0
codellama-hugcoder/checkpoint-500/adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "down_proj",
28
+ "up_proj",
29
+ "k_proj",
30
+ "q_proj",
31
+ "v_proj",
32
+ "gate_proj",
33
+ "o_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
codellama-hugcoder/checkpoint-500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba0a03baab18f0cdae4dfc77bf7b41f7d1435807efac74517b5672e9ef8bedf1
3
+ size 319876032
codellama-hugcoder/checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dad4d0839af192a8e721c020748fcd5843aa02d4b867cd03a6da416f3b15a8e
3
+ size 640009682
codellama-hugcoder/checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b3fe293b4ac5ae1cf2f114644c15f2a8317440ebc1144a8065f3fe94c0e32b8
3
+ size 14244
codellama-hugcoder/checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12f207d7fee0843ba3ccc634c56e770b9b0bfb3e3b7ef4379b8fc405b4c45a03
3
+ size 1064
codellama-hugcoder/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.25,
6
+ "eval_steps": 100.0,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0025,
14
+ "grad_norm": 0.09379793703556061,
15
+ "learning_rate": 5.999999999999999e-06,
16
+ "loss": 0.6799,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.005,
21
+ "grad_norm": 0.1399833709001541,
22
+ "learning_rate": 1.3499999999999998e-05,
23
+ "loss": 0.6954,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.0075,
28
+ "grad_norm": 0.08632303029298782,
29
+ "learning_rate": 2.1e-05,
30
+ "loss": 0.6921,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.01,
35
+ "grad_norm": 0.10006701201200485,
36
+ "learning_rate": 2.8499999999999998e-05,
37
+ "loss": 0.69,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.0125,
42
+ "grad_norm": 0.07633858919143677,
43
+ "learning_rate": 3.5999999999999994e-05,
44
+ "loss": 0.6722,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.015,
49
+ "grad_norm": 0.09399061650037766,
50
+ "learning_rate": 4.3499999999999993e-05,
51
+ "loss": 0.6453,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.0175,
56
+ "grad_norm": 0.0843738541007042,
57
+ "learning_rate": 5.1e-05,
58
+ "loss": 0.6276,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.02,
63
+ "grad_norm": 0.08583351224660873,
64
+ "learning_rate": 5.85e-05,
65
+ "loss": 0.58,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.0225,
70
+ "grad_norm": 0.09571370482444763,
71
+ "learning_rate": 6.599999999999999e-05,
72
+ "loss": 0.6355,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.025,
77
+ "grad_norm": 0.1083935871720314,
78
+ "learning_rate": 7.35e-05,
79
+ "loss": 0.589,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.0275,
84
+ "grad_norm": 0.10387319326400757,
85
+ "learning_rate": 8.1e-05,
86
+ "loss": 0.6061,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.03,
91
+ "grad_norm": 0.11083361506462097,
92
+ "learning_rate": 8.849999999999998e-05,
93
+ "loss": 0.572,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.0325,
98
+ "grad_norm": 0.12665686011314392,
99
+ "learning_rate": 9.599999999999999e-05,
100
+ "loss": 0.5442,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.035,
105
+ "grad_norm": 0.1308053582906723,
106
+ "learning_rate": 0.00010349999999999998,
107
+ "loss": 0.6524,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.0375,
112
+ "grad_norm": 0.13535510003566742,
113
+ "learning_rate": 0.00011099999999999999,
114
+ "loss": 0.6404,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.04,
119
+ "grad_norm": 0.12833671271800995,
120
+ "learning_rate": 0.0001185,
121
+ "loss": 0.5717,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.0425,
126
+ "grad_norm": 0.11962099373340607,
127
+ "learning_rate": 0.00012599999999999997,
128
+ "loss": 0.6098,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.045,
133
+ "grad_norm": 0.13898271322250366,
134
+ "learning_rate": 0.0001335,
135
+ "loss": 0.6099,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.0475,
140
+ "grad_norm": 0.14486610889434814,
141
+ "learning_rate": 0.00014099999999999998,
142
+ "loss": 0.5744,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.05,
147
+ "grad_norm": 0.1432138830423355,
148
+ "learning_rate": 0.00014849999999999998,
149
+ "loss": 0.5659,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.0525,
154
+ "grad_norm": 0.13487878441810608,
155
+ "learning_rate": 0.000156,
156
+ "loss": 0.5622,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.055,
161
+ "grad_norm": 0.12495309859514236,
162
+ "learning_rate": 0.0001635,
163
+ "loss": 0.5951,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.0575,
168
+ "grad_norm": 0.13011734187602997,
169
+ "learning_rate": 0.00017099999999999998,
170
+ "loss": 0.6249,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.06,
175
+ "grad_norm": 0.13987745344638824,
176
+ "learning_rate": 0.00017849999999999997,
177
+ "loss": 0.559,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.0625,
182
+ "grad_norm": 0.13373605906963348,
183
+ "learning_rate": 0.000186,
184
+ "loss": 0.5475,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.065,
189
+ "grad_norm": 0.12433867901563644,
190
+ "learning_rate": 0.0001935,
191
+ "loss": 0.5274,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.0675,
196
+ "grad_norm": 0.11097615957260132,
197
+ "learning_rate": 0.000201,
198
+ "loss": 0.678,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.07,
203
+ "grad_norm": 0.1155027225613594,
204
+ "learning_rate": 0.00020849999999999997,
205
+ "loss": 0.5611,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.0725,
210
+ "grad_norm": 0.11431068181991577,
211
+ "learning_rate": 0.00021599999999999996,
212
+ "loss": 0.6054,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.075,
217
+ "grad_norm": 0.09796140342950821,
218
+ "learning_rate": 0.00022349999999999998,
219
+ "loss": 0.5472,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.0775,
224
+ "grad_norm": 0.09489257633686066,
225
+ "learning_rate": 0.00023099999999999998,
226
+ "loss": 0.4636,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.08,
231
+ "grad_norm": 0.10787788033485413,
232
+ "learning_rate": 0.0002385,
233
+ "loss": 0.6164,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.0825,
238
+ "grad_norm": 0.10261733084917068,
239
+ "learning_rate": 0.00024599999999999996,
240
+ "loss": 0.5408,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.085,
245
+ "grad_norm": 0.11870352178812027,
246
+ "learning_rate": 0.0002535,
247
+ "loss": 0.5268,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.0875,
252
+ "grad_norm": 0.11910569667816162,
253
+ "learning_rate": 0.000261,
254
+ "loss": 0.5461,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.09,
259
+ "grad_norm": 0.10083702206611633,
260
+ "learning_rate": 0.00026849999999999997,
261
+ "loss": 0.4794,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.0925,
266
+ "grad_norm": 0.10453511029481888,
267
+ "learning_rate": 0.000276,
268
+ "loss": 0.5539,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.095,
273
+ "grad_norm": 0.101403146982193,
274
+ "learning_rate": 0.00028349999999999995,
275
+ "loss": 0.5346,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.0975,
280
+ "grad_norm": 0.10724789649248123,
281
+ "learning_rate": 0.00029099999999999997,
282
+ "loss": 0.6026,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.1,
287
+ "grad_norm": 0.1140277311205864,
288
+ "learning_rate": 0.0002985,
289
+ "loss": 0.5193,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.1025,
294
+ "grad_norm": 0.09706108272075653,
295
+ "learning_rate": 0.0002999963446058092,
296
+ "loss": 0.54,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.105,
301
+ "grad_norm": 0.10003062337636948,
302
+ "learning_rate": 0.0002999814948722491,
303
+ "loss": 0.5365,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.1075,
308
+ "grad_norm": 0.1078687533736229,
309
+ "learning_rate": 0.00029995522346717746,
310
+ "loss": 0.5889,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.11,
315
+ "grad_norm": 0.10538115352392197,
316
+ "learning_rate": 0.0002999175323912636,
317
+ "loss": 0.5611,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.1125,
322
+ "grad_norm": 0.1020808294415474,
323
+ "learning_rate": 0.00029986842451482874,
324
+ "loss": 0.6103,
325
+ "step": 225
326
+ },
327
+ {
328
+ "epoch": 0.115,
329
+ "grad_norm": 0.09635835886001587,
330
+ "learning_rate": 0.0002998079035776279,
331
+ "loss": 0.5229,
332
+ "step": 230
333
+ },
334
+ {
335
+ "epoch": 0.1175,
336
+ "grad_norm": 0.10287190228700638,
337
+ "learning_rate": 0.0002997359741885648,
338
+ "loss": 0.5312,
339
+ "step": 235
340
+ },
341
+ {
342
+ "epoch": 0.12,
343
+ "grad_norm": 0.09160075336694717,
344
+ "learning_rate": 0.0002996526418253408,
345
+ "loss": 0.5673,
346
+ "step": 240
347
+ },
348
+ {
349
+ "epoch": 0.1225,
350
+ "grad_norm": 0.08691006153821945,
351
+ "learning_rate": 0.000299557912834038,
352
+ "loss": 0.5326,
353
+ "step": 245
354
+ },
355
+ {
356
+ "epoch": 0.125,
357
+ "grad_norm": 0.10096988826990128,
358
+ "learning_rate": 0.00029945179442863594,
359
+ "loss": 0.6004,
360
+ "step": 250
361
+ },
362
+ {
363
+ "epoch": 0.1275,
364
+ "grad_norm": 0.09594204276800156,
365
+ "learning_rate": 0.000299334294690462,
366
+ "loss": 0.5516,
367
+ "step": 255
368
+ },
369
+ {
370
+ "epoch": 0.13,
371
+ "grad_norm": 0.10281919687986374,
372
+ "learning_rate": 0.00029920542256757607,
373
+ "loss": 0.5515,
374
+ "step": 260
375
+ },
376
+ {
377
+ "epoch": 0.1325,
378
+ "grad_norm": 0.08547840267419815,
379
+ "learning_rate": 0.00029906518787408944,
380
+ "loss": 0.5243,
381
+ "step": 265
382
+ },
383
+ {
384
+ "epoch": 0.135,
385
+ "grad_norm": 0.10161560773849487,
386
+ "learning_rate": 0.0002989136012894168,
387
+ "loss": 0.5096,
388
+ "step": 270
389
+ },
390
+ {
391
+ "epoch": 0.1375,
392
+ "grad_norm": 0.09101904183626175,
393
+ "learning_rate": 0.0002987506743574635,
394
+ "loss": 0.553,
395
+ "step": 275
396
+ },
397
+ {
398
+ "epoch": 0.14,
399
+ "grad_norm": 0.09769442677497864,
400
+ "learning_rate": 0.0002985764194857463,
401
+ "loss": 0.4953,
402
+ "step": 280
403
+ },
404
+ {
405
+ "epoch": 0.1425,
406
+ "grad_norm": 0.10991579294204712,
407
+ "learning_rate": 0.00029839084994444826,
408
+ "loss": 0.5152,
409
+ "step": 285
410
+ },
411
+ {
412
+ "epoch": 0.145,
413
+ "grad_norm": 0.09450916200876236,
414
+ "learning_rate": 0.00029819397986540836,
415
+ "loss": 0.5397,
416
+ "step": 290
417
+ },
418
+ {
419
+ "epoch": 0.1475,
420
+ "grad_norm": 0.10876069217920303,
421
+ "learning_rate": 0.0002979858242410454,
422
+ "loss": 0.4858,
423
+ "step": 295
424
+ },
425
+ {
426
+ "epoch": 0.15,
427
+ "grad_norm": 0.097995825111866,
428
+ "learning_rate": 0.00029776639892321606,
429
+ "loss": 0.5566,
430
+ "step": 300
431
+ },
432
+ {
433
+ "epoch": 0.1525,
434
+ "grad_norm": 0.1145048514008522,
435
+ "learning_rate": 0.0002975357206220079,
436
+ "loss": 0.4531,
437
+ "step": 305
438
+ },
439
+ {
440
+ "epoch": 0.155,
441
+ "grad_norm": 0.10271880775690079,
442
+ "learning_rate": 0.00029729380690446654,
443
+ "loss": 0.5199,
444
+ "step": 310
445
+ },
446
+ {
447
+ "epoch": 0.1575,
448
+ "grad_norm": 0.11095371842384338,
449
+ "learning_rate": 0.0002970406761932583,
450
+ "loss": 0.5416,
451
+ "step": 315
452
+ },
453
+ {
454
+ "epoch": 0.16,
455
+ "grad_norm": 0.09949438273906708,
456
+ "learning_rate": 0.00029677634776526673,
457
+ "loss": 0.4841,
458
+ "step": 320
459
+ },
460
+ {
461
+ "epoch": 0.1625,
462
+ "grad_norm": 0.1163724958896637,
463
+ "learning_rate": 0.00029650084175012517,
464
+ "loss": 0.4913,
465
+ "step": 325
466
+ },
467
+ {
468
+ "epoch": 0.165,
469
+ "grad_norm": 0.10726840049028397,
470
+ "learning_rate": 0.00029621417912868323,
471
+ "loss": 0.5203,
472
+ "step": 330
473
+ },
474
+ {
475
+ "epoch": 0.1675,
476
+ "grad_norm": 0.09609931707382202,
477
+ "learning_rate": 0.00029591638173140947,
478
+ "loss": 0.5607,
479
+ "step": 335
480
+ },
481
+ {
482
+ "epoch": 0.17,
483
+ "grad_norm": 0.10824442654848099,
484
+ "learning_rate": 0.0002956074722367286,
485
+ "loss": 0.6004,
486
+ "step": 340
487
+ },
488
+ {
489
+ "epoch": 0.1725,
490
+ "grad_norm": 0.10465679317712784,
491
+ "learning_rate": 0.00029528747416929463,
492
+ "loss": 0.5216,
493
+ "step": 345
494
+ },
495
+ {
496
+ "epoch": 0.175,
497
+ "grad_norm": 0.10518354922533035,
498
+ "learning_rate": 0.0002949564118981994,
499
+ "loss": 0.499,
500
+ "step": 350
501
+ },
502
+ {
503
+ "epoch": 0.1775,
504
+ "grad_norm": 0.0955279991030693,
505
+ "learning_rate": 0.0002946143106351165,
506
+ "loss": 0.5607,
507
+ "step": 355
508
+ },
509
+ {
510
+ "epoch": 0.18,
511
+ "grad_norm": 0.11159654706716537,
512
+ "learning_rate": 0.0002942611964323817,
513
+ "loss": 0.5204,
514
+ "step": 360
515
+ },
516
+ {
517
+ "epoch": 0.1825,
518
+ "grad_norm": 0.09571187198162079,
519
+ "learning_rate": 0.0002938970961810086,
520
+ "loss": 0.6113,
521
+ "step": 365
522
+ },
523
+ {
524
+ "epoch": 0.185,
525
+ "grad_norm": 0.11854679882526398,
526
+ "learning_rate": 0.0002935220376086411,
527
+ "loss": 0.5639,
528
+ "step": 370
529
+ },
530
+ {
531
+ "epoch": 0.1875,
532
+ "grad_norm": 0.1050512045621872,
533
+ "learning_rate": 0.0002931360492774415,
534
+ "loss": 0.548,
535
+ "step": 375
536
+ },
537
+ {
538
+ "epoch": 0.19,
539
+ "grad_norm": 0.1053968220949173,
540
+ "learning_rate": 0.0002927391605819157,
541
+ "loss": 0.5507,
542
+ "step": 380
543
+ },
544
+ {
545
+ "epoch": 0.1925,
546
+ "grad_norm": 0.10567320138216019,
547
+ "learning_rate": 0.00029233140174667445,
548
+ "loss": 0.5312,
549
+ "step": 385
550
+ },
551
+ {
552
+ "epoch": 0.195,
553
+ "grad_norm": 0.11914283782243729,
554
+ "learning_rate": 0.0002919128038241318,
555
+ "loss": 0.5961,
556
+ "step": 390
557
+ },
558
+ {
559
+ "epoch": 0.1975,
560
+ "grad_norm": 0.09915795922279358,
561
+ "learning_rate": 0.0002914833986921401,
562
+ "loss": 0.5086,
563
+ "step": 395
564
+ },
565
+ {
566
+ "epoch": 0.2,
567
+ "grad_norm": 0.10796502232551575,
568
+ "learning_rate": 0.0002910432190515628,
569
+ "loss": 0.5585,
570
+ "step": 400
571
+ },
572
+ {
573
+ "epoch": 0.2025,
574
+ "grad_norm": 0.10748997330665588,
575
+ "learning_rate": 0.00029059229842378373,
576
+ "loss": 0.5466,
577
+ "step": 405
578
+ },
579
+ {
580
+ "epoch": 0.205,
581
+ "grad_norm": 0.10696308314800262,
582
+ "learning_rate": 0.0002901306711481544,
583
+ "loss": 0.5513,
584
+ "step": 410
585
+ },
586
+ {
587
+ "epoch": 0.2075,
588
+ "grad_norm": 0.10418657958507538,
589
+ "learning_rate": 0.0002896583723793792,
590
+ "loss": 0.5391,
591
+ "step": 415
592
+ },
593
+ {
594
+ "epoch": 0.21,
595
+ "grad_norm": 0.16421550512313843,
596
+ "learning_rate": 0.00028917543808483796,
597
+ "loss": 0.4699,
598
+ "step": 420
599
+ },
600
+ {
601
+ "epoch": 0.2125,
602
+ "grad_norm": 0.12929962575435638,
603
+ "learning_rate": 0.00028868190504184696,
604
+ "loss": 0.4984,
605
+ "step": 425
606
+ },
607
+ {
608
+ "epoch": 0.215,
609
+ "grad_norm": 0.10469454526901245,
610
+ "learning_rate": 0.00028817781083485816,
611
+ "loss": 0.5119,
612
+ "step": 430
613
+ },
614
+ {
615
+ "epoch": 0.2175,
616
+ "grad_norm": 0.0964970663189888,
617
+ "learning_rate": 0.00028766319385259713,
618
+ "loss": 0.5167,
619
+ "step": 435
620
+ },
621
+ {
622
+ "epoch": 0.22,
623
+ "grad_norm": 0.12395574152469635,
624
+ "learning_rate": 0.00028713809328513953,
625
+ "loss": 0.5692,
626
+ "step": 440
627
+ },
628
+ {
629
+ "epoch": 0.2225,
630
+ "grad_norm": 0.10189738124608994,
631
+ "learning_rate": 0.0002866025491209265,
632
+ "loss": 0.4628,
633
+ "step": 445
634
+ },
635
+ {
636
+ "epoch": 0.225,
637
+ "grad_norm": 0.10433454066514969,
638
+ "learning_rate": 0.0002860566021437197,
639
+ "loss": 0.4869,
640
+ "step": 450
641
+ },
642
+ {
643
+ "epoch": 0.2275,
644
+ "grad_norm": 0.13003456592559814,
645
+ "learning_rate": 0.0002855002939294951,
646
+ "loss": 0.5291,
647
+ "step": 455
648
+ },
649
+ {
650
+ "epoch": 0.23,
651
+ "grad_norm": 0.11692202836275101,
652
+ "learning_rate": 0.000284933666843277,
653
+ "loss": 0.5229,
654
+ "step": 460
655
+ },
656
+ {
657
+ "epoch": 0.2325,
658
+ "grad_norm": 0.10757846385240555,
659
+ "learning_rate": 0.0002843567640359119,
660
+ "loss": 0.435,
661
+ "step": 465
662
+ },
663
+ {
664
+ "epoch": 0.235,
665
+ "grad_norm": 0.10775501281023026,
666
+ "learning_rate": 0.00028376962944078206,
667
+ "loss": 0.4418,
668
+ "step": 470
669
+ },
670
+ {
671
+ "epoch": 0.2375,
672
+ "grad_norm": 0.11543692648410797,
673
+ "learning_rate": 0.00028317230777046015,
674
+ "loss": 0.4204,
675
+ "step": 475
676
+ },
677
+ {
678
+ "epoch": 0.24,
679
+ "grad_norm": 0.10946698486804962,
680
+ "learning_rate": 0.00028256484451330403,
681
+ "loss": 0.49,
682
+ "step": 480
683
+ },
684
+ {
685
+ "epoch": 0.2425,
686
+ "grad_norm": 0.11528221517801285,
687
+ "learning_rate": 0.00028194728592999247,
688
+ "loss": 0.4752,
689
+ "step": 485
690
+ },
691
+ {
692
+ "epoch": 0.245,
693
+ "grad_norm": 0.10474205762147903,
694
+ "learning_rate": 0.0002813196790500027,
695
+ "loss": 0.4847,
696
+ "step": 490
697
+ },
698
+ {
699
+ "epoch": 0.2475,
700
+ "grad_norm": 0.10768820345401764,
701
+ "learning_rate": 0.00028068207166802837,
702
+ "loss": 0.4664,
703
+ "step": 495
704
+ },
705
+ {
706
+ "epoch": 0.25,
707
+ "grad_norm": 0.12158560007810593,
708
+ "learning_rate": 0.00028003451234034037,
709
+ "loss": 0.4741,
710
+ "step": 500
711
+ }
712
+ ],
713
+ "logging_steps": 5,
714
+ "max_steps": 2000,
715
+ "num_input_tokens_seen": 0,
716
+ "num_train_epochs": 9223372036854775807,
717
+ "save_steps": 500,
718
+ "stateful_callbacks": {
719
+ "TrainerControl": {
720
+ "args": {
721
+ "should_epoch_stop": false,
722
+ "should_evaluate": false,
723
+ "should_log": false,
724
+ "should_save": true,
725
+ "should_training_stop": false
726
+ },
727
+ "attributes": {}
728
+ }
729
+ },
730
+ "total_flos": 6.57394539429888e+17,
731
+ "train_batch_size": 4,
732
+ "trial_name": null,
733
+ "trial_params": null
734
+ }
codellama-hugcoder/checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
3
+ size 5304
codellama-hugcoder/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce19e7480e96c4d26efe137d7fe1582e71cf088cb5b49be23c8ccd4b8298bb4b
3
+ size 5304
configs/deepspeed_config.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ deepspeed_config:
4
+ deepspeed_multinode_launcher: standard
5
+ offload_optimizer_device: none
6
+ offload_param_device: none
7
+ zero3_init_flag: true
8
+ zero3_save_16bit_model: true
9
+ zero_stage: 3
10
+ distributed_type: DEEPSPEED
11
+ downcast_bf16: 'no'
12
+ machine_rank: 0
13
+ main_training_function: main
14
+ mixed_precision: bf16
15
+ num_machines: 1
16
+ num_processes: 8
17
+ rdzv_backend: static
18
+ same_network: true
19
+ tpu_env: []
20
+ tpu_use_cluster: false
21
+ tpu_use_sudo: false
22
+ use_cpu: false
configs/fsdp_config.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ debug: false
3
+ distributed_type: FSDP
4
+ downcast_bf16: 'no'
5
+ fsdp_config:
6
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
7
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
8
+ fsdp_cpu_ram_efficient_loading: true
9
+ fsdp_forward_prefetch: false
10
+ fsdp_offload_params: false
11
+ fsdp_sharding_strategy: 1
12
+ fsdp_state_dict_type: SHARDED_STATE_DICT
13
+ fsdp_sync_module_states: true
14
+ fsdp_use_orig_params: true
15
+ machine_rank: 0
16
+ main_training_function: main
17
+ mixed_precision: bf16
18
+ num_machines: 1
19
+ num_processes: 8
20
+ rdzv_backend: static
21
+ same_network: true
22
+ tpu_env: []
23
+ tpu_use_cluster: false
24
+ tpu_use_sudo: false
25
+ use_cpu: false
fim.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Sourab Mangrulkar. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import functools
17
+ import numpy as np
18
+
19
+
20
+ # this is expensive so we cache it
21
+ @functools.lru_cache(maxsize=None)
22
+ def get_fim_token_ids(tokenizer):
23
+ if "codellama" in tokenizer.name_or_path:
24
+ return (
25
+ tokenizer.bos_token_id,
26
+ tokenizer.suffix_id,
27
+ tokenizer.prefix_id,
28
+ tokenizer.middle_id,
29
+ 0,
30
+ )
31
+ elif "deepseek-coder" in tokenizer.name_or_path:
32
+ return (
33
+ tokenizer.bos_token_id,
34
+ tokenizer.encode("<|fim▁hole|>", add_special_tokens=False)[0],
35
+ tokenizer.encode("<|fim▁begin|>", add_special_tokens=False)[0],
36
+ tokenizer.encode("<|fim▁end|>", add_special_tokens=False)[0],
37
+ tokenizer.encode("<pad>", add_special_tokens=False)[0],
38
+ )
39
+ elif "stable-code" in tokenizer.name_or_path:
40
+ return (
41
+ tokenizer.bos_token_id,
42
+ tokenizer.encode("<fim_suffix>")[0],
43
+ tokenizer.encode("<fim_prefix>")[0],
44
+ tokenizer.encode("<fim_middle>")[0],
45
+ tokenizer.encode("<fim_pad>")[0],
46
+ )
47
+ else:
48
+ bos_token_id = None
49
+ try:
50
+ FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map[
51
+ "additional_special_tokens"
52
+ ][1:5]
53
+ suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
54
+ tokenizer.vocab[tok]
55
+ for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
56
+ )
57
+ except KeyError:
58
+ suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
59
+ None,
60
+ None,
61
+ None,
62
+ None,
63
+ )
64
+ return bos_token_id, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id
65
+
66
+
67
+ def _bos_token_processing(prefix_token_list, bos_token):
68
+ if bos_token is not None:
69
+ # add the BOS token to the beginning of the list
70
+ prefix_token_list.insert(0, bos_token)
71
+
72
+ return prefix_token_list
73
+
74
+
75
+ ## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
76
+ def permute(
77
+ sample,
78
+ np_rng,
79
+ suffix_tok_id,
80
+ prefix_tok_id,
81
+ middle_tok_id,
82
+ pad_tok_id,
83
+ fim_rate=0.5,
84
+ fim_spm_rate=0.5,
85
+ truncate_or_pad=False,
86
+ bos_token_id=None,
87
+ ):
88
+ """
89
+ Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
90
+ PSM and SPM (with a probability of fim_spm_rate).
91
+ """
92
+
93
+ if np_rng.binomial(1, fim_rate):
94
+ boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
95
+ boundaries.sort()
96
+
97
+ prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
98
+ middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
99
+ suffix = np.array(sample[boundaries[1] :], dtype=np.int64)
100
+
101
+ if truncate_or_pad:
102
+ new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
103
+ diff = new_length - len(sample)
104
+ if diff > 0:
105
+ if suffix.shape[0] <= diff:
106
+ return sample, np_rng
107
+ suffix = suffix[: suffix.shape[0] - diff]
108
+ elif diff < 0:
109
+ suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])
110
+
111
+ if np_rng.binomial(1, fim_spm_rate):
112
+ prefix_special_tokens = _bos_token_processing(
113
+ [prefix_tok_id, suffix_tok_id], bos_token_id
114
+ )
115
+ # SPM (variant 2 from FIM paper)
116
+ new_sample = np.concatenate(
117
+ [
118
+ prefix_special_tokens,
119
+ suffix,
120
+ [middle_tok_id],
121
+ prefix,
122
+ middle,
123
+ ]
124
+ )
125
+ else:
126
+ prefix_special_tokens = _bos_token_processing([prefix_tok_id], bos_token_id)
127
+ # PSM
128
+ new_sample = np.concatenate(
129
+ [
130
+ prefix_special_tokens,
131
+ prefix,
132
+ [suffix_tok_id],
133
+ suffix,
134
+ [middle_tok_id],
135
+ middle,
136
+ ]
137
+ )
138
+ else:
139
+ # don't do FIM preproc
140
+ new_sample = sample
141
+ return list(new_sample), np_rng
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ git+https://github.com/huggingface/accelerate
3
+ git+https://github.com/huggingface/peft
4
+ trl
5
+ huggingface-hub
6
+ bitsandbytes
7
+ evaluate
8
+ datasets
9
+ einops
10
+ wandb
11
+ tiktoken
12
+ deepspeed
13
+ tqdm
14
+ safetensors
run_deepspeed.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate launch --config_file "configs/deepspeed_config.yaml" train.py \
2
+ --model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
3
+ --dataset_name "smangrul/hug_stack" \
4
+ --splits "train" \
5
+ --max_seq_len 2048 \
6
+ --max_steps 2000 \
7
+ --save_steps 500 \
8
+ --eval_steps 100 \
9
+ --logging_steps 5 \
10
+ --log_level "info" \
11
+ --logging_strategy "steps" \
12
+ --evaluation_strategy "steps" \
13
+ --save_strategy "steps" \
14
+ --push_to_hub \
15
+ --hub_private_repo True \
16
+ --hub_strategy "every_save" \
17
+ --bf16 True \
18
+ --learning_rate 2e-5 \
19
+ --lr_scheduler_type "cosine" \
20
+ --weight_decay 0.1 \
21
+ --warmup_ratio 0.1 \
22
+ --max_grad_norm 1.0 \
23
+ --output_dir "codellama-hugcoder-df" \
24
+ --per_device_train_batch_size 16 \
25
+ --per_device_eval_batch_size 16 \
26
+ --gradient_accumulation_steps 4 \
27
+ --gradient_checkpointing True \
28
+ --use_reentrant False \
29
+ --dataset_text_field "text" \
30
+ --test_size 0.1 \
31
+ --fim_rate 0.5 \
32
+ --fim_spm_rate 0.5 \
33
+ --use_flash_attn True
run_fsdp.sh ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate launch --config_file "configs/fsdp_config.yaml" train.py \
2
+ --model_path "codellama/CodeLlama-7b-Instruct-hf" \
3
+ --dataset_name "smangrul/hug_stack" \
4
+ --splits "train" \
5
+ --max_seq_len 2048 \
6
+ --max_steps 1000 \
7
+ --save_steps 500 \
8
+ --eval_steps 100 \
9
+ --logging_steps 25 \
10
+ --log_level "info" \
11
+ --logging_strategy "steps" \
12
+ --evaluation_strategy "steps" \
13
+ --save_strategy "steps" \
14
+ --push_to_hub \
15
+ --hub_private_repo True \
16
+ --hub_strategy "every_save" \
17
+ --bf16 True \
18
+ --learning_rate 1e-4 \
19
+ --lr_scheduler_type "cosine" \
20
+ --weight_decay 0.1 \
21
+ --warmup_ratio 0.1 \
22
+ --max_grad_norm 1.0 \
23
+ --output_dir "codellama-hugcoder-fsdp" \
24
+ --per_device_train_batch_size 16 \
25
+ --per_device_eval_batch_size 16 \
26
+ --gradient_accumulation_steps 4 \
27
+ --gradient_checkpointing True \
28
+ --use_reentrant True \
29
+ --dataset_text_field "text" \
30
+ --test_size 0.1 \
31
+ --fim_rate 0.5 \
32
+ --fim_spm_rate 0.5 \
33
+ --use_flash_attn True
run_peft.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python3 train.py \
2
+ --model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
3
+ --dataset_name "smangrul/hug_stack" \
4
+ --splits "train" \
5
+ --max_seq_len 2048 \
6
+ --max_steps 2000 \
7
+ --save_steps 500 \
8
+ --eval_steps 100 \
9
+ --logging_steps 5 \
10
+ --log_level "info" \
11
+ --logging_strategy "steps" \
12
+ --save_strategy "steps" \
13
+ --push_to_hub \
14
+ --hub_private_repo True \
15
+ --hub_strategy "every_save" \
16
+ --bf16 True \
17
+ --learning_rate 3e-4 \
18
+ --lr_scheduler_type "cosine" \
19
+ --weight_decay 0.1 \
20
+ --warmup_ratio 0.1 \
21
+ --max_grad_norm 1.0 \
22
+ --output_dir "codellama-hugcoder" \
23
+ --per_device_train_batch_size 4 \
24
+ --per_device_eval_batch_size 4 \
25
+ --gradient_accumulation_steps 4 \
26
+ --gradient_checkpointing True \
27
+ --use_reentrant True \
28
+ --dataset_text_field "text" \
29
+ --test_size 0.1 \
30
+ --fim_rate 0.5 \
31
+ --fim_spm_rate 0.5 \
32
+ --use_peft_lora True \
33
+ --lora_r 32 \
34
+ --lora_alpha 64 \
35
+ --lora_dropout 0.1 \
36
+ --lora_target_modules "all-linear" \
37
+ --use_4bit_quantization True \
38
+ --use_nested_quant True \
39
+ --bnb_4bit_compute_dtype "bfloat16" \
40
+ --use_flash_attn True
run_unsloth_peft.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUDA_VISIBLE_DEVICES=0 WANDB_PROJECT=personal-code-copilot python train.py \
2
+ --seed 11 \
3
+ --model_name_or_path "codellama/CodeLlama-7b-Instruct-hf" \
4
+ --dataset_name "smangrul/hug_stack" \
5
+ --splits "train" \
6
+ --max_seq_len 2048 \
7
+ --max_steps 2000 \
8
+ --save_steps 500 \
9
+ --eval_steps 100 \
10
+ --logging_steps 5 \
11
+ --log_level "info" \
12
+ --logging_strategy "steps" \
13
+ --evaluation_strategy "steps" \
14
+ --save_strategy "steps" \
15
+ --push_to_hub \
16
+ --hub_private_repo True \
17
+ --hub_strategy "every_save" \
18
+ --bf16 True \
19
+ --learning_rate 2e-4 \
20
+ --lr_scheduler_type "cosine" \
21
+ --weight_decay 0.1 \
22
+ --warmup_ratio 0.1 \
23
+ --max_grad_norm 1.0 \
24
+ --output_dir "codellama-hugcoder" \
25
+ --per_device_train_batch_size 16 \
26
+ --per_device_eval_batch_size 16 \
27
+ --gradient_accumulation_steps 4 \
28
+ --gradient_checkpointing True \
29
+ --use_reentrant True \
30
+ --dataset_text_field "text" \
31
+ --test_size 0.1 \
32
+ --fim_rate 0.5 \
33
+ --fim_spm_rate 0.0 \
34
+ --use_peft_lora True \
35
+ --lora_r 16 \
36
+ --lora_alpha 16 \
37
+ --lora_dropout 0.1 \
38
+ --lora_target_modules "q_proj,k_proj,v_proj,o_proj,down_proj,up_proj,gate_proj" \
39
+ --use_4bit_quantization True \
40
+ --use_nested_quant True \
41
+ --bnb_4bit_compute_dtype "bfloat16" \
42
+ --use_flash_attn True \
43
+ --use_unsloth True