Campis commited on
Commit
3e4ce94
·
verified ·
1 Parent(s): 319ac5f

Upload 18 files

Browse files
README.md CHANGED
@@ -7,19 +7,19 @@ tags:
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
- - name: train_1B-Instruct_pippo_v6
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # train_1B-Instruct_pippo_v6
18
 
19
  This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the pipo_persona dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 3.6431
22
- - Num Input Tokens Seen: 300000
23
 
24
  ## Model description
25
 
@@ -38,12 +38,12 @@ More information needed
38
  ### Training hyperparameters
39
 
40
  The following hyperparameters were used during training:
41
- - learning_rate: 2e-05
42
- - train_batch_size: 2
43
- - eval_batch_size: 2
44
  - seed: 42
45
- - gradient_accumulation_steps: 8
46
- - total_train_batch_size: 16
47
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
  - lr_scheduler_type: cosine
49
  - num_epochs: 4.0
 
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
+ - name: train_1B-Instruct_pippo_v10
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
+ # train_1B-Instruct_pippo_v10
18
 
19
  This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) on the pipo_persona dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.0584
22
+ - Num Input Tokens Seen: 1576472
23
 
24
  ## Model description
25
 
 
38
  ### Training hyperparameters
39
 
40
  The following hyperparameters were used during training:
41
+ - learning_rate: 0.0003
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 8
44
  - seed: 42
45
+ - gradient_accumulation_steps: 4
46
+ - total_train_batch_size: 32
47
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
  - lr_scheduler_type: cosine
49
  - num_epochs: 4.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_loss": 0.05841538682579994,
4
+ "eval_runtime": 9.5748,
5
+ "eval_samples_per_second": 32.586,
6
+ "eval_steps_per_second": 4.073,
7
+ "num_input_tokens_seen": 1576472,
8
+ "total_flos": 9418069125660672.0,
9
+ "train_loss": 0.12584685484090677,
10
+ "train_runtime": 391.5379,
11
+ "train_samples_per_second": 12.719,
12
+ "train_steps_per_second": 0.398
13
+ }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 4.0,
3
- "eval_loss": 3.643085241317749,
4
- "eval_runtime": 12.9097,
5
- "eval_samples_per_second": 28.893,
6
- "eval_steps_per_second": 14.485,
7
- "num_input_tokens_seen": 300000
8
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "eval_loss": 0.05841538682579994,
4
+ "eval_runtime": 9.5748,
5
+ "eval_samples_per_second": 32.586,
6
+ "eval_steps_per_second": 4.073,
7
+ "num_input_tokens_seen": 1576472
8
  }
llamaboard_config.yaml CHANGED
@@ -1,6 +1,5 @@
1
  top.booster: auto
2
- top.checkpoint_path:
3
- - train_1B-Instruct_pippo_v4
4
  top.finetuning_type: lora
5
  top.model_name: Llama-3.2-1B-Instruct
6
  top.quantization_bit: none
@@ -16,7 +15,7 @@ train.badam_mode: layer
16
  train.badam_switch_interval: 50
17
  train.badam_switch_mode: ascending
18
  train.badam_update_ratio: 0.05
19
- train.batch_size: 2
20
  train.compute_type: bf16
21
  train.create_new_adapter: false
22
  train.cutoff_len: 1024
@@ -37,20 +36,20 @@ train.galore_rank: 16
37
  train.galore_scale: 2
38
  train.galore_target: all
39
  train.galore_update_interval: 200
40
- train.gradient_accumulation_steps: 8
41
  train.image_max_pixels: 768*768
42
  train.image_min_pixels: 32*32
43
- train.learning_rate: 2e-5
44
  train.logging_steps: 5
45
- train.lora_alpha: 32
46
- train.lora_dropout: 0.1
47
- train.lora_rank: 16
48
- train.lora_target: q_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
49
  train.loraplus_lr_ratio: 8
50
  train.lr_scheduler_type: cosine
51
  train.mask_history: false
52
  train.max_grad_norm: '1.0'
53
- train.max_samples: '1900'
54
  train.neat_packing: false
55
  train.neftune_alpha: 0
56
  train.num_train_epochs: '4'
@@ -74,11 +73,11 @@ train.train_on_prompt: false
74
  train.training_stage: Supervised Fine-Tuning
75
  train.use_apollo: false
76
  train.use_badam: false
77
- train.use_dora: true
78
  train.use_galore: false
79
  train.use_llama_pro: false
80
- train.use_pissa: true
81
- train.use_rslora: true
82
  train.use_swanlab: false
83
  train.val_size: 0.2
84
  train.video_max_pixels: 256*256
 
1
  top.booster: auto
2
+ top.checkpoint_path: []
 
3
  top.finetuning_type: lora
4
  top.model_name: Llama-3.2-1B-Instruct
5
  top.quantization_bit: none
 
15
  train.badam_switch_interval: 50
16
  train.badam_switch_mode: ascending
17
  train.badam_update_ratio: 0.05
18
+ train.batch_size: 8
19
  train.compute_type: bf16
20
  train.create_new_adapter: false
21
  train.cutoff_len: 1024
 
36
  train.galore_scale: 2
37
  train.galore_target: all
38
  train.galore_update_interval: 200
39
+ train.gradient_accumulation_steps: 4
40
  train.image_max_pixels: 768*768
41
  train.image_min_pixels: 32*32
42
+ train.learning_rate: 3e-4
43
  train.logging_steps: 5
44
+ train.lora_alpha: 64
45
+ train.lora_dropout: 0
46
+ train.lora_rank: 32
47
+ train.lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
48
  train.loraplus_lr_ratio: 8
49
  train.lr_scheduler_type: cosine
50
  train.mask_history: false
51
  train.max_grad_norm: '1.0'
52
+ train.max_samples: '30000'
53
  train.neat_packing: false
54
  train.neftune_alpha: 0
55
  train.num_train_epochs: '4'
 
73
  train.training_stage: Supervised Fine-Tuning
74
  train.use_apollo: false
75
  train.use_badam: false
76
+ train.use_dora: false
77
  train.use_galore: false
78
  train.use_llama_pro: false
79
+ train.use_pissa: false
80
+ train.use_rslora: false
81
  train.use_swanlab: false
82
  train.val_size: 0.2
83
  train.video_max_pixels: 256*256
running_log.txt CHANGED
@@ -1,12 +1,12 @@
1
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
2
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
3
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
4
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
5
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
6
- [INFO|2025-07-24 16:02:15] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
7
- [INFO|2025-07-24 16:02:16] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
8
- [INFO|2025-07-24 16:02:17] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
9
- [INFO|2025-07-24 16:02:17] configuration_utils.py:770 >> Model config LlamaConfig {
10
  "architectures": [
11
  "LlamaForCausalLM"
12
  ],
@@ -46,18 +46,18 @@
46
  "vocab_size": 128256
47
  }
48
 
49
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
50
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
51
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
52
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
53
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
54
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
55
- [INFO|2025-07-24 16:02:17] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
56
- [INFO|2025-07-24 16:02:17] logging.py:143 >> Add pad token: <|eot_id|>
57
- [INFO|2025-07-24 16:02:17] logging.py:143 >> Add <|eom_id|> to stop words.
58
- [INFO|2025-07-24 16:02:17] logging.py:143 >> Loading dataset pippo_dataset_v02_1900_lines.json...
59
- [INFO|2025-07-24 16:02:24] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
60
- [INFO|2025-07-24 16:02:24] configuration_utils.py:770 >> Model config LlamaConfig {
61
  "architectures": [
62
  "LlamaForCausalLM"
63
  ],
@@ -97,10 +97,10 @@
97
  "vocab_size": 128256
98
  }
99
 
100
- [INFO|2025-07-24 16:02:24] logging.py:143 >> KV cache is disabled during training.
101
- [INFO|2025-07-24 16:03:23] modeling_utils.py:1151 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors
102
- [INFO|2025-07-24 16:03:23] modeling_utils.py:2241 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
103
- [INFO|2025-07-24 16:03:23] configuration_utils.py:1135 >> Generate config GenerationConfig {
104
  "bos_token_id": 128000,
105
  "eos_token_id": [
106
  128001,
@@ -110,12 +110,12 @@
110
  "use_cache": false
111
  }
112
 
113
- [INFO|2025-07-24 16:03:25] modeling_utils.py:5131 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
114
 
115
- [INFO|2025-07-24 16:03:25] modeling_utils.py:5139 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct.
116
  If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
117
- [INFO|2025-07-24 16:03:25] configuration_utils.py:1090 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json
118
- [INFO|2025-07-24 16:03:25] configuration_utils.py:1135 >> Generate config GenerationConfig {
119
  "bos_token_id": 128000,
120
  "do_sample": true,
121
  "eos_token_id": [
@@ -127,229 +127,48 @@ If your task is similar to the task the model of the checkpoint was trained on,
127
  "top_p": 0.9
128
  }
129
 
130
- [INFO|2025-07-24 16:03:25] logging.py:143 >> Gradient checkpointing enabled.
131
- [INFO|2025-07-24 16:03:25] logging.py:143 >> Using torch SDPA for faster training and inference.
132
- [INFO|2025-07-24 16:03:25] logging.py:143 >> Upcasting trainable params to float32.
133
- [INFO|2025-07-24 16:03:25] logging.py:143 >> Fine-tuning method: DoRA
134
- [INFO|2025-07-24 16:03:27] logging.py:143 >> Loaded adapter(s): saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v4
135
- [INFO|2025-07-24 16:03:27] logging.py:143 >> trainable params: 15,925,248 || all params: 1,251,739,648 || trainable%: 1.2722
136
- [INFO|2025-07-24 16:03:27] trainer.py:756 >> Using auto half precision backend
137
- [INFO|2025-07-24 16:03:27] logging.py:143 >> Using LoRA+ optimizer with loraplus lr ratio 8.00.
138
- [INFO|2025-07-24 16:03:28] trainer.py:2409 >> ***** Running training *****
139
- [INFO|2025-07-24 16:03:28] trainer.py:2410 >> Num examples = 1,490
140
- [INFO|2025-07-24 16:03:28] trainer.py:2411 >> Num Epochs = 4
141
- [INFO|2025-07-24 16:03:28] trainer.py:2412 >> Instantaneous batch size per device = 2
142
- [INFO|2025-07-24 16:03:28] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 16
143
- [INFO|2025-07-24 16:03:28] trainer.py:2416 >> Gradient Accumulation steps = 8
144
- [INFO|2025-07-24 16:03:28] trainer.py:2417 >> Total optimization steps = 376
145
- [INFO|2025-07-24 16:03:28] trainer.py:2418 >> Number of trainable parameters = 15,925,248
146
- [INFO|2025-07-24 16:03:28] logging.py:143 >> Initial PiSSA adapter will be saved at: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/pissa_init.
147
- [INFO|2025-07-24 16:03:28] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
148
- [INFO|2025-07-24 16:03:28] configuration_utils.py:770 >> Model config LlamaConfig {
149
- "architectures": [
150
- "LlamaForCausalLM"
151
- ],
152
- "attention_bias": false,
153
- "attention_dropout": 0.0,
154
- "bos_token_id": 128000,
155
- "eos_token_id": [
156
- 128001,
157
- 128008,
158
- 128009
159
- ],
160
- "head_dim": 64,
161
- "hidden_act": "silu",
162
- "hidden_size": 2048,
163
- "initializer_range": 0.02,
164
- "intermediate_size": 8192,
165
- "max_position_embeddings": 131072,
166
- "mlp_bias": false,
167
- "model_type": "llama",
168
- "num_attention_heads": 32,
169
- "num_hidden_layers": 16,
170
- "num_key_value_heads": 8,
171
- "pretraining_tp": 1,
172
- "rms_norm_eps": 1e-05,
173
- "rope_scaling": {
174
- "factor": 32.0,
175
- "high_freq_factor": 4.0,
176
- "low_freq_factor": 1.0,
177
- "original_max_position_embeddings": 8192,
178
- "rope_type": "llama3"
179
- },
180
- "rope_theta": 500000.0,
181
- "tie_word_embeddings": true,
182
- "torch_dtype": "bfloat16",
183
- "transformers_version": "4.52.4",
184
- "use_cache": true,
185
- "vocab_size": 128256
186
- }
187
-
188
- [INFO|2025-07-24 16:03:43] logging.py:143 >> {'loss': 1.1748, 'learning_rate': 1.9994e-05, 'epoch': 0.05, 'throughput': 263.90}
189
- [INFO|2025-07-24 16:03:54] logging.py:143 >> {'loss': 1.2634, 'learning_rate': 1.9972e-05, 'epoch': 0.11, 'throughput': 304.64}
190
- [INFO|2025-07-24 16:04:06] logging.py:143 >> {'loss': 1.4367, 'learning_rate': 1.9932e-05, 'epoch': 0.16, 'throughput': 315.40}
191
- [INFO|2025-07-24 16:04:18] logging.py:143 >> {'loss': 1.3167, 'learning_rate': 1.9874e-05, 'epoch': 0.21, 'throughput': 323.64}
192
- [INFO|2025-07-24 16:04:30] logging.py:143 >> {'loss': 1.4144, 'learning_rate': 1.9800e-05, 'epoch': 0.27, 'throughput': 325.70}
193
- [INFO|2025-07-24 16:04:41] logging.py:143 >> {'loss': 1.3944, 'learning_rate': 1.9708e-05, 'epoch': 0.32, 'throughput': 331.60}
194
- [INFO|2025-07-24 16:04:52] logging.py:143 >> {'loss': 1.3946, 'learning_rate': 1.9599e-05, 'epoch': 0.38, 'throughput': 334.75}
195
- [INFO|2025-07-24 16:05:04] logging.py:143 >> {'loss': 1.3827, 'learning_rate': 1.9474e-05, 'epoch': 0.43, 'throughput': 336.48}
196
- [INFO|2025-07-24 16:05:16] logging.py:143 >> {'loss': 1.4509, 'learning_rate': 1.9332e-05, 'epoch': 0.48, 'throughput': 336.58}
197
- [INFO|2025-07-24 16:05:29] logging.py:143 >> {'loss': 1.3347, 'learning_rate': 1.9174e-05, 'epoch': 0.54, 'throughput': 335.59}
198
- [INFO|2025-07-24 16:05:41] logging.py:143 >> {'loss': 1.3834, 'learning_rate': 1.8999e-05, 'epoch': 0.59, 'throughput': 335.37}
199
- [INFO|2025-07-24 16:05:53] logging.py:143 >> {'loss': 1.4679, 'learning_rate': 1.8809e-05, 'epoch': 0.64, 'throughput': 335.60}
200
- [INFO|2025-07-24 16:06:04] logging.py:143 >> {'loss': 1.3973, 'learning_rate': 1.8604e-05, 'epoch': 0.70, 'throughput': 336.68}
201
- [INFO|2025-07-24 16:06:17] logging.py:143 >> {'loss': 1.5408, 'learning_rate': 1.8384e-05, 'epoch': 0.75, 'throughput': 335.03}
202
- [INFO|2025-07-24 16:06:28] logging.py:143 >> {'loss': 1.4967, 'learning_rate': 1.8149e-05, 'epoch': 0.81, 'throughput': 335.55}
203
- [INFO|2025-07-24 16:06:40] logging.py:143 >> {'loss': 1.4644, 'learning_rate': 1.7900e-05, 'epoch': 0.86, 'throughput': 336.30}
204
- [INFO|2025-07-24 16:06:52] logging.py:143 >> {'loss': 1.4226, 'learning_rate': 1.7637e-05, 'epoch': 0.91, 'throughput': 336.02}
205
- [INFO|2025-07-24 16:07:03] logging.py:143 >> {'loss': 1.4847, 'learning_rate': 1.7360e-05, 'epoch': 0.97, 'throughput': 336.26}
206
- [INFO|2025-07-24 16:07:13] logging.py:143 >> {'loss': 1.2927, 'learning_rate': 1.7071e-05, 'epoch': 1.01, 'throughput': 336.91}
207
- [INFO|2025-07-24 16:07:23] logging.py:143 >> {'loss': 0.8505, 'learning_rate': 1.6770e-05, 'epoch': 1.06, 'throughput': 338.48}
208
- [INFO|2025-07-24 16:07:23] trainer.py:4327 >>
209
- ***** Running Evaluation *****
210
- [INFO|2025-07-24 16:07:23] trainer.py:4329 >> Num examples = 373
211
- [INFO|2025-07-24 16:07:23] trainer.py:4332 >> Batch size = 2
212
- [INFO|2025-07-24 16:07:37] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100
213
- [INFO|2025-07-24 16:07:37] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
214
- [INFO|2025-07-24 16:07:37] configuration_utils.py:770 >> Model config LlamaConfig {
215
- "architectures": [
216
- "LlamaForCausalLM"
217
- ],
218
- "attention_bias": false,
219
- "attention_dropout": 0.0,
220
- "bos_token_id": 128000,
221
- "eos_token_id": [
222
- 128001,
223
- 128008,
224
- 128009
225
- ],
226
- "head_dim": 64,
227
- "hidden_act": "silu",
228
- "hidden_size": 2048,
229
- "initializer_range": 0.02,
230
- "intermediate_size": 8192,
231
- "max_position_embeddings": 131072,
232
- "mlp_bias": false,
233
- "model_type": "llama",
234
- "num_attention_heads": 32,
235
- "num_hidden_layers": 16,
236
- "num_key_value_heads": 8,
237
- "pretraining_tp": 1,
238
- "rms_norm_eps": 1e-05,
239
- "rope_scaling": {
240
- "factor": 32.0,
241
- "high_freq_factor": 4.0,
242
- "low_freq_factor": 1.0,
243
- "original_max_position_embeddings": 8192,
244
- "rope_type": "llama3"
245
- },
246
- "rope_theta": 500000.0,
247
- "tie_word_embeddings": true,
248
- "torch_dtype": "bfloat16",
249
- "transformers_version": "4.52.4",
250
- "use_cache": true,
251
- "vocab_size": 128256
252
- }
253
-
254
- [INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/chat_template.jinja
255
- [INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/tokenizer_config.json
256
- [INFO|2025-07-24 16:07:38] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-100/special_tokens_map.json
257
- [INFO|2025-07-24 16:07:51] logging.py:143 >> {'loss': 0.7497, 'learning_rate': 1.6456e-05, 'epoch': 1.12, 'throughput': 318.10}
258
- [INFO|2025-07-24 16:08:02] logging.py:143 >> {'loss': 0.7761, 'learning_rate': 1.6132e-05, 'epoch': 1.17, 'throughput': 319.79}
259
- [INFO|2025-07-24 16:08:14] logging.py:143 >> {'loss': 0.7997, 'learning_rate': 1.5796e-05, 'epoch': 1.23, 'throughput': 321.09}
260
- [INFO|2025-07-24 16:08:25] logging.py:143 >> {'loss': 0.8275, 'learning_rate': 1.5451e-05, 'epoch': 1.28, 'throughput': 322.08}
261
- [INFO|2025-07-24 16:08:37] logging.py:143 >> {'loss': 0.8599, 'learning_rate': 1.5096e-05, 'epoch': 1.33, 'throughput': 322.83}
262
- [INFO|2025-07-24 16:08:49] logging.py:143 >> {'loss': 0.9179, 'learning_rate': 1.4732e-05, 'epoch': 1.39, 'throughput': 323.42}
263
- [INFO|2025-07-24 16:09:00] logging.py:143 >> {'loss': 0.8255, 'learning_rate': 1.4360e-05, 'epoch': 1.44, 'throughput': 324.25}
264
- [INFO|2025-07-24 16:09:11] logging.py:143 >> {'loss': 0.8806, 'learning_rate': 1.3981e-05, 'epoch': 1.49, 'throughput': 325.46}
265
- [INFO|2025-07-24 16:09:23] logging.py:143 >> {'loss': 0.8366, 'learning_rate': 1.3594e-05, 'epoch': 1.55, 'throughput': 326.15}
266
- [INFO|2025-07-24 16:09:34] logging.py:143 >> {'loss': 0.8567, 'learning_rate': 1.3201e-05, 'epoch': 1.60, 'throughput': 327.21}
267
- [INFO|2025-07-24 16:09:45] logging.py:143 >> {'loss': 0.8939, 'learning_rate': 1.2803e-05, 'epoch': 1.66, 'throughput': 328.25}
268
- [INFO|2025-07-24 16:09:56] logging.py:143 >> {'loss': 0.9650, 'learning_rate': 1.2399e-05, 'epoch': 1.71, 'throughput': 329.28}
269
- [INFO|2025-07-24 16:10:07] logging.py:143 >> {'loss': 0.8915, 'learning_rate': 1.1992e-05, 'epoch': 1.76, 'throughput': 330.41}
270
- [INFO|2025-07-24 16:10:18] logging.py:143 >> {'loss': 0.9085, 'learning_rate': 1.1581e-05, 'epoch': 1.82, 'throughput': 331.65}
271
- [INFO|2025-07-24 16:10:29] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 1.1167e-05, 'epoch': 1.87, 'throughput': 332.58}
272
- [INFO|2025-07-24 16:10:41] logging.py:143 >> {'loss': 1.0271, 'learning_rate': 1.0751e-05, 'epoch': 1.92, 'throughput': 332.56}
273
- [INFO|2025-07-24 16:10:52] logging.py:143 >> {'loss': 0.9636, 'learning_rate': 1.0334e-05, 'epoch': 1.98, 'throughput': 333.29}
274
- [INFO|2025-07-24 16:11:02] logging.py:143 >> {'loss': 0.7085, 'learning_rate': 9.9164e-06, 'epoch': 2.02, 'throughput': 333.30}
275
- [INFO|2025-07-24 16:11:13] logging.py:143 >> {'loss': 0.5186, 'learning_rate': 9.4989e-06, 'epoch': 2.08, 'throughput': 333.84}
276
- [INFO|2025-07-24 16:11:25] logging.py:143 >> {'loss': 0.5023, 'learning_rate': 9.0822e-06, 'epoch': 2.13, 'throughput': 334.41}
277
- [INFO|2025-07-24 16:11:25] trainer.py:4327 >>
278
- ***** Running Evaluation *****
279
- [INFO|2025-07-24 16:11:25] trainer.py:4329 >> Num examples = 373
280
- [INFO|2025-07-24 16:11:25] trainer.py:4332 >> Batch size = 2
281
- [INFO|2025-07-24 16:11:38] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200
282
- [INFO|2025-07-24 16:11:39] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
283
- [INFO|2025-07-24 16:11:39] configuration_utils.py:770 >> Model config LlamaConfig {
284
- "architectures": [
285
- "LlamaForCausalLM"
286
- ],
287
- "attention_bias": false,
288
- "attention_dropout": 0.0,
289
- "bos_token_id": 128000,
290
- "eos_token_id": [
291
- 128001,
292
- 128008,
293
- 128009
294
- ],
295
- "head_dim": 64,
296
- "hidden_act": "silu",
297
- "hidden_size": 2048,
298
- "initializer_range": 0.02,
299
- "intermediate_size": 8192,
300
- "max_position_embeddings": 131072,
301
- "mlp_bias": false,
302
- "model_type": "llama",
303
- "num_attention_heads": 32,
304
- "num_hidden_layers": 16,
305
- "num_key_value_heads": 8,
306
- "pretraining_tp": 1,
307
- "rms_norm_eps": 1e-05,
308
- "rope_scaling": {
309
- "factor": 32.0,
310
- "high_freq_factor": 4.0,
311
- "low_freq_factor": 1.0,
312
- "original_max_position_embeddings": 8192,
313
- "rope_type": "llama3"
314
- },
315
- "rope_theta": 500000.0,
316
- "tie_word_embeddings": true,
317
- "torch_dtype": "bfloat16",
318
- "transformers_version": "4.52.4",
319
- "use_cache": true,
320
- "vocab_size": 128256
321
- }
322
-
323
- [INFO|2025-07-24 16:11:39] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/chat_template.jinja
324
- [INFO|2025-07-24 16:11:40] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/tokenizer_config.json
325
- [INFO|2025-07-24 16:11:40] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-200/special_tokens_map.json
326
- [INFO|2025-07-24 16:11:53] logging.py:143 >> {'loss': 0.5145, 'learning_rate': 8.6671e-06, 'epoch': 2.18, 'throughput': 323.15}
327
- [INFO|2025-07-24 16:12:05] logging.py:143 >> {'loss': 0.4971, 'learning_rate': 8.2544e-06, 'epoch': 2.24, 'throughput': 323.77}
328
- [INFO|2025-07-24 16:12:16] logging.py:143 >> {'loss': 0.5042, 'learning_rate': 7.8447e-06, 'epoch': 2.29, 'throughput': 324.87}
329
- [INFO|2025-07-24 16:12:27] logging.py:143 >> {'loss': 0.5423, 'learning_rate': 7.4387e-06, 'epoch': 2.34, 'throughput': 325.53}
330
- [INFO|2025-07-24 16:12:39] logging.py:143 >> {'loss': 0.5913, 'learning_rate': 7.0372e-06, 'epoch': 2.40, 'throughput': 325.95}
331
- [INFO|2025-07-24 16:12:51] logging.py:143 >> {'loss': 0.5403, 'learning_rate': 6.6409e-06, 'epoch': 2.45, 'throughput': 326.13}
332
- [INFO|2025-07-24 16:13:02] logging.py:143 >> {'loss': 0.5301, 'learning_rate': 6.2505e-06, 'epoch': 2.50, 'throughput': 326.79}
333
- [INFO|2025-07-24 16:13:13] logging.py:143 >> {'loss': 0.5417, 'learning_rate': 5.8666e-06, 'epoch': 2.56, 'throughput': 327.37}
334
- [INFO|2025-07-24 16:13:25] logging.py:143 >> {'loss': 0.5358, 'learning_rate': 5.4899e-06, 'epoch': 2.61, 'throughput': 327.79}
335
- [INFO|2025-07-24 16:13:37] logging.py:143 >> {'loss': 0.5249, 'learning_rate': 5.1211e-06, 'epoch': 2.67, 'throughput': 328.11}
336
- [INFO|2025-07-24 16:13:48] logging.py:143 >> {'loss': 0.5134, 'learning_rate': 4.7608e-06, 'epoch': 2.72, 'throughput': 328.62}
337
- [INFO|2025-07-24 16:14:00] logging.py:143 >> {'loss': 0.5160, 'learning_rate': 4.4096e-06, 'epoch': 2.77, 'throughput': 328.73}
338
- [INFO|2025-07-24 16:14:11] logging.py:143 >> {'loss': 0.5611, 'learning_rate': 4.0682e-06, 'epoch': 2.83, 'throughput': 329.02}
339
- [INFO|2025-07-24 16:14:22] logging.py:143 >> {'loss': 0.5084, 'learning_rate': 3.7371e-06, 'epoch': 2.88, 'throughput': 329.85}
340
- [INFO|2025-07-24 16:14:33] logging.py:143 >> {'loss': 0.5458, 'learning_rate': 3.4170e-06, 'epoch': 2.93, 'throughput': 330.35}
341
- [INFO|2025-07-24 16:14:43] logging.py:143 >> {'loss': 0.5205, 'learning_rate': 3.1084e-06, 'epoch': 2.99, 'throughput': 331.42}
342
- [INFO|2025-07-24 16:14:52] logging.py:143 >> {'loss': 0.4058, 'learning_rate': 2.8118e-06, 'epoch': 3.03, 'throughput': 332.17}
343
- [INFO|2025-07-24 16:15:04] logging.py:143 >> {'loss': 0.3859, 'learning_rate': 2.5277e-06, 'epoch': 3.09, 'throughput': 332.42}
344
- [INFO|2025-07-24 16:15:14] logging.py:143 >> {'loss': 0.3724, 'learning_rate': 2.2567e-06, 'epoch': 3.14, 'throughput': 333.25}
345
- [INFO|2025-07-24 16:15:25] logging.py:143 >> {'loss': 0.3916, 'learning_rate': 1.9991e-06, 'epoch': 3.19, 'throughput': 333.86}
346
- [INFO|2025-07-24 16:15:25] trainer.py:4327 >>
347
  ***** Running Evaluation *****
348
- [INFO|2025-07-24 16:15:25] trainer.py:4329 >> Num examples = 373
349
- [INFO|2025-07-24 16:15:25] trainer.py:4332 >> Batch size = 2
350
- [INFO|2025-07-24 16:15:37] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300
351
- [INFO|2025-07-24 16:15:38] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
352
- [INFO|2025-07-24 16:15:38] configuration_utils.py:770 >> Model config LlamaConfig {
353
  "architectures": [
354
  "LlamaForCausalLM"
355
  ],
@@ -389,27 +208,23 @@ If your task is similar to the task the model of the checkpoint was trained on,
389
  "vocab_size": 128256
390
  }
391
 
392
- [INFO|2025-07-24 16:15:38] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/chat_template.jinja
393
- [INFO|2025-07-24 16:15:39] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/tokenizer_config.json
394
- [INFO|2025-07-24 16:15:39] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-300/special_tokens_map.json
395
- [INFO|2025-07-24 16:15:51] logging.py:143 >> {'loss': 0.3676, 'learning_rate': 1.7556e-06, 'epoch': 3.25, 'throughput': 327.35}
396
- [INFO|2025-07-24 16:16:03] logging.py:143 >> {'loss': 0.3798, 'learning_rate': 1.5264e-06, 'epoch': 3.30, 'throughput': 327.69}
397
- [INFO|2025-07-24 16:16:13] logging.py:143 >> {'loss': 0.3606, 'learning_rate': 1.3120e-06, 'epoch': 3.35, 'throughput': 328.40}
398
- [INFO|2025-07-24 16:16:24] logging.py:143 >> {'loss': 0.3938, 'learning_rate': 1.1128e-06, 'epoch': 3.41, 'throughput': 329.00}
399
- [INFO|2025-07-24 16:16:35] logging.py:143 >> {'loss': 0.3605, 'learning_rate': 9.2909e-07, 'epoch': 3.46, 'throughput': 329.68}
400
- [INFO|2025-07-24 16:16:45] logging.py:143 >> {'loss': 0.3781, 'learning_rate': 7.6120e-07, 'epoch': 3.52, 'throughput': 330.66}
401
- [INFO|2025-07-24 16:16:55] logging.py:143 >> {'loss': 0.3794, 'learning_rate': 6.0944e-07, 'epoch': 3.57, 'throughput': 331.22}
402
- [INFO|2025-07-24 16:17:06] logging.py:143 >> {'loss': 0.3526, 'learning_rate': 4.7406e-07, 'epoch': 3.62, 'throughput': 332.12}
403
- [INFO|2025-07-24 16:17:16] logging.py:143 >> {'loss': 0.3846, 'learning_rate': 3.5531e-07, 'epoch': 3.68, 'throughput': 332.56}
404
- [INFO|2025-07-24 16:17:27] logging.py:143 >> {'loss': 0.3855, 'learning_rate': 2.5338e-07, 'epoch': 3.73, 'throughput': 333.19}
405
- [INFO|2025-07-24 16:17:37] logging.py:143 >> {'loss': 0.3955, 'learning_rate': 1.6847e-07, 'epoch': 3.78, 'throughput': 333.69}
406
- [INFO|2025-07-24 16:17:48] logging.py:143 >> {'loss': 0.3400, 'learning_rate': 1.0071e-07, 'epoch': 3.84, 'throughput': 334.24}
407
- [INFO|2025-07-24 16:17:58] logging.py:143 >> {'loss': 0.3584, 'learning_rate': 5.0222e-08, 'epoch': 3.89, 'throughput': 334.96}
408
- [INFO|2025-07-24 16:18:09] logging.py:143 >> {'loss': 0.3608, 'learning_rate': 1.7099e-08, 'epoch': 3.94, 'throughput': 335.57}
409
- [INFO|2025-07-24 16:18:20] logging.py:143 >> {'loss': 0.3659, 'learning_rate': 1.3962e-09, 'epoch': 4.00, 'throughput': 336.16}
410
- [INFO|2025-07-24 16:18:20] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376
411
- [INFO|2025-07-24 16:18:20] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
412
- [INFO|2025-07-24 16:18:20] configuration_utils.py:770 >> Model config LlamaConfig {
413
  "architectures": [
414
  "LlamaForCausalLM"
415
  ],
@@ -449,141 +264,17 @@ If your task is similar to the task the model of the checkpoint was trained on,
449
  "vocab_size": 128256
450
  }
451
 
452
- [INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/chat_template.jinja
453
- [INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/tokenizer_config.json
454
- [INFO|2025-07-24 16:18:21] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/checkpoint-376/special_tokens_map.json
455
- [INFO|2025-07-24 16:18:23] trainer.py:2676 >>
456
 
457
  Training completed. Do not forget to share your model on huggingface.co/models =)
458
 
459
 
460
- [INFO|2025-07-24 16:18:24] logging.py:143 >> Converted PiSSA adapter will be saved at: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/pissa_converted.
461
- [INFO|2025-07-24 16:18:24] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
462
- [INFO|2025-07-24 16:18:24] configuration_utils.py:770 >> Model config LlamaConfig {
463
- "architectures": [
464
- "LlamaForCausalLM"
465
- ],
466
- "attention_bias": false,
467
- "attention_dropout": 0.0,
468
- "bos_token_id": 128000,
469
- "eos_token_id": [
470
- 128001,
471
- 128008,
472
- 128009
473
- ],
474
- "head_dim": 64,
475
- "hidden_act": "silu",
476
- "hidden_size": 2048,
477
- "initializer_range": 0.02,
478
- "intermediate_size": 8192,
479
- "max_position_embeddings": 131072,
480
- "mlp_bias": false,
481
- "model_type": "llama",
482
- "num_attention_heads": 32,
483
- "num_hidden_layers": 16,
484
- "num_key_value_heads": 8,
485
- "pretraining_tp": 1,
486
- "rms_norm_eps": 1e-05,
487
- "rope_scaling": {
488
- "factor": 32.0,
489
- "high_freq_factor": 4.0,
490
- "low_freq_factor": 1.0,
491
- "original_max_position_embeddings": 8192,
492
- "rope_type": "llama3"
493
- },
494
- "rope_theta": 500000.0,
495
- "tie_word_embeddings": true,
496
- "torch_dtype": "bfloat16",
497
- "transformers_version": "4.52.4",
498
- "use_cache": true,
499
- "vocab_size": 128256
500
- }
501
-
502
- [INFO|2025-07-24 16:18:25] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
503
- [INFO|2025-07-24 16:18:25] configuration_utils.py:770 >> Model config LlamaConfig {
504
- "architectures": [
505
- "LlamaForCausalLM"
506
- ],
507
- "attention_bias": false,
508
- "attention_dropout": 0.0,
509
- "bos_token_id": 128000,
510
- "eos_token_id": [
511
- 128001,
512
- 128008,
513
- 128009
514
- ],
515
- "head_dim": 64,
516
- "hidden_act": "silu",
517
- "hidden_size": 2048,
518
- "initializer_range": 0.02,
519
- "intermediate_size": 8192,
520
- "max_position_embeddings": 131072,
521
- "mlp_bias": false,
522
- "model_type": "llama",
523
- "num_attention_heads": 32,
524
- "num_hidden_layers": 16,
525
- "num_key_value_heads": 8,
526
- "pretraining_tp": 1,
527
- "rms_norm_eps": 1e-05,
528
- "rope_scaling": {
529
- "factor": 32.0,
530
- "high_freq_factor": 4.0,
531
- "low_freq_factor": 1.0,
532
- "original_max_position_embeddings": 8192,
533
- "rope_type": "llama3"
534
- },
535
- "rope_theta": 500000.0,
536
- "tie_word_embeddings": true,
537
- "torch_dtype": "bfloat16",
538
- "transformers_version": "4.52.4",
539
- "use_cache": true,
540
- "vocab_size": 128256
541
- }
542
-
543
- [INFO|2025-07-24 16:18:26] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
544
- [INFO|2025-07-24 16:18:26] configuration_utils.py:770 >> Model config LlamaConfig {
545
- "architectures": [
546
- "LlamaForCausalLM"
547
- ],
548
- "attention_bias": false,
549
- "attention_dropout": 0.0,
550
- "bos_token_id": 128000,
551
- "eos_token_id": [
552
- 128001,
553
- 128008,
554
- 128009
555
- ],
556
- "head_dim": 64,
557
- "hidden_act": "silu",
558
- "hidden_size": 2048,
559
- "initializer_range": 0.02,
560
- "intermediate_size": 8192,
561
- "max_position_embeddings": 131072,
562
- "mlp_bias": false,
563
- "model_type": "llama",
564
- "num_attention_heads": 32,
565
- "num_hidden_layers": 16,
566
- "num_key_value_heads": 8,
567
- "pretraining_tp": 1,
568
- "rms_norm_eps": 1e-05,
569
- "rope_scaling": {
570
- "factor": 32.0,
571
- "high_freq_factor": 4.0,
572
- "low_freq_factor": 1.0,
573
- "original_max_position_embeddings": 8192,
574
- "rope_type": "llama3"
575
- },
576
- "rope_theta": 500000.0,
577
- "tie_word_embeddings": true,
578
- "torch_dtype": "bfloat16",
579
- "transformers_version": "4.52.4",
580
- "use_cache": true,
581
- "vocab_size": 128256
582
- }
583
-
584
- [INFO|2025-07-24 16:18:28] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6
585
- [INFO|2025-07-24 16:18:28] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
586
- [INFO|2025-07-24 16:18:28] configuration_utils.py:770 >> Model config LlamaConfig {
587
  "architectures": [
588
  "LlamaForCausalLM"
589
  ],
@@ -623,13 +314,13 @@ Training completed. Do not forget to share your model on huggingface.co/models =
623
  "vocab_size": 128256
624
  }
625
 
626
- [INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/chat_template.jinja
627
- [INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/tokenizer_config.json
628
- [INFO|2025-07-24 16:18:29] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6/special_tokens_map.json
629
- [WARNING|2025-07-24 16:18:30] logging.py:148 >> No metric eval_accuracy to plot.
630
- [INFO|2025-07-24 16:18:30] trainer.py:4327 >>
631
  ***** Running Evaluation *****
632
- [INFO|2025-07-24 16:18:30] trainer.py:4329 >> Num examples = 373
633
- [INFO|2025-07-24 16:18:30] trainer.py:4332 >> Batch size = 2
634
- [INFO|2025-07-24 16:18:43] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
635
  {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
 
1
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
2
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
3
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
4
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
5
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
6
+ [INFO|2025-08-12 15:41:11] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
7
+ [INFO|2025-08-12 15:41:12] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
8
+ [INFO|2025-08-12 15:41:13] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
9
+ [INFO|2025-08-12 15:41:13] configuration_utils.py:770 >> Model config LlamaConfig {
10
  "architectures": [
11
  "LlamaForCausalLM"
12
  ],
 
46
  "vocab_size": 128256
47
  }
48
 
49
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json
50
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file tokenizer.model from cache at None
51
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file added_tokens.json from cache at None
52
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json
53
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json
54
+ [INFO|2025-08-12 15:41:13] tokenization_utils_base.py:2023 >> loading file chat_template.jinja from cache at None
55
+ [INFO|2025-08-12 15:41:14] tokenization_utils_base.py:2299 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
56
+ [INFO|2025-08-12 15:41:14] logging.py:143 >> Add pad token: <|eot_id|>
57
+ [INFO|2025-08-12 15:41:14] logging.py:143 >> Add <|eom_id|> to stop words.
58
+ [INFO|2025-08-12 15:41:14] logging.py:143 >> Loading dataset pippo_sharegpt_conversational_8-11-25.json...
59
+ [INFO|2025-08-12 15:41:21] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
60
+ [INFO|2025-08-12 15:41:21] configuration_utils.py:770 >> Model config LlamaConfig {
61
  "architectures": [
62
  "LlamaForCausalLM"
63
  ],
 
97
  "vocab_size": 128256
98
  }
99
 
100
+ [INFO|2025-08-12 15:41:21] logging.py:143 >> KV cache is disabled during training.
101
+ [INFO|2025-08-12 15:42:20] modeling_utils.py:1151 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors
102
+ [INFO|2025-08-12 15:42:20] modeling_utils.py:2241 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
103
+ [INFO|2025-08-12 15:42:20] configuration_utils.py:1135 >> Generate config GenerationConfig {
104
  "bos_token_id": 128000,
105
  "eos_token_id": [
106
  128001,
 
110
  "use_cache": false
111
  }
112
 
113
+ [INFO|2025-08-12 15:42:21] modeling_utils.py:5131 >> All model checkpoint weights were used when initializing LlamaForCausalLM.
114
 
115
+ [INFO|2025-08-12 15:42:21] modeling_utils.py:5139 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct.
116
  If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
117
+ [INFO|2025-08-12 15:42:22] configuration_utils.py:1090 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json
118
+ [INFO|2025-08-12 15:42:22] configuration_utils.py:1135 >> Generate config GenerationConfig {
119
  "bos_token_id": 128000,
120
  "do_sample": true,
121
  "eos_token_id": [
 
127
  "top_p": 0.9
128
  }
129
 
130
+ [INFO|2025-08-12 15:42:22] logging.py:143 >> Gradient checkpointing enabled.
131
+ [INFO|2025-08-12 15:42:22] logging.py:143 >> Using torch SDPA for faster training and inference.
132
+ [INFO|2025-08-12 15:42:22] logging.py:143 >> Upcasting trainable params to float32.
133
+ [INFO|2025-08-12 15:42:22] logging.py:143 >> Fine-tuning method: LoRA
134
+ [INFO|2025-08-12 15:42:22] logging.py:143 >> trainable params: 22,544,384 || all params: 1,258,358,784 || trainable%: 1.7916
135
+ [INFO|2025-08-12 15:42:22] trainer.py:756 >> Using auto half precision backend
136
+ [INFO|2025-08-12 15:42:23] logging.py:143 >> Using LoRA+ optimizer with loraplus lr ratio 8.00.
137
+ [INFO|2025-08-12 15:42:23] trainer.py:2409 >> ***** Running training *****
138
+ [INFO|2025-08-12 15:42:23] trainer.py:2410 >> Num examples = 1,245
139
+ [INFO|2025-08-12 15:42:23] trainer.py:2411 >> Num Epochs = 4
140
+ [INFO|2025-08-12 15:42:23] trainer.py:2412 >> Instantaneous batch size per device = 8
141
+ [INFO|2025-08-12 15:42:23] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 32
142
+ [INFO|2025-08-12 15:42:23] trainer.py:2416 >> Gradient Accumulation steps = 4
143
+ [INFO|2025-08-12 15:42:23] trainer.py:2417 >> Total optimization steps = 156
144
+ [INFO|2025-08-12 15:42:23] trainer.py:2418 >> Number of trainable parameters = 22,544,384
145
+ [INFO|2025-08-12 15:42:33] logging.py:143 >> {'loss': 1.8434, 'learning_rate': 2.9951e-04, 'epoch': 0.13, 'throughput': 4897.80}
146
+ [INFO|2025-08-12 15:42:42] logging.py:143 >> {'loss': 0.3062, 'learning_rate': 2.9754e-04, 'epoch': 0.26, 'throughput': 5303.61}
147
+ [INFO|2025-08-12 15:42:53] logging.py:143 >> {'loss': 0.2601, 'learning_rate': 2.9408e-04, 'epoch': 0.38, 'throughput': 5105.76}
148
+ [INFO|2025-08-12 15:43:03] logging.py:143 >> {'loss': 0.1008, 'learning_rate': 2.8915e-04, 'epoch': 0.51, 'throughput': 5106.52}
149
+ [INFO|2025-08-12 15:43:12] logging.py:143 >> {'loss': 0.1124, 'learning_rate': 2.8282e-04, 'epoch': 0.64, 'throughput': 5122.84}
150
+ [INFO|2025-08-12 15:43:23] logging.py:143 >> {'loss': 0.1089, 'learning_rate': 2.7514e-04, 'epoch': 0.77, 'throughput': 5066.92}
151
+ [INFO|2025-08-12 15:43:33] logging.py:143 >> {'loss': 0.0657, 'learning_rate': 2.6619e-04, 'epoch': 0.90, 'throughput': 5049.20}
152
+ [INFO|2025-08-12 15:43:43] logging.py:143 >> {'loss': 0.0572, 'learning_rate': 2.5607e-04, 'epoch': 1.03, 'throughput': 5029.28}
153
+ [INFO|2025-08-12 15:43:53] logging.py:143 >> {'loss': 0.0523, 'learning_rate': 2.4487e-04, 'epoch': 1.15, 'throughput': 5027.02}
154
+ [INFO|2025-08-12 15:44:04] logging.py:143 >> {'loss': 0.0544, 'learning_rate': 2.3271e-04, 'epoch': 1.28, 'throughput': 5000.81}
155
+ [INFO|2025-08-12 15:44:14] logging.py:143 >> {'loss': 0.0593, 'learning_rate': 2.1971e-04, 'epoch': 1.41, 'throughput': 5000.91}
156
+ [INFO|2025-08-12 15:44:26] logging.py:143 >> {'loss': 0.0499, 'learning_rate': 2.0600e-04, 'epoch': 1.54, 'throughput': 4922.77}
157
+ [INFO|2025-08-12 15:44:37] logging.py:143 >> {'loss': 0.0512, 'learning_rate': 1.9173e-04, 'epoch': 1.67, 'throughput': 4915.89}
158
+ [INFO|2025-08-12 15:44:49] logging.py:143 >> {'loss': 0.0538, 'learning_rate': 1.7704e-04, 'epoch': 1.79, 'throughput': 4854.44}
159
+ [INFO|2025-08-12 15:45:01] logging.py:143 >> {'loss': 0.0450, 'learning_rate': 1.6207e-04, 'epoch': 1.92, 'throughput': 4783.53}
160
+ [INFO|2025-08-12 15:45:12] logging.py:143 >> {'loss': 0.0476, 'learning_rate': 1.4698e-04, 'epoch': 2.05, 'throughput': 4783.09}
161
+ [INFO|2025-08-12 15:45:25] logging.py:143 >> {'loss': 0.0470, 'learning_rate': 1.3192e-04, 'epoch': 2.18, 'throughput': 4708.73}
162
+ [INFO|2025-08-12 15:45:39] logging.py:143 >> {'loss': 0.0442, 'learning_rate': 1.1704e-04, 'epoch': 2.31, 'throughput': 4641.33}
163
+ [INFO|2025-08-12 15:45:52] logging.py:143 >> {'loss': 0.0419, 'learning_rate': 1.0250e-04, 'epoch': 2.44, 'throughput': 4587.28}
164
+ [INFO|2025-08-12 15:46:05] logging.py:143 >> {'loss': 0.0463, 'learning_rate': 8.8438e-05, 'epoch': 2.56, 'throughput': 4535.46}
165
+ [INFO|2025-08-12 15:46:05] trainer.py:4327 >>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  ***** Running Evaluation *****
167
+ [INFO|2025-08-12 15:46:05] trainer.py:4329 >> Num examples = 312
168
+ [INFO|2025-08-12 15:46:05] trainer.py:4332 >> Batch size = 8
169
+ [INFO|2025-08-12 15:46:15] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-100
170
+ [INFO|2025-08-12 15:46:15] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
171
+ [INFO|2025-08-12 15:46:15] configuration_utils.py:770 >> Model config LlamaConfig {
172
  "architectures": [
173
  "LlamaForCausalLM"
174
  ],
 
208
  "vocab_size": 128256
209
  }
210
 
211
+ [INFO|2025-08-12 15:46:16] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-100/chat_template.jinja
212
+ [INFO|2025-08-12 15:46:16] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-100/tokenizer_config.json
213
+ [INFO|2025-08-12 15:46:16] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-100/special_tokens_map.json
214
+ [INFO|2025-08-12 15:46:32] logging.py:143 >> {'loss': 0.0450, 'learning_rate': 7.5000e-05, 'epoch': 2.69, 'throughput': 4253.16}
215
+ [INFO|2025-08-12 15:46:46] logging.py:143 >> {'loss': 0.0433, 'learning_rate': 6.2322e-05, 'epoch': 2.82, 'throughput': 4228.51}
216
+ [INFO|2025-08-12 15:46:59] logging.py:143 >> {'loss': 0.0433, 'learning_rate': 5.0532e-05, 'epoch': 2.95, 'throughput': 4204.93}
217
+ [INFO|2025-08-12 15:47:13] logging.py:143 >> {'loss': 0.0439, 'learning_rate': 3.9749e-05, 'epoch': 3.08, 'throughput': 4183.35}
218
+ [INFO|2025-08-12 15:47:26] logging.py:143 >> {'loss': 0.0427, 'learning_rate': 3.0084e-05, 'epoch': 3.21, 'throughput': 4162.23}
219
+ [INFO|2025-08-12 15:47:40] logging.py:143 >> {'loss': 0.0424, 'learning_rate': 2.1633e-05, 'epoch': 3.33, 'throughput': 4146.32}
220
+ [INFO|2025-08-12 15:47:53] logging.py:143 >> {'loss': 0.0437, 'learning_rate': 1.4482e-05, 'epoch': 3.46, 'throughput': 4129.22}
221
+ [INFO|2025-08-12 15:48:07] logging.py:143 >> {'loss': 0.0411, 'learning_rate': 8.7049e-06, 'epoch': 3.59, 'throughput': 4113.01}
222
+ [INFO|2025-08-12 15:48:20] logging.py:143 >> {'loss': 0.0420, 'learning_rate': 4.3587e-06, 'epoch': 3.72, 'throughput': 4098.93}
223
+ [INFO|2025-08-12 15:48:34] logging.py:143 >> {'loss': 0.0426, 'learning_rate': 1.4880e-06, 'epoch': 3.85, 'throughput': 4086.58}
224
+ [INFO|2025-08-12 15:48:48] logging.py:143 >> {'loss': 0.0410, 'learning_rate': 1.2165e-07, 'epoch': 3.97, 'throughput': 4072.05}
225
+ [INFO|2025-08-12 15:48:50] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-156
226
+ [INFO|2025-08-12 15:48:50] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
227
+ [INFO|2025-08-12 15:48:50] configuration_utils.py:770 >> Model config LlamaConfig {
 
 
 
 
228
  "architectures": [
229
  "LlamaForCausalLM"
230
  ],
 
264
  "vocab_size": 128256
265
  }
266
 
267
+ [INFO|2025-08-12 15:48:52] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-156/chat_template.jinja
268
+ [INFO|2025-08-12 15:48:52] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-156/tokenizer_config.json
269
+ [INFO|2025-08-12 15:48:52] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/checkpoint-156/special_tokens_map.json
270
+ [INFO|2025-08-12 15:48:54] trainer.py:2676 >>
271
 
272
  Training completed. Do not forget to share your model on huggingface.co/models =)
273
 
274
 
275
+ [INFO|2025-08-12 15:48:54] trainer.py:3993 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10
276
+ [INFO|2025-08-12 15:48:55] configuration_utils.py:698 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json
277
+ [INFO|2025-08-12 15:48:55] configuration_utils.py:770 >> Model config LlamaConfig {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  "architectures": [
279
  "LlamaForCausalLM"
280
  ],
 
314
  "vocab_size": 128256
315
  }
316
 
317
+ [INFO|2025-08-12 15:48:56] tokenization_utils_base.py:2356 >> chat template saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/chat_template.jinja
318
+ [INFO|2025-08-12 15:48:56] tokenization_utils_base.py:2525 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/tokenizer_config.json
319
+ [INFO|2025-08-12 15:48:56] tokenization_utils_base.py:2534 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10/special_tokens_map.json
320
+ [WARNING|2025-08-12 15:48:56] logging.py:148 >> No metric eval_accuracy to plot.
321
+ [INFO|2025-08-12 15:48:57] trainer.py:4327 >>
322
  ***** Running Evaluation *****
323
+ [INFO|2025-08-12 15:48:57] trainer.py:4329 >> Num examples = 312
324
+ [INFO|2025-08-12 15:48:57] trainer.py:4332 >> Batch size = 8
325
+ [INFO|2025-08-12 15:49:06] modelcard.py:450 >> Dropping the following result as it does not have all the necessary fields:
326
  {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 4.0,
3
- "num_input_tokens_seen": 300000,
4
- "total_flos": 1780328448000000.0,
5
- "train_loss": 0.7932832451101314,
6
- "train_runtime": 895.9422,
7
- "train_samples_per_second": 6.652,
8
- "train_steps_per_second": 0.42
9
  }
 
1
  {
2
  "epoch": 4.0,
3
+ "num_input_tokens_seen": 1576472,
4
+ "total_flos": 9418069125660672.0,
5
+ "train_loss": 0.12584685484090677,
6
+ "train_runtime": 391.5379,
7
+ "train_samples_per_second": 12.719,
8
+ "train_steps_per_second": 0.398
9
  }
trainer_log.jsonl CHANGED
@@ -1,79 +1,33 @@
1
- {"current_steps": 5, "total_steps": 376, "loss": 1.1748, "lr": 1.9994415637302545e-05, "epoch": 0.053691275167785234, "percentage": 1.33, "elapsed_time": "0:00:15", "remaining_time": "0:19:07", "throughput": 263.9, "total_tokens": 4080}
2
- {"current_steps": 10, "total_steps": 376, "loss": 1.2634, "lr": 1.9971739852847514e-05, "epoch": 0.10738255033557047, "percentage": 2.66, "elapsed_time": "0:00:26", "remaining_time": "0:16:10", "throughput": 304.64, "total_tokens": 8080}
3
- {"current_steps": 15, "total_steps": 376, "loss": 1.4367, "lr": 1.9931663163249744e-05, "epoch": 0.1610738255033557, "percentage": 3.99, "elapsed_time": "0:00:38", "remaining_time": "0:15:27", "throughput": 315.4, "total_tokens": 12160}
4
- {"current_steps": 20, "total_steps": 376, "loss": 1.3167, "lr": 1.9874255503213154e-05, "epoch": 0.21476510067114093, "percentage": 5.32, "elapsed_time": "0:00:50", "remaining_time": "0:14:51", "throughput": 323.64, "total_tokens": 16208}
5
- {"current_steps": 25, "total_steps": 376, "loss": 1.4144, "lr": 1.979961705036587e-05, "epoch": 0.2684563758389262, "percentage": 6.65, "elapsed_time": "0:01:02", "remaining_time": "0:14:33", "throughput": 325.7, "total_tokens": 20256}
6
- {"current_steps": 30, "total_steps": 376, "loss": 1.3944, "lr": 1.9707878050448074e-05, "epoch": 0.3221476510067114, "percentage": 7.98, "elapsed_time": "0:01:13", "remaining_time": "0:14:05", "throughput": 331.6, "total_tokens": 24304}
7
- {"current_steps": 35, "total_steps": 376, "loss": 1.3946, "lr": 1.9599198590030308e-05, "epoch": 0.37583892617449666, "percentage": 9.31, "elapsed_time": "0:01:24", "remaining_time": "0:13:46", "throughput": 334.75, "total_tokens": 28384}
8
- {"current_steps": 40, "total_steps": 376, "loss": 1.3827, "lr": 1.947376831715892e-05, "epoch": 0.42953020134228187, "percentage": 10.64, "elapsed_time": "0:01:36", "remaining_time": "0:13:30", "throughput": 336.48, "total_tokens": 32480}
9
- {"current_steps": 45, "total_steps": 376, "loss": 1.4509, "lr": 1.9331806110416027e-05, "epoch": 0.48322147651006714, "percentage": 11.97, "elapsed_time": "0:01:48", "remaining_time": "0:13:20", "throughput": 336.58, "total_tokens": 36608}
10
- {"current_steps": 50, "total_steps": 376, "loss": 1.3347, "lr": 1.9173559696971594e-05, "epoch": 0.5369127516778524, "percentage": 13.3, "elapsed_time": "0:02:01", "remaining_time": "0:13:08", "throughput": 335.59, "total_tokens": 40608}
11
- {"current_steps": 55, "total_steps": 376, "loss": 1.3834, "lr": 1.899930522029408e-05, "epoch": 0.5906040268456376, "percentage": 14.63, "elapsed_time": "0:02:13", "remaining_time": "0:12:57", "throughput": 335.37, "total_tokens": 44672}
12
- {"current_steps": 60, "total_steps": 376, "loss": 1.4679, "lr": 1.8809346758274014e-05, "epoch": 0.6442953020134228, "percentage": 15.96, "elapsed_time": "0:02:25", "remaining_time": "0:12:44", "throughput": 335.6, "total_tokens": 48704}
13
- {"current_steps": 65, "total_steps": 376, "loss": 1.3973, "lr": 1.8604015792601395e-05, "epoch": 0.697986577181208, "percentage": 17.29, "elapsed_time": "0:02:36", "remaining_time": "0:12:30", "throughput": 336.68, "total_tokens": 52832}
14
- {"current_steps": 70, "total_steps": 376, "loss": 1.5408, "lr": 1.8383670630322864e-05, "epoch": 0.7516778523489933, "percentage": 18.62, "elapsed_time": "0:02:49", "remaining_time": "0:12:19", "throughput": 335.03, "total_tokens": 56672}
15
- {"current_steps": 75, "total_steps": 376, "loss": 1.4967, "lr": 1.8148695778588034e-05, "epoch": 0.8053691275167785, "percentage": 19.95, "elapsed_time": "0:03:00", "remaining_time": "0:12:04", "throughput": 335.55, "total_tokens": 60608}
16
- {"current_steps": 80, "total_steps": 376, "loss": 1.4644, "lr": 1.789950127367606e-05, "epoch": 0.8590604026845637, "percentage": 21.28, "elapsed_time": "0:03:12", "remaining_time": "0:11:51", "throughput": 336.3, "total_tokens": 64640}
17
- {"current_steps": 85, "total_steps": 376, "loss": 1.4226, "lr": 1.7636521965473324e-05, "epoch": 0.912751677852349, "percentage": 22.61, "elapsed_time": "0:03:24", "remaining_time": "0:11:38", "throughput": 336.02, "total_tokens": 68560}
18
- {"current_steps": 90, "total_steps": 376, "loss": 1.4847, "lr": 1.7360216758650826e-05, "epoch": 0.9664429530201343, "percentage": 23.94, "elapsed_time": "0:03:35", "remaining_time": "0:11:25", "throughput": 336.26, "total_tokens": 72512}
19
- {"current_steps": 95, "total_steps": 376, "loss": 1.2927, "lr": 1.7071067811865477e-05, "epoch": 1.010738255033557, "percentage": 25.27, "elapsed_time": "0:03:45", "remaining_time": "0:11:05", "throughput": 336.91, "total_tokens": 75808}
20
- {"current_steps": 100, "total_steps": 376, "loss": 0.8505, "lr": 1.67695796963826e-05, "epoch": 1.0644295302013422, "percentage": 26.6, "elapsed_time": "0:03:55", "remaining_time": "0:10:51", "throughput": 338.48, "total_tokens": 79840}
21
- {"current_steps": 100, "total_steps": 376, "eval_loss": 2.885671615600586, "epoch": 1.0644295302013422, "percentage": 26.6, "elapsed_time": "0:04:09", "remaining_time": "0:11:27", "throughput": 320.58, "total_tokens": 79840}
22
- {"current_steps": 105, "total_steps": 376, "loss": 0.7497, "lr": 1.6456278515588023e-05, "epoch": 1.1181208053691276, "percentage": 27.93, "elapsed_time": "0:04:23", "remaining_time": "0:11:20", "throughput": 318.1, "total_tokens": 83840}
23
- {"current_steps": 110, "total_steps": 376, "loss": 0.7761, "lr": 1.613171098692611e-05, "epoch": 1.1718120805369128, "percentage": 29.26, "elapsed_time": "0:04:34", "remaining_time": "0:11:04", "throughput": 319.79, "total_tokens": 87872}
24
- {"current_steps": 115, "total_steps": 376, "loss": 0.7997, "lr": 1.5796443487865774e-05, "epoch": 1.225503355704698, "percentage": 30.59, "elapsed_time": "0:04:46", "remaining_time": "0:10:49", "throughput": 321.09, "total_tokens": 91888}
25
- {"current_steps": 120, "total_steps": 376, "loss": 0.8275, "lr": 1.54510610675594e-05, "epoch": 1.279194630872483, "percentage": 31.91, "elapsed_time": "0:04:57", "remaining_time": "0:10:35", "throughput": 322.08, "total_tokens": 95888}
26
- {"current_steps": 125, "total_steps": 376, "loss": 0.8599, "lr": 1.5096166425919176e-05, "epoch": 1.3328859060402685, "percentage": 33.24, "elapsed_time": "0:05:09", "remaining_time": "0:10:21", "throughput": 322.83, "total_tokens": 99888}
27
- {"current_steps": 130, "total_steps": 376, "loss": 0.9179, "lr": 1.4732378861892524e-05, "epoch": 1.3865771812080536, "percentage": 34.57, "elapsed_time": "0:05:21", "remaining_time": "0:10:07", "throughput": 323.42, "total_tokens": 103840}
28
- {"current_steps": 135, "total_steps": 376, "loss": 0.8255, "lr": 1.436033319277183e-05, "epoch": 1.440268456375839, "percentage": 35.9, "elapsed_time": "0:05:32", "remaining_time": "0:09:53", "throughput": 324.25, "total_tokens": 107824}
29
- {"current_steps": 140, "total_steps": 376, "loss": 0.8806, "lr": 1.3980678646424308e-05, "epoch": 1.4939597315436242, "percentage": 37.23, "elapsed_time": "0:05:43", "remaining_time": "0:09:39", "throughput": 325.46, "total_tokens": 111936}
30
- {"current_steps": 145, "total_steps": 376, "loss": 0.8366, "lr": 1.3594077728375129e-05, "epoch": 1.5476510067114093, "percentage": 38.56, "elapsed_time": "0:05:55", "remaining_time": "0:09:26", "throughput": 326.15, "total_tokens": 115952}
31
- {"current_steps": 150, "total_steps": 376, "loss": 0.8567, "lr": 1.3201205065720699e-05, "epoch": 1.6013422818791945, "percentage": 39.89, "elapsed_time": "0:06:06", "remaining_time": "0:09:12", "throughput": 327.21, "total_tokens": 119984}
32
- {"current_steps": 155, "total_steps": 376, "loss": 0.8939, "lr": 1.2802746229889563e-05, "epoch": 1.6550335570469799, "percentage": 41.22, "elapsed_time": "0:06:17", "remaining_time": "0:08:58", "throughput": 328.25, "total_tokens": 124032}
33
- {"current_steps": 160, "total_steps": 376, "loss": 0.965, "lr": 1.2399396540305205e-05, "epoch": 1.7087248322147652, "percentage": 42.55, "elapsed_time": "0:06:28", "remaining_time": "0:08:44", "throughput": 329.28, "total_tokens": 128000}
34
- {"current_steps": 165, "total_steps": 376, "loss": 0.8915, "lr": 1.1991859851038362e-05, "epoch": 1.7624161073825504, "percentage": 43.88, "elapsed_time": "0:06:39", "remaining_time": "0:08:31", "throughput": 330.41, "total_tokens": 132096}
35
- {"current_steps": 170, "total_steps": 376, "loss": 0.9085, "lr": 1.1580847322566224e-05, "epoch": 1.8161073825503355, "percentage": 45.21, "elapsed_time": "0:06:50", "remaining_time": "0:08:17", "throughput": 331.65, "total_tokens": 136208}
36
- {"current_steps": 175, "total_steps": 376, "loss": 0.8831, "lr": 1.1167076180781764e-05, "epoch": 1.8697986577181207, "percentage": 46.54, "elapsed_time": "0:07:01", "remaining_time": "0:08:04", "throughput": 332.58, "total_tokens": 140320}
37
- {"current_steps": 180, "total_steps": 376, "loss": 1.0271, "lr": 1.0751268465418784e-05, "epoch": 1.923489932885906, "percentage": 47.87, "elapsed_time": "0:07:13", "remaining_time": "0:07:52", "throughput": 332.56, "total_tokens": 144176}
38
- {"current_steps": 185, "total_steps": 376, "loss": 0.9636, "lr": 1.0334149770076747e-05, "epoch": 1.9771812080536912, "percentage": 49.2, "elapsed_time": "0:07:24", "remaining_time": "0:07:39", "throughput": 333.29, "total_tokens": 148256}
39
- {"current_steps": 190, "total_steps": 376, "loss": 0.7085, "lr": 9.916447976043972e-06, "epoch": 2.021476510067114, "percentage": 50.53, "elapsed_time": "0:07:34", "remaining_time": "0:07:24", "throughput": 333.3, "total_tokens": 151488}
40
- {"current_steps": 195, "total_steps": 376, "loss": 0.5186, "lr": 9.498891982128809e-06, "epoch": 2.0751677852348993, "percentage": 51.86, "elapsed_time": "0:07:45", "remaining_time": "0:07:12", "throughput": 333.84, "total_tokens": 155472}
41
- {"current_steps": 200, "total_steps": 376, "loss": 0.5023, "lr": 9.082210432715197e-06, "epoch": 2.1288590604026845, "percentage": 53.19, "elapsed_time": "0:07:56", "remaining_time": "0:06:59", "throughput": 334.41, "total_tokens": 159504}
42
- {"current_steps": 200, "total_steps": 376, "eval_loss": 3.4170751571655273, "epoch": 2.1288590604026845, "percentage": 53.19, "elapsed_time": "0:08:10", "remaining_time": "0:07:11", "throughput": 324.96, "total_tokens": 159504}
43
- {"current_steps": 205, "total_steps": 376, "loss": 0.5145, "lr": 8.667130446262214e-06, "epoch": 2.1825503355704696, "percentage": 54.52, "elapsed_time": "0:08:25", "remaining_time": "0:07:01", "throughput": 323.15, "total_tokens": 163472}
44
- {"current_steps": 210, "total_steps": 376, "loss": 0.4971, "lr": 8.25437634646637e-06, "epoch": 2.2362416107382552, "percentage": 55.85, "elapsed_time": "0:08:37", "remaining_time": "0:06:49", "throughput": 323.77, "total_tokens": 167584}
45
- {"current_steps": 215, "total_steps": 376, "loss": 0.5042, "lr": 7.844668398300866e-06, "epoch": 2.2899328859060404, "percentage": 57.18, "elapsed_time": "0:08:48", "remaining_time": "0:06:35", "throughput": 324.87, "total_tokens": 171712}
46
- {"current_steps": 220, "total_steps": 376, "loss": 0.5423, "lr": 7.438721551137367e-06, "epoch": 2.3436241610738255, "percentage": 58.51, "elapsed_time": "0:08:59", "remaining_time": "0:06:22", "throughput": 325.53, "total_tokens": 175696}
47
- {"current_steps": 225, "total_steps": 376, "loss": 0.5913, "lr": 7.037244191143662e-06, "epoch": 2.3973154362416107, "percentage": 59.84, "elapsed_time": "0:09:11", "remaining_time": "0:06:10", "throughput": 325.95, "total_tokens": 179728}
48
- {"current_steps": 230, "total_steps": 376, "loss": 0.5403, "lr": 6.640936905134212e-06, "epoch": 2.451006711409396, "percentage": 61.17, "elapsed_time": "0:09:23", "remaining_time": "0:05:57", "throughput": 326.13, "total_tokens": 183648}
49
- {"current_steps": 235, "total_steps": 376, "loss": 0.5301, "lr": 6.2504912580307905e-06, "epoch": 2.504697986577181, "percentage": 62.5, "elapsed_time": "0:09:34", "remaining_time": "0:05:44", "throughput": 326.79, "total_tokens": 187664}
50
- {"current_steps": 240, "total_steps": 376, "loss": 0.5417, "lr": 5.866588586066481e-06, "epoch": 2.558389261744966, "percentage": 63.83, "elapsed_time": "0:09:45", "remaining_time": "0:05:31", "throughput": 327.37, "total_tokens": 191776}
51
- {"current_steps": 245, "total_steps": 376, "loss": 0.5358, "lr": 5.48989880783898e-06, "epoch": 2.6120805369127518, "percentage": 65.16, "elapsed_time": "0:09:57", "remaining_time": "0:05:19", "throughput": 327.79, "total_tokens": 195872}
52
- {"current_steps": 250, "total_steps": 376, "loss": 0.5249, "lr": 5.121079255287953e-06, "epoch": 2.665771812080537, "percentage": 66.49, "elapsed_time": "0:10:08", "remaining_time": "0:05:06", "throughput": 328.11, "total_tokens": 199808}
53
- {"current_steps": 255, "total_steps": 376, "loss": 0.5134, "lr": 4.760773526636315e-06, "epoch": 2.719463087248322, "percentage": 67.82, "elapsed_time": "0:10:20", "remaining_time": "0:04:54", "throughput": 328.62, "total_tokens": 203888}
54
- {"current_steps": 260, "total_steps": 376, "loss": 0.516, "lr": 4.409610363297211e-06, "epoch": 2.7731543624161072, "percentage": 69.15, "elapsed_time": "0:10:32", "remaining_time": "0:04:42", "throughput": 328.73, "total_tokens": 207840}
55
- {"current_steps": 265, "total_steps": 376, "loss": 0.5611, "lr": 4.0682025527064486e-06, "epoch": 2.826845637583893, "percentage": 70.48, "elapsed_time": "0:10:43", "remaining_time": "0:04:29", "throughput": 329.02, "total_tokens": 211856}
56
- {"current_steps": 270, "total_steps": 376, "loss": 0.5084, "lr": 3.7371458589949337e-06, "epoch": 2.880536912751678, "percentage": 71.81, "elapsed_time": "0:10:54", "remaining_time": "0:04:16", "throughput": 329.85, "total_tokens": 215904}
57
- {"current_steps": 275, "total_steps": 376, "loss": 0.5458, "lr": 3.4170179833671847e-06, "epoch": 2.934228187919463, "percentage": 73.14, "elapsed_time": "0:11:05", "remaining_time": "0:04:04", "throughput": 330.35, "total_tokens": 219856}
58
- {"current_steps": 280, "total_steps": 376, "loss": 0.5205, "lr": 3.1083775560000373e-06, "epoch": 2.9879194630872483, "percentage": 74.47, "elapsed_time": "0:11:15", "remaining_time": "0:03:51", "throughput": 331.42, "total_tokens": 223984}
59
- {"current_steps": 285, "total_steps": 376, "loss": 0.4058, "lr": 2.8117631612207084e-06, "epoch": 3.032214765100671, "percentage": 75.8, "elapsed_time": "0:11:24", "remaining_time": "0:03:38", "throughput": 332.17, "total_tokens": 227440}
60
- {"current_steps": 290, "total_steps": 376, "loss": 0.3859, "lr": 2.527692397665311e-06, "epoch": 3.085906040268456, "percentage": 77.13, "elapsed_time": "0:11:36", "remaining_time": "0:03:26", "throughput": 332.42, "total_tokens": 231424}
61
- {"current_steps": 295, "total_steps": 376, "loss": 0.3724, "lr": 2.256660975057867e-06, "epoch": 3.1395973154362418, "percentage": 78.46, "elapsed_time": "0:11:46", "remaining_time": "0:03:13", "throughput": 333.25, "total_tokens": 235456}
62
- {"current_steps": 300, "total_steps": 376, "loss": 0.3916, "lr": 1.9991418491859383e-06, "epoch": 3.193288590604027, "percentage": 79.79, "elapsed_time": "0:11:57", "remaining_time": "0:03:01", "throughput": 333.86, "total_tokens": 239424}
63
- {"current_steps": 300, "total_steps": 376, "eval_loss": 3.5844736099243164, "epoch": 3.193288590604027, "percentage": 79.79, "elapsed_time": "0:12:09", "remaining_time": "0:03:04", "throughput": 328.08, "total_tokens": 239424}
64
- {"current_steps": 305, "total_steps": 376, "loss": 0.3676, "lr": 1.7555843965823992e-06, "epoch": 3.246979865771812, "percentage": 81.12, "elapsed_time": "0:12:23", "remaining_time": "0:02:53", "throughput": 327.35, "total_tokens": 243488}
65
- {"current_steps": 310, "total_steps": 376, "loss": 0.3798, "lr": 1.5264136303534893e-06, "epoch": 3.3006711409395972, "percentage": 82.45, "elapsed_time": "0:12:35", "remaining_time": "0:02:40", "throughput": 327.69, "total_tokens": 247456}
66
- {"current_steps": 315, "total_steps": 376, "loss": 0.3606, "lr": 1.3120294585216353e-06, "epoch": 3.3543624161073824, "percentage": 83.78, "elapsed_time": "0:12:45", "remaining_time": "0:02:28", "throughput": 328.4, "total_tokens": 251456}
67
- {"current_steps": 320, "total_steps": 376, "loss": 0.3938, "lr": 1.11280598617714e-06, "epoch": 3.4080536912751676, "percentage": 85.11, "elapsed_time": "0:12:56", "remaining_time": "0:02:15", "throughput": 329.0, "total_tokens": 255504}
68
- {"current_steps": 325, "total_steps": 376, "loss": 0.3605, "lr": 9.290908626565931e-07, "epoch": 3.461744966442953, "percentage": 86.44, "elapsed_time": "0:13:07", "remaining_time": "0:02:03", "throughput": 329.68, "total_tokens": 259472}
69
- {"current_steps": 330, "total_steps": 376, "loss": 0.3781, "lr": 7.612046748871327e-07, "epoch": 3.5154362416107383, "percentage": 87.77, "elapsed_time": "0:13:17", "remaining_time": "0:01:51", "throughput": 330.66, "total_tokens": 263536}
70
- {"current_steps": 335, "total_steps": 376, "loss": 0.3794, "lr": 6.094403879552213e-07, "epoch": 3.5691275167785235, "percentage": 89.1, "elapsed_time": "0:13:27", "remaining_time": "0:01:38", "throughput": 331.22, "total_tokens": 267584}
71
- {"current_steps": 340, "total_steps": 376, "loss": 0.3526, "lr": 4.740628338761255e-07, "epoch": 3.6228187919463086, "percentage": 90.43, "elapsed_time": "0:13:37", "remaining_time": "0:01:26", "throughput": 332.12, "total_tokens": 271664}
72
- {"current_steps": 345, "total_steps": 376, "loss": 0.3846, "lr": 3.553082494562354e-07, "epoch": 3.6765100671140942, "percentage": 91.76, "elapsed_time": "0:13:48", "remaining_time": "0:01:14", "throughput": 332.56, "total_tokens": 275664}
73
- {"current_steps": 350, "total_steps": 376, "loss": 0.3855, "lr": 2.533838640546438e-07, "epoch": 3.7302013422818794, "percentage": 93.09, "elapsed_time": "0:13:59", "remaining_time": "0:01:02", "throughput": 333.19, "total_tokens": 279648}
74
- {"current_steps": 355, "total_steps": 376, "loss": 0.3955, "lr": 1.6846753796336491e-07, "epoch": 3.7838926174496645, "percentage": 94.41, "elapsed_time": "0:14:09", "remaining_time": "0:00:50", "throughput": 333.69, "total_tokens": 283600}
75
- {"current_steps": 360, "total_steps": 376, "loss": 0.34, "lr": 1.0070745203721532e-07, "epoch": 3.8375838926174497, "percentage": 95.74, "elapsed_time": "0:14:20", "remaining_time": "0:00:38", "throughput": 334.24, "total_tokens": 287600}
76
- {"current_steps": 365, "total_steps": 376, "loss": 0.3584, "lr": 5.022184911495864e-08, "epoch": 3.891275167785235, "percentage": 97.07, "elapsed_time": "0:14:30", "remaining_time": "0:00:26", "throughput": 334.96, "total_tokens": 291712}
77
- {"current_steps": 370, "total_steps": 376, "loss": 0.3608, "lr": 1.7098827682970885e-08, "epoch": 3.94496644295302, "percentage": 98.4, "elapsed_time": "0:14:41", "remaining_time": "0:00:14", "throughput": 335.57, "total_tokens": 295856}
78
- {"current_steps": 375, "total_steps": 376, "loss": 0.3659, "lr": 1.3961881414292776e-09, "epoch": 3.998657718120805, "percentage": 99.73, "elapsed_time": "0:14:52", "remaining_time": "0:00:02", "throughput": 336.16, "total_tokens": 299904}
79
- {"current_steps": 376, "total_steps": 376, "epoch": 4.0, "percentage": 100.0, "elapsed_time": "0:14:55", "remaining_time": "0:00:00", "throughput": 334.85, "total_tokens": 300000}
 
1
+ {"current_steps": 5, "total_steps": 156, "loss": 1.8434, "lr": 0.0002995135962201315, "epoch": 0.1282051282051282, "percentage": 3.21, "elapsed_time": "0:00:10", "remaining_time": "0:05:12", "throughput": 4897.8, "total_tokens": 50752}
2
+ {"current_steps": 10, "total_steps": 156, "loss": 0.3062, "lr": 0.00029754298604207154, "epoch": 0.2564102564102564, "percentage": 6.41, "elapsed_time": "0:00:19", "remaining_time": "0:04:39", "throughput": 5303.61, "total_tokens": 101696}
3
+ {"current_steps": 15, "total_steps": 156, "loss": 0.2601, "lr": 0.0002940777167447058, "epoch": 0.38461538461538464, "percentage": 9.62, "elapsed_time": "0:00:29", "remaining_time": "0:04:41", "throughput": 5105.76, "total_tokens": 152704}
4
+ {"current_steps": 20, "total_steps": 156, "loss": 0.1008, "lr": 0.0002891528926491214, "epoch": 0.5128205128205128, "percentage": 12.82, "elapsed_time": "0:00:39", "remaining_time": "0:04:30", "throughput": 5106.52, "total_tokens": 203328}
5
+ {"current_steps": 25, "total_steps": 156, "loss": 0.1124, "lr": 0.0002828184038479814, "epoch": 0.6410256410256411, "percentage": 16.03, "elapsed_time": "0:00:49", "remaining_time": "0:04:19", "throughput": 5122.84, "total_tokens": 253888}
6
+ {"current_steps": 30, "total_steps": 156, "loss": 0.1089, "lr": 0.00027513842080242916, "epoch": 0.7692307692307693, "percentage": 19.23, "elapsed_time": "0:01:00", "remaining_time": "0:04:12", "throughput": 5066.92, "total_tokens": 304832}
7
+ {"current_steps": 35, "total_steps": 156, "loss": 0.0657, "lr": 0.00026619074427414814, "epoch": 0.8974358974358975, "percentage": 22.44, "elapsed_time": "0:01:10", "remaining_time": "0:04:03", "throughput": 5049.2, "total_tokens": 355456}
8
+ {"current_steps": 40, "total_steps": 156, "loss": 0.0572, "lr": 0.00025606601717798207, "epoch": 1.0256410256410255, "percentage": 25.64, "elapsed_time": "0:01:20", "remaining_time": "0:03:53", "throughput": 5029.28, "total_tokens": 405080}
9
+ {"current_steps": 45, "total_steps": 156, "loss": 0.0523, "lr": 0.0002448668063393066, "epoch": 1.1538461538461537, "percentage": 28.85, "elapsed_time": "0:01:30", "remaining_time": "0:03:43", "throughput": 5027.02, "total_tokens": 455832}
10
+ {"current_steps": 50, "total_steps": 156, "loss": 0.0544, "lr": 0.00023270656345825375, "epoch": 1.282051282051282, "percentage": 32.05, "elapsed_time": "0:01:41", "remaining_time": "0:03:34", "throughput": 5000.81, "total_tokens": 505816}
11
+ {"current_steps": 55, "total_steps": 156, "loss": 0.0593, "lr": 0.00021970847580656525, "epoch": 1.4102564102564101, "percentage": 35.26, "elapsed_time": "0:01:51", "remaining_time": "0:03:24", "throughput": 5000.91, "total_tokens": 556568}
12
+ {"current_steps": 60, "total_steps": 156, "loss": 0.0499, "lr": 0.00020600421829989314, "epoch": 1.5384615384615383, "percentage": 38.46, "elapsed_time": "0:02:03", "remaining_time": "0:03:17", "throughput": 4922.77, "total_tokens": 607000}
13
+ {"current_steps": 65, "total_steps": 156, "loss": 0.0512, "lr": 0.0001917326195874679, "epoch": 1.6666666666666665, "percentage": 41.67, "elapsed_time": "0:02:13", "remaining_time": "0:03:07", "throughput": 4915.89, "total_tokens": 657816}
14
+ {"current_steps": 70, "total_steps": 156, "loss": 0.0538, "lr": 0.00017703825567208587, "epoch": 1.7948717948717947, "percentage": 44.87, "elapsed_time": "0:02:25", "remaining_time": "0:02:59", "throughput": 4854.44, "total_tokens": 708696}
15
+ {"current_steps": 75, "total_steps": 156, "loss": 0.045, "lr": 0.0001620699853075089, "epoch": 1.9230769230769231, "percentage": 48.08, "elapsed_time": "0:02:38", "remaining_time": "0:02:51", "throughput": 4783.53, "total_tokens": 759320}
16
+ {"current_steps": 80, "total_steps": 156, "loss": 0.0476, "lr": 0.00014697944201018398, "epoch": 2.051282051282051, "percentage": 51.28, "elapsed_time": "0:02:48", "remaining_time": "0:02:40", "throughput": 4783.09, "total_tokens": 808152}
17
+ {"current_steps": 85, "total_steps": 156, "loss": 0.047, "lr": 0.00013191949796170156, "epoch": 2.1794871794871793, "percentage": 54.49, "elapsed_time": "0:03:02", "remaining_time": "0:02:32", "throughput": 4708.73, "total_tokens": 858392}
18
+ {"current_steps": 90, "total_steps": 156, "loss": 0.0442, "lr": 0.00011704271536316746, "epoch": 2.3076923076923075, "percentage": 57.69, "elapsed_time": "0:03:15", "remaining_time": "0:02:23", "throughput": 4641.33, "total_tokens": 908888}
19
+ {"current_steps": 95, "total_steps": 156, "loss": 0.0419, "lr": 0.00010249980092977915, "epoch": 2.435897435897436, "percentage": 60.9, "elapsed_time": "0:03:29", "remaining_time": "0:02:14", "throughput": 4587.28, "total_tokens": 959832}
20
+ {"current_steps": 100, "total_steps": 156, "loss": 0.0463, "lr": 8.84380791820865e-05, "epoch": 2.564102564102564, "percentage": 64.1, "elapsed_time": "0:03:42", "remaining_time": "0:02:04", "throughput": 4535.46, "total_tokens": 1009816}
21
+ {"current_steps": 100, "total_steps": 156, "eval_loss": 0.06037572771310806, "epoch": 2.564102564102564, "percentage": 64.1, "elapsed_time": "0:03:52", "remaining_time": "0:02:10", "throughput": 4349.11, "total_tokens": 1009816}
22
+ {"current_steps": 105, "total_steps": 156, "loss": 0.045, "lr": 7.500000000000002e-05, "epoch": 2.6923076923076925, "percentage": 67.31, "elapsed_time": "0:04:09", "remaining_time": "0:02:01", "throughput": 4253.16, "total_tokens": 1060632}
23
+ {"current_steps": 110, "total_steps": 156, "loss": 0.0433, "lr": 6.23216955585167e-05, "epoch": 2.8205128205128203, "percentage": 70.51, "elapsed_time": "0:04:22", "remaining_time": "0:01:49", "throughput": 4228.51, "total_tokens": 1111448}
24
+ {"current_steps": 115, "total_steps": 156, "loss": 0.0433, "lr": 5.0531601263880747e-05, "epoch": 2.948717948717949, "percentage": 73.72, "elapsed_time": "0:04:36", "remaining_time": "0:01:38", "throughput": 4204.93, "total_tokens": 1162456}
25
+ {"current_steps": 120, "total_steps": 156, "loss": 0.0439, "lr": 3.974915466055074e-05, "epoch": 3.076923076923077, "percentage": 76.92, "elapsed_time": "0:04:49", "remaining_time": "0:01:26", "throughput": 4183.35, "total_tokens": 1212248}
26
+ {"current_steps": 125, "total_steps": 156, "loss": 0.0427, "lr": 3.0083585489474855e-05, "epoch": 3.2051282051282053, "percentage": 80.13, "elapsed_time": "0:05:03", "remaining_time": "0:01:15", "throughput": 4162.23, "total_tokens": 1262936}
27
+ {"current_steps": 130, "total_steps": 156, "loss": 0.0424, "lr": 2.1632809154782886e-05, "epoch": 3.3333333333333335, "percentage": 83.33, "elapsed_time": "0:05:16", "remaining_time": "0:01:03", "throughput": 4146.32, "total_tokens": 1313624}
28
+ {"current_steps": 135, "total_steps": 156, "loss": 0.0437, "lr": 1.4482434808442684e-05, "epoch": 3.4615384615384617, "percentage": 86.54, "elapsed_time": "0:05:30", "remaining_time": "0:00:51", "throughput": 4129.22, "total_tokens": 1363928}
29
+ {"current_steps": 140, "total_steps": 156, "loss": 0.0411, "lr": 8.70489810131027e-06, "epoch": 3.58974358974359, "percentage": 89.74, "elapsed_time": "0:05:43", "remaining_time": "0:00:39", "throughput": 4113.01, "total_tokens": 1414680}
30
+ {"current_steps": 145, "total_steps": 156, "loss": 0.042, "lr": 4.358727386092198e-06, "epoch": 3.717948717948718, "percentage": 92.95, "elapsed_time": "0:05:57", "remaining_time": "0:00:27", "throughput": 4098.93, "total_tokens": 1465368}
31
+ {"current_steps": 150, "total_steps": 156, "loss": 0.0426, "lr": 1.4879508058253886e-06, "epoch": 3.8461538461538463, "percentage": 96.15, "elapsed_time": "0:06:11", "remaining_time": "0:00:14", "throughput": 4086.58, "total_tokens": 1516504}
32
+ {"current_steps": 155, "total_steps": 156, "loss": 0.041, "lr": 1.2165027426456198e-07, "epoch": 3.9743589743589745, "percentage": 99.36, "elapsed_time": "0:06:24", "remaining_time": "0:00:02", "throughput": 4072.05, "total_tokens": 1567256}
33
+ {"current_steps": 156, "total_steps": 156, "epoch": 4.0, "percentage": 100.0, "elapsed_time": "0:06:31", "remaining_time": "0:00:00", "throughput": 4026.38, "total_tokens": 1576472}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
trainer_state.json CHANGED
@@ -4,802 +4,344 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 4.0,
6
  "eval_steps": 100,
7
- "global_step": 376,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.053691275167785234,
14
- "grad_norm": 8.373551368713379,
15
- "learning_rate": 1.9994415637302545e-05,
16
- "loss": 1.1748,
17
- "num_input_tokens_seen": 4080,
18
  "step": 5,
19
- "train_runtime": 15.4659,
20
- "train_tokens_per_second": 263.806
21
  },
22
  {
23
- "epoch": 0.10738255033557047,
24
- "grad_norm": 6.174284934997559,
25
- "learning_rate": 1.9971739852847514e-05,
26
- "loss": 1.2634,
27
- "num_input_tokens_seen": 8080,
28
  "step": 10,
29
- "train_runtime": 26.529,
30
- "train_tokens_per_second": 304.572
31
  },
32
  {
33
- "epoch": 0.1610738255033557,
34
- "grad_norm": 8.544853210449219,
35
- "learning_rate": 1.9931663163249744e-05,
36
- "loss": 1.4367,
37
- "num_input_tokens_seen": 12160,
38
  "step": 15,
39
- "train_runtime": 38.5597,
40
- "train_tokens_per_second": 315.355
41
  },
42
  {
43
- "epoch": 0.21476510067114093,
44
- "grad_norm": 7.589258193969727,
45
- "learning_rate": 1.9874255503213154e-05,
46
- "loss": 1.3167,
47
- "num_input_tokens_seen": 16208,
48
  "step": 20,
49
- "train_runtime": 50.0852,
50
- "train_tokens_per_second": 323.609
51
  },
52
  {
53
- "epoch": 0.2684563758389262,
54
- "grad_norm": 8.169995307922363,
55
- "learning_rate": 1.979961705036587e-05,
56
- "loss": 1.4144,
57
- "num_input_tokens_seen": 20256,
58
  "step": 25,
59
- "train_runtime": 62.1983,
60
- "train_tokens_per_second": 325.668
61
  },
62
  {
63
- "epoch": 0.3221476510067114,
64
- "grad_norm": 8.032195091247559,
65
- "learning_rate": 1.9707878050448074e-05,
66
- "loss": 1.3944,
67
- "num_input_tokens_seen": 24304,
68
  "step": 30,
69
- "train_runtime": 73.2989,
70
- "train_tokens_per_second": 331.574
71
  },
72
  {
73
- "epoch": 0.37583892617449666,
74
- "grad_norm": 7.009624004364014,
75
- "learning_rate": 1.9599198590030308e-05,
76
- "loss": 1.3946,
77
- "num_input_tokens_seen": 28384,
78
  "step": 35,
79
- "train_runtime": 84.7962,
80
- "train_tokens_per_second": 334.732
81
  },
82
  {
83
- "epoch": 0.42953020134228187,
84
- "grad_norm": 7.077579498291016,
85
- "learning_rate": 1.947376831715892e-05,
86
- "loss": 1.3827,
87
- "num_input_tokens_seen": 32480,
88
  "step": 40,
89
- "train_runtime": 96.5334,
90
- "train_tokens_per_second": 336.464
91
  },
92
  {
93
- "epoch": 0.48322147651006714,
94
- "grad_norm": 6.77874755859375,
95
- "learning_rate": 1.9331806110416027e-05,
96
- "loss": 1.4509,
97
- "num_input_tokens_seen": 36608,
98
  "step": 45,
99
- "train_runtime": 108.7688,
100
- "train_tokens_per_second": 336.567
101
  },
102
  {
103
- "epoch": 0.5369127516778524,
104
- "grad_norm": 6.446975231170654,
105
- "learning_rate": 1.9173559696971594e-05,
106
- "loss": 1.3347,
107
- "num_input_tokens_seen": 40608,
108
  "step": 50,
109
- "train_runtime": 121.0098,
110
- "train_tokens_per_second": 335.576
111
  },
112
  {
113
- "epoch": 0.5906040268456376,
114
- "grad_norm": 7.034291744232178,
115
- "learning_rate": 1.899930522029408e-05,
116
- "loss": 1.3834,
117
- "num_input_tokens_seen": 44672,
118
  "step": 55,
119
- "train_runtime": 133.2088,
120
- "train_tokens_per_second": 335.353
121
  },
122
  {
123
- "epoch": 0.6442953020134228,
124
- "grad_norm": 6.512075424194336,
125
- "learning_rate": 1.8809346758274014e-05,
126
- "loss": 1.4679,
127
- "num_input_tokens_seen": 48704,
128
  "step": 60,
129
- "train_runtime": 145.1301,
130
- "train_tokens_per_second": 335.589
131
  },
132
  {
133
- "epoch": 0.697986577181208,
134
- "grad_norm": 5.968503475189209,
135
- "learning_rate": 1.8604015792601395e-05,
136
- "loss": 1.3973,
137
- "num_input_tokens_seen": 52832,
138
  "step": 65,
139
- "train_runtime": 156.9248,
140
- "train_tokens_per_second": 336.671
141
  },
142
  {
143
- "epoch": 0.7516778523489933,
144
- "grad_norm": 6.824077129364014,
145
- "learning_rate": 1.8383670630322864e-05,
146
- "loss": 1.5408,
147
- "num_input_tokens_seen": 56672,
148
  "step": 70,
149
- "train_runtime": 169.159,
150
- "train_tokens_per_second": 335.022
151
  },
152
  {
153
- "epoch": 0.8053691275167785,
154
- "grad_norm": 6.580747127532959,
155
- "learning_rate": 1.8148695778588034e-05,
156
- "loss": 1.4967,
157
- "num_input_tokens_seen": 60608,
158
  "step": 75,
159
- "train_runtime": 180.627,
160
- "train_tokens_per_second": 335.542
161
  },
162
  {
163
- "epoch": 0.8590604026845637,
164
- "grad_norm": 6.616082191467285,
165
- "learning_rate": 1.789950127367606e-05,
166
- "loss": 1.4644,
167
- "num_input_tokens_seen": 64640,
168
  "step": 80,
169
- "train_runtime": 192.2131,
170
- "train_tokens_per_second": 336.293
171
  },
172
  {
173
- "epoch": 0.912751677852349,
174
- "grad_norm": 6.736894607543945,
175
- "learning_rate": 1.7636521965473324e-05,
176
- "loss": 1.4226,
177
- "num_input_tokens_seen": 68560,
178
  "step": 85,
179
- "train_runtime": 204.0434,
180
- "train_tokens_per_second": 336.007
181
  },
182
  {
183
- "epoch": 0.9664429530201343,
184
- "grad_norm": 6.567346572875977,
185
- "learning_rate": 1.7360216758650826e-05,
186
- "loss": 1.4847,
187
- "num_input_tokens_seen": 72512,
188
  "step": 90,
189
- "train_runtime": 215.6473,
190
- "train_tokens_per_second": 336.253
191
  },
192
  {
193
- "epoch": 1.010738255033557,
194
- "grad_norm": 4.928441047668457,
195
- "learning_rate": 1.7071067811865477e-05,
196
- "loss": 1.2927,
197
- "num_input_tokens_seen": 75808,
198
  "step": 95,
199
- "train_runtime": 225.0164,
200
- "train_tokens_per_second": 336.9
201
  },
202
  {
203
- "epoch": 1.0644295302013422,
204
- "grad_norm": 5.473133087158203,
205
- "learning_rate": 1.67695796963826e-05,
206
- "loss": 0.8505,
207
- "num_input_tokens_seen": 79840,
208
  "step": 100,
209
- "train_runtime": 235.8823,
210
- "train_tokens_per_second": 338.474
211
  },
212
  {
213
- "epoch": 1.0644295302013422,
214
- "eval_loss": 2.885671615600586,
215
- "eval_runtime": 13.1654,
216
- "eval_samples_per_second": 28.332,
217
- "eval_steps_per_second": 14.204,
218
- "num_input_tokens_seen": 79840,
219
  "step": 100
220
  },
221
  {
222
- "epoch": 1.1181208053691276,
223
- "grad_norm": 7.370046138763428,
224
- "learning_rate": 1.6456278515588023e-05,
225
- "loss": 0.7497,
226
- "num_input_tokens_seen": 83840,
227
  "step": 105,
228
- "train_runtime": 263.568,
229
- "train_tokens_per_second": 318.096
230
  },
231
  {
232
- "epoch": 1.1718120805369128,
233
- "grad_norm": 6.678887844085693,
234
- "learning_rate": 1.613171098692611e-05,
235
- "loss": 0.7761,
236
- "num_input_tokens_seen": 87872,
237
  "step": 110,
238
- "train_runtime": 274.7842,
239
- "train_tokens_per_second": 319.786
240
  },
241
  {
242
- "epoch": 1.225503355704698,
243
- "grad_norm": 5.9977707862854,
244
- "learning_rate": 1.5796443487865774e-05,
245
- "loss": 0.7997,
246
- "num_input_tokens_seen": 91888,
247
  "step": 115,
248
- "train_runtime": 286.1784,
249
- "train_tokens_per_second": 321.086
250
  },
251
  {
252
- "epoch": 1.279194630872483,
253
- "grad_norm": 6.782074451446533,
254
- "learning_rate": 1.54510610675594e-05,
255
- "loss": 0.8275,
256
- "num_input_tokens_seen": 95888,
257
  "step": 120,
258
- "train_runtime": 297.7215,
259
- "train_tokens_per_second": 322.073
260
  },
261
  {
262
- "epoch": 1.3328859060402685,
263
- "grad_norm": 6.483254909515381,
264
- "learning_rate": 1.5096166425919176e-05,
265
- "loss": 0.8599,
266
- "num_input_tokens_seen": 99888,
267
  "step": 125,
268
- "train_runtime": 309.418,
269
- "train_tokens_per_second": 322.825
270
  },
271
  {
272
- "epoch": 1.3865771812080536,
273
- "grad_norm": 6.478480815887451,
274
- "learning_rate": 1.4732378861892524e-05,
275
- "loss": 0.9179,
276
- "num_input_tokens_seen": 103840,
277
  "step": 130,
278
- "train_runtime": 321.0761,
279
- "train_tokens_per_second": 323.412
280
  },
281
  {
282
- "epoch": 1.440268456375839,
283
- "grad_norm": 5.326768398284912,
284
- "learning_rate": 1.436033319277183e-05,
285
- "loss": 0.8255,
286
- "num_input_tokens_seen": 107824,
287
  "step": 135,
288
- "train_runtime": 332.5409,
289
- "train_tokens_per_second": 324.243
290
  },
291
  {
292
- "epoch": 1.4939597315436242,
293
- "grad_norm": 5.588732719421387,
294
- "learning_rate": 1.3980678646424308e-05,
295
- "loss": 0.8806,
296
- "num_input_tokens_seen": 111936,
297
  "step": 140,
298
- "train_runtime": 343.9421,
299
- "train_tokens_per_second": 325.45
300
  },
301
  {
302
- "epoch": 1.5476510067114093,
303
- "grad_norm": 4.919476509094238,
304
- "learning_rate": 1.3594077728375129e-05,
305
- "loss": 0.8366,
306
- "num_input_tokens_seen": 115952,
307
  "step": 145,
308
- "train_runtime": 355.5203,
309
- "train_tokens_per_second": 326.147
310
  },
311
  {
312
- "epoch": 1.6013422818791945,
313
- "grad_norm": 5.641396522521973,
314
- "learning_rate": 1.3201205065720699e-05,
315
- "loss": 0.8567,
316
- "num_input_tokens_seen": 119984,
317
  "step": 150,
318
- "train_runtime": 366.6895,
319
- "train_tokens_per_second": 327.209
320
  },
321
  {
322
- "epoch": 1.6550335570469799,
323
- "grad_norm": 6.353975296020508,
324
- "learning_rate": 1.2802746229889563e-05,
325
- "loss": 0.8939,
326
- "num_input_tokens_seen": 124032,
327
  "step": 155,
328
- "train_runtime": 377.8594,
329
- "train_tokens_per_second": 328.249
330
- },
331
- {
332
- "epoch": 1.7087248322147652,
333
- "grad_norm": 6.592543125152588,
334
- "learning_rate": 1.2399396540305205e-05,
335
- "loss": 0.965,
336
- "num_input_tokens_seen": 128000,
337
- "step": 160,
338
- "train_runtime": 388.7376,
339
- "train_tokens_per_second": 329.271
340
- },
341
- {
342
- "epoch": 1.7624161073825504,
343
- "grad_norm": 6.394205570220947,
344
- "learning_rate": 1.1991859851038362e-05,
345
- "loss": 0.8915,
346
- "num_input_tokens_seen": 132096,
347
- "step": 165,
348
- "train_runtime": 399.8008,
349
- "train_tokens_per_second": 330.405
350
- },
351
- {
352
- "epoch": 1.8161073825503355,
353
- "grad_norm": 6.431347846984863,
354
- "learning_rate": 1.1580847322566224e-05,
355
- "loss": 0.9085,
356
- "num_input_tokens_seen": 136208,
357
- "step": 170,
358
- "train_runtime": 410.7015,
359
- "train_tokens_per_second": 331.647
360
- },
361
- {
362
- "epoch": 1.8697986577181207,
363
- "grad_norm": 6.055410385131836,
364
- "learning_rate": 1.1167076180781764e-05,
365
- "loss": 0.8831,
366
- "num_input_tokens_seen": 140320,
367
- "step": 175,
368
- "train_runtime": 421.922,
369
- "train_tokens_per_second": 332.573
370
- },
371
- {
372
- "epoch": 1.923489932885906,
373
- "grad_norm": 6.936720848083496,
374
- "learning_rate": 1.0751268465418784e-05,
375
- "loss": 1.0271,
376
- "num_input_tokens_seen": 144176,
377
- "step": 180,
378
- "train_runtime": 433.5402,
379
- "train_tokens_per_second": 332.555
380
- },
381
- {
382
- "epoch": 1.9771812080536912,
383
- "grad_norm": 6.763705253601074,
384
- "learning_rate": 1.0334149770076747e-05,
385
- "loss": 0.9636,
386
- "num_input_tokens_seen": 148256,
387
- "step": 185,
388
- "train_runtime": 444.8309,
389
- "train_tokens_per_second": 333.286
390
- },
391
- {
392
- "epoch": 2.021476510067114,
393
- "grad_norm": 3.6498420238494873,
394
- "learning_rate": 9.916447976043972e-06,
395
- "loss": 0.7085,
396
- "num_input_tokens_seen": 151488,
397
- "step": 190,
398
- "train_runtime": 454.5144,
399
- "train_tokens_per_second": 333.296
400
- },
401
- {
402
- "epoch": 2.0751677852348993,
403
- "grad_norm": 4.591700553894043,
404
- "learning_rate": 9.498891982128809e-06,
405
- "loss": 0.5186,
406
- "num_input_tokens_seen": 155472,
407
- "step": 195,
408
- "train_runtime": 465.7186,
409
- "train_tokens_per_second": 333.832
410
- },
411
- {
412
- "epoch": 2.1288590604026845,
413
- "grad_norm": 5.094573497772217,
414
- "learning_rate": 9.082210432715197e-06,
415
- "loss": 0.5023,
416
- "num_input_tokens_seen": 159504,
417
- "step": 200,
418
- "train_runtime": 476.9786,
419
- "train_tokens_per_second": 334.405
420
- },
421
- {
422
- "epoch": 2.1288590604026845,
423
- "eval_loss": 3.4170751571655273,
424
- "eval_runtime": 13.8611,
425
- "eval_samples_per_second": 26.91,
426
- "eval_steps_per_second": 13.491,
427
- "num_input_tokens_seen": 159504,
428
- "step": 200
429
- },
430
- {
431
- "epoch": 2.1825503355704696,
432
- "grad_norm": 5.766697406768799,
433
- "learning_rate": 8.667130446262214e-06,
434
- "loss": 0.5145,
435
- "num_input_tokens_seen": 163472,
436
- "step": 205,
437
- "train_runtime": 505.879,
438
- "train_tokens_per_second": 323.144
439
- },
440
- {
441
- "epoch": 2.2362416107382552,
442
- "grad_norm": 4.821939945220947,
443
- "learning_rate": 8.25437634646637e-06,
444
- "loss": 0.4971,
445
- "num_input_tokens_seen": 167584,
446
- "step": 210,
447
- "train_runtime": 517.6135,
448
- "train_tokens_per_second": 323.763
449
- },
450
- {
451
- "epoch": 2.2899328859060404,
452
- "grad_norm": 5.04258394241333,
453
- "learning_rate": 7.844668398300866e-06,
454
- "loss": 0.5042,
455
- "num_input_tokens_seen": 171712,
456
- "step": 215,
457
- "train_runtime": 528.5686,
458
- "train_tokens_per_second": 324.862
459
- },
460
- {
461
- "epoch": 2.3436241610738255,
462
- "grad_norm": 3.9442596435546875,
463
- "learning_rate": 7.438721551137367e-06,
464
- "loss": 0.5423,
465
- "num_input_tokens_seen": 175696,
466
- "step": 220,
467
- "train_runtime": 539.7243,
468
- "train_tokens_per_second": 325.529
469
- },
470
- {
471
- "epoch": 2.3973154362416107,
472
- "grad_norm": 4.744536399841309,
473
- "learning_rate": 7.037244191143662e-06,
474
- "loss": 0.5913,
475
- "num_input_tokens_seen": 179728,
476
- "step": 225,
477
- "train_runtime": 551.4106,
478
- "train_tokens_per_second": 325.942
479
- },
480
- {
481
- "epoch": 2.451006711409396,
482
- "grad_norm": 3.8527259826660156,
483
- "learning_rate": 6.640936905134212e-06,
484
- "loss": 0.5403,
485
- "num_input_tokens_seen": 183648,
486
- "step": 230,
487
- "train_runtime": 563.1128,
488
- "train_tokens_per_second": 326.13
489
- },
490
- {
491
- "epoch": 2.504697986577181,
492
- "grad_norm": 5.313571453094482,
493
- "learning_rate": 6.2504912580307905e-06,
494
- "loss": 0.5301,
495
- "num_input_tokens_seen": 187664,
496
- "step": 235,
497
- "train_runtime": 574.2745,
498
- "train_tokens_per_second": 326.784
499
- },
500
- {
501
- "epoch": 2.558389261744966,
502
- "grad_norm": 4.798688888549805,
503
- "learning_rate": 5.866588586066481e-06,
504
- "loss": 0.5417,
505
- "num_input_tokens_seen": 191776,
506
- "step": 240,
507
- "train_runtime": 585.8109,
508
- "train_tokens_per_second": 327.368
509
- },
510
- {
511
- "epoch": 2.6120805369127518,
512
- "grad_norm": 4.535137176513672,
513
- "learning_rate": 5.48989880783898e-06,
514
- "loss": 0.5358,
515
- "num_input_tokens_seen": 195872,
516
- "step": 245,
517
- "train_runtime": 597.5628,
518
- "train_tokens_per_second": 327.785
519
- },
520
- {
521
- "epoch": 2.665771812080537,
522
- "grad_norm": 4.101404666900635,
523
- "learning_rate": 5.121079255287953e-06,
524
- "loss": 0.5249,
525
- "num_input_tokens_seen": 199808,
526
- "step": 250,
527
- "train_runtime": 608.9706,
528
- "train_tokens_per_second": 328.108
529
- },
530
- {
531
- "epoch": 2.719463087248322,
532
- "grad_norm": 4.927453994750977,
533
- "learning_rate": 4.760773526636315e-06,
534
- "loss": 0.5134,
535
- "num_input_tokens_seen": 203888,
536
- "step": 255,
537
- "train_runtime": 620.4402,
538
- "train_tokens_per_second": 328.618
539
- },
540
- {
541
- "epoch": 2.7731543624161072,
542
- "grad_norm": 4.203514575958252,
543
- "learning_rate": 4.409610363297211e-06,
544
- "loss": 0.516,
545
- "num_input_tokens_seen": 207840,
546
- "step": 260,
547
- "train_runtime": 632.2498,
548
- "train_tokens_per_second": 328.731
549
- },
550
- {
551
- "epoch": 2.826845637583893,
552
- "grad_norm": 3.7994656562805176,
553
- "learning_rate": 4.0682025527064486e-06,
554
- "loss": 0.5611,
555
- "num_input_tokens_seen": 211856,
556
- "step": 265,
557
- "train_runtime": 643.9121,
558
- "train_tokens_per_second": 329.014
559
- },
560
- {
561
- "epoch": 2.880536912751678,
562
- "grad_norm": 4.413597583770752,
563
- "learning_rate": 3.7371458589949337e-06,
564
- "loss": 0.5084,
565
- "num_input_tokens_seen": 215904,
566
- "step": 270,
567
- "train_runtime": 654.5637,
568
- "train_tokens_per_second": 329.844
569
- },
570
- {
571
- "epoch": 2.934228187919463,
572
- "grad_norm": 4.6744384765625,
573
- "learning_rate": 3.4170179833671847e-06,
574
- "loss": 0.5458,
575
- "num_input_tokens_seen": 219856,
576
- "step": 275,
577
- "train_runtime": 665.522,
578
- "train_tokens_per_second": 330.351
579
- },
580
- {
581
- "epoch": 2.9879194630872483,
582
- "grad_norm": 4.479729652404785,
583
- "learning_rate": 3.1083775560000373e-06,
584
- "loss": 0.5205,
585
- "num_input_tokens_seen": 223984,
586
- "step": 280,
587
- "train_runtime": 675.8425,
588
- "train_tokens_per_second": 331.414
589
- },
590
- {
591
- "epoch": 3.032214765100671,
592
- "grad_norm": 2.398268938064575,
593
- "learning_rate": 2.8117631612207084e-06,
594
- "loss": 0.4058,
595
- "num_input_tokens_seen": 227440,
596
- "step": 285,
597
- "train_runtime": 684.7066,
598
- "train_tokens_per_second": 332.171
599
- },
600
- {
601
- "epoch": 3.085906040268456,
602
- "grad_norm": 2.7196109294891357,
603
- "learning_rate": 2.527692397665311e-06,
604
- "loss": 0.3859,
605
- "num_input_tokens_seen": 231424,
606
- "step": 290,
607
- "train_runtime": 696.1897,
608
- "train_tokens_per_second": 332.415
609
- },
610
- {
611
- "epoch": 3.1395973154362418,
612
- "grad_norm": 2.9645609855651855,
613
- "learning_rate": 2.256660975057867e-06,
614
- "loss": 0.3724,
615
- "num_input_tokens_seen": 235456,
616
- "step": 295,
617
- "train_runtime": 706.5398,
618
- "train_tokens_per_second": 333.252
619
- },
620
- {
621
- "epoch": 3.193288590604027,
622
- "grad_norm": 3.25144624710083,
623
- "learning_rate": 1.9991418491859383e-06,
624
- "loss": 0.3916,
625
- "num_input_tokens_seen": 239424,
626
- "step": 300,
627
- "train_runtime": 717.153,
628
- "train_tokens_per_second": 333.853
629
- },
630
- {
631
- "epoch": 3.193288590604027,
632
- "eval_loss": 3.5844736099243164,
633
- "eval_runtime": 12.6192,
634
- "eval_samples_per_second": 29.558,
635
- "eval_steps_per_second": 14.819,
636
- "num_input_tokens_seen": 239424,
637
- "step": 300
638
- },
639
- {
640
- "epoch": 3.246979865771812,
641
- "grad_norm": 3.184697151184082,
642
- "learning_rate": 1.7555843965823992e-06,
643
- "loss": 0.3676,
644
- "num_input_tokens_seen": 243488,
645
- "step": 305,
646
- "train_runtime": 743.816,
647
- "train_tokens_per_second": 327.35
648
- },
649
- {
650
- "epoch": 3.3006711409395972,
651
- "grad_norm": 3.306453227996826,
652
- "learning_rate": 1.5264136303534893e-06,
653
- "loss": 0.3798,
654
- "num_input_tokens_seen": 247456,
655
- "step": 310,
656
- "train_runtime": 755.1587,
657
- "train_tokens_per_second": 327.687
658
- },
659
- {
660
- "epoch": 3.3543624161073824,
661
- "grad_norm": 3.3960649967193604,
662
- "learning_rate": 1.3120294585216353e-06,
663
- "loss": 0.3606,
664
- "num_input_tokens_seen": 251456,
665
- "step": 315,
666
- "train_runtime": 765.7156,
667
- "train_tokens_per_second": 328.393
668
- },
669
- {
670
- "epoch": 3.4080536912751676,
671
- "grad_norm": 3.560805320739746,
672
- "learning_rate": 1.11280598617714e-06,
673
- "loss": 0.3938,
674
- "num_input_tokens_seen": 255504,
675
- "step": 320,
676
- "train_runtime": 776.6023,
677
- "train_tokens_per_second": 329.002
678
- },
679
- {
680
- "epoch": 3.461744966442953,
681
- "grad_norm": 3.646918773651123,
682
- "learning_rate": 9.290908626565931e-07,
683
- "loss": 0.3605,
684
- "num_input_tokens_seen": 259472,
685
- "step": 325,
686
- "train_runtime": 787.0463,
687
- "train_tokens_per_second": 329.678
688
- },
689
- {
690
- "epoch": 3.5154362416107383,
691
- "grad_norm": 2.9772727489471436,
692
- "learning_rate": 7.612046748871327e-07,
693
- "loss": 0.3781,
694
- "num_input_tokens_seen": 263536,
695
- "step": 330,
696
- "train_runtime": 797.017,
697
- "train_tokens_per_second": 330.653
698
- },
699
- {
700
- "epoch": 3.5691275167785235,
701
- "grad_norm": 2.8137423992156982,
702
- "learning_rate": 6.094403879552213e-07,
703
- "loss": 0.3794,
704
- "num_input_tokens_seen": 267584,
705
- "step": 335,
706
- "train_runtime": 807.8723,
707
- "train_tokens_per_second": 331.221
708
- },
709
- {
710
- "epoch": 3.6228187919463086,
711
- "grad_norm": 2.8694584369659424,
712
- "learning_rate": 4.740628338761255e-07,
713
- "loss": 0.3526,
714
- "num_input_tokens_seen": 271664,
715
- "step": 340,
716
- "train_runtime": 817.9687,
717
- "train_tokens_per_second": 332.12
718
- },
719
- {
720
- "epoch": 3.6765100671140942,
721
- "grad_norm": 3.0649948120117188,
722
- "learning_rate": 3.553082494562354e-07,
723
- "loss": 0.3846,
724
- "num_input_tokens_seen": 275664,
725
- "step": 345,
726
- "train_runtime": 828.9328,
727
- "train_tokens_per_second": 332.553
728
- },
729
- {
730
- "epoch": 3.7302013422818794,
731
- "grad_norm": 4.392162322998047,
732
- "learning_rate": 2.533838640546438e-07,
733
- "loss": 0.3855,
734
- "num_input_tokens_seen": 279648,
735
- "step": 350,
736
- "train_runtime": 839.3162,
737
- "train_tokens_per_second": 333.186
738
- },
739
- {
740
- "epoch": 3.7838926174496645,
741
- "grad_norm": 3.539325475692749,
742
- "learning_rate": 1.6846753796336491e-07,
743
- "loss": 0.3955,
744
- "num_input_tokens_seen": 283600,
745
- "step": 355,
746
- "train_runtime": 849.891,
747
- "train_tokens_per_second": 333.69
748
- },
749
- {
750
- "epoch": 3.8375838926174497,
751
- "grad_norm": 3.1544559001922607,
752
- "learning_rate": 1.0070745203721532e-07,
753
- "loss": 0.34,
754
- "num_input_tokens_seen": 287600,
755
- "step": 360,
756
- "train_runtime": 860.4738,
757
- "train_tokens_per_second": 334.234
758
- },
759
- {
760
- "epoch": 3.891275167785235,
761
- "grad_norm": 3.2322452068328857,
762
- "learning_rate": 5.022184911495864e-08,
763
- "loss": 0.3584,
764
- "num_input_tokens_seen": 291712,
765
- "step": 365,
766
- "train_runtime": 870.8827,
767
- "train_tokens_per_second": 334.961
768
- },
769
- {
770
- "epoch": 3.94496644295302,
771
- "grad_norm": 3.279101610183716,
772
- "learning_rate": 1.7098827682970885e-08,
773
- "loss": 0.3608,
774
- "num_input_tokens_seen": 295856,
775
- "step": 370,
776
- "train_runtime": 881.666,
777
- "train_tokens_per_second": 335.565
778
- },
779
- {
780
- "epoch": 3.998657718120805,
781
- "grad_norm": 3.604062080383301,
782
- "learning_rate": 1.3961881414292776e-09,
783
- "loss": 0.3659,
784
- "num_input_tokens_seen": 299904,
785
- "step": 375,
786
- "train_runtime": 892.1532,
787
- "train_tokens_per_second": 336.158
788
  },
789
  {
790
  "epoch": 4.0,
791
- "num_input_tokens_seen": 300000,
792
- "step": 376,
793
- "total_flos": 1780328448000000.0,
794
- "train_loss": 0.7932832451101314,
795
- "train_runtime": 895.9422,
796
- "train_samples_per_second": 6.652,
797
- "train_steps_per_second": 0.42
798
  }
799
  ],
800
  "logging_steps": 5,
801
- "max_steps": 376,
802
- "num_input_tokens_seen": 300000,
803
  "num_train_epochs": 4,
804
  "save_steps": 100,
805
  "stateful_callbacks": {
@@ -814,8 +356,8 @@
814
  "attributes": {}
815
  }
816
  },
817
- "total_flos": 1780328448000000.0,
818
- "train_batch_size": 2,
819
  "trial_name": null,
820
  "trial_params": null
821
  }
 
4
  "best_model_checkpoint": null,
5
  "epoch": 4.0,
6
  "eval_steps": 100,
7
+ "global_step": 156,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.1282051282051282,
14
+ "grad_norm": 5.522233963012695,
15
+ "learning_rate": 0.0002995135962201315,
16
+ "loss": 1.8434,
17
+ "num_input_tokens_seen": 50752,
18
  "step": 5,
19
+ "train_runtime": 10.3647,
20
+ "train_tokens_per_second": 4896.615
21
  },
22
  {
23
+ "epoch": 0.2564102564102564,
24
+ "grad_norm": 0.8573942184448242,
25
+ "learning_rate": 0.00029754298604207154,
26
+ "loss": 0.3062,
27
+ "num_input_tokens_seen": 101696,
28
  "step": 10,
29
+ "train_runtime": 19.1774,
30
+ "train_tokens_per_second": 5302.914
31
  },
32
  {
33
+ "epoch": 0.38461538461538464,
34
+ "grad_norm": 3.1560184955596924,
35
+ "learning_rate": 0.0002940777167447058,
36
+ "loss": 0.2601,
37
+ "num_input_tokens_seen": 152704,
38
  "step": 15,
39
+ "train_runtime": 29.9107,
40
+ "train_tokens_per_second": 5105.336
41
  },
42
  {
43
+ "epoch": 0.5128205128205128,
44
+ "grad_norm": 0.6165825128555298,
45
+ "learning_rate": 0.0002891528926491214,
46
+ "loss": 0.1008,
47
+ "num_input_tokens_seen": 203328,
48
  "step": 20,
49
+ "train_runtime": 39.8198,
50
+ "train_tokens_per_second": 5106.2
51
  },
52
  {
53
+ "epoch": 0.6410256410256411,
54
+ "grad_norm": 3.741000175476074,
55
+ "learning_rate": 0.0002828184038479814,
56
+ "loss": 0.1124,
57
+ "num_input_tokens_seen": 253888,
58
  "step": 25,
59
+ "train_runtime": 49.5625,
60
+ "train_tokens_per_second": 5122.579
61
  },
62
  {
63
+ "epoch": 0.7692307692307693,
64
+ "grad_norm": 1.1849110126495361,
65
+ "learning_rate": 0.00027513842080242916,
66
+ "loss": 0.1089,
67
+ "num_input_tokens_seen": 304832,
68
  "step": 30,
69
+ "train_runtime": 60.1637,
70
+ "train_tokens_per_second": 5066.713
71
  },
72
  {
73
+ "epoch": 0.8974358974358975,
74
+ "grad_norm": 0.8759293556213379,
75
+ "learning_rate": 0.00026619074427414814,
76
+ "loss": 0.0657,
77
+ "num_input_tokens_seen": 355456,
78
  "step": 35,
79
+ "train_runtime": 70.401,
80
+ "train_tokens_per_second": 5049.019
81
  },
82
  {
83
+ "epoch": 1.0256410256410255,
84
+ "grad_norm": 0.20007072389125824,
85
+ "learning_rate": 0.00025606601717798207,
86
+ "loss": 0.0572,
87
+ "num_input_tokens_seen": 405080,
88
  "step": 40,
89
+ "train_runtime": 80.5468,
90
+ "train_tokens_per_second": 5029.128
91
  },
92
  {
93
+ "epoch": 1.1538461538461537,
94
+ "grad_norm": 0.25018996000289917,
95
+ "learning_rate": 0.0002448668063393066,
96
+ "loss": 0.0523,
97
+ "num_input_tokens_seen": 455832,
98
  "step": 45,
99
+ "train_runtime": 90.6788,
100
+ "train_tokens_per_second": 5026.885
101
  },
102
  {
103
+ "epoch": 1.282051282051282,
104
+ "grad_norm": 0.13906894624233246,
105
+ "learning_rate": 0.00023270656345825375,
106
+ "loss": 0.0544,
107
+ "num_input_tokens_seen": 505816,
108
  "step": 50,
109
+ "train_runtime": 101.1492,
110
+ "train_tokens_per_second": 5000.69
111
  },
112
  {
113
+ "epoch": 1.4102564102564101,
114
+ "grad_norm": 0.2015986293554306,
115
+ "learning_rate": 0.00021970847580656525,
116
+ "loss": 0.0593,
117
+ "num_input_tokens_seen": 556568,
118
  "step": 55,
119
+ "train_runtime": 111.2958,
120
+ "train_tokens_per_second": 5000.798
121
  },
122
  {
123
+ "epoch": 1.5384615384615383,
124
+ "grad_norm": 0.14813509583473206,
125
+ "learning_rate": 0.00020600421829989314,
126
+ "loss": 0.0499,
127
+ "num_input_tokens_seen": 607000,
128
  "step": 60,
129
+ "train_runtime": 123.307,
130
+ "train_tokens_per_second": 4922.672
131
  },
132
  {
133
+ "epoch": 1.6666666666666665,
134
+ "grad_norm": 0.2038683444261551,
135
+ "learning_rate": 0.0001917326195874679,
136
+ "loss": 0.0512,
137
+ "num_input_tokens_seen": 657816,
138
  "step": 65,
139
+ "train_runtime": 133.8167,
140
+ "train_tokens_per_second": 4915.797
141
  },
142
  {
143
+ "epoch": 1.7948717948717947,
144
+ "grad_norm": 0.13923539221286774,
145
+ "learning_rate": 0.00017703825567208587,
146
+ "loss": 0.0538,
147
+ "num_input_tokens_seen": 708696,
148
  "step": 70,
149
+ "train_runtime": 145.9917,
150
+ "train_tokens_per_second": 4854.357
151
  },
152
  {
153
+ "epoch": 1.9230769230769231,
154
+ "grad_norm": 0.1887049674987793,
155
+ "learning_rate": 0.0001620699853075089,
156
+ "loss": 0.045,
157
+ "num_input_tokens_seen": 759320,
158
  "step": 75,
159
+ "train_runtime": 158.739,
160
+ "train_tokens_per_second": 4783.45
161
  },
162
  {
163
+ "epoch": 2.051282051282051,
164
+ "grad_norm": 0.12282092869281769,
165
+ "learning_rate": 0.00014697944201018398,
166
+ "loss": 0.0476,
167
+ "num_input_tokens_seen": 808152,
168
  "step": 80,
169
+ "train_runtime": 168.9625,
170
+ "train_tokens_per_second": 4783.025
171
  },
172
  {
173
+ "epoch": 2.1794871794871793,
174
+ "grad_norm": 0.12478133291006088,
175
+ "learning_rate": 0.00013191949796170156,
176
+ "loss": 0.047,
177
+ "num_input_tokens_seen": 858392,
178
  "step": 85,
179
+ "train_runtime": 182.3006,
180
+ "train_tokens_per_second": 4708.662
181
  },
182
  {
183
+ "epoch": 2.3076923076923075,
184
+ "grad_norm": 0.07435181736946106,
185
+ "learning_rate": 0.00011704271536316746,
186
+ "loss": 0.0442,
187
+ "num_input_tokens_seen": 908888,
188
  "step": 90,
189
+ "train_runtime": 195.8275,
190
+ "train_tokens_per_second": 4641.269
191
  },
192
  {
193
+ "epoch": 2.435897435897436,
194
+ "grad_norm": 0.1333557814359665,
195
+ "learning_rate": 0.00010249980092977915,
196
+ "loss": 0.0419,
197
+ "num_input_tokens_seen": 959832,
198
  "step": 95,
199
+ "train_runtime": 209.2403,
200
+ "train_tokens_per_second": 4587.224
201
  },
202
  {
203
+ "epoch": 2.564102564102564,
204
+ "grad_norm": 0.08782525360584259,
205
+ "learning_rate": 8.84380791820865e-05,
206
+ "loss": 0.0463,
207
+ "num_input_tokens_seen": 1009816,
208
  "step": 100,
209
+ "train_runtime": 222.6514,
210
+ "train_tokens_per_second": 4535.413
211
  },
212
  {
213
+ "epoch": 2.564102564102564,
214
+ "eval_loss": 0.06037572771310806,
215
+ "eval_runtime": 9.5358,
216
+ "eval_samples_per_second": 32.719,
217
+ "eval_steps_per_second": 4.09,
218
+ "num_input_tokens_seen": 1009816,
219
  "step": 100
220
  },
221
  {
222
+ "epoch": 2.6923076923076925,
223
+ "grad_norm": 0.1221509501338005,
224
+ "learning_rate": 7.500000000000002e-05,
225
+ "loss": 0.045,
226
+ "num_input_tokens_seen": 1060632,
227
  "step": 105,
228
+ "train_runtime": 249.3777,
229
+ "train_tokens_per_second": 4253.114
230
  },
231
  {
232
+ "epoch": 2.8205128205128203,
233
+ "grad_norm": 0.09098366647958755,
234
+ "learning_rate": 6.23216955585167e-05,
235
+ "loss": 0.0433,
236
+ "num_input_tokens_seen": 1111448,
237
  "step": 110,
238
+ "train_runtime": 262.8486,
239
+ "train_tokens_per_second": 4228.472
240
  },
241
  {
242
+ "epoch": 2.948717948717949,
243
+ "grad_norm": 0.07901735603809357,
244
+ "learning_rate": 5.0531601263880747e-05,
245
+ "loss": 0.0433,
246
+ "num_input_tokens_seen": 1162456,
247
  "step": 115,
248
+ "train_runtime": 276.4533,
249
+ "train_tokens_per_second": 4204.891
250
  },
251
  {
252
+ "epoch": 3.076923076923077,
253
+ "grad_norm": 0.09963134676218033,
254
+ "learning_rate": 3.974915466055074e-05,
255
+ "loss": 0.0439,
256
+ "num_input_tokens_seen": 1212248,
257
  "step": 120,
258
+ "train_runtime": 289.7817,
259
+ "train_tokens_per_second": 4183.315
260
  },
261
  {
262
+ "epoch": 3.2051282051282053,
263
+ "grad_norm": 0.08260652422904968,
264
+ "learning_rate": 3.0083585489474855e-05,
265
+ "loss": 0.0427,
266
+ "num_input_tokens_seen": 1262936,
267
  "step": 125,
268
+ "train_runtime": 303.4302,
269
+ "train_tokens_per_second": 4162.197
270
  },
271
  {
272
+ "epoch": 3.3333333333333335,
273
+ "grad_norm": 0.09652029722929001,
274
+ "learning_rate": 2.1632809154782886e-05,
275
+ "loss": 0.0424,
276
+ "num_input_tokens_seen": 1313624,
277
  "step": 130,
278
+ "train_runtime": 316.8192,
279
+ "train_tokens_per_second": 4146.289
280
  },
281
  {
282
+ "epoch": 3.4615384615384617,
283
+ "grad_norm": 0.1191246286034584,
284
+ "learning_rate": 1.4482434808442684e-05,
285
+ "loss": 0.0437,
286
+ "num_input_tokens_seen": 1363928,
287
  "step": 135,
288
+ "train_runtime": 330.3136,
289
+ "train_tokens_per_second": 4129.191
290
  },
291
  {
292
+ "epoch": 3.58974358974359,
293
+ "grad_norm": 0.10475795716047287,
294
+ "learning_rate": 8.70489810131027e-06,
295
+ "loss": 0.0411,
296
+ "num_input_tokens_seen": 1414680,
297
  "step": 140,
298
+ "train_runtime": 343.955,
299
+ "train_tokens_per_second": 4112.98
300
  },
301
  {
302
+ "epoch": 3.717948717948718,
303
+ "grad_norm": 0.09064557403326035,
304
+ "learning_rate": 4.358727386092198e-06,
305
+ "loss": 0.042,
306
+ "num_input_tokens_seen": 1465368,
307
  "step": 145,
308
+ "train_runtime": 357.5028,
309
+ "train_tokens_per_second": 4098.899
310
  },
311
  {
312
+ "epoch": 3.8461538461538463,
313
+ "grad_norm": 0.07417233288288116,
314
+ "learning_rate": 1.4879508058253886e-06,
315
+ "loss": 0.0426,
316
+ "num_input_tokens_seen": 1516504,
317
  "step": 150,
318
+ "train_runtime": 371.0961,
319
+ "train_tokens_per_second": 4086.554
320
  },
321
  {
322
+ "epoch": 3.9743589743589745,
323
+ "grad_norm": 0.07646717131137848,
324
+ "learning_rate": 1.2165027426456198e-07,
325
+ "loss": 0.041,
326
+ "num_input_tokens_seen": 1567256,
327
  "step": 155,
328
+ "train_runtime": 384.8842,
329
+ "train_tokens_per_second": 4072.019
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
  },
331
  {
332
  "epoch": 4.0,
333
+ "num_input_tokens_seen": 1576472,
334
+ "step": 156,
335
+ "total_flos": 9418069125660672.0,
336
+ "train_loss": 0.12584685484090677,
337
+ "train_runtime": 391.5379,
338
+ "train_samples_per_second": 12.719,
339
+ "train_steps_per_second": 0.398
340
  }
341
  ],
342
  "logging_steps": 5,
343
+ "max_steps": 156,
344
+ "num_input_tokens_seen": 1576472,
345
  "num_train_epochs": 4,
346
  "save_steps": 100,
347
  "stateful_callbacks": {
 
356
  "attributes": {}
357
  }
358
  },
359
+ "total_flos": 9418069125660672.0,
360
+ "train_batch_size": 8,
361
  "trial_name": null,
362
  "trial_params": null
363
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33993b60fd6ce357901ae84544b5451fdd8d4d3dc51c04aa377491c86c3089cd
3
  size 5752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:171015cab50cf10ee5f3b71c1a82d2e67eda7e879d7b17feb2eb6581d0ad7a3a
3
  size 5752
training_args.yaml CHANGED
@@ -1,4 +1,3 @@
1
- adapter_name_or_path: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v4
2
  bf16: true
3
  cutoff_len: 1024
4
  dataset: pipo_persona
@@ -10,27 +9,25 @@ eval_steps: 100
10
  eval_strategy: steps
11
  finetuning_type: lora
12
  flash_attn: auto
13
- gradient_accumulation_steps: 8
14
  include_num_input_tokens_seen: true
15
- learning_rate: 2.0e-05
16
  logging_steps: 5
17
- lora_alpha: 32
18
- lora_dropout: 0.1
19
- lora_rank: 16
20
- lora_target: q_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
21
  loraplus_lr_ratio: 8
22
  lr_scheduler_type: cosine
23
  max_grad_norm: 1.0
24
- max_samples: 1900
25
  model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
26
  num_train_epochs: 4.0
27
  optim: adamw_torch
28
- output_dir: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v6
29
  packing: false
30
- per_device_eval_batch_size: 2
31
- per_device_train_batch_size: 2
32
- pissa_convert: true
33
- pissa_init: true
34
  plot_loss: true
35
  preprocessing_num_workers: 16
36
  report_to: none
@@ -38,7 +35,5 @@ save_steps: 100
38
  stage: sft
39
  template: llama3
40
  trust_remote_code: true
41
- use_dora: true
42
- use_rslora: true
43
  val_size: 0.2
44
  warmup_steps: 0
 
 
1
  bf16: true
2
  cutoff_len: 1024
3
  dataset: pipo_persona
 
9
  eval_strategy: steps
10
  finetuning_type: lora
11
  flash_attn: auto
12
+ gradient_accumulation_steps: 4
13
  include_num_input_tokens_seen: true
14
+ learning_rate: 0.0003
15
  logging_steps: 5
16
+ lora_alpha: 64
17
+ lora_dropout: 0
18
+ lora_rank: 32
19
+ lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj
20
  loraplus_lr_ratio: 8
21
  lr_scheduler_type: cosine
22
  max_grad_norm: 1.0
23
+ max_samples: 30000
24
  model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
25
  num_train_epochs: 4.0
26
  optim: adamw_torch
27
+ output_dir: saves/Llama-3.2-1B-Instruct/lora/train_1B-Instruct_pippo_v10
28
  packing: false
29
+ per_device_eval_batch_size: 8
30
+ per_device_train_batch_size: 8
 
 
31
  plot_loss: true
32
  preprocessing_num_workers: 16
33
  report_to: none
 
35
  stage: sft
36
  template: llama3
37
  trust_remote_code: true
 
 
38
  val_size: 0.2
39
  warmup_steps: 0
training_eval_loss.png CHANGED
training_loss.png ADDED