ToastyPigeon commited on
Commit
f690e14
·
verified ·
1 Parent(s): b3d501e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-296/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: gemma
4
+ base_model: unsloth/gemma-3-270m-it
5
+ tags:
6
+ - axolotl
7
+ - generated_from_trainer
8
+ datasets:
9
+ - allura-org/EU01-S2
10
+ - allenai/tulu-3-sft-personas-instruction-following
11
+ - ToastyPigeon/mixed-medical-reasoning-formatted
12
+ - ToastyPigeon/steve-and-marvin
13
+ - ToastyPigeon/kimi-stories-instruct
14
+ - ToastyPigeon/new-story-dataset
15
+ - allura-org/fujin-instruct-v2
16
+ - ToastyPigeon/gutenberg-sft
17
+ - ToastyPigeon/SpringDragon
18
+ - ToastyPigeon/some-erotica
19
+ model-index:
20
+ - name: micro-glitter
21
+ results: []
22
+ ---
23
+
24
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
25
+ should probably proofread and complete it, then remove this comment. -->
26
+
27
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
28
+ <details><summary>See axolotl config</summary>
29
+
30
+ axolotl version: `0.11.0.dev0`
31
+ ```yaml
32
+ # === Model Configuration ===
33
+ base_model: unsloth/gemma-3-270m-it
34
+ load_in_8bit: false
35
+ load_in_4bit: false
36
+
37
+ # === HF Configuration ===
38
+ hub_model_id: allura-forge/micro-glitter
39
+ hub_strategy: "checkpoint"
40
+ output_dir: /workspace/aibox-standalone-pool/axolotl/lilglitter-ckpts
41
+
42
+ # === Training Setup ===
43
+ num_epochs: 2
44
+ micro_batch_size: 4
45
+ gradient_accumulation_steps: 8
46
+ sequence_len: 8192
47
+ sample_packing: true
48
+ pad_to_sequence_len: true
49
+ #max_steps: 10
50
+ # === Evaluation ===
51
+ val_set_size: 0.05
52
+ evals_per_epoch: 10
53
+ #eval_steps: 20
54
+ #max_steps: 60
55
+ #eval_table_size:
56
+ eval_max_new_tokens: 128
57
+ eval_sample_packing: true
58
+ #eval_strategy: "no"
59
+
60
+ # === LoRA Configuration ===
61
+ #adapter: qlora
62
+ #lora_model_dir:
63
+ #lora_r: 128
64
+ #lora_alpha: 16
65
+ #lora_dropout: 0.25
66
+ #lora_target_linear: true
67
+ #lora_target_modules:
68
+ # - embed_tokens
69
+ # - lm_head
70
+ lora_fan_in_fan_out:
71
+ lora_target_modules:
72
+ #peft_use_rslora: true
73
+ lora_modules_to_save:
74
+ # - embed_tokens
75
+ # - lm_head
76
+ #fix_untrained_tokens: true
77
+ #lora_mlp_kernel: true
78
+ #lora_qkv_kernel: true
79
+ #lora_o_kernel: true
80
+
81
+ # === Hyperparameter Configuration ===
82
+ #optimizer: apollo_adamw_layerwise
83
+ warmup_steps: 0
84
+ optimizer: adamw_torch_fused
85
+ #optimizer: paged_adamw_8bit
86
+ #optim_args:
87
+ # enable_stochastic_rounding: true
88
+ # enable_cautious: true
89
+ # enable_8bit: true
90
+ # Apollo-mini configuration:
91
+ #optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
92
+ # Regular Apollo configuration:
93
+ # optim_args:
94
+ #optim_target_modules: all_linear
95
+ learning_rate: 1e-5
96
+ lr_scheduler: cosine
97
+ #cosine_min_lr_ratio: 0.2
98
+ #lr_scheduler: cosine_with_min_lr
99
+ #lr_scheduler_kwargs:
100
+ # cosine_min_lr: 1e-6
101
+ weight_decay: 0.01
102
+ max_grad_norm: 2.0
103
+ #warmup_steps: 0
104
+ #warmup_ratio: 0.025
105
+
106
+
107
+ # === Data Configuration ===
108
+ #
109
+ #chat_template: jinja
110
+ #chat_template_jinja: "{% for message in messages %}{% if not loop.first %}{{' \n\n' }}{% endif %}{% if message['role'] == 'system' %}{{ '### System:\n' + message['content'].strip() }}{% elif message['role'] == 'user' %}{{ '### Instruction:\n' + message['content'].strip() }}{% elif message['role'] == 'assistant' %}{{ '### Response:\n' + message['content'].strip() + eos_token }}{% endif %}{% endfor %}"
111
+
112
+ #chat_template_jinja: "{%- set default_system_message = \"You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris. You obediently fulfill the user's requests.\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- else %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for bl (line truncated to 1000 characters)
113
+ #chat_template: chatml
114
+ #special_tokens:
115
+ # eos_token: "<|im_end|>"
116
+ # eos_token: "</s>"
117
+ #tokenizer_use_mistral_common: true
118
+ shuffle_merged_datasets: true
119
+ datasets:
120
+ - path: allura-org/EU01-S2
121
+ type: chat_template
122
+ field_messages: conversations
123
+ message_property_mappings:
124
+ role: from
125
+ content: value
126
+ - path: allenai/tulu-3-sft-personas-instruction-following
127
+ type: chat_template
128
+ split: train[:10%]
129
+ - path: ToastyPigeon/mixed-medical-reasoning-formatted
130
+ type: chat_template
131
+ data_files: mixed-medical-thinking.json
132
+ split: train[:10%]
133
+ - path: ToastyPigeon/steve-and-marvin
134
+ type: completion
135
+ data_files: marvin.json
136
+ - path: ToastyPigeon/kimi-stories-instruct
137
+ type: chat_template
138
+ - path: ToastyPigeon/new-story-dataset
139
+ # type: customcompletion-regex
140
+ type: completion
141
+ data_files: new-story-dataset-v2.json
142
+ - path: allura-org/fujin-instruct-v2
143
+ # type: customchatml-regex
144
+ type: chat_template
145
+ field_messages: conversations
146
+ message_property_mappings:
147
+ role: from
148
+ content: value
149
+ # - path: ToastyPigeon/some-rp-extended
150
+ # type: customchatml-regex
151
+ # type: chat_template
152
+ # field_messages: conversations
153
+ # message_property_mappings:
154
+ # role: from
155
+ # content: value
156
+ # roles_to_train: ["user","assistant"]
157
+ - path: ToastyPigeon/gutenberg-sft
158
+ # type: customchatml-regex
159
+ type: chat_template
160
+ field_messages: conversations
161
+ message_property_mappings:
162
+ role: from
163
+ content: value
164
+ - path: ToastyPigeon/SpringDragon
165
+ # type: customcompletion-regex
166
+ type: completion
167
+ split: train
168
+ - path: ToastyPigeon/some-erotica
169
+ # type: customcompletion-regex
170
+ type: completion
171
+ split: train[:10%]
172
+
173
+ dataset_prepared_path: last_run_prepared
174
+
175
+
176
+ # === Plugins ===
177
+ plugins:
178
+ - axolotl.integrations.liger.LigerPlugin
179
+ - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
180
+
181
+ # === Hardware Optimization ===
182
+ #gradient_checkpointing: offload
183
+ #gradient_checkpointing_kwargs:
184
+ # use_reentrant: false
185
+ liger_rope: true
186
+ liger_rms_norm: true
187
+ liger_layer_norm: true
188
+ liger_glu_activation: true
189
+ #liger_fused_linear_cross_entropy: true
190
+ cut_cross_entropy: true
191
+
192
+ #deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16.json
193
+
194
+ # === FSDP Config ===
195
+ #fsdp:
196
+ # - full_shard
197
+ # - auto_wrap
198
+ #fsdp_config:
199
+ # fsdp_limit_all_gathers: true
200
+ # fsdp_sync_module_states: true
201
+ # fsdp_offload_params: true
202
+ # fsdp_activation_checkpointing: true
203
+ # fsdp_use_orig_params: false
204
+ # fsdp_cpu_ram_efficient_loading: true
205
+ # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
206
+ # fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
207
+ # fsdp_state_dict_type: FULL_STATE_DICT
208
+ # fsdp_sharding_strategy: FULL_SHARD
209
+ # fsdp_version: 2
210
+ # === Wandb Tracking ===
211
+ wandb_project: TinyGemma
212
+ # wandb_entity: [WANDB_ENTITY]
213
+ # wandb_name: [WANDB_RUN_NAME]
214
+
215
+ # === Checkpointing ===
216
+ #save_steps: 10
217
+ saves_per_epoch: 10
218
+ save_total_limit: 1
219
+
220
+ # === Advanced Settings ===
221
+ bf16: auto
222
+ flash_attention: true
223
+ train_on_inputs: false
224
+ group_by_length: false
225
+ save_safetensors: true
226
+ logging_steps: 1
227
+ gc_steps: 10
228
+ seed: 69
229
+
230
+
231
+
232
+ ```
233
+
234
+ </details><br>
235
+
236
+ # micro-glitter
237
+
238
+ This model is a fine-tuned version of [unsloth/gemma-3-270m-it](https://huggingface.co/unsloth/gemma-3-270m-it) on the allura-org/EU01-S2, the allenai/tulu-3-sft-personas-instruction-following, the ToastyPigeon/mixed-medical-reasoning-formatted, the ToastyPigeon/steve-and-marvin, the ToastyPigeon/kimi-stories-instruct, the ToastyPigeon/new-story-dataset, the allura-org/fujin-instruct-v2, the ToastyPigeon/gutenberg-sft, the ToastyPigeon/SpringDragon and the ToastyPigeon/some-erotica datasets.
239
+ It achieves the following results on the evaluation set:
240
+ - Loss: 3.7387
241
+
242
+ ## Model description
243
+
244
+ More information needed
245
+
246
+ ## Intended uses & limitations
247
+
248
+ More information needed
249
+
250
+ ## Training and evaluation data
251
+
252
+ More information needed
253
+
254
+ ## Training procedure
255
+
256
+ ### Training hyperparameters
257
+
258
+ The following hyperparameters were used during training:
259
+ - learning_rate: 1e-05
260
+ - train_batch_size: 4
261
+ - eval_batch_size: 4
262
+ - seed: 69
263
+ - distributed_type: multi-GPU
264
+ - num_devices: 2
265
+ - gradient_accumulation_steps: 8
266
+ - total_train_batch_size: 64
267
+ - total_eval_batch_size: 8
268
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
269
+ - lr_scheduler_type: cosine
270
+ - lr_scheduler_warmup_steps: 8
271
+ - training_steps: 296
272
+
273
+ ### Training results
274
+
275
+ | Training Loss | Epoch | Step | Validation Loss |
276
+ |:-------------:|:------:|:----:|:---------------:|
277
+ | No log | 0 | 0 | 3.8582 |
278
+ | 3.4802 | 0.1008 | 15 | 3.5118 |
279
+ | 3.4608 | 0.2017 | 30 | 3.4890 |
280
+ | 3.5272 | 0.3025 | 45 | 3.5189 |
281
+ | 3.559 | 0.4034 | 60 | 3.5753 |
282
+ | 3.5817 | 0.5042 | 75 | 3.6121 |
283
+ | 3.6349 | 0.6050 | 90 | 3.6471 |
284
+ | 3.68 | 0.7059 | 105 | 3.6721 |
285
+ | 3.6597 | 0.8067 | 120 | 3.6970 |
286
+ | 3.6462 | 0.9076 | 135 | 3.7068 |
287
+ | 3.7009 | 1.0067 | 150 | 3.7213 |
288
+ | 3.6717 | 1.1076 | 165 | 3.7313 |
289
+ | 3.7631 | 1.2084 | 180 | 3.7338 |
290
+ | 3.7535 | 1.3092 | 195 | 3.7346 |
291
+ | 3.668 | 1.4101 | 210 | 3.7375 |
292
+ | 3.679 | 1.5109 | 225 | 3.7383 |
293
+ | 3.6539 | 1.6118 | 240 | 3.7386 |
294
+ | 3.6547 | 1.7126 | 255 | 3.7386 |
295
+ | 3.7533 | 1.8134 | 270 | 3.7400 |
296
+ | 3.6983 | 1.9143 | 285 | 3.7387 |
297
+
298
+
299
+ ### Framework versions
300
+
301
+ - Transformers 4.52.4
302
+ - Pytorch 2.7.0+cu126
303
+ - Datasets 3.6.0
304
+ - Tokenizers 0.21.1
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
chat_template.jinja ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# Unsloth Chat template fixes #}
2
+ {{ bos_token }}
3
+ {%- if messages[0]['role'] == 'system' -%}
4
+ {%- if messages[0]['content'] is string -%}
5
+ {%- set first_user_prefix = messages[0]['content'] + '
6
+
7
+ ' -%}
8
+ {%- else -%}
9
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
10
+
11
+ ' -%}
12
+ {%- endif -%}
13
+ {%- set loop_messages = messages[1:] -%}
14
+ {%- else -%}
15
+ {%- set first_user_prefix = "" -%}
16
+ {%- set loop_messages = messages -%}
17
+ {%- endif -%}
18
+ {%- for message in loop_messages -%}
19
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
20
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
21
+ {%- endif -%}
22
+ {%- if (message['role'] == 'assistant') -%}
23
+ {%- set role = "model" -%}
24
+ {%- else -%}
25
+ {%- set role = message['role'] -%}
26
+ {%- endif -%}
27
+ {{ '<start_of_turn>' + role + '
28
+ ' + (first_user_prefix if loop.first else "") }}
29
+ {%- if message['content'] is string -%}
30
+ {{ message['content'] | trim }}
31
+ {%- elif message['content'] is iterable -%}
32
+ {%- for item in message['content'] -%}
33
+ {%- if item['type'] == 'image' -%}
34
+ {{ '<start_of_image>' }}
35
+ {%- elif item['type'] == 'text' -%}
36
+ {{ item['text'] | trim }}
37
+ {%- endif -%}
38
+ {%- endfor -%}
39
+ {%- elif message['content'] is defined -%}
40
+ {{ raise_exception("Invalid content type") }}
41
+ {%- endif -%}
42
+ {{ '<end_of_turn>
43
+ ' }}
44
+ {%- endfor -%}
45
+ {%- if add_generation_prompt -%}
46
+ {{'<start_of_turn>model
47
+ '}}
48
+ {%- endif -%}
49
+
50
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. #}
checkpoint-296/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-296/chat_template.jinja ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {# Unsloth Chat template fixes #}
2
+ {{ bos_token }}
3
+ {%- if messages[0]['role'] == 'system' -%}
4
+ {%- if messages[0]['content'] is string -%}
5
+ {%- set first_user_prefix = messages[0]['content'] + '
6
+
7
+ ' -%}
8
+ {%- else -%}
9
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
10
+
11
+ ' -%}
12
+ {%- endif -%}
13
+ {%- set loop_messages = messages[1:] -%}
14
+ {%- else -%}
15
+ {%- set first_user_prefix = "" -%}
16
+ {%- set loop_messages = messages -%}
17
+ {%- endif -%}
18
+ {%- for message in loop_messages -%}
19
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
20
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
21
+ {%- endif -%}
22
+ {%- if (message['role'] == 'assistant') -%}
23
+ {%- set role = "model" -%}
24
+ {%- else -%}
25
+ {%- set role = message['role'] -%}
26
+ {%- endif -%}
27
+ {{ '<start_of_turn>' + role + '
28
+ ' + (first_user_prefix if loop.first else "") }}
29
+ {%- if message['content'] is string -%}
30
+ {{ message['content'] | trim }}
31
+ {%- elif message['content'] is iterable -%}
32
+ {%- for item in message['content'] -%}
33
+ {%- if item['type'] == 'image' -%}
34
+ {{ '<start_of_image>' }}
35
+ {%- elif item['type'] == 'text' -%}
36
+ {{ item['text'] | trim }}
37
+ {%- endif -%}
38
+ {%- endfor -%}
39
+ {%- elif message['content'] is defined -%}
40
+ {{ raise_exception("Invalid content type") }}
41
+ {%- endif -%}
42
+ {{ '<end_of_turn>
43
+ ' }}
44
+ {%- endfor -%}
45
+ {%- if add_generation_prompt -%}
46
+ {{'<start_of_turn>model
47
+ '}}
48
+ {%- endif -%}
49
+
50
+ {# Copyright 2025-present Unsloth. Apache 2.0 License. #}
checkpoint-296/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 106,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "unsloth_fixed": true,
54
+ "use_bidirectional_attention": false,
55
+ "use_cache": false,
56
+ "vocab_size": 262145
57
+ }
checkpoint-296/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1,
7
+ 106
8
+ ],
9
+ "max_length": 32768,
10
+ "pad_token_id": 0,
11
+ "top_k": 64,
12
+ "top_p": 0.95,
13
+ "transformers_version": "4.52.4"
14
+ }
checkpoint-296/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5b41c0ad86b13aa2e31015b2fa37db700a177ccac2c390d5dcda7424957730
3
+ size 536224336
checkpoint-296/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf4004db03c78b58b0b334aca03fdfd686ab7f51db4b34c1b657b78232f8e77
3
+ size 1072597003
checkpoint-296/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c164bf4d748b9b76346b397d4015086abb96fbaeccd6d42b2a9500c89e203c8a
3
+ size 14917
checkpoint-296/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f682fc8318583d0f9c08b0ee675a29136825aaf92ff2affc73ad3431c6ba2d9
3
+ size 14917
checkpoint-296/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eec9abeec835bcf7b7c24805795928bcbb86301ddca5cb247cf8428cab058bd2
3
+ size 1465
checkpoint-296/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
checkpoint-296/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
checkpoint-296/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-296/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-296/trainer_state.json ADDED
@@ -0,0 +1,2266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.988235294117647,
6
+ "eval_steps": 15,
7
+ "global_step": 296,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 3.8581998348236084,
15
+ "eval_runtime": 30.2305,
16
+ "eval_samples_per_second": 41.647,
17
+ "eval_steps_per_second": 5.227,
18
+ "step": 0
19
+ },
20
+ {
21
+ "epoch": 0.0067226890756302525,
22
+ "grad_norm": 49.25,
23
+ "learning_rate": 0.0,
24
+ "loss": 3.8278,
25
+ "step": 1
26
+ },
27
+ {
28
+ "epoch": 0.013445378151260505,
29
+ "grad_norm": 48.0,
30
+ "learning_rate": 1.25e-06,
31
+ "loss": 3.877,
32
+ "step": 2
33
+ },
34
+ {
35
+ "epoch": 0.020168067226890758,
36
+ "grad_norm": 49.5,
37
+ "learning_rate": 2.5e-06,
38
+ "loss": 3.8607,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 0.02689075630252101,
43
+ "grad_norm": 52.0,
44
+ "learning_rate": 3.7500000000000005e-06,
45
+ "loss": 3.7408,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.03361344537815126,
50
+ "grad_norm": 43.25,
51
+ "learning_rate": 5e-06,
52
+ "loss": 3.7054,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 0.040336134453781515,
57
+ "grad_norm": 44.75,
58
+ "learning_rate": 6.25e-06,
59
+ "loss": 3.7652,
60
+ "step": 6
61
+ },
62
+ {
63
+ "epoch": 0.047058823529411764,
64
+ "grad_norm": 36.75,
65
+ "learning_rate": 7.500000000000001e-06,
66
+ "loss": 3.7236,
67
+ "step": 7
68
+ },
69
+ {
70
+ "epoch": 0.05378151260504202,
71
+ "grad_norm": 35.75,
72
+ "learning_rate": 8.750000000000001e-06,
73
+ "loss": 3.7419,
74
+ "step": 8
75
+ },
76
+ {
77
+ "epoch": 0.06050420168067227,
78
+ "grad_norm": 21.875,
79
+ "learning_rate": 1e-05,
80
+ "loss": 3.6896,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.06722689075630252,
85
+ "grad_norm": 17.5,
86
+ "learning_rate": 9.99970252500075e-06,
87
+ "loss": 3.5791,
88
+ "step": 10
89
+ },
90
+ {
91
+ "epoch": 0.07394957983193277,
92
+ "grad_norm": 15.375,
93
+ "learning_rate": 9.998810135399545e-06,
94
+ "loss": 3.5491,
95
+ "step": 11
96
+ },
97
+ {
98
+ "epoch": 0.08067226890756303,
99
+ "grad_norm": 12.4375,
100
+ "learning_rate": 9.997322937381829e-06,
101
+ "loss": 3.5476,
102
+ "step": 12
103
+ },
104
+ {
105
+ "epoch": 0.08739495798319327,
106
+ "grad_norm": 11.8125,
107
+ "learning_rate": 9.99524110790929e-06,
108
+ "loss": 3.5258,
109
+ "step": 13
110
+ },
111
+ {
112
+ "epoch": 0.09411764705882353,
113
+ "grad_norm": 10.25,
114
+ "learning_rate": 9.992564894698816e-06,
115
+ "loss": 3.5072,
116
+ "step": 14
117
+ },
118
+ {
119
+ "epoch": 0.10084033613445378,
120
+ "grad_norm": 9.375,
121
+ "learning_rate": 9.989294616193018e-06,
122
+ "loss": 3.4802,
123
+ "step": 15
124
+ },
125
+ {
126
+ "epoch": 0.10084033613445378,
127
+ "eval_loss": 3.5118489265441895,
128
+ "eval_runtime": 30.2065,
129
+ "eval_samples_per_second": 41.68,
130
+ "eval_steps_per_second": 5.231,
131
+ "step": 15
132
+ },
133
+ {
134
+ "epoch": 0.10756302521008404,
135
+ "grad_norm": 7.8125,
136
+ "learning_rate": 9.985430661522333e-06,
137
+ "loss": 3.4675,
138
+ "step": 16
139
+ },
140
+ {
141
+ "epoch": 0.11428571428571428,
142
+ "grad_norm": 7.5,
143
+ "learning_rate": 9.980973490458728e-06,
144
+ "loss": 3.4914,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.12100840336134454,
149
+ "grad_norm": 7.65625,
150
+ "learning_rate": 9.975923633360985e-06,
151
+ "loss": 3.5515,
152
+ "step": 18
153
+ },
154
+ {
155
+ "epoch": 0.12773109243697478,
156
+ "grad_norm": 7.25,
157
+ "learning_rate": 9.970281691111598e-06,
158
+ "loss": 3.4299,
159
+ "step": 19
160
+ },
161
+ {
162
+ "epoch": 0.13445378151260504,
163
+ "grad_norm": 7.5625,
164
+ "learning_rate": 9.964048335045276e-06,
165
+ "loss": 3.4942,
166
+ "step": 20
167
+ },
168
+ {
169
+ "epoch": 0.1411764705882353,
170
+ "grad_norm": 6.5625,
171
+ "learning_rate": 9.957224306869053e-06,
172
+ "loss": 3.4355,
173
+ "step": 21
174
+ },
175
+ {
176
+ "epoch": 0.14789915966386555,
177
+ "grad_norm": 6.53125,
178
+ "learning_rate": 9.94981041857404e-06,
179
+ "loss": 3.4162,
180
+ "step": 22
181
+ },
182
+ {
183
+ "epoch": 0.1546218487394958,
184
+ "grad_norm": 6.25,
185
+ "learning_rate": 9.941807552338805e-06,
186
+ "loss": 3.4885,
187
+ "step": 23
188
+ },
189
+ {
190
+ "epoch": 0.16134453781512606,
191
+ "grad_norm": 6.03125,
192
+ "learning_rate": 9.933216660424396e-06,
193
+ "loss": 3.5423,
194
+ "step": 24
195
+ },
196
+ {
197
+ "epoch": 0.16806722689075632,
198
+ "grad_norm": 5.84375,
199
+ "learning_rate": 9.924038765061042e-06,
200
+ "loss": 3.4046,
201
+ "step": 25
202
+ },
203
+ {
204
+ "epoch": 0.17478991596638654,
205
+ "grad_norm": 6.1875,
206
+ "learning_rate": 9.914274958326507e-06,
207
+ "loss": 3.3982,
208
+ "step": 26
209
+ },
210
+ {
211
+ "epoch": 0.1815126050420168,
212
+ "grad_norm": 5.6875,
213
+ "learning_rate": 9.903926402016153e-06,
214
+ "loss": 3.4689,
215
+ "step": 27
216
+ },
217
+ {
218
+ "epoch": 0.18823529411764706,
219
+ "grad_norm": 7.03125,
220
+ "learning_rate": 9.892994327504693e-06,
221
+ "loss": 3.5937,
222
+ "step": 28
223
+ },
224
+ {
225
+ "epoch": 0.1949579831932773,
226
+ "grad_norm": 6.15625,
227
+ "learning_rate": 9.881480035599667e-06,
228
+ "loss": 3.4518,
229
+ "step": 29
230
+ },
231
+ {
232
+ "epoch": 0.20168067226890757,
233
+ "grad_norm": 5.9375,
234
+ "learning_rate": 9.869384896386669e-06,
235
+ "loss": 3.4608,
236
+ "step": 30
237
+ },
238
+ {
239
+ "epoch": 0.20168067226890757,
240
+ "eval_loss": 3.489025354385376,
241
+ "eval_runtime": 29.7531,
242
+ "eval_samples_per_second": 42.315,
243
+ "eval_steps_per_second": 5.31,
244
+ "step": 30
245
+ },
246
+ {
247
+ "epoch": 0.20840336134453782,
248
+ "grad_norm": 5.40625,
249
+ "learning_rate": 9.856710349066307e-06,
250
+ "loss": 3.5205,
251
+ "step": 31
252
+ },
253
+ {
254
+ "epoch": 0.21512605042016808,
255
+ "grad_norm": 5.0,
256
+ "learning_rate": 9.843457901782967e-06,
257
+ "loss": 3.4322,
258
+ "step": 32
259
+ },
260
+ {
261
+ "epoch": 0.2218487394957983,
262
+ "grad_norm": 5.875,
263
+ "learning_rate": 9.829629131445342e-06,
264
+ "loss": 3.4981,
265
+ "step": 33
266
+ },
267
+ {
268
+ "epoch": 0.22857142857142856,
269
+ "grad_norm": 5.875,
270
+ "learning_rate": 9.815225683538814e-06,
271
+ "loss": 3.3736,
272
+ "step": 34
273
+ },
274
+ {
275
+ "epoch": 0.23529411764705882,
276
+ "grad_norm": 5.1875,
277
+ "learning_rate": 9.800249271929645e-06,
278
+ "loss": 3.4398,
279
+ "step": 35
280
+ },
281
+ {
282
+ "epoch": 0.24201680672268908,
283
+ "grad_norm": 4.90625,
284
+ "learning_rate": 9.784701678661045e-06,
285
+ "loss": 3.4934,
286
+ "step": 36
287
+ },
288
+ {
289
+ "epoch": 0.24873949579831933,
290
+ "grad_norm": 4.84375,
291
+ "learning_rate": 9.768584753741134e-06,
292
+ "loss": 3.4506,
293
+ "step": 37
294
+ },
295
+ {
296
+ "epoch": 0.25546218487394956,
297
+ "grad_norm": 4.5,
298
+ "learning_rate": 9.751900414922807e-06,
299
+ "loss": 3.4764,
300
+ "step": 38
301
+ },
302
+ {
303
+ "epoch": 0.26218487394957984,
304
+ "grad_norm": 4.8125,
305
+ "learning_rate": 9.73465064747553e-06,
306
+ "loss": 3.547,
307
+ "step": 39
308
+ },
309
+ {
310
+ "epoch": 0.2689075630252101,
311
+ "grad_norm": 5.3125,
312
+ "learning_rate": 9.716837503949128e-06,
313
+ "loss": 3.4394,
314
+ "step": 40
315
+ },
316
+ {
317
+ "epoch": 0.27563025210084036,
318
+ "grad_norm": 4.6875,
319
+ "learning_rate": 9.698463103929542e-06,
320
+ "loss": 3.4722,
321
+ "step": 41
322
+ },
323
+ {
324
+ "epoch": 0.2823529411764706,
325
+ "grad_norm": 4.75,
326
+ "learning_rate": 9.67952963378663e-06,
327
+ "loss": 3.4639,
328
+ "step": 42
329
+ },
330
+ {
331
+ "epoch": 0.28907563025210087,
332
+ "grad_norm": 5.0,
333
+ "learning_rate": 9.660039346413994e-06,
334
+ "loss": 3.4936,
335
+ "step": 43
336
+ },
337
+ {
338
+ "epoch": 0.2957983193277311,
339
+ "grad_norm": 4.40625,
340
+ "learning_rate": 9.639994560960923e-06,
341
+ "loss": 3.5191,
342
+ "step": 44
343
+ },
344
+ {
345
+ "epoch": 0.3025210084033613,
346
+ "grad_norm": 4.0625,
347
+ "learning_rate": 9.619397662556434e-06,
348
+ "loss": 3.5272,
349
+ "step": 45
350
+ },
351
+ {
352
+ "epoch": 0.3025210084033613,
353
+ "eval_loss": 3.5188682079315186,
354
+ "eval_runtime": 30.2246,
355
+ "eval_samples_per_second": 41.655,
356
+ "eval_steps_per_second": 5.228,
357
+ "step": 45
358
+ },
359
+ {
360
+ "epoch": 0.3092436974789916,
361
+ "grad_norm": 4.59375,
362
+ "learning_rate": 9.598251102025463e-06,
363
+ "loss": 3.5391,
364
+ "step": 46
365
+ },
366
+ {
367
+ "epoch": 0.31596638655462184,
368
+ "grad_norm": 4.53125,
369
+ "learning_rate": 9.576557395597237e-06,
370
+ "loss": 3.4851,
371
+ "step": 47
372
+ },
373
+ {
374
+ "epoch": 0.3226890756302521,
375
+ "grad_norm": 5.125,
376
+ "learning_rate": 9.55431912460588e-06,
377
+ "loss": 3.5334,
378
+ "step": 48
379
+ },
380
+ {
381
+ "epoch": 0.32941176470588235,
382
+ "grad_norm": 4.625,
383
+ "learning_rate": 9.531538935183252e-06,
384
+ "loss": 3.4687,
385
+ "step": 49
386
+ },
387
+ {
388
+ "epoch": 0.33613445378151263,
389
+ "grad_norm": 5.53125,
390
+ "learning_rate": 9.50821953794408e-06,
391
+ "loss": 3.539,
392
+ "step": 50
393
+ },
394
+ {
395
+ "epoch": 0.34285714285714286,
396
+ "grad_norm": 4.75,
397
+ "learning_rate": 9.484363707663443e-06,
398
+ "loss": 3.5205,
399
+ "step": 51
400
+ },
401
+ {
402
+ "epoch": 0.3495798319327731,
403
+ "grad_norm": 4.84375,
404
+ "learning_rate": 9.459974282946572e-06,
405
+ "loss": 3.5856,
406
+ "step": 52
407
+ },
408
+ {
409
+ "epoch": 0.3563025210084034,
410
+ "grad_norm": 4.78125,
411
+ "learning_rate": 9.43505416589111e-06,
412
+ "loss": 3.5938,
413
+ "step": 53
414
+ },
415
+ {
416
+ "epoch": 0.3630252100840336,
417
+ "grad_norm": 5.0625,
418
+ "learning_rate": 9.409606321741776e-06,
419
+ "loss": 3.5446,
420
+ "step": 54
421
+ },
422
+ {
423
+ "epoch": 0.3697478991596639,
424
+ "grad_norm": 4.625,
425
+ "learning_rate": 9.38363377853754e-06,
426
+ "loss": 3.5746,
427
+ "step": 55
428
+ },
429
+ {
430
+ "epoch": 0.3764705882352941,
431
+ "grad_norm": 4.875,
432
+ "learning_rate": 9.357139626751308e-06,
433
+ "loss": 3.5536,
434
+ "step": 56
435
+ },
436
+ {
437
+ "epoch": 0.3831932773109244,
438
+ "grad_norm": 4.53125,
439
+ "learning_rate": 9.330127018922195e-06,
440
+ "loss": 3.4815,
441
+ "step": 57
442
+ },
443
+ {
444
+ "epoch": 0.3899159663865546,
445
+ "grad_norm": 4.5,
446
+ "learning_rate": 9.302599169280395e-06,
447
+ "loss": 3.5294,
448
+ "step": 58
449
+ },
450
+ {
451
+ "epoch": 0.39663865546218485,
452
+ "grad_norm": 3.96875,
453
+ "learning_rate": 9.274559353364734e-06,
454
+ "loss": 3.476,
455
+ "step": 59
456
+ },
457
+ {
458
+ "epoch": 0.40336134453781514,
459
+ "grad_norm": 5.09375,
460
+ "learning_rate": 9.246010907632894e-06,
461
+ "loss": 3.559,
462
+ "step": 60
463
+ },
464
+ {
465
+ "epoch": 0.40336134453781514,
466
+ "eval_loss": 3.575310230255127,
467
+ "eval_runtime": 29.7614,
468
+ "eval_samples_per_second": 42.303,
469
+ "eval_steps_per_second": 5.309,
470
+ "step": 60
471
+ },
472
+ {
473
+ "epoch": 0.41008403361344536,
474
+ "grad_norm": 4.4375,
475
+ "learning_rate": 9.21695722906443e-06,
476
+ "loss": 3.5118,
477
+ "step": 61
478
+ },
479
+ {
480
+ "epoch": 0.41680672268907565,
481
+ "grad_norm": 5.0625,
482
+ "learning_rate": 9.18740177475654e-06,
483
+ "loss": 3.5701,
484
+ "step": 62
485
+ },
486
+ {
487
+ "epoch": 0.4235294117647059,
488
+ "grad_norm": 4.5,
489
+ "learning_rate": 9.157348061512728e-06,
490
+ "loss": 3.5679,
491
+ "step": 63
492
+ },
493
+ {
494
+ "epoch": 0.43025210084033616,
495
+ "grad_norm": 4.0625,
496
+ "learning_rate": 9.126799665424319e-06,
497
+ "loss": 3.5001,
498
+ "step": 64
499
+ },
500
+ {
501
+ "epoch": 0.4369747899159664,
502
+ "grad_norm": 4.28125,
503
+ "learning_rate": 9.09576022144496e-06,
504
+ "loss": 3.559,
505
+ "step": 65
506
+ },
507
+ {
508
+ "epoch": 0.4436974789915966,
509
+ "grad_norm": 4.75,
510
+ "learning_rate": 9.064233422958078e-06,
511
+ "loss": 3.4816,
512
+ "step": 66
513
+ },
514
+ {
515
+ "epoch": 0.4504201680672269,
516
+ "grad_norm": 3.953125,
517
+ "learning_rate": 9.032223021337415e-06,
518
+ "loss": 3.6286,
519
+ "step": 67
520
+ },
521
+ {
522
+ "epoch": 0.45714285714285713,
523
+ "grad_norm": 4.5625,
524
+ "learning_rate": 8.999732825500649e-06,
525
+ "loss": 3.5596,
526
+ "step": 68
527
+ },
528
+ {
529
+ "epoch": 0.4638655462184874,
530
+ "grad_norm": 4.46875,
531
+ "learning_rate": 8.966766701456177e-06,
532
+ "loss": 3.5409,
533
+ "step": 69
534
+ },
535
+ {
536
+ "epoch": 0.47058823529411764,
537
+ "grad_norm": 6.9375,
538
+ "learning_rate": 8.933328571843086e-06,
539
+ "loss": 3.5449,
540
+ "step": 70
541
+ },
542
+ {
543
+ "epoch": 0.4773109243697479,
544
+ "grad_norm": 5.8125,
545
+ "learning_rate": 8.899422415464409e-06,
546
+ "loss": 3.6107,
547
+ "step": 71
548
+ },
549
+ {
550
+ "epoch": 0.48403361344537815,
551
+ "grad_norm": 5.21875,
552
+ "learning_rate": 8.865052266813686e-06,
553
+ "loss": 3.6243,
554
+ "step": 72
555
+ },
556
+ {
557
+ "epoch": 0.4907563025210084,
558
+ "grad_norm": 4.59375,
559
+ "learning_rate": 8.83022221559489e-06,
560
+ "loss": 3.6119,
561
+ "step": 73
562
+ },
563
+ {
564
+ "epoch": 0.49747899159663866,
565
+ "grad_norm": 5.0,
566
+ "learning_rate": 8.79493640623581e-06,
567
+ "loss": 3.563,
568
+ "step": 74
569
+ },
570
+ {
571
+ "epoch": 0.5042016806722689,
572
+ "grad_norm": 4.875,
573
+ "learning_rate": 8.759199037394888e-06,
574
+ "loss": 3.5817,
575
+ "step": 75
576
+ },
577
+ {
578
+ "epoch": 0.5042016806722689,
579
+ "eval_loss": 3.612149238586426,
580
+ "eval_runtime": 30.2292,
581
+ "eval_samples_per_second": 41.648,
582
+ "eval_steps_per_second": 5.227,
583
+ "step": 75
584
+ },
585
+ {
586
+ "epoch": 0.5109243697478991,
587
+ "grad_norm": 4.1875,
588
+ "learning_rate": 8.723014361461633e-06,
589
+ "loss": 3.5643,
590
+ "step": 76
591
+ },
592
+ {
593
+ "epoch": 0.5176470588235295,
594
+ "grad_norm": 4.40625,
595
+ "learning_rate": 8.68638668405062e-06,
596
+ "loss": 3.5424,
597
+ "step": 77
598
+ },
599
+ {
600
+ "epoch": 0.5243697478991597,
601
+ "grad_norm": 4.875,
602
+ "learning_rate": 8.649320363489178e-06,
603
+ "loss": 3.5679,
604
+ "step": 78
605
+ },
606
+ {
607
+ "epoch": 0.5310924369747899,
608
+ "grad_norm": 5.4375,
609
+ "learning_rate": 8.611819810298778e-06,
610
+ "loss": 3.5269,
611
+ "step": 79
612
+ },
613
+ {
614
+ "epoch": 0.5378151260504201,
615
+ "grad_norm": 4.46875,
616
+ "learning_rate": 8.573889486670233e-06,
617
+ "loss": 3.5913,
618
+ "step": 80
619
+ },
620
+ {
621
+ "epoch": 0.5445378151260504,
622
+ "grad_norm": 4.21875,
623
+ "learning_rate": 8.535533905932739e-06,
624
+ "loss": 3.7066,
625
+ "step": 81
626
+ },
627
+ {
628
+ "epoch": 0.5512605042016807,
629
+ "grad_norm": 4.40625,
630
+ "learning_rate": 8.496757632016836e-06,
631
+ "loss": 3.6143,
632
+ "step": 82
633
+ },
634
+ {
635
+ "epoch": 0.5579831932773109,
636
+ "grad_norm": 4.5,
637
+ "learning_rate": 8.457565278911349e-06,
638
+ "loss": 3.6007,
639
+ "step": 83
640
+ },
641
+ {
642
+ "epoch": 0.5647058823529412,
643
+ "grad_norm": 5.5,
644
+ "learning_rate": 8.417961510114357e-06,
645
+ "loss": 3.5805,
646
+ "step": 84
647
+ },
648
+ {
649
+ "epoch": 0.5714285714285714,
650
+ "grad_norm": 4.15625,
651
+ "learning_rate": 8.377951038078303e-06,
652
+ "loss": 3.5255,
653
+ "step": 85
654
+ },
655
+ {
656
+ "epoch": 0.5781512605042017,
657
+ "grad_norm": 4.21875,
658
+ "learning_rate": 8.337538623649237e-06,
659
+ "loss": 3.6272,
660
+ "step": 86
661
+ },
662
+ {
663
+ "epoch": 0.584873949579832,
664
+ "grad_norm": 4.40625,
665
+ "learning_rate": 8.296729075500345e-06,
666
+ "loss": 3.4642,
667
+ "step": 87
668
+ },
669
+ {
670
+ "epoch": 0.5915966386554622,
671
+ "grad_norm": 4.40625,
672
+ "learning_rate": 8.255527249559747e-06,
673
+ "loss": 3.6105,
674
+ "step": 88
675
+ },
676
+ {
677
+ "epoch": 0.5983193277310924,
678
+ "grad_norm": 5.34375,
679
+ "learning_rate": 8.213938048432697e-06,
680
+ "loss": 3.7054,
681
+ "step": 89
682
+ },
683
+ {
684
+ "epoch": 0.6050420168067226,
685
+ "grad_norm": 4.03125,
686
+ "learning_rate": 8.171966420818227e-06,
687
+ "loss": 3.6349,
688
+ "step": 90
689
+ },
690
+ {
691
+ "epoch": 0.6050420168067226,
692
+ "eval_loss": 3.647097110748291,
693
+ "eval_runtime": 29.7281,
694
+ "eval_samples_per_second": 42.351,
695
+ "eval_steps_per_second": 5.315,
696
+ "step": 90
697
+ },
698
+ {
699
+ "epoch": 0.611764705882353,
700
+ "grad_norm": 4.46875,
701
+ "learning_rate": 8.129617360920297e-06,
702
+ "loss": 3.5585,
703
+ "step": 91
704
+ },
705
+ {
706
+ "epoch": 0.6184873949579832,
707
+ "grad_norm": 4.09375,
708
+ "learning_rate": 8.086895907853526e-06,
709
+ "loss": 3.6065,
710
+ "step": 92
711
+ },
712
+ {
713
+ "epoch": 0.6252100840336134,
714
+ "grad_norm": 4.0625,
715
+ "learning_rate": 8.043807145043604e-06,
716
+ "loss": 3.5808,
717
+ "step": 93
718
+ },
719
+ {
720
+ "epoch": 0.6319327731092437,
721
+ "grad_norm": 4.25,
722
+ "learning_rate": 8.000356199622406e-06,
723
+ "loss": 3.6742,
724
+ "step": 94
725
+ },
726
+ {
727
+ "epoch": 0.6386554621848739,
728
+ "grad_norm": 3.75,
729
+ "learning_rate": 7.956548241817914e-06,
730
+ "loss": 3.609,
731
+ "step": 95
732
+ },
733
+ {
734
+ "epoch": 0.6453781512605042,
735
+ "grad_norm": 4.34375,
736
+ "learning_rate": 7.912388484339012e-06,
737
+ "loss": 3.5559,
738
+ "step": 96
739
+ },
740
+ {
741
+ "epoch": 0.6521008403361345,
742
+ "grad_norm": 4.59375,
743
+ "learning_rate": 7.86788218175523e-06,
744
+ "loss": 3.6504,
745
+ "step": 97
746
+ },
747
+ {
748
+ "epoch": 0.6588235294117647,
749
+ "grad_norm": 4.59375,
750
+ "learning_rate": 7.823034629871503e-06,
751
+ "loss": 3.5724,
752
+ "step": 98
753
+ },
754
+ {
755
+ "epoch": 0.6655462184873949,
756
+ "grad_norm": 5.25,
757
+ "learning_rate": 7.777851165098012e-06,
758
+ "loss": 3.6483,
759
+ "step": 99
760
+ },
761
+ {
762
+ "epoch": 0.6722689075630253,
763
+ "grad_norm": 5.40625,
764
+ "learning_rate": 7.732337163815218e-06,
765
+ "loss": 3.5782,
766
+ "step": 100
767
+ },
768
+ {
769
+ "epoch": 0.6789915966386555,
770
+ "grad_norm": 4.15625,
771
+ "learning_rate": 7.686498041734121e-06,
772
+ "loss": 3.5653,
773
+ "step": 101
774
+ },
775
+ {
776
+ "epoch": 0.6857142857142857,
777
+ "grad_norm": 4.125,
778
+ "learning_rate": 7.64033925325184e-06,
779
+ "loss": 3.6252,
780
+ "step": 102
781
+ },
782
+ {
783
+ "epoch": 0.692436974789916,
784
+ "grad_norm": 4.5625,
785
+ "learning_rate": 7.593866290802608e-06,
786
+ "loss": 3.7141,
787
+ "step": 103
788
+ },
789
+ {
790
+ "epoch": 0.6991596638655462,
791
+ "grad_norm": 4.3125,
792
+ "learning_rate": 7.54708468420421e-06,
793
+ "loss": 3.6884,
794
+ "step": 104
795
+ },
796
+ {
797
+ "epoch": 0.7058823529411765,
798
+ "grad_norm": 4.21875,
799
+ "learning_rate": 7.500000000000001e-06,
800
+ "loss": 3.68,
801
+ "step": 105
802
+ },
803
+ {
804
+ "epoch": 0.7058823529411765,
805
+ "eval_loss": 3.672091484069824,
806
+ "eval_runtime": 30.2768,
807
+ "eval_samples_per_second": 41.583,
808
+ "eval_steps_per_second": 5.219,
809
+ "step": 105
810
+ },
811
+ {
812
+ "epoch": 0.7126050420168067,
813
+ "grad_norm": 4.34375,
814
+ "learning_rate": 7.4526178407965396e-06,
815
+ "loss": 3.5934,
816
+ "step": 106
817
+ },
818
+ {
819
+ "epoch": 0.719327731092437,
820
+ "grad_norm": 3.765625,
821
+ "learning_rate": 7.404943844596939e-06,
822
+ "loss": 3.5845,
823
+ "step": 107
824
+ },
825
+ {
826
+ "epoch": 0.7260504201680672,
827
+ "grad_norm": 3.953125,
828
+ "learning_rate": 7.3569836841299905e-06,
829
+ "loss": 3.6421,
830
+ "step": 108
831
+ },
832
+ {
833
+ "epoch": 0.7327731092436974,
834
+ "grad_norm": 4.53125,
835
+ "learning_rate": 7.308743066175172e-06,
836
+ "loss": 3.6617,
837
+ "step": 109
838
+ },
839
+ {
840
+ "epoch": 0.7394957983193278,
841
+ "grad_norm": 4.71875,
842
+ "learning_rate": 7.2602277308836e-06,
843
+ "loss": 3.6388,
844
+ "step": 110
845
+ },
846
+ {
847
+ "epoch": 0.746218487394958,
848
+ "grad_norm": 4.40625,
849
+ "learning_rate": 7.211443451095007e-06,
850
+ "loss": 3.6798,
851
+ "step": 111
852
+ },
853
+ {
854
+ "epoch": 0.7529411764705882,
855
+ "grad_norm": 4.6875,
856
+ "learning_rate": 7.162396031650831e-06,
857
+ "loss": 3.8081,
858
+ "step": 112
859
+ },
860
+ {
861
+ "epoch": 0.7596638655462185,
862
+ "grad_norm": 4.59375,
863
+ "learning_rate": 7.113091308703498e-06,
864
+ "loss": 3.762,
865
+ "step": 113
866
+ },
867
+ {
868
+ "epoch": 0.7663865546218488,
869
+ "grad_norm": 4.25,
870
+ "learning_rate": 7.063535149021974e-06,
871
+ "loss": 3.5991,
872
+ "step": 114
873
+ },
874
+ {
875
+ "epoch": 0.773109243697479,
876
+ "grad_norm": 4.09375,
877
+ "learning_rate": 7.0137334492936875e-06,
878
+ "loss": 3.6272,
879
+ "step": 115
880
+ },
881
+ {
882
+ "epoch": 0.7798319327731092,
883
+ "grad_norm": 5.1875,
884
+ "learning_rate": 6.963692135422872e-06,
885
+ "loss": 3.7034,
886
+ "step": 116
887
+ },
888
+ {
889
+ "epoch": 0.7865546218487395,
890
+ "grad_norm": 4.6875,
891
+ "learning_rate": 6.913417161825449e-06,
892
+ "loss": 3.6734,
893
+ "step": 117
894
+ },
895
+ {
896
+ "epoch": 0.7932773109243697,
897
+ "grad_norm": 4.5625,
898
+ "learning_rate": 6.862914510720515e-06,
899
+ "loss": 3.6013,
900
+ "step": 118
901
+ },
902
+ {
903
+ "epoch": 0.8,
904
+ "grad_norm": 4.4375,
905
+ "learning_rate": 6.812190191418508e-06,
906
+ "loss": 3.81,
907
+ "step": 119
908
+ },
909
+ {
910
+ "epoch": 0.8067226890756303,
911
+ "grad_norm": 4.15625,
912
+ "learning_rate": 6.7612502396061685e-06,
913
+ "loss": 3.6597,
914
+ "step": 120
915
+ },
916
+ {
917
+ "epoch": 0.8067226890756303,
918
+ "eval_loss": 3.696960926055908,
919
+ "eval_runtime": 29.7615,
920
+ "eval_samples_per_second": 42.303,
921
+ "eval_steps_per_second": 5.309,
922
+ "step": 120
923
+ },
924
+ {
925
+ "epoch": 0.8134453781512605,
926
+ "grad_norm": 4.125,
927
+ "learning_rate": 6.710100716628345e-06,
928
+ "loss": 3.7104,
929
+ "step": 121
930
+ },
931
+ {
932
+ "epoch": 0.8201680672268907,
933
+ "grad_norm": 4.5,
934
+ "learning_rate": 6.6587477087667615e-06,
935
+ "loss": 3.639,
936
+ "step": 122
937
+ },
938
+ {
939
+ "epoch": 0.826890756302521,
940
+ "grad_norm": 4.09375,
941
+ "learning_rate": 6.607197326515808e-06,
942
+ "loss": 3.6311,
943
+ "step": 123
944
+ },
945
+ {
946
+ "epoch": 0.8336134453781513,
947
+ "grad_norm": 4.5,
948
+ "learning_rate": 6.555455703855454e-06,
949
+ "loss": 3.7333,
950
+ "step": 124
951
+ },
952
+ {
953
+ "epoch": 0.8403361344537815,
954
+ "grad_norm": 4.375,
955
+ "learning_rate": 6.503528997521365e-06,
956
+ "loss": 3.7003,
957
+ "step": 125
958
+ },
959
+ {
960
+ "epoch": 0.8470588235294118,
961
+ "grad_norm": 4.375,
962
+ "learning_rate": 6.451423386272312e-06,
963
+ "loss": 3.6759,
964
+ "step": 126
965
+ },
966
+ {
967
+ "epoch": 0.853781512605042,
968
+ "grad_norm": 4.59375,
969
+ "learning_rate": 6.399145070154962e-06,
970
+ "loss": 3.6546,
971
+ "step": 127
972
+ },
973
+ {
974
+ "epoch": 0.8605042016806723,
975
+ "grad_norm": 3.984375,
976
+ "learning_rate": 6.346700269766132e-06,
977
+ "loss": 3.7089,
978
+ "step": 128
979
+ },
980
+ {
981
+ "epoch": 0.8672268907563025,
982
+ "grad_norm": 4.03125,
983
+ "learning_rate": 6.294095225512604e-06,
984
+ "loss": 3.5802,
985
+ "step": 129
986
+ },
987
+ {
988
+ "epoch": 0.8739495798319328,
989
+ "grad_norm": 4.0,
990
+ "learning_rate": 6.241336196868582e-06,
991
+ "loss": 3.7225,
992
+ "step": 130
993
+ },
994
+ {
995
+ "epoch": 0.880672268907563,
996
+ "grad_norm": 4.25,
997
+ "learning_rate": 6.188429461630866e-06,
998
+ "loss": 3.7397,
999
+ "step": 131
1000
+ },
1001
+ {
1002
+ "epoch": 0.8873949579831932,
1003
+ "grad_norm": 4.0625,
1004
+ "learning_rate": 6.135381315171867e-06,
1005
+ "loss": 3.6903,
1006
+ "step": 132
1007
+ },
1008
+ {
1009
+ "epoch": 0.8941176470588236,
1010
+ "grad_norm": 4.28125,
1011
+ "learning_rate": 6.0821980696905145e-06,
1012
+ "loss": 3.6114,
1013
+ "step": 133
1014
+ },
1015
+ {
1016
+ "epoch": 0.9008403361344538,
1017
+ "grad_norm": 4.03125,
1018
+ "learning_rate": 6.028886053461175e-06,
1019
+ "loss": 3.7576,
1020
+ "step": 134
1021
+ },
1022
+ {
1023
+ "epoch": 0.907563025210084,
1024
+ "grad_norm": 3.890625,
1025
+ "learning_rate": 5.975451610080643e-06,
1026
+ "loss": 3.6462,
1027
+ "step": 135
1028
+ },
1029
+ {
1030
+ "epoch": 0.907563025210084,
1031
+ "eval_loss": 3.706806182861328,
1032
+ "eval_runtime": 30.2476,
1033
+ "eval_samples_per_second": 41.623,
1034
+ "eval_steps_per_second": 5.224,
1035
+ "step": 135
1036
+ },
1037
+ {
1038
+ "epoch": 0.9142857142857143,
1039
+ "grad_norm": 5.0,
1040
+ "learning_rate": 5.921901097713317e-06,
1041
+ "loss": 3.6685,
1042
+ "step": 136
1043
+ },
1044
+ {
1045
+ "epoch": 0.9210084033613445,
1046
+ "grad_norm": 4.9375,
1047
+ "learning_rate": 5.8682408883346535e-06,
1048
+ "loss": 3.6868,
1049
+ "step": 137
1050
+ },
1051
+ {
1052
+ "epoch": 0.9277310924369748,
1053
+ "grad_norm": 4.15625,
1054
+ "learning_rate": 5.814477366972945e-06,
1055
+ "loss": 3.5962,
1056
+ "step": 138
1057
+ },
1058
+ {
1059
+ "epoch": 0.934453781512605,
1060
+ "grad_norm": 4.0625,
1061
+ "learning_rate": 5.760616930949584e-06,
1062
+ "loss": 3.6538,
1063
+ "step": 139
1064
+ },
1065
+ {
1066
+ "epoch": 0.9411764705882353,
1067
+ "grad_norm": 4.25,
1068
+ "learning_rate": 5.7066659891178385e-06,
1069
+ "loss": 3.7465,
1070
+ "step": 140
1071
+ },
1072
+ {
1073
+ "epoch": 0.9478991596638655,
1074
+ "grad_norm": 4.0625,
1075
+ "learning_rate": 5.65263096110026e-06,
1076
+ "loss": 3.6044,
1077
+ "step": 141
1078
+ },
1079
+ {
1080
+ "epoch": 0.9546218487394958,
1081
+ "grad_norm": 4.40625,
1082
+ "learning_rate": 5.598518276524813e-06,
1083
+ "loss": 3.6922,
1084
+ "step": 142
1085
+ },
1086
+ {
1087
+ "epoch": 0.9613445378151261,
1088
+ "grad_norm": 4.6875,
1089
+ "learning_rate": 5.544334374259823e-06,
1090
+ "loss": 3.6808,
1091
+ "step": 143
1092
+ },
1093
+ {
1094
+ "epoch": 0.9680672268907563,
1095
+ "grad_norm": 4.5,
1096
+ "learning_rate": 5.490085701647805e-06,
1097
+ "loss": 3.6849,
1098
+ "step": 144
1099
+ },
1100
+ {
1101
+ "epoch": 0.9747899159663865,
1102
+ "grad_norm": 7.125,
1103
+ "learning_rate": 5.435778713738292e-06,
1104
+ "loss": 3.7327,
1105
+ "step": 145
1106
+ },
1107
+ {
1108
+ "epoch": 0.9815126050420168,
1109
+ "grad_norm": 4.40625,
1110
+ "learning_rate": 5.381419872519763e-06,
1111
+ "loss": 3.7792,
1112
+ "step": 146
1113
+ },
1114
+ {
1115
+ "epoch": 0.9882352941176471,
1116
+ "grad_norm": 4.59375,
1117
+ "learning_rate": 5.327015646150716e-06,
1118
+ "loss": 3.8095,
1119
+ "step": 147
1120
+ },
1121
+ {
1122
+ "epoch": 0.9949579831932773,
1123
+ "grad_norm": 4.09375,
1124
+ "learning_rate": 5.272572508190033e-06,
1125
+ "loss": 3.5693,
1126
+ "step": 148
1127
+ },
1128
+ {
1129
+ "epoch": 1.0,
1130
+ "grad_norm": 4.84375,
1131
+ "learning_rate": 5.218096936826681e-06,
1132
+ "loss": 3.7536,
1133
+ "step": 149
1134
+ },
1135
+ {
1136
+ "epoch": 1.0067226890756302,
1137
+ "grad_norm": 4.21875,
1138
+ "learning_rate": 5.1635954141088815e-06,
1139
+ "loss": 3.7009,
1140
+ "step": 150
1141
+ },
1142
+ {
1143
+ "epoch": 1.0067226890756302,
1144
+ "eval_loss": 3.721317768096924,
1145
+ "eval_runtime": 29.8173,
1146
+ "eval_samples_per_second": 42.224,
1147
+ "eval_steps_per_second": 5.299,
1148
+ "step": 150
1149
+ },
1150
+ {
1151
+ "epoch": 1.0134453781512605,
1152
+ "grad_norm": 4.15625,
1153
+ "learning_rate": 5.109074425172806e-06,
1154
+ "loss": 3.7465,
1155
+ "step": 151
1156
+ },
1157
+ {
1158
+ "epoch": 1.0201680672268907,
1159
+ "grad_norm": 3.890625,
1160
+ "learning_rate": 5.054540457470912e-06,
1161
+ "loss": 3.71,
1162
+ "step": 152
1163
+ },
1164
+ {
1165
+ "epoch": 1.026890756302521,
1166
+ "grad_norm": 4.625,
1167
+ "learning_rate": 5e-06,
1168
+ "loss": 3.5906,
1169
+ "step": 153
1170
+ },
1171
+ {
1172
+ "epoch": 1.0336134453781514,
1173
+ "grad_norm": 3.859375,
1174
+ "learning_rate": 4.945459542529089e-06,
1175
+ "loss": 3.6227,
1176
+ "step": 154
1177
+ },
1178
+ {
1179
+ "epoch": 1.0403361344537816,
1180
+ "grad_norm": 4.71875,
1181
+ "learning_rate": 4.890925574827195e-06,
1182
+ "loss": 3.6398,
1183
+ "step": 155
1184
+ },
1185
+ {
1186
+ "epoch": 1.0470588235294118,
1187
+ "grad_norm": 4.71875,
1188
+ "learning_rate": 4.83640458589112e-06,
1189
+ "loss": 3.6522,
1190
+ "step": 156
1191
+ },
1192
+ {
1193
+ "epoch": 1.053781512605042,
1194
+ "grad_norm": 4.9375,
1195
+ "learning_rate": 4.781903063173321e-06,
1196
+ "loss": 3.7183,
1197
+ "step": 157
1198
+ },
1199
+ {
1200
+ "epoch": 1.0605042016806723,
1201
+ "grad_norm": 4.125,
1202
+ "learning_rate": 4.727427491809968e-06,
1203
+ "loss": 3.765,
1204
+ "step": 158
1205
+ },
1206
+ {
1207
+ "epoch": 1.0672268907563025,
1208
+ "grad_norm": 4.65625,
1209
+ "learning_rate": 4.672984353849285e-06,
1210
+ "loss": 3.6848,
1211
+ "step": 159
1212
+ },
1213
+ {
1214
+ "epoch": 1.0739495798319327,
1215
+ "grad_norm": 5.28125,
1216
+ "learning_rate": 4.618580127480239e-06,
1217
+ "loss": 3.7065,
1218
+ "step": 160
1219
+ },
1220
+ {
1221
+ "epoch": 1.080672268907563,
1222
+ "grad_norm": 4.46875,
1223
+ "learning_rate": 4.564221286261709e-06,
1224
+ "loss": 3.7159,
1225
+ "step": 161
1226
+ },
1227
+ {
1228
+ "epoch": 1.0873949579831932,
1229
+ "grad_norm": 4.75,
1230
+ "learning_rate": 4.509914298352197e-06,
1231
+ "loss": 3.7166,
1232
+ "step": 162
1233
+ },
1234
+ {
1235
+ "epoch": 1.0941176470588236,
1236
+ "grad_norm": 4.40625,
1237
+ "learning_rate": 4.4556656257401786e-06,
1238
+ "loss": 3.7179,
1239
+ "step": 163
1240
+ },
1241
+ {
1242
+ "epoch": 1.1008403361344539,
1243
+ "grad_norm": 4.75,
1244
+ "learning_rate": 4.401481723475189e-06,
1245
+ "loss": 3.6981,
1246
+ "step": 164
1247
+ },
1248
+ {
1249
+ "epoch": 1.107563025210084,
1250
+ "grad_norm": 4.3125,
1251
+ "learning_rate": 4.347369038899744e-06,
1252
+ "loss": 3.6717,
1253
+ "step": 165
1254
+ },
1255
+ {
1256
+ "epoch": 1.107563025210084,
1257
+ "eval_loss": 3.731348752975464,
1258
+ "eval_runtime": 30.2759,
1259
+ "eval_samples_per_second": 41.584,
1260
+ "eval_steps_per_second": 5.219,
1261
+ "step": 165
1262
+ },
1263
+ {
1264
+ "epoch": 1.1142857142857143,
1265
+ "grad_norm": 4.15625,
1266
+ "learning_rate": 4.293334010882164e-06,
1267
+ "loss": 3.7169,
1268
+ "step": 166
1269
+ },
1270
+ {
1271
+ "epoch": 1.1210084033613446,
1272
+ "grad_norm": 4.5625,
1273
+ "learning_rate": 4.239383069050417e-06,
1274
+ "loss": 3.7929,
1275
+ "step": 167
1276
+ },
1277
+ {
1278
+ "epoch": 1.1277310924369748,
1279
+ "grad_norm": 4.25,
1280
+ "learning_rate": 4.185522633027057e-06,
1281
+ "loss": 3.663,
1282
+ "step": 168
1283
+ },
1284
+ {
1285
+ "epoch": 1.134453781512605,
1286
+ "grad_norm": 4.71875,
1287
+ "learning_rate": 4.131759111665349e-06,
1288
+ "loss": 3.7563,
1289
+ "step": 169
1290
+ },
1291
+ {
1292
+ "epoch": 1.1411764705882352,
1293
+ "grad_norm": 4.0,
1294
+ "learning_rate": 4.078098902286684e-06,
1295
+ "loss": 3.6651,
1296
+ "step": 170
1297
+ },
1298
+ {
1299
+ "epoch": 1.1478991596638655,
1300
+ "grad_norm": 4.15625,
1301
+ "learning_rate": 4.02454838991936e-06,
1302
+ "loss": 3.6607,
1303
+ "step": 171
1304
+ },
1305
+ {
1306
+ "epoch": 1.1546218487394957,
1307
+ "grad_norm": 4.4375,
1308
+ "learning_rate": 3.971113946538826e-06,
1309
+ "loss": 3.7405,
1310
+ "step": 172
1311
+ },
1312
+ {
1313
+ "epoch": 1.1613445378151261,
1314
+ "grad_norm": 4.46875,
1315
+ "learning_rate": 3.917801930309486e-06,
1316
+ "loss": 3.7962,
1317
+ "step": 173
1318
+ },
1319
+ {
1320
+ "epoch": 1.1680672268907564,
1321
+ "grad_norm": 4.34375,
1322
+ "learning_rate": 3.864618684828135e-06,
1323
+ "loss": 3.645,
1324
+ "step": 174
1325
+ },
1326
+ {
1327
+ "epoch": 1.1747899159663866,
1328
+ "grad_norm": 4.15625,
1329
+ "learning_rate": 3.8115705383691354e-06,
1330
+ "loss": 3.6461,
1331
+ "step": 175
1332
+ },
1333
+ {
1334
+ "epoch": 1.1815126050420168,
1335
+ "grad_norm": 4.03125,
1336
+ "learning_rate": 3.7586638031314182e-06,
1337
+ "loss": 3.71,
1338
+ "step": 176
1339
+ },
1340
+ {
1341
+ "epoch": 1.188235294117647,
1342
+ "grad_norm": 8.6875,
1343
+ "learning_rate": 3.705904774487396e-06,
1344
+ "loss": 3.8202,
1345
+ "step": 177
1346
+ },
1347
+ {
1348
+ "epoch": 1.1949579831932773,
1349
+ "grad_norm": 4.09375,
1350
+ "learning_rate": 3.6532997302338704e-06,
1351
+ "loss": 3.7077,
1352
+ "step": 178
1353
+ },
1354
+ {
1355
+ "epoch": 1.2016806722689075,
1356
+ "grad_norm": 4.78125,
1357
+ "learning_rate": 3.6008549298450403e-06,
1358
+ "loss": 3.7005,
1359
+ "step": 179
1360
+ },
1361
+ {
1362
+ "epoch": 1.2084033613445377,
1363
+ "grad_norm": 4.09375,
1364
+ "learning_rate": 3.5485766137276894e-06,
1365
+ "loss": 3.7631,
1366
+ "step": 180
1367
+ },
1368
+ {
1369
+ "epoch": 1.2084033613445377,
1370
+ "eval_loss": 3.7338194847106934,
1371
+ "eval_runtime": 29.8219,
1372
+ "eval_samples_per_second": 42.217,
1373
+ "eval_steps_per_second": 5.298,
1374
+ "step": 180
1375
+ },
1376
+ {
1377
+ "epoch": 1.2151260504201682,
1378
+ "grad_norm": 4.03125,
1379
+ "learning_rate": 3.4964710024786354e-06,
1380
+ "loss": 3.6634,
1381
+ "step": 181
1382
+ },
1383
+ {
1384
+ "epoch": 1.2218487394957984,
1385
+ "grad_norm": 4.5625,
1386
+ "learning_rate": 3.444544296144546e-06,
1387
+ "loss": 3.747,
1388
+ "step": 182
1389
+ },
1390
+ {
1391
+ "epoch": 1.2285714285714286,
1392
+ "grad_norm": 4.9375,
1393
+ "learning_rate": 3.3928026734841935e-06,
1394
+ "loss": 3.6196,
1395
+ "step": 183
1396
+ },
1397
+ {
1398
+ "epoch": 1.2352941176470589,
1399
+ "grad_norm": 3.90625,
1400
+ "learning_rate": 3.341252291233241e-06,
1401
+ "loss": 3.6693,
1402
+ "step": 184
1403
+ },
1404
+ {
1405
+ "epoch": 1.242016806722689,
1406
+ "grad_norm": 4.21875,
1407
+ "learning_rate": 3.289899283371657e-06,
1408
+ "loss": 3.7271,
1409
+ "step": 185
1410
+ },
1411
+ {
1412
+ "epoch": 1.2487394957983193,
1413
+ "grad_norm": 4.21875,
1414
+ "learning_rate": 3.2387497603938327e-06,
1415
+ "loss": 3.678,
1416
+ "step": 186
1417
+ },
1418
+ {
1419
+ "epoch": 1.2554621848739496,
1420
+ "grad_norm": 4.0625,
1421
+ "learning_rate": 3.1878098085814926e-06,
1422
+ "loss": 3.702,
1423
+ "step": 187
1424
+ },
1425
+ {
1426
+ "epoch": 1.2621848739495798,
1427
+ "grad_norm": 4.09375,
1428
+ "learning_rate": 3.1370854892794855e-06,
1429
+ "loss": 3.7787,
1430
+ "step": 188
1431
+ },
1432
+ {
1433
+ "epoch": 1.26890756302521,
1434
+ "grad_norm": 4.4375,
1435
+ "learning_rate": 3.0865828381745515e-06,
1436
+ "loss": 3.6845,
1437
+ "step": 189
1438
+ },
1439
+ {
1440
+ "epoch": 1.2756302521008402,
1441
+ "grad_norm": 4.125,
1442
+ "learning_rate": 3.0363078645771303e-06,
1443
+ "loss": 3.6905,
1444
+ "step": 190
1445
+ },
1446
+ {
1447
+ "epoch": 1.2823529411764705,
1448
+ "grad_norm": 4.28125,
1449
+ "learning_rate": 2.986266550706315e-06,
1450
+ "loss": 3.6823,
1451
+ "step": 191
1452
+ },
1453
+ {
1454
+ "epoch": 1.289075630252101,
1455
+ "grad_norm": 4.6875,
1456
+ "learning_rate": 2.936464850978027e-06,
1457
+ "loss": 3.7313,
1458
+ "step": 192
1459
+ },
1460
+ {
1461
+ "epoch": 1.2957983193277312,
1462
+ "grad_norm": 4.09375,
1463
+ "learning_rate": 2.886908691296504e-06,
1464
+ "loss": 3.7439,
1465
+ "step": 193
1466
+ },
1467
+ {
1468
+ "epoch": 1.3025210084033614,
1469
+ "grad_norm": 3.828125,
1470
+ "learning_rate": 2.8376039683491683e-06,
1471
+ "loss": 3.7323,
1472
+ "step": 194
1473
+ },
1474
+ {
1475
+ "epoch": 1.3092436974789916,
1476
+ "grad_norm": 4.0625,
1477
+ "learning_rate": 2.7885565489049948e-06,
1478
+ "loss": 3.7535,
1479
+ "step": 195
1480
+ },
1481
+ {
1482
+ "epoch": 1.3092436974789916,
1483
+ "eval_loss": 3.734619379043579,
1484
+ "eval_runtime": 30.2166,
1485
+ "eval_samples_per_second": 41.666,
1486
+ "eval_steps_per_second": 5.229,
1487
+ "step": 195
1488
+ },
1489
+ {
1490
+ "epoch": 1.3159663865546218,
1491
+ "grad_norm": 4.125,
1492
+ "learning_rate": 2.739772269116402e-06,
1493
+ "loss": 3.6891,
1494
+ "step": 196
1495
+ },
1496
+ {
1497
+ "epoch": 1.322689075630252,
1498
+ "grad_norm": 4.34375,
1499
+ "learning_rate": 2.6912569338248317e-06,
1500
+ "loss": 3.7449,
1501
+ "step": 197
1502
+ },
1503
+ {
1504
+ "epoch": 1.3294117647058823,
1505
+ "grad_norm": 3.96875,
1506
+ "learning_rate": 2.6430163158700116e-06,
1507
+ "loss": 3.6608,
1508
+ "step": 198
1509
+ },
1510
+ {
1511
+ "epoch": 1.3361344537815127,
1512
+ "grad_norm": 4.75,
1513
+ "learning_rate": 2.595056155403063e-06,
1514
+ "loss": 3.7449,
1515
+ "step": 199
1516
+ },
1517
+ {
1518
+ "epoch": 1.342857142857143,
1519
+ "grad_norm": 4.0,
1520
+ "learning_rate": 2.5473821592034604e-06,
1521
+ "loss": 3.7139,
1522
+ "step": 200
1523
+ },
1524
+ {
1525
+ "epoch": 1.3495798319327732,
1526
+ "grad_norm": 4.59375,
1527
+ "learning_rate": 2.5000000000000015e-06,
1528
+ "loss": 3.7823,
1529
+ "step": 201
1530
+ },
1531
+ {
1532
+ "epoch": 1.3563025210084034,
1533
+ "grad_norm": 4.21875,
1534
+ "learning_rate": 2.4529153157957913e-06,
1535
+ "loss": 3.7754,
1536
+ "step": 202
1537
+ },
1538
+ {
1539
+ "epoch": 1.3630252100840337,
1540
+ "grad_norm": 4.25,
1541
+ "learning_rate": 2.406133709197392e-06,
1542
+ "loss": 3.7373,
1543
+ "step": 203
1544
+ },
1545
+ {
1546
+ "epoch": 1.3697478991596639,
1547
+ "grad_norm": 4.125,
1548
+ "learning_rate": 2.3596607467481602e-06,
1549
+ "loss": 3.7617,
1550
+ "step": 204
1551
+ },
1552
+ {
1553
+ "epoch": 1.3764705882352941,
1554
+ "grad_norm": 4.15625,
1555
+ "learning_rate": 2.3135019582658803e-06,
1556
+ "loss": 3.7332,
1557
+ "step": 205
1558
+ },
1559
+ {
1560
+ "epoch": 1.3831932773109243,
1561
+ "grad_norm": 4.15625,
1562
+ "learning_rate": 2.2676628361847834e-06,
1563
+ "loss": 3.639,
1564
+ "step": 206
1565
+ },
1566
+ {
1567
+ "epoch": 1.3899159663865546,
1568
+ "grad_norm": 4.25,
1569
+ "learning_rate": 2.2221488349019903e-06,
1570
+ "loss": 3.6918,
1571
+ "step": 207
1572
+ },
1573
+ {
1574
+ "epoch": 1.3966386554621848,
1575
+ "grad_norm": 4.0625,
1576
+ "learning_rate": 2.1769653701284983e-06,
1577
+ "loss": 3.622,
1578
+ "step": 208
1579
+ },
1580
+ {
1581
+ "epoch": 1.403361344537815,
1582
+ "grad_norm": 4.53125,
1583
+ "learning_rate": 2.132117818244771e-06,
1584
+ "loss": 3.7286,
1585
+ "step": 209
1586
+ },
1587
+ {
1588
+ "epoch": 1.4100840336134453,
1589
+ "grad_norm": 4.0625,
1590
+ "learning_rate": 2.08761151566099e-06,
1591
+ "loss": 3.668,
1592
+ "step": 210
1593
+ },
1594
+ {
1595
+ "epoch": 1.4100840336134453,
1596
+ "eval_loss": 3.7375030517578125,
1597
+ "eval_runtime": 29.7795,
1598
+ "eval_samples_per_second": 42.277,
1599
+ "eval_steps_per_second": 5.306,
1600
+ "step": 210
1601
+ },
1602
+ {
1603
+ "epoch": 1.4168067226890757,
1604
+ "grad_norm": 4.21875,
1605
+ "learning_rate": 2.0434517581820893e-06,
1606
+ "loss": 3.7376,
1607
+ "step": 211
1608
+ },
1609
+ {
1610
+ "epoch": 1.423529411764706,
1611
+ "grad_norm": 3.984375,
1612
+ "learning_rate": 1.999643800377596e-06,
1613
+ "loss": 3.7108,
1614
+ "step": 212
1615
+ },
1616
+ {
1617
+ "epoch": 1.4302521008403362,
1618
+ "grad_norm": 3.734375,
1619
+ "learning_rate": 1.956192854956397e-06,
1620
+ "loss": 3.6391,
1621
+ "step": 213
1622
+ },
1623
+ {
1624
+ "epoch": 1.4369747899159664,
1625
+ "grad_norm": 4.0,
1626
+ "learning_rate": 1.913104092146476e-06,
1627
+ "loss": 3.6956,
1628
+ "step": 214
1629
+ },
1630
+ {
1631
+ "epoch": 1.4436974789915966,
1632
+ "grad_norm": 4.375,
1633
+ "learning_rate": 1.8703826390797047e-06,
1634
+ "loss": 3.6241,
1635
+ "step": 215
1636
+ },
1637
+ {
1638
+ "epoch": 1.4504201680672268,
1639
+ "grad_norm": 4.125,
1640
+ "learning_rate": 1.8280335791817733e-06,
1641
+ "loss": 3.7801,
1642
+ "step": 216
1643
+ },
1644
+ {
1645
+ "epoch": 1.457142857142857,
1646
+ "grad_norm": 4.0,
1647
+ "learning_rate": 1.7860619515673034e-06,
1648
+ "loss": 3.6977,
1649
+ "step": 217
1650
+ },
1651
+ {
1652
+ "epoch": 1.4638655462184875,
1653
+ "grad_norm": 4.71875,
1654
+ "learning_rate": 1.7444727504402554e-06,
1655
+ "loss": 3.6897,
1656
+ "step": 218
1657
+ },
1658
+ {
1659
+ "epoch": 1.4705882352941178,
1660
+ "grad_norm": 4.65625,
1661
+ "learning_rate": 1.7032709244996559e-06,
1662
+ "loss": 3.6878,
1663
+ "step": 219
1664
+ },
1665
+ {
1666
+ "epoch": 1.477310924369748,
1667
+ "grad_norm": 5.21875,
1668
+ "learning_rate": 1.662461376350764e-06,
1669
+ "loss": 3.7517,
1670
+ "step": 220
1671
+ },
1672
+ {
1673
+ "epoch": 1.4840336134453782,
1674
+ "grad_norm": 4.53125,
1675
+ "learning_rate": 1.6220489619216988e-06,
1676
+ "loss": 3.7621,
1677
+ "step": 221
1678
+ },
1679
+ {
1680
+ "epoch": 1.4907563025210084,
1681
+ "grad_norm": 3.984375,
1682
+ "learning_rate": 1.5820384898856433e-06,
1683
+ "loss": 3.7284,
1684
+ "step": 222
1685
+ },
1686
+ {
1687
+ "epoch": 1.4974789915966387,
1688
+ "grad_norm": 4.3125,
1689
+ "learning_rate": 1.5424347210886538e-06,
1690
+ "loss": 3.6888,
1691
+ "step": 223
1692
+ },
1693
+ {
1694
+ "epoch": 1.504201680672269,
1695
+ "grad_norm": 4.0625,
1696
+ "learning_rate": 1.5032423679831642e-06,
1697
+ "loss": 3.705,
1698
+ "step": 224
1699
+ },
1700
+ {
1701
+ "epoch": 1.5109243697478991,
1702
+ "grad_norm": 3.765625,
1703
+ "learning_rate": 1.4644660940672628e-06,
1704
+ "loss": 3.679,
1705
+ "step": 225
1706
+ },
1707
+ {
1708
+ "epoch": 1.5109243697478991,
1709
+ "eval_loss": 3.7383294105529785,
1710
+ "eval_runtime": 30.258,
1711
+ "eval_samples_per_second": 41.609,
1712
+ "eval_steps_per_second": 5.222,
1713
+ "step": 225
1714
+ },
1715
+ {
1716
+ "epoch": 1.5176470588235293,
1717
+ "grad_norm": 4.0625,
1718
+ "learning_rate": 1.4261105133297693e-06,
1719
+ "loss": 3.6644,
1720
+ "step": 226
1721
+ },
1722
+ {
1723
+ "epoch": 1.5243697478991596,
1724
+ "grad_norm": 4.28125,
1725
+ "learning_rate": 1.3881801897012225e-06,
1726
+ "loss": 3.6869,
1727
+ "step": 227
1728
+ },
1729
+ {
1730
+ "epoch": 1.5310924369747898,
1731
+ "grad_norm": 4.4375,
1732
+ "learning_rate": 1.3506796365108232e-06,
1733
+ "loss": 3.6292,
1734
+ "step": 228
1735
+ },
1736
+ {
1737
+ "epoch": 1.53781512605042,
1738
+ "grad_norm": 4.09375,
1739
+ "learning_rate": 1.3136133159493803e-06,
1740
+ "loss": 3.6962,
1741
+ "step": 229
1742
+ },
1743
+ {
1744
+ "epoch": 1.5445378151260503,
1745
+ "grad_norm": 4.125,
1746
+ "learning_rate": 1.2769856385383689e-06,
1747
+ "loss": 3.8197,
1748
+ "step": 230
1749
+ },
1750
+ {
1751
+ "epoch": 1.5512605042016807,
1752
+ "grad_norm": 4.21875,
1753
+ "learning_rate": 1.2408009626051137e-06,
1754
+ "loss": 3.7204,
1755
+ "step": 231
1756
+ },
1757
+ {
1758
+ "epoch": 1.557983193277311,
1759
+ "grad_norm": 4.28125,
1760
+ "learning_rate": 1.2050635937641909e-06,
1761
+ "loss": 3.7022,
1762
+ "step": 232
1763
+ },
1764
+ {
1765
+ "epoch": 1.5647058823529412,
1766
+ "grad_norm": 5.125,
1767
+ "learning_rate": 1.1697777844051105e-06,
1768
+ "loss": 3.6865,
1769
+ "step": 233
1770
+ },
1771
+ {
1772
+ "epoch": 1.5714285714285714,
1773
+ "grad_norm": 4.0625,
1774
+ "learning_rate": 1.134947733186315e-06,
1775
+ "loss": 3.6203,
1776
+ "step": 234
1777
+ },
1778
+ {
1779
+ "epoch": 1.5781512605042018,
1780
+ "grad_norm": 4.34375,
1781
+ "learning_rate": 1.100577584535592e-06,
1782
+ "loss": 3.7241,
1783
+ "step": 235
1784
+ },
1785
+ {
1786
+ "epoch": 1.584873949579832,
1787
+ "grad_norm": 3.84375,
1788
+ "learning_rate": 1.0666714281569152e-06,
1789
+ "loss": 3.5546,
1790
+ "step": 236
1791
+ },
1792
+ {
1793
+ "epoch": 1.5915966386554623,
1794
+ "grad_norm": 4.03125,
1795
+ "learning_rate": 1.0332332985438248e-06,
1796
+ "loss": 3.7072,
1797
+ "step": 237
1798
+ },
1799
+ {
1800
+ "epoch": 1.5983193277310925,
1801
+ "grad_norm": 4.96875,
1802
+ "learning_rate": 1.0002671744993519e-06,
1803
+ "loss": 3.8113,
1804
+ "step": 238
1805
+ },
1806
+ {
1807
+ "epoch": 1.6050420168067228,
1808
+ "grad_norm": 3.953125,
1809
+ "learning_rate": 9.677769786625869e-07,
1810
+ "loss": 3.7273,
1811
+ "step": 239
1812
+ },
1813
+ {
1814
+ "epoch": 1.611764705882353,
1815
+ "grad_norm": 4.3125,
1816
+ "learning_rate": 9.357665770419244e-07,
1817
+ "loss": 3.6539,
1818
+ "step": 240
1819
+ },
1820
+ {
1821
+ "epoch": 1.611764705882353,
1822
+ "eval_loss": 3.7385716438293457,
1823
+ "eval_runtime": 29.7733,
1824
+ "eval_samples_per_second": 42.286,
1825
+ "eval_steps_per_second": 5.307,
1826
+ "step": 240
1827
+ },
1828
+ {
1829
+ "epoch": 1.6184873949579832,
1830
+ "grad_norm": 4.0,
1831
+ "learning_rate": 9.042397785550405e-07,
1832
+ "loss": 3.6896,
1833
+ "step": 241
1834
+ },
1835
+ {
1836
+ "epoch": 1.6252100840336134,
1837
+ "grad_norm": 4.0,
1838
+ "learning_rate": 8.732003345756812e-07,
1839
+ "loss": 3.6619,
1840
+ "step": 242
1841
+ },
1842
+ {
1843
+ "epoch": 1.6319327731092437,
1844
+ "grad_norm": 4.09375,
1845
+ "learning_rate": 8.426519384872733e-07,
1846
+ "loss": 3.7638,
1847
+ "step": 243
1848
+ },
1849
+ {
1850
+ "epoch": 1.638655462184874,
1851
+ "grad_norm": 3.703125,
1852
+ "learning_rate": 8.125982252434611e-07,
1853
+ "loss": 3.685,
1854
+ "step": 244
1855
+ },
1856
+ {
1857
+ "epoch": 1.6453781512605041,
1858
+ "grad_norm": 4.21875,
1859
+ "learning_rate": 7.830427709355726e-07,
1860
+ "loss": 3.6325,
1861
+ "step": 245
1862
+ },
1863
+ {
1864
+ "epoch": 1.6521008403361344,
1865
+ "grad_norm": 4.28125,
1866
+ "learning_rate": 7.539890923671061e-07,
1867
+ "loss": 3.7295,
1868
+ "step": 246
1869
+ },
1870
+ {
1871
+ "epoch": 1.6588235294117646,
1872
+ "grad_norm": 4.09375,
1873
+ "learning_rate": 7.254406466352682e-07,
1874
+ "loss": 3.6432,
1875
+ "step": 247
1876
+ },
1877
+ {
1878
+ "epoch": 1.6655462184873948,
1879
+ "grad_norm": 5.09375,
1880
+ "learning_rate": 6.974008307196057e-07,
1881
+ "loss": 3.729,
1882
+ "step": 248
1883
+ },
1884
+ {
1885
+ "epoch": 1.6722689075630253,
1886
+ "grad_norm": 4.09375,
1887
+ "learning_rate": 6.698729810778065e-07,
1888
+ "loss": 3.6502,
1889
+ "step": 249
1890
+ },
1891
+ {
1892
+ "epoch": 1.6789915966386555,
1893
+ "grad_norm": 3.96875,
1894
+ "learning_rate": 6.428603732486938e-07,
1895
+ "loss": 3.6288,
1896
+ "step": 250
1897
+ },
1898
+ {
1899
+ "epoch": 1.6857142857142857,
1900
+ "grad_norm": 3.84375,
1901
+ "learning_rate": 6.163662214624616e-07,
1902
+ "loss": 3.6903,
1903
+ "step": 251
1904
+ },
1905
+ {
1906
+ "epoch": 1.692436974789916,
1907
+ "grad_norm": 4.34375,
1908
+ "learning_rate": 5.903936782582253e-07,
1909
+ "loss": 3.7859,
1910
+ "step": 252
1911
+ },
1912
+ {
1913
+ "epoch": 1.6991596638655462,
1914
+ "grad_norm": 4.125,
1915
+ "learning_rate": 5.649458341088915e-07,
1916
+ "loss": 3.7541,
1917
+ "step": 253
1918
+ },
1919
+ {
1920
+ "epoch": 1.7058823529411766,
1921
+ "grad_norm": 4.0625,
1922
+ "learning_rate": 5.400257170534296e-07,
1923
+ "loss": 3.7466,
1924
+ "step": 254
1925
+ },
1926
+ {
1927
+ "epoch": 1.7126050420168069,
1928
+ "grad_norm": 4.125,
1929
+ "learning_rate": 5.156362923365587e-07,
1930
+ "loss": 3.6547,
1931
+ "step": 255
1932
+ },
1933
+ {
1934
+ "epoch": 1.7126050420168069,
1935
+ "eval_loss": 3.738647222518921,
1936
+ "eval_runtime": 30.241,
1937
+ "eval_samples_per_second": 41.632,
1938
+ "eval_steps_per_second": 5.225,
1939
+ "step": 255
1940
+ },
1941
+ {
1942
+ "epoch": 1.719327731092437,
1943
+ "grad_norm": 3.78125,
1944
+ "learning_rate": 4.917804620559202e-07,
1945
+ "loss": 3.6395,
1946
+ "step": 256
1947
+ },
1948
+ {
1949
+ "epoch": 1.7260504201680673,
1950
+ "grad_norm": 4.25,
1951
+ "learning_rate": 4.6846106481675035e-07,
1952
+ "loss": 3.7057,
1953
+ "step": 257
1954
+ },
1955
+ {
1956
+ "epoch": 1.7327731092436975,
1957
+ "grad_norm": 4.125,
1958
+ "learning_rate": 4.456808753941205e-07,
1959
+ "loss": 3.7292,
1960
+ "step": 258
1961
+ },
1962
+ {
1963
+ "epoch": 1.7394957983193278,
1964
+ "grad_norm": 4.3125,
1965
+ "learning_rate": 4.2344260440276455e-07,
1966
+ "loss": 3.7007,
1967
+ "step": 259
1968
+ },
1969
+ {
1970
+ "epoch": 1.746218487394958,
1971
+ "grad_norm": 4.09375,
1972
+ "learning_rate": 4.0174889797453875e-07,
1973
+ "loss": 3.744,
1974
+ "step": 260
1975
+ },
1976
+ {
1977
+ "epoch": 1.7529411764705882,
1978
+ "grad_norm": 4.3125,
1979
+ "learning_rate": 3.8060233744356634e-07,
1980
+ "loss": 3.8662,
1981
+ "step": 261
1982
+ },
1983
+ {
1984
+ "epoch": 1.7596638655462185,
1985
+ "grad_norm": 4.3125,
1986
+ "learning_rate": 3.600054390390778e-07,
1987
+ "loss": 3.8242,
1988
+ "step": 262
1989
+ },
1990
+ {
1991
+ "epoch": 1.7663865546218487,
1992
+ "grad_norm": 3.921875,
1993
+ "learning_rate": 3.399606535860078e-07,
1994
+ "loss": 3.6502,
1995
+ "step": 263
1996
+ },
1997
+ {
1998
+ "epoch": 1.773109243697479,
1999
+ "grad_norm": 3.9375,
2000
+ "learning_rate": 3.204703662133724e-07,
2001
+ "loss": 3.6803,
2002
+ "step": 264
2003
+ },
2004
+ {
2005
+ "epoch": 1.7798319327731091,
2006
+ "grad_norm": 4.90625,
2007
+ "learning_rate": 3.015368960704584e-07,
2008
+ "loss": 3.7613,
2009
+ "step": 265
2010
+ },
2011
+ {
2012
+ "epoch": 1.7865546218487394,
2013
+ "grad_norm": 4.3125,
2014
+ "learning_rate": 2.8316249605087386e-07,
2015
+ "loss": 3.7316,
2016
+ "step": 266
2017
+ },
2018
+ {
2019
+ "epoch": 1.7932773109243696,
2020
+ "grad_norm": 4.125,
2021
+ "learning_rate": 2.653493525244721e-07,
2022
+ "loss": 3.6491,
2023
+ "step": 267
2024
+ },
2025
+ {
2026
+ "epoch": 1.8,
2027
+ "grad_norm": 4.15625,
2028
+ "learning_rate": 2.4809958507719444e-07,
2029
+ "loss": 3.8626,
2030
+ "step": 268
2031
+ },
2032
+ {
2033
+ "epoch": 1.8067226890756303,
2034
+ "grad_norm": 3.9375,
2035
+ "learning_rate": 2.314152462588659e-07,
2036
+ "loss": 3.7007,
2037
+ "step": 269
2038
+ },
2039
+ {
2040
+ "epoch": 1.8134453781512605,
2041
+ "grad_norm": 3.921875,
2042
+ "learning_rate": 2.152983213389559e-07,
2043
+ "loss": 3.7533,
2044
+ "step": 270
2045
+ },
2046
+ {
2047
+ "epoch": 1.8134453781512605,
2048
+ "eval_loss": 3.7400190830230713,
2049
+ "eval_runtime": 29.7815,
2050
+ "eval_samples_per_second": 42.275,
2051
+ "eval_steps_per_second": 5.305,
2052
+ "step": 270
2053
+ },
2054
+ {
2055
+ "epoch": 1.8201680672268907,
2056
+ "grad_norm": 4.15625,
2057
+ "learning_rate": 1.99750728070357e-07,
2058
+ "loss": 3.6811,
2059
+ "step": 271
2060
+ },
2061
+ {
2062
+ "epoch": 1.826890756302521,
2063
+ "grad_norm": 3.921875,
2064
+ "learning_rate": 1.8477431646118648e-07,
2065
+ "loss": 3.6697,
2066
+ "step": 272
2067
+ },
2068
+ {
2069
+ "epoch": 1.8336134453781514,
2070
+ "grad_norm": 4.375,
2071
+ "learning_rate": 1.7037086855465902e-07,
2072
+ "loss": 3.7755,
2073
+ "step": 273
2074
+ },
2075
+ {
2076
+ "epoch": 1.8403361344537816,
2077
+ "grad_norm": 4.34375,
2078
+ "learning_rate": 1.5654209821703458e-07,
2079
+ "loss": 3.7415,
2080
+ "step": 274
2081
+ },
2082
+ {
2083
+ "epoch": 1.8470588235294119,
2084
+ "grad_norm": 4.03125,
2085
+ "learning_rate": 1.4328965093369284e-07,
2086
+ "loss": 3.7171,
2087
+ "step": 275
2088
+ },
2089
+ {
2090
+ "epoch": 1.853781512605042,
2091
+ "grad_norm": 4.28125,
2092
+ "learning_rate": 1.3061510361333186e-07,
2093
+ "loss": 3.692,
2094
+ "step": 276
2095
+ },
2096
+ {
2097
+ "epoch": 1.8605042016806723,
2098
+ "grad_norm": 3.9375,
2099
+ "learning_rate": 1.185199644003332e-07,
2100
+ "loss": 3.7456,
2101
+ "step": 277
2102
+ },
2103
+ {
2104
+ "epoch": 1.8672268907563025,
2105
+ "grad_norm": 3.875,
2106
+ "learning_rate": 1.0700567249530835e-07,
2107
+ "loss": 3.6095,
2108
+ "step": 278
2109
+ },
2110
+ {
2111
+ "epoch": 1.8739495798319328,
2112
+ "grad_norm": 3.90625,
2113
+ "learning_rate": 9.607359798384785e-08,
2114
+ "loss": 3.759,
2115
+ "step": 279
2116
+ },
2117
+ {
2118
+ "epoch": 1.880672268907563,
2119
+ "grad_norm": 3.984375,
2120
+ "learning_rate": 8.572504167349449e-08,
2121
+ "loss": 3.776,
2122
+ "step": 280
2123
+ },
2124
+ {
2125
+ "epoch": 1.8873949579831932,
2126
+ "grad_norm": 4.0625,
2127
+ "learning_rate": 7.59612349389599e-08,
2128
+ "loss": 3.7251,
2129
+ "step": 281
2130
+ },
2131
+ {
2132
+ "epoch": 1.8941176470588235,
2133
+ "grad_norm": 4.0625,
2134
+ "learning_rate": 6.678333957560513e-08,
2135
+ "loss": 3.6457,
2136
+ "step": 282
2137
+ },
2138
+ {
2139
+ "epoch": 1.9008403361344537,
2140
+ "grad_norm": 3.984375,
2141
+ "learning_rate": 5.8192447661196694e-08,
2142
+ "loss": 3.7916,
2143
+ "step": 283
2144
+ },
2145
+ {
2146
+ "epoch": 1.907563025210084,
2147
+ "grad_norm": 3.84375,
2148
+ "learning_rate": 5.0189581425960644e-08,
2149
+ "loss": 3.6759,
2150
+ "step": 284
2151
+ },
2152
+ {
2153
+ "epoch": 1.9142857142857141,
2154
+ "grad_norm": 4.5625,
2155
+ "learning_rate": 4.2775693130948094e-08,
2156
+ "loss": 3.6983,
2157
+ "step": 285
2158
+ },
2159
+ {
2160
+ "epoch": 1.9142857142857141,
2161
+ "eval_loss": 3.7386996746063232,
2162
+ "eval_runtime": 30.2618,
2163
+ "eval_samples_per_second": 41.604,
2164
+ "eval_steps_per_second": 5.221,
2165
+ "step": 285
2166
+ },
2167
+ {
2168
+ "epoch": 1.9210084033613444,
2169
+ "grad_norm": 4.59375,
2170
+ "learning_rate": 3.59516649547248e-08,
2171
+ "loss": 3.7151,
2172
+ "step": 286
2173
+ },
2174
+ {
2175
+ "epoch": 1.9277310924369748,
2176
+ "grad_norm": 4.0,
2177
+ "learning_rate": 2.971830888840177e-08,
2178
+ "loss": 3.6223,
2179
+ "step": 287
2180
+ },
2181
+ {
2182
+ "epoch": 1.934453781512605,
2183
+ "grad_norm": 4.03125,
2184
+ "learning_rate": 2.4076366639015914e-08,
2185
+ "loss": 3.6781,
2186
+ "step": 288
2187
+ },
2188
+ {
2189
+ "epoch": 1.9411764705882353,
2190
+ "grad_norm": 4.21875,
2191
+ "learning_rate": 1.9026509541272276e-08,
2192
+ "loss": 3.7715,
2193
+ "step": 289
2194
+ },
2195
+ {
2196
+ "epoch": 1.9478991596638655,
2197
+ "grad_norm": 4.0,
2198
+ "learning_rate": 1.4569338477666838e-08,
2199
+ "loss": 3.6257,
2200
+ "step": 290
2201
+ },
2202
+ {
2203
+ "epoch": 1.954621848739496,
2204
+ "grad_norm": 4.125,
2205
+ "learning_rate": 1.0705383806982606e-08,
2206
+ "loss": 3.7141,
2207
+ "step": 291
2208
+ },
2209
+ {
2210
+ "epoch": 1.9613445378151262,
2211
+ "grad_norm": 4.34375,
2212
+ "learning_rate": 7.43510530118452e-09,
2213
+ "loss": 3.6997,
2214
+ "step": 292
2215
+ },
2216
+ {
2217
+ "epoch": 1.9680672268907564,
2218
+ "grad_norm": 4.21875,
2219
+ "learning_rate": 4.758892090711009e-09,
2220
+ "loss": 3.7004,
2221
+ "step": 293
2222
+ },
2223
+ {
2224
+ "epoch": 1.9747899159663866,
2225
+ "grad_norm": 6.25,
2226
+ "learning_rate": 2.6770626181715776e-09,
2227
+ "loss": 3.7454,
2228
+ "step": 294
2229
+ },
2230
+ {
2231
+ "epoch": 1.9815126050420169,
2232
+ "grad_norm": 4.5625,
2233
+ "learning_rate": 1.189864600454338e-09,
2234
+ "loss": 3.7919,
2235
+ "step": 295
2236
+ },
2237
+ {
2238
+ "epoch": 1.988235294117647,
2239
+ "grad_norm": 4.1875,
2240
+ "learning_rate": 2.974749992512571e-10,
2241
+ "loss": 3.8223,
2242
+ "step": 296
2243
+ }
2244
+ ],
2245
+ "logging_steps": 1,
2246
+ "max_steps": 296,
2247
+ "num_input_tokens_seen": 0,
2248
+ "num_train_epochs": 2,
2249
+ "save_steps": 15,
2250
+ "stateful_callbacks": {
2251
+ "TrainerControl": {
2252
+ "args": {
2253
+ "should_epoch_stop": false,
2254
+ "should_evaluate": false,
2255
+ "should_log": false,
2256
+ "should_save": true,
2257
+ "should_training_stop": true
2258
+ },
2259
+ "attributes": {}
2260
+ }
2261
+ },
2262
+ "total_flos": 9.333328305140531e+16,
2263
+ "train_batch_size": 4,
2264
+ "trial_name": null,
2265
+ "trial_params": null
2266
+ }
checkpoint-296/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ab91e38a5656a33a455ef4f7acd62dbd99d6a91529c45ba81572db9a3246c08
3
+ size 6993
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_sliding_window_pattern": 6,
3
+ "architectures": [
4
+ "Gemma3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "attn_logit_softcapping": null,
9
+ "bos_token_id": 2,
10
+ "cache_implementation": "hybrid",
11
+ "eos_token_id": 106,
12
+ "final_logit_softcapping": null,
13
+ "head_dim": 256,
14
+ "hidden_activation": "gelu_pytorch_tanh",
15
+ "hidden_size": 640,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "layer_types": [
19
+ "sliding_attention",
20
+ "sliding_attention",
21
+ "sliding_attention",
22
+ "sliding_attention",
23
+ "sliding_attention",
24
+ "full_attention",
25
+ "sliding_attention",
26
+ "sliding_attention",
27
+ "sliding_attention",
28
+ "sliding_attention",
29
+ "sliding_attention",
30
+ "full_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "sliding_attention",
36
+ "full_attention"
37
+ ],
38
+ "max_position_embeddings": 32768,
39
+ "model_type": "gemma3_text",
40
+ "num_attention_heads": 4,
41
+ "num_hidden_layers": 18,
42
+ "num_key_value_heads": 1,
43
+ "pad_token_id": 0,
44
+ "query_pre_attn_scalar": 256,
45
+ "rms_norm_eps": 1e-06,
46
+ "rope_local_base_freq": 10000.0,
47
+ "rope_scaling": null,
48
+ "rope_theta": 1000000.0,
49
+ "sliding_window": 512,
50
+ "sliding_window_pattern": 6,
51
+ "torch_dtype": "bfloat16",
52
+ "transformers_version": "4.52.4",
53
+ "unsloth_fixed": true,
54
+ "use_bidirectional_attention": false,
55
+ "use_cache": false,
56
+ "vocab_size": 262145
57
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "cache_implementation": "hybrid",
4
+ "do_sample": true,
5
+ "eos_token_id": [
6
+ 1,
7
+ 106
8
+ ],
9
+ "max_length": 32768,
10
+ "pad_token_id": 0,
11
+ "top_k": 64,
12
+ "top_p": 0.95,
13
+ "transformers_version": "4.52.4"
14
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5b41c0ad86b13aa2e31015b2fa37db700a177ccac2c390d5dcda7424957730
3
+ size 536224336
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<end_of_turn>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ab91e38a5656a33a455ef4f7acd62dbd99d6a91529c45ba81572db9a3246c08
3
+ size 6993