CarnageOP10 commited on
Commit
5206f12
·
verified ·
1 Parent(s): 20fe38d

Upload 8 files

Browse files
config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Idefics3ForConditionalGeneration"
4
+ ],
5
+ "dtype": "float16",
6
+ "image_seq_len": 81,
7
+ "image_token_id": 49153,
8
+ "model_type": "idefics3",
9
+ "pad_token_id": 128002,
10
+ "scale_factor": 3,
11
+ "text_config": {
12
+ "_attn_implementation_autoset": false,
13
+ "_flash_attn_2_enabled": true,
14
+ "_name_or_path": "/fsx/m4/experiments/local_experiment_dir/s3_async_temporary_checkpoint_folder/tr_324_opt_400/unwrapped_model",
15
+ "architectures": [
16
+ "VLlama3ForCausalLM"
17
+ ],
18
+ "attention_bias": false,
19
+ "attention_dropout": 0.0,
20
+ "bos_token_id": 0,
21
+ "dtype": "float16",
22
+ "eos_token_id": 0,
23
+ "head_dim": 64,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 2048,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 8192,
28
+ "max_position_embeddings": 16384,
29
+ "mlp_bias": false,
30
+ "model_type": "llama",
31
+ "neftune_noise_alpha": 0.0,
32
+ "num_attention_heads": 32,
33
+ "num_hidden_layers": 24,
34
+ "num_key_value_heads": 32,
35
+ "pad_token_id": 2,
36
+ "perceiver_config": {
37
+ "_attn_implementation_autoset": false,
38
+ "_name_or_path": "",
39
+ "add_cross_attention": false,
40
+ "architectures": null,
41
+ "attention_dropout": 0.0,
42
+ "bad_words_ids": null,
43
+ "begin_suppress_tokens": null,
44
+ "bos_token_id": null,
45
+ "chunk_size_feed_forward": 0,
46
+ "cross_attention_hidden_size": null,
47
+ "decoder_start_token_id": null,
48
+ "diversity_penalty": 0.0,
49
+ "do_sample": false,
50
+ "early_stopping": false,
51
+ "encoder_no_repeat_ngram_size": 0,
52
+ "eos_token_id": null,
53
+ "exponential_decay_length_penalty": null,
54
+ "finetuning_task": null,
55
+ "forced_bos_token_id": null,
56
+ "forced_eos_token_id": null,
57
+ "hidden_act": "silu",
58
+ "id2label": {
59
+ "0": "LABEL_0",
60
+ "1": "LABEL_1"
61
+ },
62
+ "is_decoder": false,
63
+ "is_encoder_decoder": false,
64
+ "label2id": {
65
+ "LABEL_0": 0,
66
+ "LABEL_1": 1
67
+ },
68
+ "length_penalty": 1.0,
69
+ "max_length": 20,
70
+ "min_length": 0,
71
+ "model_type": "vllama3",
72
+ "no_repeat_ngram_size": 0,
73
+ "num_beam_groups": 1,
74
+ "num_beams": 1,
75
+ "num_key_value_heads": 1,
76
+ "num_return_sequences": 1,
77
+ "output_attentions": false,
78
+ "output_hidden_states": false,
79
+ "output_scores": false,
80
+ "pad_token_id": null,
81
+ "prefix": null,
82
+ "problem_type": null,
83
+ "pruned_heads": {},
84
+ "qk_layer_norms_perceiver": false,
85
+ "remove_invalid_values": false,
86
+ "repetition_penalty": 1.0,
87
+ "resampler_depth": 6,
88
+ "resampler_head_dim": 96,
89
+ "resampler_n_heads": 16,
90
+ "resampler_n_latents": 64,
91
+ "return_dict": true,
92
+ "return_dict_in_generate": false,
93
+ "sep_token_id": null,
94
+ "suppress_tokens": null,
95
+ "task_specific_params": null,
96
+ "temperature": 1.0,
97
+ "tf_legacy_loss": false,
98
+ "tie_encoder_decoder": false,
99
+ "tie_word_embeddings": true,
100
+ "tokenizer_class": null,
101
+ "top_k": 50,
102
+ "top_p": 1.0,
103
+ "torch_dtype": null,
104
+ "torchscript": false,
105
+ "transformers_version": "4.46.0",
106
+ "typical_p": 1.0,
107
+ "use_bfloat16": false
108
+ },
109
+ "pretraining_tp": 1,
110
+ "qk_layer_norms": false,
111
+ "rms_norm_eps": 1e-05,
112
+ "rope_scaling": null,
113
+ "rope_theta": 273768.0,
114
+ "use_cache": true,
115
+ "use_resampler": false,
116
+ "vocab_size": 49155
117
+ },
118
+ "tie_word_embeddings": false,
119
+ "transformers.js_config": {
120
+ "dtype": {
121
+ "decoder_model_merged": "q4",
122
+ "embed_tokens": "auto",
123
+ "vision_encoder": "auto"
124
+ },
125
+ "kv_cache_dtype": {
126
+ "fp16": "float16",
127
+ "q4f16": "float16"
128
+ },
129
+ "use_external_data_format": {
130
+ "decoder_model_merged.onnx": true,
131
+ "decoder_model_merged_fp16.onnx": true
132
+ }
133
+ },
134
+ "transformers_version": "4.57.0",
135
+ "use_cache": true,
136
+ "vision_config": {
137
+ "_attn_implementation_autoset": false,
138
+ "attention_dropout": 0.0,
139
+ "dtype": "float16",
140
+ "hidden_act": "gelu_pytorch_tanh",
141
+ "hidden_size": 1152,
142
+ "image_size": 384,
143
+ "initializer_range": 0.02,
144
+ "intermediate_size": 4304,
145
+ "layer_norm_eps": 1e-06,
146
+ "max_image_size": {
147
+ "longest_edge": 384
148
+ },
149
+ "model_type": "idefics3_vision",
150
+ "num_attention_heads": 16,
151
+ "num_channels": 3,
152
+ "num_hidden_layers": 27,
153
+ "patch_size": 14,
154
+ "size": {
155
+ "longest_edge": 1920
156
+ },
157
+ "tie_word_embeddings": false
158
+ },
159
+ "vocab_size": 49155
160
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 49154,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.0"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d787c8fc0b458a298166e4fe51db445a31962cf563613551f672d473c7150b0
3
+ size 4492630256
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d145c72bdffa85cea0e1d3a503d4388c6b85c83ac15a849977b766b7371e311c
3
+ size 5171520935
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d744506ed8242dbe82c0f3357716f73248e5153ff68604326958faa28d9296
3
+ size 14645
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb53595bb041d4f6bfcf8bdb1611ba64ac44f2528fecf6fbcf3d23aba6e3e8dc
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,643 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.7777777777777778,
6
+ "eval_steps": 50,
7
+ "global_step": 700,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0011111111111111111,
14
+ "grad_norm": 234.0,
15
+ "learning_rate": 0.0,
16
+ "loss": 34.6327,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.011111111111111112,
21
+ "grad_norm": 196.0,
22
+ "learning_rate": 9e-06,
23
+ "loss": 32.4195,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.022222222222222223,
28
+ "grad_norm": 66.875,
29
+ "learning_rate": 1.9e-05,
30
+ "loss": 14.1658,
31
+ "step": 20
32
+ },
33
+ {
34
+ "epoch": 0.03333333333333333,
35
+ "grad_norm": 0.9267578125,
36
+ "learning_rate": 2.9e-05,
37
+ "loss": 0.4885,
38
+ "step": 30
39
+ },
40
+ {
41
+ "epoch": 0.044444444444444446,
42
+ "grad_norm": 0.4169921875,
43
+ "learning_rate": 3.9000000000000006e-05,
44
+ "loss": 0.3108,
45
+ "step": 40
46
+ },
47
+ {
48
+ "epoch": 0.05555555555555555,
49
+ "grad_norm": 0.41357421875,
50
+ "learning_rate": 4.9e-05,
51
+ "loss": 0.2811,
52
+ "step": 50
53
+ },
54
+ {
55
+ "epoch": 0.05555555555555555,
56
+ "eval_loss": 0.14292466640472412,
57
+ "eval_runtime": 449.904,
58
+ "eval_samples_per_second": 4.001,
59
+ "eval_steps_per_second": 2.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "epoch": 0.06666666666666667,
64
+ "grad_norm": 0.359619140625,
65
+ "learning_rate": 4.9742857142857145e-05,
66
+ "loss": 0.2584,
67
+ "step": 60
68
+ },
69
+ {
70
+ "epoch": 0.07777777777777778,
71
+ "grad_norm": 0.431640625,
72
+ "learning_rate": 4.9457142857142854e-05,
73
+ "loss": 0.2643,
74
+ "step": 70
75
+ },
76
+ {
77
+ "epoch": 0.08888888888888889,
78
+ "grad_norm": 0.46923828125,
79
+ "learning_rate": 4.917142857142858e-05,
80
+ "loss": 0.2923,
81
+ "step": 80
82
+ },
83
+ {
84
+ "epoch": 0.1,
85
+ "grad_norm": 0.449951171875,
86
+ "learning_rate": 4.888571428571429e-05,
87
+ "loss": 0.29,
88
+ "step": 90
89
+ },
90
+ {
91
+ "epoch": 0.1111111111111111,
92
+ "grad_norm": 0.3896484375,
93
+ "learning_rate": 4.86e-05,
94
+ "loss": 0.2609,
95
+ "step": 100
96
+ },
97
+ {
98
+ "epoch": 0.1111111111111111,
99
+ "eval_loss": 0.13491107523441315,
100
+ "eval_runtime": 431.8863,
101
+ "eval_samples_per_second": 4.168,
102
+ "eval_steps_per_second": 2.084,
103
+ "step": 100
104
+ },
105
+ {
106
+ "epoch": 0.12222222222222222,
107
+ "grad_norm": 0.3828125,
108
+ "learning_rate": 4.831428571428572e-05,
109
+ "loss": 0.2863,
110
+ "step": 110
111
+ },
112
+ {
113
+ "epoch": 0.13333333333333333,
114
+ "grad_norm": 0.603515625,
115
+ "learning_rate": 4.802857142857143e-05,
116
+ "loss": 0.2647,
117
+ "step": 120
118
+ },
119
+ {
120
+ "epoch": 0.14444444444444443,
121
+ "grad_norm": 0.439208984375,
122
+ "learning_rate": 4.7742857142857144e-05,
123
+ "loss": 0.2706,
124
+ "step": 130
125
+ },
126
+ {
127
+ "epoch": 0.15555555555555556,
128
+ "grad_norm": 0.50634765625,
129
+ "learning_rate": 4.745714285714286e-05,
130
+ "loss": 0.2356,
131
+ "step": 140
132
+ },
133
+ {
134
+ "epoch": 0.16666666666666666,
135
+ "grad_norm": 0.513671875,
136
+ "learning_rate": 4.717142857142857e-05,
137
+ "loss": 0.2469,
138
+ "step": 150
139
+ },
140
+ {
141
+ "epoch": 0.16666666666666666,
142
+ "eval_loss": 0.12212829291820526,
143
+ "eval_runtime": 424.7761,
144
+ "eval_samples_per_second": 4.238,
145
+ "eval_steps_per_second": 2.119,
146
+ "step": 150
147
+ },
148
+ {
149
+ "epoch": 0.17777777777777778,
150
+ "grad_norm": 0.48095703125,
151
+ "learning_rate": 4.6885714285714285e-05,
152
+ "loss": 0.2469,
153
+ "step": 160
154
+ },
155
+ {
156
+ "epoch": 0.18888888888888888,
157
+ "grad_norm": 0.4775390625,
158
+ "learning_rate": 4.660000000000001e-05,
159
+ "loss": 0.2566,
160
+ "step": 170
161
+ },
162
+ {
163
+ "epoch": 0.2,
164
+ "grad_norm": 0.416259765625,
165
+ "learning_rate": 4.631428571428572e-05,
166
+ "loss": 0.2263,
167
+ "step": 180
168
+ },
169
+ {
170
+ "epoch": 0.2111111111111111,
171
+ "grad_norm": 0.47900390625,
172
+ "learning_rate": 4.602857142857143e-05,
173
+ "loss": 0.2505,
174
+ "step": 190
175
+ },
176
+ {
177
+ "epoch": 0.2222222222222222,
178
+ "grad_norm": 0.3916015625,
179
+ "learning_rate": 4.574285714285714e-05,
180
+ "loss": 0.2232,
181
+ "step": 200
182
+ },
183
+ {
184
+ "epoch": 0.2222222222222222,
185
+ "eval_loss": 0.11928859353065491,
186
+ "eval_runtime": 432.6807,
187
+ "eval_samples_per_second": 4.16,
188
+ "eval_steps_per_second": 2.08,
189
+ "step": 200
190
+ },
191
+ {
192
+ "epoch": 0.23333333333333334,
193
+ "grad_norm": 0.48046875,
194
+ "learning_rate": 4.545714285714286e-05,
195
+ "loss": 0.2394,
196
+ "step": 210
197
+ },
198
+ {
199
+ "epoch": 0.24444444444444444,
200
+ "grad_norm": 0.40771484375,
201
+ "learning_rate": 4.5171428571428575e-05,
202
+ "loss": 0.226,
203
+ "step": 220
204
+ },
205
+ {
206
+ "epoch": 0.25555555555555554,
207
+ "grad_norm": 0.476318359375,
208
+ "learning_rate": 4.4885714285714284e-05,
209
+ "loss": 0.2628,
210
+ "step": 230
211
+ },
212
+ {
213
+ "epoch": 0.26666666666666666,
214
+ "grad_norm": 0.443603515625,
215
+ "learning_rate": 4.46e-05,
216
+ "loss": 0.25,
217
+ "step": 240
218
+ },
219
+ {
220
+ "epoch": 0.2777777777777778,
221
+ "grad_norm": 0.468994140625,
222
+ "learning_rate": 4.4314285714285716e-05,
223
+ "loss": 0.2663,
224
+ "step": 250
225
+ },
226
+ {
227
+ "epoch": 0.2777777777777778,
228
+ "eval_loss": 0.11743941903114319,
229
+ "eval_runtime": 463.571,
230
+ "eval_samples_per_second": 3.883,
231
+ "eval_steps_per_second": 1.941,
232
+ "step": 250
233
+ },
234
+ {
235
+ "epoch": 0.28888888888888886,
236
+ "grad_norm": 0.5185546875,
237
+ "learning_rate": 4.402857142857143e-05,
238
+ "loss": 0.2446,
239
+ "step": 260
240
+ },
241
+ {
242
+ "epoch": 0.3,
243
+ "grad_norm": 0.43994140625,
244
+ "learning_rate": 4.374285714285715e-05,
245
+ "loss": 0.257,
246
+ "step": 270
247
+ },
248
+ {
249
+ "epoch": 0.3111111111111111,
250
+ "grad_norm": 0.447021484375,
251
+ "learning_rate": 4.345714285714286e-05,
252
+ "loss": 0.2292,
253
+ "step": 280
254
+ },
255
+ {
256
+ "epoch": 0.32222222222222224,
257
+ "grad_norm": 0.372314453125,
258
+ "learning_rate": 4.317142857142857e-05,
259
+ "loss": 0.2314,
260
+ "step": 290
261
+ },
262
+ {
263
+ "epoch": 0.3333333333333333,
264
+ "grad_norm": 0.4365234375,
265
+ "learning_rate": 4.288571428571429e-05,
266
+ "loss": 0.2288,
267
+ "step": 300
268
+ },
269
+ {
270
+ "epoch": 0.3333333333333333,
271
+ "eval_loss": 0.11491398513317108,
272
+ "eval_runtime": 454.5263,
273
+ "eval_samples_per_second": 3.96,
274
+ "eval_steps_per_second": 1.98,
275
+ "step": 300
276
+ },
277
+ {
278
+ "epoch": 0.34444444444444444,
279
+ "grad_norm": 0.3798828125,
280
+ "learning_rate": 4.26e-05,
281
+ "loss": 0.23,
282
+ "step": 310
283
+ },
284
+ {
285
+ "epoch": 0.35555555555555557,
286
+ "grad_norm": 0.431396484375,
287
+ "learning_rate": 4.2314285714285715e-05,
288
+ "loss": 0.2208,
289
+ "step": 320
290
+ },
291
+ {
292
+ "epoch": 0.36666666666666664,
293
+ "grad_norm": 0.369384765625,
294
+ "learning_rate": 4.202857142857143e-05,
295
+ "loss": 0.213,
296
+ "step": 330
297
+ },
298
+ {
299
+ "epoch": 0.37777777777777777,
300
+ "grad_norm": 0.46826171875,
301
+ "learning_rate": 4.174285714285715e-05,
302
+ "loss": 0.2453,
303
+ "step": 340
304
+ },
305
+ {
306
+ "epoch": 0.3888888888888889,
307
+ "grad_norm": 0.485107421875,
308
+ "learning_rate": 4.145714285714286e-05,
309
+ "loss": 0.2454,
310
+ "step": 350
311
+ },
312
+ {
313
+ "epoch": 0.3888888888888889,
314
+ "eval_loss": 0.11370400339365005,
315
+ "eval_runtime": 429.4907,
316
+ "eval_samples_per_second": 4.191,
317
+ "eval_steps_per_second": 2.096,
318
+ "step": 350
319
+ },
320
+ {
321
+ "epoch": 0.4,
322
+ "grad_norm": 0.42919921875,
323
+ "learning_rate": 4.117142857142857e-05,
324
+ "loss": 0.245,
325
+ "step": 360
326
+ },
327
+ {
328
+ "epoch": 0.4111111111111111,
329
+ "grad_norm": 0.38427734375,
330
+ "learning_rate": 4.088571428571429e-05,
331
+ "loss": 0.2255,
332
+ "step": 370
333
+ },
334
+ {
335
+ "epoch": 0.4222222222222222,
336
+ "grad_norm": 0.295654296875,
337
+ "learning_rate": 4.0600000000000004e-05,
338
+ "loss": 0.2266,
339
+ "step": 380
340
+ },
341
+ {
342
+ "epoch": 0.43333333333333335,
343
+ "grad_norm": 0.437744140625,
344
+ "learning_rate": 4.0314285714285714e-05,
345
+ "loss": 0.2438,
346
+ "step": 390
347
+ },
348
+ {
349
+ "epoch": 0.4444444444444444,
350
+ "grad_norm": 0.4521484375,
351
+ "learning_rate": 4.002857142857143e-05,
352
+ "loss": 0.2135,
353
+ "step": 400
354
+ },
355
+ {
356
+ "epoch": 0.4444444444444444,
357
+ "eval_loss": 0.11205437779426575,
358
+ "eval_runtime": 431.8391,
359
+ "eval_samples_per_second": 4.168,
360
+ "eval_steps_per_second": 2.084,
361
+ "step": 400
362
+ },
363
+ {
364
+ "epoch": 0.45555555555555555,
365
+ "grad_norm": 0.33154296875,
366
+ "learning_rate": 3.9742857142857146e-05,
367
+ "loss": 0.2081,
368
+ "step": 410
369
+ },
370
+ {
371
+ "epoch": 0.4666666666666667,
372
+ "grad_norm": 0.35595703125,
373
+ "learning_rate": 3.945714285714286e-05,
374
+ "loss": 0.2369,
375
+ "step": 420
376
+ },
377
+ {
378
+ "epoch": 0.4777777777777778,
379
+ "grad_norm": 0.5166015625,
380
+ "learning_rate": 3.917142857142858e-05,
381
+ "loss": 0.2116,
382
+ "step": 430
383
+ },
384
+ {
385
+ "epoch": 0.4888888888888889,
386
+ "grad_norm": 0.403076171875,
387
+ "learning_rate": 3.888571428571429e-05,
388
+ "loss": 0.2159,
389
+ "step": 440
390
+ },
391
+ {
392
+ "epoch": 0.5,
393
+ "grad_norm": 0.349609375,
394
+ "learning_rate": 3.86e-05,
395
+ "loss": 0.208,
396
+ "step": 450
397
+ },
398
+ {
399
+ "epoch": 0.5,
400
+ "eval_loss": 0.11105828732252121,
401
+ "eval_runtime": 468.2112,
402
+ "eval_samples_per_second": 3.844,
403
+ "eval_steps_per_second": 1.922,
404
+ "step": 450
405
+ },
406
+ {
407
+ "epoch": 0.5111111111111111,
408
+ "grad_norm": 0.4482421875,
409
+ "learning_rate": 3.831428571428571e-05,
410
+ "loss": 0.2459,
411
+ "step": 460
412
+ },
413
+ {
414
+ "epoch": 0.5222222222222223,
415
+ "grad_norm": 0.365478515625,
416
+ "learning_rate": 3.802857142857143e-05,
417
+ "loss": 0.2239,
418
+ "step": 470
419
+ },
420
+ {
421
+ "epoch": 0.5333333333333333,
422
+ "grad_norm": 0.397216796875,
423
+ "learning_rate": 3.7742857142857145e-05,
424
+ "loss": 0.2396,
425
+ "step": 480
426
+ },
427
+ {
428
+ "epoch": 0.5444444444444444,
429
+ "grad_norm": 0.384765625,
430
+ "learning_rate": 3.745714285714286e-05,
431
+ "loss": 0.2263,
432
+ "step": 490
433
+ },
434
+ {
435
+ "epoch": 0.5555555555555556,
436
+ "grad_norm": 0.403564453125,
437
+ "learning_rate": 3.717142857142858e-05,
438
+ "loss": 0.2225,
439
+ "step": 500
440
+ },
441
+ {
442
+ "epoch": 0.5555555555555556,
443
+ "eval_loss": 0.11006490141153336,
444
+ "eval_runtime": 450.548,
445
+ "eval_samples_per_second": 3.995,
446
+ "eval_steps_per_second": 1.998,
447
+ "step": 500
448
+ },
449
+ {
450
+ "epoch": 0.5666666666666667,
451
+ "grad_norm": 0.4404296875,
452
+ "learning_rate": 3.688571428571429e-05,
453
+ "loss": 0.2238,
454
+ "step": 510
455
+ },
456
+ {
457
+ "epoch": 0.5777777777777777,
458
+ "grad_norm": 0.354248046875,
459
+ "learning_rate": 3.66e-05,
460
+ "loss": 0.2032,
461
+ "step": 520
462
+ },
463
+ {
464
+ "epoch": 0.5888888888888889,
465
+ "grad_norm": 0.344970703125,
466
+ "learning_rate": 3.631428571428572e-05,
467
+ "loss": 0.222,
468
+ "step": 530
469
+ },
470
+ {
471
+ "epoch": 0.6,
472
+ "grad_norm": 0.3740234375,
473
+ "learning_rate": 3.602857142857143e-05,
474
+ "loss": 0.2329,
475
+ "step": 540
476
+ },
477
+ {
478
+ "epoch": 0.6111111111111112,
479
+ "grad_norm": 0.3486328125,
480
+ "learning_rate": 3.574285714285714e-05,
481
+ "loss": 0.2188,
482
+ "step": 550
483
+ },
484
+ {
485
+ "epoch": 0.6111111111111112,
486
+ "eval_loss": 0.10909327119588852,
487
+ "eval_runtime": 435.9811,
488
+ "eval_samples_per_second": 4.129,
489
+ "eval_steps_per_second": 2.064,
490
+ "step": 550
491
+ },
492
+ {
493
+ "epoch": 0.6222222222222222,
494
+ "grad_norm": 0.425537109375,
495
+ "learning_rate": 3.545714285714286e-05,
496
+ "loss": 0.215,
497
+ "step": 560
498
+ },
499
+ {
500
+ "epoch": 0.6333333333333333,
501
+ "grad_norm": 0.41357421875,
502
+ "learning_rate": 3.517142857142857e-05,
503
+ "loss": 0.242,
504
+ "step": 570
505
+ },
506
+ {
507
+ "epoch": 0.6444444444444445,
508
+ "grad_norm": 0.482666015625,
509
+ "learning_rate": 3.488571428571429e-05,
510
+ "loss": 0.2034,
511
+ "step": 580
512
+ },
513
+ {
514
+ "epoch": 0.6555555555555556,
515
+ "grad_norm": 0.49658203125,
516
+ "learning_rate": 3.46e-05,
517
+ "loss": 0.2241,
518
+ "step": 590
519
+ },
520
+ {
521
+ "epoch": 0.6666666666666666,
522
+ "grad_norm": 0.42041015625,
523
+ "learning_rate": 3.431428571428572e-05,
524
+ "loss": 0.2477,
525
+ "step": 600
526
+ },
527
+ {
528
+ "epoch": 0.6666666666666666,
529
+ "eval_loss": 0.10835430771112442,
530
+ "eval_runtime": 451.0734,
531
+ "eval_samples_per_second": 3.99,
532
+ "eval_steps_per_second": 1.995,
533
+ "step": 600
534
+ },
535
+ {
536
+ "epoch": 0.6777777777777778,
537
+ "grad_norm": 0.372314453125,
538
+ "learning_rate": 3.402857142857143e-05,
539
+ "loss": 0.2293,
540
+ "step": 610
541
+ },
542
+ {
543
+ "epoch": 0.6888888888888889,
544
+ "grad_norm": 0.387939453125,
545
+ "learning_rate": 3.374285714285714e-05,
546
+ "loss": 0.246,
547
+ "step": 620
548
+ },
549
+ {
550
+ "epoch": 0.7,
551
+ "grad_norm": 0.426025390625,
552
+ "learning_rate": 3.345714285714286e-05,
553
+ "loss": 0.2,
554
+ "step": 630
555
+ },
556
+ {
557
+ "epoch": 0.7111111111111111,
558
+ "grad_norm": 0.380126953125,
559
+ "learning_rate": 3.3171428571428574e-05,
560
+ "loss": 0.2184,
561
+ "step": 640
562
+ },
563
+ {
564
+ "epoch": 0.7222222222222222,
565
+ "grad_norm": 0.393310546875,
566
+ "learning_rate": 3.2885714285714284e-05,
567
+ "loss": 0.2092,
568
+ "step": 650
569
+ },
570
+ {
571
+ "epoch": 0.7222222222222222,
572
+ "eval_loss": 0.10704999417066574,
573
+ "eval_runtime": 434.4458,
574
+ "eval_samples_per_second": 4.143,
575
+ "eval_steps_per_second": 2.072,
576
+ "step": 650
577
+ },
578
+ {
579
+ "epoch": 0.7333333333333333,
580
+ "grad_norm": 0.37841796875,
581
+ "learning_rate": 3.26e-05,
582
+ "loss": 0.2078,
583
+ "step": 660
584
+ },
585
+ {
586
+ "epoch": 0.7444444444444445,
587
+ "grad_norm": 0.400634765625,
588
+ "learning_rate": 3.2314285714285716e-05,
589
+ "loss": 0.2358,
590
+ "step": 670
591
+ },
592
+ {
593
+ "epoch": 0.7555555555555555,
594
+ "grad_norm": 0.38037109375,
595
+ "learning_rate": 3.202857142857143e-05,
596
+ "loss": 0.2226,
597
+ "step": 680
598
+ },
599
+ {
600
+ "epoch": 0.7666666666666667,
601
+ "grad_norm": 0.4248046875,
602
+ "learning_rate": 3.174285714285715e-05,
603
+ "loss": 0.2351,
604
+ "step": 690
605
+ },
606
+ {
607
+ "epoch": 0.7777777777777778,
608
+ "grad_norm": 0.51953125,
609
+ "learning_rate": 3.145714285714286e-05,
610
+ "loss": 0.2525,
611
+ "step": 700
612
+ },
613
+ {
614
+ "epoch": 0.7777777777777778,
615
+ "eval_loss": 0.10690909624099731,
616
+ "eval_runtime": 432.8965,
617
+ "eval_samples_per_second": 4.158,
618
+ "eval_steps_per_second": 2.079,
619
+ "step": 700
620
+ }
621
+ ],
622
+ "logging_steps": 10,
623
+ "max_steps": 1800,
624
+ "num_input_tokens_seen": 0,
625
+ "num_train_epochs": 2,
626
+ "save_steps": 50,
627
+ "stateful_callbacks": {
628
+ "TrainerControl": {
629
+ "args": {
630
+ "should_epoch_stop": false,
631
+ "should_evaluate": false,
632
+ "should_log": false,
633
+ "should_save": true,
634
+ "should_training_stop": false
635
+ },
636
+ "attributes": {}
637
+ }
638
+ },
639
+ "total_flos": 1.2387832165981747e+17,
640
+ "train_batch_size": 4,
641
+ "trial_name": null,
642
+ "trial_params": null
643
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d596ce9cf12538cc209f87a3bcb25d6de593d17eb0a779831b1f716a21e281f9
3
+ size 5777