void-818 commited on
Commit
e848b1d
·
verified ·
1 Parent(s): 5742a10

Add 16 files

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd0a5d1fb55d9e9c23364942c6b2393506b1c5e8464bf56bcdfd7c853e91fc2c
3
  size 4967215360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86f5f45d6144d0fa35c2b8d730ce6f3cf67244843f31bae3f5f69924edf468c
3
  size 4967215360
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed2dc0d1dcf4087574ffaf17728197a85fffb69dfb523ba2612be7e9f9aea85f
3
  size 3077766632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88459e7f33fdabff763552eb4e14c7418f497a8132c7b14597e1c795f95ca8d2
3
  size 3077766632
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:718a0f3db00824213036a2c0441849791319b7d9cf189065873bb26a7020738e
3
  size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7242403044cccf39933e2344635dc740c47d5b24649c5690c6b12d08ca549e87
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67cd9271a188fb648597e505255f5e1ea37f0260a2bbe5ba6a99f0ce616fc6b3
3
  size 1465
trainer_state.json CHANGED
@@ -1,371 +1,1087 @@
1
  {
2
- "best_global_step": 500,
3
- "best_metric": 0.02287970297038555,
4
- "best_model_checkpoint": "./trained_model_20251223_131030/checkpoint-500",
5
- "epoch": 0.36133694670280037,
6
  "eval_steps": 500,
7
- "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.007226738934056007,
14
- "grad_norm": 1.3359375,
15
  "learning_rate": 1.44e-06,
16
- "loss": 0.0529,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.014453477868112014,
21
- "grad_norm": 1.40625,
22
  "learning_rate": 3.04e-06,
23
- "loss": 0.0463,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.02168021680216802,
28
- "grad_norm": 0.80078125,
29
  "learning_rate": 4.6400000000000005e-06,
30
- "loss": 0.0295,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.028906955736224028,
35
- "grad_norm": 0.4140625,
36
  "learning_rate": 6.24e-06,
37
- "loss": 0.0233,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.036133694670280034,
42
- "grad_norm": 0.275390625,
43
  "learning_rate": 7.840000000000001e-06,
44
- "loss": 0.0192,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.04336043360433604,
49
- "grad_norm": 0.353515625,
50
  "learning_rate": 9.440000000000001e-06,
51
- "loss": 0.0184,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.05058717253839205,
56
- "grad_norm": 0.318359375,
57
  "learning_rate": 1.1040000000000001e-05,
58
- "loss": 0.0278,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.057813911472448055,
63
- "grad_norm": 0.494140625,
64
  "learning_rate": 1.2640000000000001e-05,
65
- "loss": 0.0219,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.06504065040650407,
70
- "grad_norm": 0.30859375,
71
  "learning_rate": 1.4240000000000001e-05,
72
- "loss": 0.0198,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.07226738934056007,
77
- "grad_norm": 0.408203125,
78
  "learning_rate": 1.584e-05,
79
- "loss": 0.0181,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.07949412827461608,
84
- "grad_norm": 0.31640625,
85
  "learning_rate": 1.7440000000000002e-05,
86
- "loss": 0.0177,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.08672086720867209,
91
- "grad_norm": 0.2734375,
92
  "learning_rate": 1.904e-05,
93
- "loss": 0.0184,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.0939476061427281,
98
- "grad_norm": 0.314453125,
99
  "learning_rate": 1.99801340948597e-05,
100
- "loss": 0.0191,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.1011743450767841,
105
- "grad_norm": 0.55078125,
106
  "learning_rate": 1.9930469332008943e-05,
107
- "loss": 0.0178,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.10840108401084012,
112
- "grad_norm": 0.2197265625,
113
  "learning_rate": 1.9880804569158184e-05,
114
- "loss": 0.018,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.11562782294489611,
119
- "grad_norm": 0.259765625,
120
  "learning_rate": 1.9831139806307428e-05,
121
- "loss": 0.0188,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.12285456187895212,
126
- "grad_norm": 0.88671875,
127
  "learning_rate": 1.9781475043456668e-05,
128
  "loss": 0.0217,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.13008130081300814,
133
- "grad_norm": 0.404296875,
134
  "learning_rate": 1.9731810280605912e-05,
135
- "loss": 0.0185,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.13730803974706413,
140
- "grad_norm": 0.228515625,
141
  "learning_rate": 1.9682145517755153e-05,
142
  "loss": 0.029,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.14453477868112014,
147
- "grad_norm": 0.384765625,
148
  "learning_rate": 1.9632480754904396e-05,
149
- "loss": 0.0262,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.15176151761517614,
154
- "grad_norm": 0.375,
155
  "learning_rate": 1.9582815992053637e-05,
156
- "loss": 0.0193,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.15898825654923215,
161
- "grad_norm": 0.6328125,
162
  "learning_rate": 1.953315122920288e-05,
163
- "loss": 0.03,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.16621499548328816,
168
- "grad_norm": 0.53515625,
169
  "learning_rate": 1.9483486466352125e-05,
170
- "loss": 0.0226,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.17344173441734417,
175
- "grad_norm": 0.35546875,
176
  "learning_rate": 1.943382170350137e-05,
177
- "loss": 0.0191,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.18066847335140018,
182
- "grad_norm": 0.4296875,
183
  "learning_rate": 1.938415694065061e-05,
184
- "loss": 0.0185,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.1878952122854562,
189
- "grad_norm": 0.37109375,
190
  "learning_rate": 1.9334492177799853e-05,
191
- "loss": 0.0203,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.1951219512195122,
196
- "grad_norm": 0.466796875,
197
  "learning_rate": 1.9284827414949094e-05,
198
- "loss": 0.0179,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.2023486901535682,
203
- "grad_norm": 0.32421875,
204
  "learning_rate": 1.9235162652098338e-05,
205
- "loss": 0.0193,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.20957542908762422,
210
- "grad_norm": 0.330078125,
211
  "learning_rate": 1.9185497889247578e-05,
212
- "loss": 0.0169,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.21680216802168023,
217
- "grad_norm": 0.35546875,
218
  "learning_rate": 1.9135833126396822e-05,
219
- "loss": 0.0184,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.2240289069557362,
224
- "grad_norm": 0.27734375,
225
  "learning_rate": 1.9086168363546066e-05,
226
  "loss": 0.0239,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.23125564588979222,
231
- "grad_norm": 0.275390625,
232
  "learning_rate": 1.903650360069531e-05,
233
  "loss": 0.0234,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.23848238482384823,
238
- "grad_norm": 0.408203125,
239
  "learning_rate": 1.898683883784455e-05,
240
- "loss": 0.0199,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.24570912375790424,
245
- "grad_norm": 0.365234375,
246
  "learning_rate": 1.8937174074993794e-05,
247
  "loss": 0.0191,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.2529358626919603,
252
- "grad_norm": 0.298828125,
253
  "learning_rate": 1.8887509312143035e-05,
254
- "loss": 0.0179,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.2601626016260163,
259
- "grad_norm": 0.58203125,
260
  "learning_rate": 1.883784454929228e-05,
261
  "loss": 0.018,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.26738934056007224,
266
- "grad_norm": 0.298828125,
267
  "learning_rate": 1.878817978644152e-05,
268
- "loss": 0.0213,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.27461607949412825,
273
- "grad_norm": 0.453125,
274
  "learning_rate": 1.8738515023590763e-05,
275
  "loss": 0.0182,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.28184281842818426,
280
- "grad_norm": 0.33203125,
281
  "learning_rate": 1.8688850260740007e-05,
282
- "loss": 0.0306,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.28906955736224027,
287
- "grad_norm": 0.31640625,
288
  "learning_rate": 1.8639185497889248e-05,
289
- "loss": 0.0438,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.2962962962962963,
294
- "grad_norm": 0.4375,
295
  "learning_rate": 1.8589520735038492e-05,
296
- "loss": 0.0594,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.3035230352303523,
301
- "grad_norm": 0.2734375,
302
  "learning_rate": 1.8539855972187736e-05,
303
- "loss": 0.0182,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.3107497741644083,
308
- "grad_norm": 0.291015625,
309
  "learning_rate": 1.8490191209336976e-05,
310
- "loss": 0.0193,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.3179765130984643,
315
- "grad_norm": 0.431640625,
316
  "learning_rate": 1.844052644648622e-05,
317
- "loss": 0.0178,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.3252032520325203,
322
- "grad_norm": 0.283203125,
323
  "learning_rate": 1.839086168363546e-05,
324
- "loss": 0.0197,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.3324299909665763,
329
- "grad_norm": 0.427734375,
330
  "learning_rate": 1.8341196920784705e-05,
331
- "loss": 0.0219,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.33965672990063234,
336
- "grad_norm": 0.28125,
337
  "learning_rate": 1.8291532157933945e-05,
338
- "loss": 0.0185,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.34688346883468835,
343
- "grad_norm": 1.15625,
344
  "learning_rate": 1.824186739508319e-05,
345
- "loss": 0.0396,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.35411020776874436,
350
- "grad_norm": 0.3984375,
351
  "learning_rate": 1.8192202632232433e-05,
352
- "loss": 0.0287,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.36133694670280037,
357
- "grad_norm": 0.318359375,
358
  "learning_rate": 1.8142537869381677e-05,
359
- "loss": 0.0189,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.36133694670280037,
364
- "eval_loss": 0.02287970297038555,
365
- "eval_runtime": 43.4357,
366
- "eval_samples_per_second": 26.844,
367
- "eval_steps_per_second": 6.723,
368
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  }
370
  ],
371
  "logging_steps": 10,
@@ -385,7 +1101,7 @@
385
  "attributes": {}
386
  }
387
  },
388
- "total_flos": 2.0917465462974874e+17,
389
  "train_batch_size": 4,
390
  "trial_name": null,
391
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.022791102528572083,
4
+ "best_model_checkpoint": "./trained_model_20251223_144504/checkpoint-1500",
5
+ "epoch": 1.0838301716350496,
6
  "eval_steps": 500,
7
+ "global_step": 1500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.007226738934056007,
14
+ "grad_norm": 1.7734375,
15
  "learning_rate": 1.44e-06,
16
+ "loss": 0.0577,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.014453477868112014,
21
+ "grad_norm": 1.3828125,
22
  "learning_rate": 3.04e-06,
23
+ "loss": 0.0498,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.02168021680216802,
28
+ "grad_norm": 0.890625,
29
  "learning_rate": 4.6400000000000005e-06,
30
+ "loss": 0.0334,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.028906955736224028,
35
+ "grad_norm": 0.458984375,
36
  "learning_rate": 6.24e-06,
37
+ "loss": 0.0249,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.036133694670280034,
42
+ "grad_norm": 0.2578125,
43
  "learning_rate": 7.840000000000001e-06,
44
+ "loss": 0.0194,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.04336043360433604,
49
+ "grad_norm": 0.333984375,
50
  "learning_rate": 9.440000000000001e-06,
51
+ "loss": 0.0193,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.05058717253839205,
56
+ "grad_norm": 0.380859375,
57
  "learning_rate": 1.1040000000000001e-05,
58
+ "loss": 0.0274,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.057813911472448055,
63
+ "grad_norm": 0.69921875,
64
  "learning_rate": 1.2640000000000001e-05,
65
+ "loss": 0.0188,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.06504065040650407,
70
+ "grad_norm": 0.271484375,
71
  "learning_rate": 1.4240000000000001e-05,
72
+ "loss": 0.0194,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.07226738934056007,
77
+ "grad_norm": 0.4375,
78
  "learning_rate": 1.584e-05,
79
+ "loss": 0.0188,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.07949412827461608,
84
+ "grad_norm": 0.265625,
85
  "learning_rate": 1.7440000000000002e-05,
86
+ "loss": 0.018,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 0.08672086720867209,
91
+ "grad_norm": 0.294921875,
92
  "learning_rate": 1.904e-05,
93
+ "loss": 0.019,
94
  "step": 120
95
  },
96
  {
97
  "epoch": 0.0939476061427281,
98
+ "grad_norm": 0.34375,
99
  "learning_rate": 1.99801340948597e-05,
100
+ "loss": 0.0193,
101
  "step": 130
102
  },
103
  {
104
  "epoch": 0.1011743450767841,
105
+ "grad_norm": 0.5546875,
106
  "learning_rate": 1.9930469332008943e-05,
107
+ "loss": 0.0184,
108
  "step": 140
109
  },
110
  {
111
  "epoch": 0.10840108401084012,
112
+ "grad_norm": 0.3046875,
113
  "learning_rate": 1.9880804569158184e-05,
114
+ "loss": 0.0184,
115
  "step": 150
116
  },
117
  {
118
  "epoch": 0.11562782294489611,
119
+ "grad_norm": 0.2216796875,
120
  "learning_rate": 1.9831139806307428e-05,
121
+ "loss": 0.019,
122
  "step": 160
123
  },
124
  {
125
  "epoch": 0.12285456187895212,
126
+ "grad_norm": 0.8671875,
127
  "learning_rate": 1.9781475043456668e-05,
128
  "loss": 0.0217,
129
  "step": 170
130
  },
131
  {
132
  "epoch": 0.13008130081300814,
133
+ "grad_norm": 0.369140625,
134
  "learning_rate": 1.9731810280605912e-05,
135
+ "loss": 0.0187,
136
  "step": 180
137
  },
138
  {
139
  "epoch": 0.13730803974706413,
140
+ "grad_norm": 0.271484375,
141
  "learning_rate": 1.9682145517755153e-05,
142
  "loss": 0.029,
143
  "step": 190
144
  },
145
  {
146
  "epoch": 0.14453477868112014,
147
+ "grad_norm": 0.40234375,
148
  "learning_rate": 1.9632480754904396e-05,
149
+ "loss": 0.0268,
150
  "step": 200
151
  },
152
  {
153
  "epoch": 0.15176151761517614,
154
+ "grad_norm": 0.330078125,
155
  "learning_rate": 1.9582815992053637e-05,
156
+ "loss": 0.019,
157
  "step": 210
158
  },
159
  {
160
  "epoch": 0.15898825654923215,
161
+ "grad_norm": 0.73828125,
162
  "learning_rate": 1.953315122920288e-05,
163
+ "loss": 0.0295,
164
  "step": 220
165
  },
166
  {
167
  "epoch": 0.16621499548328816,
168
+ "grad_norm": 0.5390625,
169
  "learning_rate": 1.9483486466352125e-05,
170
+ "loss": 0.0224,
171
  "step": 230
172
  },
173
  {
174
  "epoch": 0.17344173441734417,
175
+ "grad_norm": 0.419921875,
176
  "learning_rate": 1.943382170350137e-05,
177
+ "loss": 0.0193,
178
  "step": 240
179
  },
180
  {
181
  "epoch": 0.18066847335140018,
182
+ "grad_norm": 0.396484375,
183
  "learning_rate": 1.938415694065061e-05,
184
+ "loss": 0.0188,
185
  "step": 250
186
  },
187
  {
188
  "epoch": 0.1878952122854562,
189
+ "grad_norm": 0.494140625,
190
  "learning_rate": 1.9334492177799853e-05,
191
+ "loss": 0.0211,
192
  "step": 260
193
  },
194
  {
195
  "epoch": 0.1951219512195122,
196
+ "grad_norm": 0.46875,
197
  "learning_rate": 1.9284827414949094e-05,
198
+ "loss": 0.0183,
199
  "step": 270
200
  },
201
  {
202
  "epoch": 0.2023486901535682,
203
+ "grad_norm": 0.404296875,
204
  "learning_rate": 1.9235162652098338e-05,
205
+ "loss": 0.0197,
206
  "step": 280
207
  },
208
  {
209
  "epoch": 0.20957542908762422,
210
+ "grad_norm": 0.384765625,
211
  "learning_rate": 1.9185497889247578e-05,
212
+ "loss": 0.0174,
213
  "step": 290
214
  },
215
  {
216
  "epoch": 0.21680216802168023,
217
+ "grad_norm": 0.333984375,
218
  "learning_rate": 1.9135833126396822e-05,
219
+ "loss": 0.019,
220
  "step": 300
221
  },
222
  {
223
  "epoch": 0.2240289069557362,
224
+ "grad_norm": 0.28125,
225
  "learning_rate": 1.9086168363546066e-05,
226
  "loss": 0.0239,
227
  "step": 310
228
  },
229
  {
230
  "epoch": 0.23125564588979222,
231
+ "grad_norm": 0.328125,
232
  "learning_rate": 1.903650360069531e-05,
233
  "loss": 0.0234,
234
  "step": 320
235
  },
236
  {
237
  "epoch": 0.23848238482384823,
238
+ "grad_norm": 0.419921875,
239
  "learning_rate": 1.898683883784455e-05,
240
+ "loss": 0.0195,
241
  "step": 330
242
  },
243
  {
244
  "epoch": 0.24570912375790424,
245
+ "grad_norm": 0.33203125,
246
  "learning_rate": 1.8937174074993794e-05,
247
  "loss": 0.0191,
248
  "step": 340
249
  },
250
  {
251
  "epoch": 0.2529358626919603,
252
+ "grad_norm": 0.2734375,
253
  "learning_rate": 1.8887509312143035e-05,
254
+ "loss": 0.0186,
255
  "step": 350
256
  },
257
  {
258
  "epoch": 0.2601626016260163,
259
+ "grad_norm": 0.64453125,
260
  "learning_rate": 1.883784454929228e-05,
261
  "loss": 0.018,
262
  "step": 360
263
  },
264
  {
265
  "epoch": 0.26738934056007224,
266
+ "grad_norm": 0.25390625,
267
  "learning_rate": 1.878817978644152e-05,
268
+ "loss": 0.0217,
269
  "step": 370
270
  },
271
  {
272
  "epoch": 0.27461607949412825,
273
+ "grad_norm": 0.384765625,
274
  "learning_rate": 1.8738515023590763e-05,
275
  "loss": 0.0182,
276
  "step": 380
277
  },
278
  {
279
  "epoch": 0.28184281842818426,
280
+ "grad_norm": 1.1015625,
281
  "learning_rate": 1.8688850260740007e-05,
282
+ "loss": 0.0257,
283
  "step": 390
284
  },
285
  {
286
  "epoch": 0.28906955736224027,
287
+ "grad_norm": 0.30078125,
288
  "learning_rate": 1.8639185497889248e-05,
289
+ "loss": 0.0437,
290
  "step": 400
291
  },
292
  {
293
  "epoch": 0.2962962962962963,
294
+ "grad_norm": 0.42578125,
295
  "learning_rate": 1.8589520735038492e-05,
296
+ "loss": 0.0607,
297
  "step": 410
298
  },
299
  {
300
  "epoch": 0.3035230352303523,
301
+ "grad_norm": 0.267578125,
302
  "learning_rate": 1.8539855972187736e-05,
303
+ "loss": 0.0187,
304
  "step": 420
305
  },
306
  {
307
  "epoch": 0.3107497741644083,
308
+ "grad_norm": 0.20703125,
309
  "learning_rate": 1.8490191209336976e-05,
310
+ "loss": 0.0199,
311
  "step": 430
312
  },
313
  {
314
  "epoch": 0.3179765130984643,
315
+ "grad_norm": 0.5234375,
316
  "learning_rate": 1.844052644648622e-05,
317
+ "loss": 0.0181,
318
  "step": 440
319
  },
320
  {
321
  "epoch": 0.3252032520325203,
322
+ "grad_norm": 0.296875,
323
  "learning_rate": 1.839086168363546e-05,
324
+ "loss": 0.0199,
325
  "step": 450
326
  },
327
  {
328
  "epoch": 0.3324299909665763,
329
+ "grad_norm": 0.3203125,
330
  "learning_rate": 1.8341196920784705e-05,
331
+ "loss": 0.0223,
332
  "step": 460
333
  },
334
  {
335
  "epoch": 0.33965672990063234,
336
+ "grad_norm": 0.3125,
337
  "learning_rate": 1.8291532157933945e-05,
338
+ "loss": 0.018,
339
  "step": 470
340
  },
341
  {
342
  "epoch": 0.34688346883468835,
343
+ "grad_norm": 1.0703125,
344
  "learning_rate": 1.824186739508319e-05,
345
+ "loss": 0.0398,
346
  "step": 480
347
  },
348
  {
349
  "epoch": 0.35411020776874436,
350
+ "grad_norm": 0.357421875,
351
  "learning_rate": 1.8192202632232433e-05,
352
+ "loss": 0.0286,
353
  "step": 490
354
  },
355
  {
356
  "epoch": 0.36133694670280037,
357
+ "grad_norm": 0.3125,
358
  "learning_rate": 1.8142537869381677e-05,
359
+ "loss": 0.0193,
360
  "step": 500
361
  },
362
  {
363
  "epoch": 0.36133694670280037,
364
+ "eval_loss": 0.023125287145376205,
365
+ "eval_runtime": 43.3724,
366
+ "eval_samples_per_second": 26.883,
367
+ "eval_steps_per_second": 6.732,
368
  "step": 500
369
+ },
370
+ {
371
+ "epoch": 0.3685636856368564,
372
+ "grad_norm": 0.470703125,
373
+ "learning_rate": 1.8092873106530917e-05,
374
+ "loss": 0.0202,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 0.3757904245709124,
379
+ "grad_norm": 0.328125,
380
+ "learning_rate": 1.804320834368016e-05,
381
+ "loss": 0.0179,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 0.3830171635049684,
386
+ "grad_norm": 0.337890625,
387
+ "learning_rate": 1.7993543580829402e-05,
388
+ "loss": 0.0178,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 0.3902439024390244,
393
+ "grad_norm": 0.2490234375,
394
+ "learning_rate": 1.7943878817978646e-05,
395
+ "loss": 0.0187,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 0.3974706413730804,
400
+ "grad_norm": 0.39453125,
401
+ "learning_rate": 1.7894214055127886e-05,
402
+ "loss": 0.0194,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 0.4046973803071364,
407
+ "grad_norm": 0.38671875,
408
+ "learning_rate": 1.784454929227713e-05,
409
+ "loss": 0.0288,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 0.41192411924119243,
414
+ "grad_norm": 1.0,
415
+ "learning_rate": 1.7794884529426374e-05,
416
+ "loss": 0.0215,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 0.41915085817524844,
421
+ "grad_norm": 0.6484375,
422
+ "learning_rate": 1.7745219766575618e-05,
423
+ "loss": 0.0185,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 0.42637759710930445,
428
+ "grad_norm": 0.34375,
429
+ "learning_rate": 1.769555500372486e-05,
430
+ "loss": 0.0197,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 0.43360433604336046,
435
+ "grad_norm": 0.458984375,
436
+ "learning_rate": 1.7645890240874102e-05,
437
+ "loss": 0.0191,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 0.4408310749774164,
442
+ "grad_norm": 1.015625,
443
+ "learning_rate": 1.7596225478023343e-05,
444
+ "loss": 0.0218,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 0.4480578139114724,
449
+ "grad_norm": 0.416015625,
450
+ "learning_rate": 1.7546560715172587e-05,
451
+ "loss": 0.0199,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 0.45528455284552843,
456
+ "grad_norm": 0.80078125,
457
+ "learning_rate": 1.7496895952321827e-05,
458
+ "loss": 0.0256,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 0.46251129177958444,
463
+ "grad_norm": 0.46484375,
464
+ "learning_rate": 1.744723118947107e-05,
465
+ "loss": 0.0195,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 0.46973803071364045,
470
+ "grad_norm": 0.54296875,
471
+ "learning_rate": 1.7397566426620315e-05,
472
+ "loss": 0.0191,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 0.47696476964769646,
477
+ "grad_norm": 0.294921875,
478
+ "learning_rate": 1.7347901663769556e-05,
479
+ "loss": 0.0194,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 0.48419150858175247,
484
+ "grad_norm": 0.41796875,
485
+ "learning_rate": 1.72982369009188e-05,
486
+ "loss": 0.0182,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 0.4914182475158085,
491
+ "grad_norm": 0.484375,
492
+ "learning_rate": 1.7248572138068044e-05,
493
+ "loss": 0.0188,
494
+ "step": 680
495
+ },
496
+ {
497
+ "epoch": 0.4986449864498645,
498
+ "grad_norm": 0.4765625,
499
+ "learning_rate": 1.7198907375217284e-05,
500
+ "loss": 0.02,
501
+ "step": 690
502
+ },
503
+ {
504
+ "epoch": 0.5058717253839206,
505
+ "grad_norm": 0.2578125,
506
+ "learning_rate": 1.7149242612366528e-05,
507
+ "loss": 0.0191,
508
+ "step": 700
509
+ },
510
+ {
511
+ "epoch": 0.5130984643179766,
512
+ "grad_norm": 0.416015625,
513
+ "learning_rate": 1.709957784951577e-05,
514
+ "loss": 0.0197,
515
+ "step": 710
516
+ },
517
+ {
518
+ "epoch": 0.5203252032520326,
519
+ "grad_norm": 0.388671875,
520
+ "learning_rate": 1.7049913086665013e-05,
521
+ "loss": 0.02,
522
+ "step": 720
523
+ },
524
+ {
525
+ "epoch": 0.5275519421860885,
526
+ "grad_norm": 1.2265625,
527
+ "learning_rate": 1.7000248323814253e-05,
528
+ "loss": 0.0233,
529
+ "step": 730
530
+ },
531
+ {
532
+ "epoch": 0.5347786811201445,
533
+ "grad_norm": 0.400390625,
534
+ "learning_rate": 1.6950583560963497e-05,
535
+ "loss": 0.0175,
536
+ "step": 740
537
+ },
538
+ {
539
+ "epoch": 0.5420054200542005,
540
+ "grad_norm": 0.416015625,
541
+ "learning_rate": 1.690091879811274e-05,
542
+ "loss": 0.019,
543
+ "step": 750
544
+ },
545
+ {
546
+ "epoch": 0.5492321589882565,
547
+ "grad_norm": 0.306640625,
548
+ "learning_rate": 1.6851254035261985e-05,
549
+ "loss": 0.0306,
550
+ "step": 760
551
+ },
552
+ {
553
+ "epoch": 0.5564588979223125,
554
+ "grad_norm": 0.2333984375,
555
+ "learning_rate": 1.6801589272411225e-05,
556
+ "loss": 0.0187,
557
+ "step": 770
558
+ },
559
+ {
560
+ "epoch": 0.5636856368563685,
561
+ "grad_norm": 0.2060546875,
562
+ "learning_rate": 1.675192450956047e-05,
563
+ "loss": 0.0184,
564
+ "step": 780
565
+ },
566
+ {
567
+ "epoch": 0.5709123757904245,
568
+ "grad_norm": 1.0390625,
569
+ "learning_rate": 1.670225974670971e-05,
570
+ "loss": 0.0242,
571
+ "step": 790
572
+ },
573
+ {
574
+ "epoch": 0.5781391147244805,
575
+ "grad_norm": 0.49609375,
576
+ "learning_rate": 1.6652594983858954e-05,
577
+ "loss": 0.0228,
578
+ "step": 800
579
+ },
580
+ {
581
+ "epoch": 0.5853658536585366,
582
+ "grad_norm": 0.3828125,
583
+ "learning_rate": 1.6602930221008194e-05,
584
+ "loss": 0.0197,
585
+ "step": 810
586
+ },
587
+ {
588
+ "epoch": 0.5925925925925926,
589
+ "grad_norm": 0.4453125,
590
+ "learning_rate": 1.6553265458157438e-05,
591
+ "loss": 0.0192,
592
+ "step": 820
593
+ },
594
+ {
595
+ "epoch": 0.5998193315266486,
596
+ "grad_norm": 0.310546875,
597
+ "learning_rate": 1.6503600695306682e-05,
598
+ "loss": 0.0176,
599
+ "step": 830
600
+ },
601
+ {
602
+ "epoch": 0.6070460704607046,
603
+ "grad_norm": 0.337890625,
604
+ "learning_rate": 1.6453935932455926e-05,
605
+ "loss": 0.0228,
606
+ "step": 840
607
+ },
608
+ {
609
+ "epoch": 0.6142728093947606,
610
+ "grad_norm": 0.3984375,
611
+ "learning_rate": 1.6404271169605167e-05,
612
+ "loss": 0.0187,
613
+ "step": 850
614
+ },
615
+ {
616
+ "epoch": 0.6214995483288166,
617
+ "grad_norm": 0.34765625,
618
+ "learning_rate": 1.635460640675441e-05,
619
+ "loss": 0.0191,
620
+ "step": 860
621
+ },
622
+ {
623
+ "epoch": 0.6287262872628726,
624
+ "grad_norm": 0.421875,
625
+ "learning_rate": 1.630494164390365e-05,
626
+ "loss": 0.0182,
627
+ "step": 870
628
+ },
629
+ {
630
+ "epoch": 0.6359530261969286,
631
+ "grad_norm": 0.3359375,
632
+ "learning_rate": 1.6255276881052895e-05,
633
+ "loss": 0.019,
634
+ "step": 880
635
+ },
636
+ {
637
+ "epoch": 0.6431797651309846,
638
+ "grad_norm": 0.353515625,
639
+ "learning_rate": 1.6205612118202136e-05,
640
+ "loss": 0.0192,
641
+ "step": 890
642
+ },
643
+ {
644
+ "epoch": 0.6504065040650406,
645
+ "grad_norm": 0.357421875,
646
+ "learning_rate": 1.615594735535138e-05,
647
+ "loss": 0.0199,
648
+ "step": 900
649
+ },
650
+ {
651
+ "epoch": 0.6576332429990966,
652
+ "grad_norm": 0.357421875,
653
+ "learning_rate": 1.610628259250062e-05,
654
+ "loss": 0.0182,
655
+ "step": 910
656
+ },
657
+ {
658
+ "epoch": 0.6648599819331527,
659
+ "grad_norm": 0.36328125,
660
+ "learning_rate": 1.6056617829649864e-05,
661
+ "loss": 0.0176,
662
+ "step": 920
663
+ },
664
+ {
665
+ "epoch": 0.6720867208672087,
666
+ "grad_norm": 0.328125,
667
+ "learning_rate": 1.6006953066799108e-05,
668
+ "loss": 0.0197,
669
+ "step": 930
670
+ },
671
+ {
672
+ "epoch": 0.6793134598012647,
673
+ "grad_norm": 0.4296875,
674
+ "learning_rate": 1.5957288303948352e-05,
675
+ "loss": 0.0188,
676
+ "step": 940
677
+ },
678
+ {
679
+ "epoch": 0.6865401987353207,
680
+ "grad_norm": 0.345703125,
681
+ "learning_rate": 1.5907623541097592e-05,
682
+ "loss": 0.0196,
683
+ "step": 950
684
+ },
685
+ {
686
+ "epoch": 0.6937669376693767,
687
+ "grad_norm": 0.44921875,
688
+ "learning_rate": 1.5857958778246836e-05,
689
+ "loss": 0.019,
690
+ "step": 960
691
+ },
692
+ {
693
+ "epoch": 0.7009936766034327,
694
+ "grad_norm": 0.37109375,
695
+ "learning_rate": 1.5808294015396077e-05,
696
+ "loss": 0.0287,
697
+ "step": 970
698
+ },
699
+ {
700
+ "epoch": 0.7082204155374887,
701
+ "grad_norm": 0.44921875,
702
+ "learning_rate": 1.575862925254532e-05,
703
+ "loss": 0.0193,
704
+ "step": 980
705
+ },
706
+ {
707
+ "epoch": 0.7154471544715447,
708
+ "grad_norm": 0.28515625,
709
+ "learning_rate": 1.570896448969456e-05,
710
+ "loss": 0.0185,
711
+ "step": 990
712
+ },
713
+ {
714
+ "epoch": 0.7226738934056007,
715
+ "grad_norm": 0.5625,
716
+ "learning_rate": 1.5659299726843805e-05,
717
+ "loss": 0.0181,
718
+ "step": 1000
719
+ },
720
+ {
721
+ "epoch": 0.7226738934056007,
722
+ "eval_loss": 0.02289458177983761,
723
+ "eval_runtime": 43.4716,
724
+ "eval_samples_per_second": 26.822,
725
+ "eval_steps_per_second": 6.717,
726
+ "step": 1000
727
+ },
728
+ {
729
+ "epoch": 0.7299006323396567,
730
+ "grad_norm": 0.314453125,
731
+ "learning_rate": 1.560963496399305e-05,
732
+ "loss": 0.0181,
733
+ "step": 1010
734
+ },
735
+ {
736
+ "epoch": 0.7371273712737128,
737
+ "grad_norm": 0.333984375,
738
+ "learning_rate": 1.5559970201142293e-05,
739
+ "loss": 0.0517,
740
+ "step": 1020
741
+ },
742
+ {
743
+ "epoch": 0.7443541102077688,
744
+ "grad_norm": 0.43359375,
745
+ "learning_rate": 1.5510305438291533e-05,
746
+ "loss": 0.0314,
747
+ "step": 1030
748
+ },
749
+ {
750
+ "epoch": 0.7515808491418248,
751
+ "grad_norm": 0.60546875,
752
+ "learning_rate": 1.5460640675440777e-05,
753
+ "loss": 0.0185,
754
+ "step": 1040
755
+ },
756
+ {
757
+ "epoch": 0.7588075880758808,
758
+ "grad_norm": 0.54296875,
759
+ "learning_rate": 1.5410975912590018e-05,
760
+ "loss": 0.0183,
761
+ "step": 1050
762
+ },
763
+ {
764
+ "epoch": 0.7660343270099368,
765
+ "grad_norm": 0.30078125,
766
+ "learning_rate": 1.5361311149739262e-05,
767
+ "loss": 0.0193,
768
+ "step": 1060
769
+ },
770
+ {
771
+ "epoch": 0.7732610659439928,
772
+ "grad_norm": 0.322265625,
773
+ "learning_rate": 1.5311646386888502e-05,
774
+ "loss": 0.0185,
775
+ "step": 1070
776
+ },
777
+ {
778
+ "epoch": 0.7804878048780488,
779
+ "grad_norm": 0.375,
780
+ "learning_rate": 1.5261981624037746e-05,
781
+ "loss": 0.0187,
782
+ "step": 1080
783
+ },
784
+ {
785
+ "epoch": 0.7877145438121048,
786
+ "grad_norm": 0.373046875,
787
+ "learning_rate": 1.5212316861186989e-05,
788
+ "loss": 0.0176,
789
+ "step": 1090
790
+ },
791
+ {
792
+ "epoch": 0.7949412827461608,
793
+ "grad_norm": 0.328125,
794
+ "learning_rate": 1.5162652098336232e-05,
795
+ "loss": 0.0189,
796
+ "step": 1100
797
+ },
798
+ {
799
+ "epoch": 0.8021680216802168,
800
+ "grad_norm": 0.333984375,
801
+ "learning_rate": 1.5112987335485475e-05,
802
+ "loss": 0.0189,
803
+ "step": 1110
804
+ },
805
+ {
806
+ "epoch": 0.8093947606142728,
807
+ "grad_norm": 0.248046875,
808
+ "learning_rate": 1.5063322572634717e-05,
809
+ "loss": 0.0174,
810
+ "step": 1120
811
+ },
812
+ {
813
+ "epoch": 0.8166214995483289,
814
+ "grad_norm": 0.2216796875,
815
+ "learning_rate": 1.5013657809783959e-05,
816
+ "loss": 0.018,
817
+ "step": 1130
818
+ },
819
+ {
820
+ "epoch": 0.8238482384823849,
821
+ "grad_norm": 0.37109375,
822
+ "learning_rate": 1.4963993046933203e-05,
823
+ "loss": 0.0178,
824
+ "step": 1140
825
+ },
826
+ {
827
+ "epoch": 0.8310749774164409,
828
+ "grad_norm": 0.28125,
829
+ "learning_rate": 1.4914328284082444e-05,
830
+ "loss": 0.0165,
831
+ "step": 1150
832
+ },
833
+ {
834
+ "epoch": 0.8383017163504969,
835
+ "grad_norm": 0.35546875,
836
+ "learning_rate": 1.4864663521231688e-05,
837
+ "loss": 0.0238,
838
+ "step": 1160
839
+ },
840
+ {
841
+ "epoch": 0.8455284552845529,
842
+ "grad_norm": 0.265625,
843
+ "learning_rate": 1.481499875838093e-05,
844
+ "loss": 0.0195,
845
+ "step": 1170
846
+ },
847
+ {
848
+ "epoch": 0.8527551942186089,
849
+ "grad_norm": 2.46875,
850
+ "learning_rate": 1.4765333995530174e-05,
851
+ "loss": 0.0424,
852
+ "step": 1180
853
+ },
854
+ {
855
+ "epoch": 0.8599819331526649,
856
+ "grad_norm": 0.51953125,
857
+ "learning_rate": 1.4715669232679414e-05,
858
+ "loss": 0.0203,
859
+ "step": 1190
860
+ },
861
+ {
862
+ "epoch": 0.8672086720867209,
863
+ "grad_norm": 0.47265625,
864
+ "learning_rate": 1.4666004469828658e-05,
865
+ "loss": 0.0181,
866
+ "step": 1200
867
+ },
868
+ {
869
+ "epoch": 0.8744354110207768,
870
+ "grad_norm": 0.451171875,
871
+ "learning_rate": 1.46163397069779e-05,
872
+ "loss": 0.0189,
873
+ "step": 1210
874
+ },
875
+ {
876
+ "epoch": 0.8816621499548328,
877
+ "grad_norm": 0.380859375,
878
+ "learning_rate": 1.4566674944127144e-05,
879
+ "loss": 0.0322,
880
+ "step": 1220
881
+ },
882
+ {
883
+ "epoch": 0.8888888888888888,
884
+ "grad_norm": 0.298828125,
885
+ "learning_rate": 1.4517010181276385e-05,
886
+ "loss": 0.019,
887
+ "step": 1230
888
+ },
889
+ {
890
+ "epoch": 0.8961156278229448,
891
+ "grad_norm": 0.248046875,
892
+ "learning_rate": 1.4467345418425629e-05,
893
+ "loss": 0.0253,
894
+ "step": 1240
895
+ },
896
+ {
897
+ "epoch": 0.9033423667570009,
898
+ "grad_norm": 0.39453125,
899
+ "learning_rate": 1.4417680655574871e-05,
900
+ "loss": 0.018,
901
+ "step": 1250
902
+ },
903
+ {
904
+ "epoch": 0.9105691056910569,
905
+ "grad_norm": 0.345703125,
906
+ "learning_rate": 1.4368015892724115e-05,
907
+ "loss": 0.0217,
908
+ "step": 1260
909
+ },
910
+ {
911
+ "epoch": 0.9177958446251129,
912
+ "grad_norm": 0.330078125,
913
+ "learning_rate": 1.4318351129873355e-05,
914
+ "loss": 0.0336,
915
+ "step": 1270
916
+ },
917
+ {
918
+ "epoch": 0.9250225835591689,
919
+ "grad_norm": 0.267578125,
920
+ "learning_rate": 1.42686863670226e-05,
921
+ "loss": 0.0219,
922
+ "step": 1280
923
+ },
924
+ {
925
+ "epoch": 0.9322493224932249,
926
+ "grad_norm": 0.65234375,
927
+ "learning_rate": 1.4219021604171842e-05,
928
+ "loss": 0.0212,
929
+ "step": 1290
930
+ },
931
+ {
932
+ "epoch": 0.9394760614272809,
933
+ "grad_norm": 0.259765625,
934
+ "learning_rate": 1.4169356841321085e-05,
935
+ "loss": 0.0172,
936
+ "step": 1300
937
+ },
938
+ {
939
+ "epoch": 0.9467028003613369,
940
+ "grad_norm": 0.61328125,
941
+ "learning_rate": 1.4119692078470326e-05,
942
+ "loss": 0.0187,
943
+ "step": 1310
944
+ },
945
+ {
946
+ "epoch": 0.9539295392953929,
947
+ "grad_norm": 0.3671875,
948
+ "learning_rate": 1.407002731561957e-05,
949
+ "loss": 0.0179,
950
+ "step": 1320
951
+ },
952
+ {
953
+ "epoch": 0.9611562782294489,
954
+ "grad_norm": 0.3046875,
955
+ "learning_rate": 1.4020362552768812e-05,
956
+ "loss": 0.0179,
957
+ "step": 1330
958
+ },
959
+ {
960
+ "epoch": 0.9683830171635049,
961
+ "grad_norm": 0.56640625,
962
+ "learning_rate": 1.3970697789918054e-05,
963
+ "loss": 0.0187,
964
+ "step": 1340
965
+ },
966
+ {
967
+ "epoch": 0.975609756097561,
968
+ "grad_norm": 0.380859375,
969
+ "learning_rate": 1.3921033027067297e-05,
970
+ "loss": 0.0193,
971
+ "step": 1350
972
+ },
973
+ {
974
+ "epoch": 0.982836495031617,
975
+ "grad_norm": 0.38671875,
976
+ "learning_rate": 1.387136826421654e-05,
977
+ "loss": 0.02,
978
+ "step": 1360
979
+ },
980
+ {
981
+ "epoch": 0.990063233965673,
982
+ "grad_norm": 0.34375,
983
+ "learning_rate": 1.3821703501365781e-05,
984
+ "loss": 0.0171,
985
+ "step": 1370
986
+ },
987
+ {
988
+ "epoch": 0.997289972899729,
989
+ "grad_norm": 0.3203125,
990
+ "learning_rate": 1.3772038738515025e-05,
991
+ "loss": 0.0197,
992
+ "step": 1380
993
+ },
994
+ {
995
+ "epoch": 1.0043360433604336,
996
+ "grad_norm": 0.2333984375,
997
+ "learning_rate": 1.3722373975664267e-05,
998
+ "loss": 0.0167,
999
+ "step": 1390
1000
+ },
1001
+ {
1002
+ "epoch": 1.0115627822944897,
1003
+ "grad_norm": 0.1865234375,
1004
+ "learning_rate": 1.3672709212813511e-05,
1005
+ "loss": 0.0184,
1006
+ "step": 1400
1007
+ },
1008
+ {
1009
+ "epoch": 1.0187895212285456,
1010
+ "grad_norm": 0.2099609375,
1011
+ "learning_rate": 1.3623044449962752e-05,
1012
+ "loss": 0.015,
1013
+ "step": 1410
1014
+ },
1015
+ {
1016
+ "epoch": 1.0260162601626017,
1017
+ "grad_norm": 0.169921875,
1018
+ "learning_rate": 1.3573379687111996e-05,
1019
+ "loss": 0.0145,
1020
+ "step": 1420
1021
+ },
1022
+ {
1023
+ "epoch": 1.0332429990966576,
1024
+ "grad_norm": 0.19921875,
1025
+ "learning_rate": 1.3523714924261238e-05,
1026
+ "loss": 0.0151,
1027
+ "step": 1430
1028
+ },
1029
+ {
1030
+ "epoch": 1.0404697380307137,
1031
+ "grad_norm": 0.1337890625,
1032
+ "learning_rate": 1.3474050161410482e-05,
1033
+ "loss": 0.016,
1034
+ "step": 1440
1035
+ },
1036
+ {
1037
+ "epoch": 1.0476964769647696,
1038
+ "grad_norm": 0.173828125,
1039
+ "learning_rate": 1.3424385398559722e-05,
1040
+ "loss": 0.0153,
1041
+ "step": 1450
1042
+ },
1043
+ {
1044
+ "epoch": 1.0549232158988258,
1045
+ "grad_norm": 0.2275390625,
1046
+ "learning_rate": 1.3374720635708966e-05,
1047
+ "loss": 0.02,
1048
+ "step": 1460
1049
+ },
1050
+ {
1051
+ "epoch": 1.0621499548328817,
1052
+ "grad_norm": 0.2451171875,
1053
+ "learning_rate": 1.3325055872858208e-05,
1054
+ "loss": 0.0176,
1055
+ "step": 1470
1056
+ },
1057
+ {
1058
+ "epoch": 1.0693766937669378,
1059
+ "grad_norm": 0.287109375,
1060
+ "learning_rate": 1.3275391110007452e-05,
1061
+ "loss": 0.0175,
1062
+ "step": 1480
1063
+ },
1064
+ {
1065
+ "epoch": 1.0766034327009937,
1066
+ "grad_norm": 0.15234375,
1067
+ "learning_rate": 1.3225726347156693e-05,
1068
+ "loss": 0.015,
1069
+ "step": 1490
1070
+ },
1071
+ {
1072
+ "epoch": 1.0838301716350496,
1073
+ "grad_norm": 0.15234375,
1074
+ "learning_rate": 1.3176061584305937e-05,
1075
+ "loss": 0.0226,
1076
+ "step": 1500
1077
+ },
1078
+ {
1079
+ "epoch": 1.0838301716350496,
1080
+ "eval_loss": 0.022791102528572083,
1081
+ "eval_runtime": 43.397,
1082
+ "eval_samples_per_second": 26.868,
1083
+ "eval_steps_per_second": 6.729,
1084
+ "step": 1500
1085
  }
1086
  ],
1087
  "logging_steps": 10,
 
1101
  "attributes": {}
1102
  }
1103
  },
1104
+ "total_flos": 6.261646670739886e+17,
1105
  "train_batch_size": 4,
1106
  "trial_name": null,
1107
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2934171f5cebc30444d6719c8cd810adec3f2252b2c17b1c00f83891aab2b503
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b079f0fe6e2318616cd3018ea7ee3f0504303f3997e29be89e5fcb15bb84cb2c
3
  size 5841