Sabbir772 commited on
Commit
0096fdc
·
verified ·
1 Parent(s): 3dd0567

Training in progress, step 4800

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad15d12fa229f1794a8dd10223f571b0c4044b188c2edb0e066b7a18688f6132
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:470ceab2ffbdc9be7e9ba55dac8f66ad2d2ee6d83bf68d588bdf8c2e363afa0a
3
  size 990185320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52a3dcb31a2a26c374576cfddafb3476db84b7103622bbfe66f386c692915e00
3
  size 1980545291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a8f1de75a1e7f597eeeec866f696144c9ded9443ee76a77efdde665cb83edb3
3
  size 1980545291
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:435141f0266913129d6ffd1dc7b62f464fe5698014d7627dbd3fef5684f9d38b
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce3825e60923efd0732224de480af38290fa16b941f44ff5e3049ad2c6cac56
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0258928230b848c7fc84e13b45f33e34530c19f3dde5f9081a5d4f027a569dd2
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3185a876acce70348de92f6615e3c6174f04c918e17668da9430678af0491872
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,436 +2,1006 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6236162361623616,
6
  "eval_steps": 400,
7
- "global_step": 4400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03690036900369004,
14
- "grad_norm": 4.41992712020874,
15
- "learning_rate": 3.926937269372694e-05,
16
- "loss": 0.9291,
17
  "step": 100
18
  },
19
  {
20
- "epoch": 0.07380073800738007,
21
- "grad_norm": 5.523387908935547,
22
- "learning_rate": 3.853136531365314e-05,
23
- "loss": 0.9121,
24
  "step": 200
25
  },
26
  {
27
- "epoch": 0.11070110701107011,
28
- "grad_norm": 2.894198417663574,
29
- "learning_rate": 3.779335793357934e-05,
30
- "loss": 0.9451,
31
  "step": 300
32
  },
33
  {
34
- "epoch": 0.14760147601476015,
35
- "grad_norm": 4.9876556396484375,
36
- "learning_rate": 3.705535055350554e-05,
37
- "loss": 0.92,
38
  "step": 400
39
  },
40
  {
41
- "epoch": 0.14760147601476015,
42
- "eval_bleu": 46.2124538637683,
43
- "eval_chrf": 70.20566598384819,
44
- "eval_loss": 0.9659060835838318,
45
- "eval_runtime": 96.6966,
46
- "eval_samples_per_second": 8.439,
47
- "eval_steps_per_second": 1.055,
48
  "step": 400
49
  },
50
  {
51
- "epoch": 0.18450184501845018,
52
- "grad_norm": 3.176483631134033,
53
- "learning_rate": 3.6317343173431734e-05,
54
- "loss": 0.9031,
55
  "step": 500
56
  },
57
  {
58
- "epoch": 0.22140221402214022,
59
- "grad_norm": 3.038796901702881,
60
- "learning_rate": 3.5579335793357936e-05,
61
- "loss": 0.9035,
62
  "step": 600
63
  },
64
  {
65
- "epoch": 0.25830258302583026,
66
- "grad_norm": 10.56854248046875,
67
- "learning_rate": 3.484132841328414e-05,
68
- "loss": 0.8688,
69
  "step": 700
70
  },
71
  {
72
- "epoch": 0.2952029520295203,
73
- "grad_norm": 4.877047538757324,
74
- "learning_rate": 3.410332103321034e-05,
75
- "loss": 1.0048,
76
  "step": 800
77
  },
78
  {
79
- "epoch": 0.2952029520295203,
80
- "eval_bleu": 45.750715379330366,
81
- "eval_chrf": 70.00797971045762,
82
- "eval_loss": 0.962220311164856,
83
- "eval_runtime": 96.7314,
84
- "eval_samples_per_second": 8.436,
85
- "eval_steps_per_second": 1.054,
86
  "step": 800
87
  },
88
  {
89
- "epoch": 0.33210332103321033,
90
- "grad_norm": 5.688119888305664,
91
- "learning_rate": 3.3365313653136534e-05,
92
- "loss": 0.9146,
93
  "step": 900
94
  },
95
  {
96
- "epoch": 0.36900369003690037,
97
- "grad_norm": 4.046717643737793,
98
- "learning_rate": 3.2627306273062736e-05,
99
- "loss": 0.906,
100
  "step": 1000
101
  },
102
  {
103
- "epoch": 0.4059040590405904,
104
- "grad_norm": 4.171918869018555,
105
- "learning_rate": 3.188929889298893e-05,
106
- "loss": 0.9026,
107
  "step": 1100
108
  },
109
  {
110
- "epoch": 0.44280442804428044,
111
- "grad_norm": 2.658325672149658,
112
- "learning_rate": 3.115129151291513e-05,
113
- "loss": 0.9634,
114
  "step": 1200
115
  },
116
  {
117
- "epoch": 0.44280442804428044,
118
- "eval_bleu": 46.31749782940418,
119
- "eval_chrf": 70.29647401157018,
120
- "eval_loss": 0.9587263464927673,
121
- "eval_runtime": 97.1058,
122
- "eval_samples_per_second": 8.403,
123
- "eval_steps_per_second": 1.05,
124
  "step": 1200
125
  },
126
  {
127
- "epoch": 0.4797047970479705,
128
- "grad_norm": 2.7676937580108643,
129
- "learning_rate": 3.041328413284133e-05,
130
- "loss": 0.889,
131
  "step": 1300
132
  },
133
  {
134
- "epoch": 0.5166051660516605,
135
- "grad_norm": 3.7263267040252686,
136
- "learning_rate": 2.967527675276753e-05,
137
- "loss": 0.9156,
138
  "step": 1400
139
  },
140
  {
141
- "epoch": 0.5535055350553506,
142
- "grad_norm": 5.799243450164795,
143
- "learning_rate": 2.893726937269373e-05,
144
- "loss": 0.9631,
145
  "step": 1500
146
  },
147
  {
148
- "epoch": 0.5904059040590406,
149
- "grad_norm": 3.508241653442383,
150
- "learning_rate": 2.819926199261993e-05,
151
- "loss": 0.9116,
152
  "step": 1600
153
  },
154
  {
155
- "epoch": 0.5904059040590406,
156
- "eval_bleu": 46.58006412895955,
157
- "eval_chrf": 70.4943624210968,
158
- "eval_loss": 0.9491108655929565,
159
- "eval_runtime": 96.8012,
160
- "eval_samples_per_second": 8.43,
161
- "eval_steps_per_second": 1.054,
162
  "step": 1600
163
  },
164
  {
165
- "epoch": 0.6273062730627307,
166
- "grad_norm": 4.0949225425720215,
167
- "learning_rate": 2.7461254612546128e-05,
168
- "loss": 0.9217,
169
  "step": 1700
170
  },
171
  {
172
- "epoch": 0.6642066420664207,
173
- "grad_norm": 4.698362827301025,
174
- "learning_rate": 2.6723247232472326e-05,
175
- "loss": 0.9627,
176
  "step": 1800
177
  },
178
  {
179
- "epoch": 0.7011070110701108,
180
- "grad_norm": 3.4152214527130127,
181
- "learning_rate": 2.5985239852398528e-05,
182
- "loss": 0.9621,
183
  "step": 1900
184
  },
185
  {
186
- "epoch": 0.7380073800738007,
187
- "grad_norm": 5.174529075622559,
188
- "learning_rate": 2.5247232472324727e-05,
189
- "loss": 0.9127,
190
  "step": 2000
191
  },
192
  {
193
- "epoch": 0.7380073800738007,
194
- "eval_bleu": 46.23328341662156,
195
- "eval_chrf": 70.4334392951765,
196
- "eval_loss": 0.9401571154594421,
197
- "eval_runtime": 96.3896,
198
- "eval_samples_per_second": 8.466,
199
- "eval_steps_per_second": 1.058,
200
  "step": 2000
201
  },
202
  {
203
- "epoch": 0.7749077490774908,
204
- "grad_norm": 4.145532608032227,
205
- "learning_rate": 2.4509225092250925e-05,
206
- "loss": 0.8737,
207
  "step": 2100
208
  },
209
  {
210
- "epoch": 0.8118081180811808,
211
- "grad_norm": 3.2128381729125977,
212
- "learning_rate": 2.3771217712177123e-05,
213
- "loss": 0.8972,
214
  "step": 2200
215
  },
216
  {
217
- "epoch": 0.8487084870848709,
218
- "grad_norm": 5.43286657333374,
219
- "learning_rate": 2.3033210332103325e-05,
220
- "loss": 0.9537,
221
  "step": 2300
222
  },
223
  {
224
- "epoch": 0.8856088560885609,
225
- "grad_norm": 4.181277751922607,
226
- "learning_rate": 2.2295202952029523e-05,
227
- "loss": 0.9513,
228
  "step": 2400
229
  },
230
  {
231
- "epoch": 0.8856088560885609,
232
- "eval_bleu": 46.62494660518371,
233
- "eval_chrf": 70.75585572160985,
234
- "eval_loss": 0.931623101234436,
235
- "eval_runtime": 96.3421,
236
- "eval_samples_per_second": 8.47,
237
- "eval_steps_per_second": 1.059,
238
  "step": 2400
239
  },
240
  {
241
- "epoch": 0.922509225092251,
242
- "grad_norm": 4.236485004425049,
243
- "learning_rate": 2.1557195571955722e-05,
244
- "loss": 0.912,
245
  "step": 2500
246
  },
247
  {
248
- "epoch": 0.959409594095941,
249
- "grad_norm": 4.3563337326049805,
250
- "learning_rate": 2.081918819188192e-05,
251
- "loss": 0.9503,
252
  "step": 2600
253
  },
254
  {
255
- "epoch": 0.996309963099631,
256
- "grad_norm": 3.8650689125061035,
257
- "learning_rate": 2.0081180811808122e-05,
258
- "loss": 0.8835,
259
  "step": 2700
260
  },
261
  {
262
- "epoch": 1.033210332103321,
263
- "grad_norm": 4.865896224975586,
264
- "learning_rate": 1.934317343173432e-05,
265
- "loss": 0.9395,
266
  "step": 2800
267
  },
268
  {
269
- "epoch": 1.033210332103321,
270
- "eval_bleu": 46.61705928386175,
271
- "eval_chrf": 70.76885341204647,
272
- "eval_loss": 0.9273455739021301,
273
- "eval_runtime": 96.2367,
274
- "eval_samples_per_second": 8.479,
275
- "eval_steps_per_second": 1.06,
276
  "step": 2800
277
  },
278
  {
279
- "epoch": 1.070110701107011,
280
- "grad_norm": 4.255625247955322,
281
- "learning_rate": 1.8605166051660515e-05,
282
- "loss": 0.925,
283
  "step": 2900
284
  },
285
  {
286
- "epoch": 1.1070110701107012,
287
- "grad_norm": 3.4618682861328125,
288
- "learning_rate": 1.7867158671586717e-05,
289
- "loss": 0.8997,
290
  "step": 3000
291
  },
292
  {
293
- "epoch": 1.1439114391143912,
294
- "grad_norm": 4.462490558624268,
295
- "learning_rate": 1.7129151291512916e-05,
296
- "loss": 0.9109,
297
  "step": 3100
298
  },
299
  {
300
- "epoch": 1.1808118081180812,
301
- "grad_norm": 3.9228367805480957,
302
- "learning_rate": 1.6391143911439117e-05,
303
- "loss": 0.8899,
304
  "step": 3200
305
  },
306
  {
307
- "epoch": 1.1808118081180812,
308
- "eval_bleu": 46.706846283749094,
309
- "eval_chrf": 70.66405951179729,
310
- "eval_loss": 0.9309074282646179,
311
- "eval_runtime": 96.5614,
312
- "eval_samples_per_second": 8.451,
313
- "eval_steps_per_second": 1.056,
314
  "step": 3200
315
  },
316
  {
317
- "epoch": 1.2177121771217712,
318
- "grad_norm": 4.034820556640625,
319
- "learning_rate": 1.5653136531365312e-05,
320
- "loss": 0.8508,
321
  "step": 3300
322
  },
323
  {
324
- "epoch": 1.2546125461254611,
325
- "grad_norm": 3.396322011947632,
326
- "learning_rate": 1.4915129151291514e-05,
327
- "loss": 0.8776,
328
  "step": 3400
329
  },
330
  {
331
- "epoch": 1.2915129151291513,
332
- "grad_norm": 7.144755840301514,
333
- "learning_rate": 1.4177121771217713e-05,
334
- "loss": 0.8599,
335
  "step": 3500
336
  },
337
  {
338
- "epoch": 1.3284132841328413,
339
- "grad_norm": 3.339700937271118,
340
- "learning_rate": 1.3439114391143913e-05,
341
- "loss": 0.8638,
342
  "step": 3600
343
  },
344
  {
345
- "epoch": 1.3284132841328413,
346
- "eval_bleu": 46.624919425579975,
347
- "eval_chrf": 70.85748628309527,
348
- "eval_loss": 0.9251583218574524,
349
- "eval_runtime": 96.1672,
350
- "eval_samples_per_second": 8.485,
351
- "eval_steps_per_second": 1.061,
352
  "step": 3600
353
  },
354
  {
355
- "epoch": 1.3653136531365313,
356
- "grad_norm": 3.8256001472473145,
357
- "learning_rate": 1.2701107011070111e-05,
358
- "loss": 0.8555,
359
  "step": 3700
360
  },
361
  {
362
- "epoch": 1.4022140221402215,
363
- "grad_norm": 3.9691555500030518,
364
- "learning_rate": 1.1963099630996311e-05,
365
- "loss": 0.9664,
366
  "step": 3800
367
  },
368
  {
369
- "epoch": 1.4391143911439115,
370
- "grad_norm": 3.3013808727264404,
371
- "learning_rate": 1.122509225092251e-05,
372
- "loss": 0.8878,
373
  "step": 3900
374
  },
375
  {
376
- "epoch": 1.4760147601476015,
377
- "grad_norm": 2.8146023750305176,
378
- "learning_rate": 1.048708487084871e-05,
379
- "loss": 0.8244,
380
  "step": 4000
381
  },
382
  {
383
- "epoch": 1.4760147601476015,
384
- "eval_bleu": 46.934667995761984,
385
- "eval_chrf": 70.98538034968328,
386
- "eval_loss": 0.9257907271385193,
387
- "eval_runtime": 96.3505,
388
- "eval_samples_per_second": 8.469,
389
- "eval_steps_per_second": 1.059,
390
  "step": 4000
391
  },
392
  {
393
- "epoch": 1.5129151291512914,
394
- "grad_norm": 3.6188437938690186,
395
- "learning_rate": 9.749077490774908e-06,
396
- "loss": 0.9188,
397
  "step": 4100
398
  },
399
  {
400
- "epoch": 1.5498154981549814,
401
- "grad_norm": 3.578477621078491,
402
- "learning_rate": 9.011070110701108e-06,
403
- "loss": 0.9018,
404
  "step": 4200
405
  },
406
  {
407
- "epoch": 1.5867158671586716,
408
- "grad_norm": 5.138524532318115,
409
- "learning_rate": 8.273062730627306e-06,
410
- "loss": 0.826,
411
  "step": 4300
412
  },
413
  {
414
- "epoch": 1.6236162361623616,
415
- "grad_norm": 4.470804214477539,
416
- "learning_rate": 7.5350553505535065e-06,
417
- "loss": 0.8736,
418
  "step": 4400
419
  },
420
  {
421
- "epoch": 1.6236162361623616,
422
- "eval_bleu": 46.87049580619245,
423
- "eval_chrf": 70.96854490395519,
424
- "eval_loss": 0.9282872676849365,
425
- "eval_runtime": 96.576,
426
- "eval_samples_per_second": 8.449,
427
- "eval_steps_per_second": 1.056,
428
  "step": 4400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  }
430
  ],
431
  "logging_steps": 100,
432
- "max_steps": 5420,
433
  "num_input_tokens_seen": 0,
434
- "num_train_epochs": 2,
435
  "save_steps": 400,
436
  "stateful_callbacks": {
437
  "TrainerControl": {
@@ -445,7 +1015,7 @@
445
  "attributes": {}
446
  }
447
  },
448
- "total_flos": 1.20506497892352e+16,
449
  "train_batch_size": 8,
450
  "trial_name": null,
451
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.987647227808101,
6
  "eval_steps": 400,
7
+ "global_step": 10400,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.028727377190462512,
14
+ "grad_norm": 6.10549259185791,
15
+ "learning_rate": 3.450229819017523e-05,
16
+ "loss": 1.2588,
17
  "step": 100
18
  },
19
  {
20
+ "epoch": 0.057454754380925024,
21
+ "grad_norm": 4.3359055519104,
22
+ "learning_rate": 3.399956908934214e-05,
23
+ "loss": 1.1224,
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.08618213157138753,
28
+ "grad_norm": 5.931371212005615,
29
+ "learning_rate": 3.3496839988509046e-05,
30
+ "loss": 1.1446,
31
  "step": 300
32
  },
33
  {
34
+ "epoch": 0.11490950876185005,
35
+ "grad_norm": 3.002002716064453,
36
+ "learning_rate": 3.299411088767595e-05,
37
+ "loss": 1.1758,
38
  "step": 400
39
  },
40
  {
41
+ "epoch": 0.11490950876185005,
42
+ "eval_bleu": 41.28035874039895,
43
+ "eval_chrf": 67.18168785062267,
44
+ "eval_loss": 1.1012325286865234,
45
+ "eval_runtime": 92.7093,
46
+ "eval_samples_per_second": 8.802,
47
+ "eval_steps_per_second": 1.1,
48
  "step": 400
49
  },
50
  {
51
+ "epoch": 0.14363688595231255,
52
+ "grad_norm": 8.971595764160156,
53
+ "learning_rate": 3.249138178684286e-05,
54
+ "loss": 1.1277,
55
  "step": 500
56
  },
57
  {
58
+ "epoch": 0.17236426314277506,
59
+ "grad_norm": 4.5978193283081055,
60
+ "learning_rate": 3.1988652686009764e-05,
61
+ "loss": 1.1686,
62
  "step": 600
63
  },
64
  {
65
+ "epoch": 0.20109164033323756,
66
+ "grad_norm": 4.403153419494629,
67
+ "learning_rate": 3.148592358517667e-05,
68
+ "loss": 1.1407,
69
  "step": 700
70
  },
71
  {
72
+ "epoch": 0.2298190175237001,
73
+ "grad_norm": 6.78354024887085,
74
+ "learning_rate": 3.098319448434358e-05,
75
+ "loss": 1.1484,
76
  "step": 800
77
  },
78
  {
79
+ "epoch": 0.2298190175237001,
80
+ "eval_bleu": 41.581675716567375,
81
+ "eval_chrf": 67.43982029189218,
82
+ "eval_loss": 1.082136631011963,
83
+ "eval_runtime": 93.3596,
84
+ "eval_samples_per_second": 8.74,
85
+ "eval_steps_per_second": 1.093,
86
  "step": 800
87
  },
88
  {
89
+ "epoch": 0.2585463947141626,
90
+ "grad_norm": 5.0640668869018555,
91
+ "learning_rate": 3.0480465383510482e-05,
92
+ "loss": 1.1026,
93
  "step": 900
94
  },
95
  {
96
+ "epoch": 0.2872737719046251,
97
+ "grad_norm": 6.618113994598389,
98
+ "learning_rate": 2.9977736282677386e-05,
99
+ "loss": 1.1249,
100
  "step": 1000
101
  },
102
  {
103
+ "epoch": 0.31600114909508764,
104
+ "grad_norm": 4.835494041442871,
105
+ "learning_rate": 2.9475007181844297e-05,
106
+ "loss": 1.0801,
107
  "step": 1100
108
  },
109
  {
110
+ "epoch": 0.3447285262855501,
111
+ "grad_norm": 5.188547611236572,
112
+ "learning_rate": 2.89722780810112e-05,
113
+ "loss": 1.084,
114
  "step": 1200
115
  },
116
  {
117
+ "epoch": 0.3447285262855501,
118
+ "eval_bleu": 42.00764102408484,
119
+ "eval_chrf": 67.64762972701448,
120
+ "eval_loss": 1.089510202407837,
121
+ "eval_runtime": 93.2029,
122
+ "eval_samples_per_second": 8.755,
123
+ "eval_steps_per_second": 1.094,
124
  "step": 1200
125
  },
126
  {
127
+ "epoch": 0.37345590347601265,
128
+ "grad_norm": 4.901341438293457,
129
+ "learning_rate": 2.8469548980178107e-05,
130
+ "loss": 1.077,
131
  "step": 1300
132
  },
133
  {
134
+ "epoch": 0.40218328066647513,
135
+ "grad_norm": 3.2502522468566895,
136
+ "learning_rate": 2.7966819879345015e-05,
137
+ "loss": 1.1337,
138
  "step": 1400
139
  },
140
  {
141
+ "epoch": 0.43091065785693766,
142
+ "grad_norm": 4.620765209197998,
143
+ "learning_rate": 2.746409077851192e-05,
144
+ "loss": 1.0423,
145
  "step": 1500
146
  },
147
  {
148
+ "epoch": 0.4596380350474002,
149
+ "grad_norm": 5.650172233581543,
150
+ "learning_rate": 2.696136167767883e-05,
151
+ "loss": 1.0917,
152
  "step": 1600
153
  },
154
  {
155
+ "epoch": 0.4596380350474002,
156
+ "eval_bleu": 42.83637366392379,
157
+ "eval_chrf": 68.16318386751354,
158
+ "eval_loss": 1.0691643953323364,
159
+ "eval_runtime": 93.178,
160
+ "eval_samples_per_second": 8.757,
161
+ "eval_steps_per_second": 1.095,
162
  "step": 1600
163
  },
164
  {
165
+ "epoch": 0.4883654122378627,
166
+ "grad_norm": 4.365080833435059,
167
+ "learning_rate": 2.6458632576845733e-05,
168
+ "loss": 1.0874,
169
  "step": 1700
170
  },
171
  {
172
+ "epoch": 0.5170927894283252,
173
+ "grad_norm": 6.768467903137207,
174
+ "learning_rate": 2.5955903476012636e-05,
175
+ "loss": 1.0862,
176
  "step": 1800
177
  },
178
  {
179
+ "epoch": 0.5458201666187877,
180
+ "grad_norm": 6.844293594360352,
181
+ "learning_rate": 2.5453174375179547e-05,
182
+ "loss": 1.0999,
183
  "step": 1900
184
  },
185
  {
186
+ "epoch": 0.5745475438092502,
187
+ "grad_norm": 6.295813083648682,
188
+ "learning_rate": 2.495044527434645e-05,
189
+ "loss": 1.1827,
190
  "step": 2000
191
  },
192
  {
193
+ "epoch": 0.5745475438092502,
194
+ "eval_bleu": 42.50176680079929,
195
+ "eval_chrf": 67.98178905115233,
196
+ "eval_loss": 1.072556734085083,
197
+ "eval_runtime": 93.3417,
198
+ "eval_samples_per_second": 8.742,
199
+ "eval_steps_per_second": 1.093,
200
  "step": 2000
201
  },
202
  {
203
+ "epoch": 0.6032749209997127,
204
+ "grad_norm": 4.727890491485596,
205
+ "learning_rate": 2.4447716173513354e-05,
206
+ "loss": 1.1242,
207
  "step": 2100
208
  },
209
  {
210
+ "epoch": 0.6320022981901753,
211
+ "grad_norm": 6.65009880065918,
212
+ "learning_rate": 2.3944987072680265e-05,
213
+ "loss": 1.0538,
214
  "step": 2200
215
  },
216
  {
217
+ "epoch": 0.6607296753806378,
218
+ "grad_norm": 7.740184307098389,
219
+ "learning_rate": 2.344225797184717e-05,
220
+ "loss": 1.0841,
221
  "step": 2300
222
  },
223
  {
224
+ "epoch": 0.6894570525711002,
225
+ "grad_norm": 2.9125332832336426,
226
+ "learning_rate": 2.2939528871014072e-05,
227
+ "loss": 1.072,
228
  "step": 2400
229
  },
230
  {
231
+ "epoch": 0.6894570525711002,
232
+ "eval_bleu": 42.81380660025228,
233
+ "eval_chrf": 68.15974530185456,
234
+ "eval_loss": 1.0647461414337158,
235
+ "eval_runtime": 93.2564,
236
+ "eval_samples_per_second": 8.75,
237
+ "eval_steps_per_second": 1.094,
238
  "step": 2400
239
  },
240
  {
241
+ "epoch": 0.7181844297615627,
242
+ "grad_norm": 4.305944442749023,
243
+ "learning_rate": 2.2436799770180983e-05,
244
+ "loss": 1.0874,
245
  "step": 2500
246
  },
247
  {
248
+ "epoch": 0.7469118069520253,
249
+ "grad_norm": 4.778198719024658,
250
+ "learning_rate": 2.1934070669347887e-05,
251
+ "loss": 1.0971,
252
  "step": 2600
253
  },
254
  {
255
+ "epoch": 0.7756391841424878,
256
+ "grad_norm": 3.9012796878814697,
257
+ "learning_rate": 2.143134156851479e-05,
258
+ "loss": 1.0818,
259
  "step": 2700
260
  },
261
  {
262
+ "epoch": 0.8043665613329503,
263
+ "grad_norm": 6.876541614532471,
264
+ "learning_rate": 2.09286124676817e-05,
265
+ "loss": 1.0424,
266
  "step": 2800
267
  },
268
  {
269
+ "epoch": 0.8043665613329503,
270
+ "eval_bleu": 42.600987811990336,
271
+ "eval_chrf": 68.08456018392769,
272
+ "eval_loss": 1.0626726150512695,
273
+ "eval_runtime": 93.8915,
274
+ "eval_samples_per_second": 8.691,
275
+ "eval_steps_per_second": 1.086,
276
  "step": 2800
277
  },
278
  {
279
+ "epoch": 0.8330939385234128,
280
+ "grad_norm": 3.5570015907287598,
281
+ "learning_rate": 2.0425883366848605e-05,
282
+ "loss": 1.0989,
283
  "step": 2900
284
  },
285
  {
286
+ "epoch": 0.8618213157138753,
287
+ "grad_norm": 4.700556755065918,
288
+ "learning_rate": 1.992315426601551e-05,
289
+ "loss": 1.0897,
290
  "step": 3000
291
  },
292
  {
293
+ "epoch": 0.8905486929043378,
294
+ "grad_norm": 3.259363889694214,
295
+ "learning_rate": 1.942042516518242e-05,
296
+ "loss": 1.0951,
297
  "step": 3100
298
  },
299
  {
300
+ "epoch": 0.9192760700948004,
301
+ "grad_norm": 5.479709148406982,
302
+ "learning_rate": 1.8917696064349323e-05,
303
+ "loss": 1.0604,
304
  "step": 3200
305
  },
306
  {
307
+ "epoch": 0.9192760700948004,
308
+ "eval_bleu": 42.6775731878505,
309
+ "eval_chrf": 68.08652942159591,
310
+ "eval_loss": 1.0623129606246948,
311
+ "eval_runtime": 94.0003,
312
+ "eval_samples_per_second": 8.681,
313
+ "eval_steps_per_second": 1.085,
314
  "step": 3200
315
  },
316
  {
317
+ "epoch": 0.9480034472852629,
318
+ "grad_norm": 4.478285312652588,
319
+ "learning_rate": 1.841496696351623e-05,
320
+ "loss": 1.1103,
321
  "step": 3300
322
  },
323
  {
324
+ "epoch": 0.9767308244757253,
325
+ "grad_norm": 6.426375389099121,
326
+ "learning_rate": 1.7912237862683137e-05,
327
+ "loss": 1.0529,
328
  "step": 3400
329
  },
330
  {
331
+ "epoch": 1.005458201666188,
332
+ "grad_norm": 5.8713884353637695,
333
+ "learning_rate": 1.740950876185004e-05,
334
+ "loss": 1.0224,
335
  "step": 3500
336
  },
337
  {
338
+ "epoch": 1.0341855788566503,
339
+ "grad_norm": 3.4513754844665527,
340
+ "learning_rate": 1.6906779661016948e-05,
341
+ "loss": 1.0399,
342
  "step": 3600
343
  },
344
  {
345
+ "epoch": 1.0341855788566503,
346
+ "eval_bleu": 43.44094597011667,
347
+ "eval_chrf": 68.32209136987879,
348
+ "eval_loss": 1.0515447854995728,
349
+ "eval_runtime": 93.6321,
350
+ "eval_samples_per_second": 8.715,
351
+ "eval_steps_per_second": 1.089,
352
  "step": 3600
353
  },
354
  {
355
+ "epoch": 1.062912956047113,
356
+ "grad_norm": 4.6053595542907715,
357
+ "learning_rate": 1.6404050560183855e-05,
358
+ "loss": 1.0074,
359
  "step": 3700
360
  },
361
  {
362
+ "epoch": 1.0916403332375755,
363
+ "grad_norm": 3.8163719177246094,
364
+ "learning_rate": 1.5901321459350762e-05,
365
+ "loss": 0.977,
366
  "step": 3800
367
  },
368
  {
369
+ "epoch": 1.1203677104280378,
370
+ "grad_norm": 5.681026458740234,
371
+ "learning_rate": 1.5398592358517666e-05,
372
+ "loss": 1.0561,
373
  "step": 3900
374
  },
375
  {
376
+ "epoch": 1.1490950876185004,
377
+ "grad_norm": 3.1531848907470703,
378
+ "learning_rate": 1.4895863257684573e-05,
379
+ "loss": 1.0264,
380
  "step": 4000
381
  },
382
  {
383
+ "epoch": 1.1490950876185004,
384
+ "eval_bleu": 43.34741810707491,
385
+ "eval_chrf": 68.32312785885578,
386
+ "eval_loss": 1.0600742101669312,
387
+ "eval_runtime": 93.0806,
388
+ "eval_samples_per_second": 8.767,
389
+ "eval_steps_per_second": 1.096,
390
  "step": 4000
391
  },
392
  {
393
+ "epoch": 1.177822464808963,
394
+ "grad_norm": 4.803258419036865,
395
+ "learning_rate": 1.4393134156851478e-05,
396
+ "loss": 1.0506,
397
  "step": 4100
398
  },
399
  {
400
+ "epoch": 1.2065498419994254,
401
+ "grad_norm": 3.834118366241455,
402
+ "learning_rate": 1.3890405056018384e-05,
403
+ "loss": 1.0144,
404
  "step": 4200
405
  },
406
  {
407
+ "epoch": 1.235277219189888,
408
+ "grad_norm": 4.096823215484619,
409
+ "learning_rate": 1.3387675955185291e-05,
410
+ "loss": 1.0276,
411
  "step": 4300
412
  },
413
  {
414
+ "epoch": 1.2640045963803503,
415
+ "grad_norm": 4.902688980102539,
416
+ "learning_rate": 1.2884946854352196e-05,
417
+ "loss": 1.0298,
418
  "step": 4400
419
  },
420
  {
421
+ "epoch": 1.2640045963803503,
422
+ "eval_bleu": 43.13712526544163,
423
+ "eval_chrf": 68.36972995474055,
424
+ "eval_loss": 1.0530284643173218,
425
+ "eval_runtime": 92.6858,
426
+ "eval_samples_per_second": 8.804,
427
+ "eval_steps_per_second": 1.1,
428
  "step": 4400
429
+ },
430
+ {
431
+ "epoch": 1.292731973570813,
432
+ "grad_norm": 5.135220050811768,
433
+ "learning_rate": 1.2382217753519102e-05,
434
+ "loss": 1.0511,
435
+ "step": 4500
436
+ },
437
+ {
438
+ "epoch": 1.3214593507612755,
439
+ "grad_norm": 5.3379669189453125,
440
+ "learning_rate": 1.1879488652686009e-05,
441
+ "loss": 0.9576,
442
+ "step": 4600
443
+ },
444
+ {
445
+ "epoch": 1.3501867279517379,
446
+ "grad_norm": 3.882709264755249,
447
+ "learning_rate": 1.1376759551852914e-05,
448
+ "loss": 1.0353,
449
+ "step": 4700
450
+ },
451
+ {
452
+ "epoch": 1.3789141051422005,
453
+ "grad_norm": 5.007932662963867,
454
+ "learning_rate": 1.0874030451019821e-05,
455
+ "loss": 1.1185,
456
+ "step": 4800
457
+ },
458
+ {
459
+ "epoch": 1.3789141051422005,
460
+ "eval_bleu": 43.42350950007614,
461
+ "eval_chrf": 68.41671354789146,
462
+ "eval_loss": 1.0510783195495605,
463
+ "eval_runtime": 92.8632,
464
+ "eval_samples_per_second": 8.787,
465
+ "eval_steps_per_second": 1.098,
466
+ "step": 4800
467
+ },
468
+ {
469
+ "epoch": 1.407641482332663,
470
+ "grad_norm": 4.357149600982666,
471
+ "learning_rate": 1.0371301350186727e-05,
472
+ "loss": 1.0127,
473
+ "step": 4900
474
+ },
475
+ {
476
+ "epoch": 1.4363688595231254,
477
+ "grad_norm": 4.049122333526611,
478
+ "learning_rate": 9.868572249353634e-06,
479
+ "loss": 0.9992,
480
+ "step": 5000
481
+ },
482
+ {
483
+ "epoch": 1.465096236713588,
484
+ "grad_norm": 7.8539509773254395,
485
+ "learning_rate": 9.36584314852054e-06,
486
+ "loss": 0.9932,
487
+ "step": 5100
488
+ },
489
+ {
490
+ "epoch": 1.4938236139040506,
491
+ "grad_norm": 3.6276633739471436,
492
+ "learning_rate": 8.863114047687445e-06,
493
+ "loss": 1.0533,
494
+ "step": 5200
495
+ },
496
+ {
497
+ "epoch": 1.4938236139040506,
498
+ "eval_bleu": 43.312959923626096,
499
+ "eval_chrf": 68.37463399791737,
500
+ "eval_loss": 1.0536953210830688,
501
+ "eval_runtime": 92.7398,
502
+ "eval_samples_per_second": 8.799,
503
+ "eval_steps_per_second": 1.1,
504
+ "step": 5200
505
+ },
506
+ {
507
+ "epoch": 1.522550991094513,
508
+ "grad_norm": 5.274589538574219,
509
+ "learning_rate": 8.360384946854352e-06,
510
+ "loss": 1.0569,
511
+ "step": 5300
512
+ },
513
+ {
514
+ "epoch": 1.5512783682849756,
515
+ "grad_norm": 4.3048553466796875,
516
+ "learning_rate": 7.857655846021257e-06,
517
+ "loss": 0.9795,
518
+ "step": 5400
519
+ },
520
+ {
521
+ "epoch": 1.5800057454754382,
522
+ "grad_norm": 2.943560838699341,
523
+ "learning_rate": 7.354926745188164e-06,
524
+ "loss": 1.0684,
525
+ "step": 5500
526
+ },
527
+ {
528
+ "epoch": 1.6087331226659005,
529
+ "grad_norm": 4.310853481292725,
530
+ "learning_rate": 6.852197644355069e-06,
531
+ "loss": 1.0273,
532
+ "step": 5600
533
+ },
534
+ {
535
+ "epoch": 1.6087331226659005,
536
+ "eval_bleu": 43.54734511736152,
537
+ "eval_chrf": 68.38126240205835,
538
+ "eval_loss": 1.0513123273849487,
539
+ "eval_runtime": 93.5642,
540
+ "eval_samples_per_second": 8.721,
541
+ "eval_steps_per_second": 1.09,
542
+ "step": 5600
543
+ },
544
+ {
545
+ "epoch": 1.637460499856363,
546
+ "grad_norm": 5.702296733856201,
547
+ "learning_rate": 6.349468543521976e-06,
548
+ "loss": 1.0453,
549
+ "step": 5700
550
+ },
551
+ {
552
+ "epoch": 1.6661878770468257,
553
+ "grad_norm": 4.328929424285889,
554
+ "learning_rate": 5.846739442688882e-06,
555
+ "loss": 1.0025,
556
+ "step": 5800
557
+ },
558
+ {
559
+ "epoch": 1.694915254237288,
560
+ "grad_norm": 3.448596954345703,
561
+ "learning_rate": 5.344010341855789e-06,
562
+ "loss": 1.0255,
563
+ "step": 5900
564
+ },
565
+ {
566
+ "epoch": 1.7236426314277506,
567
+ "grad_norm": 5.398486614227295,
568
+ "learning_rate": 4.841281241022694e-06,
569
+ "loss": 1.0557,
570
+ "step": 6000
571
+ },
572
+ {
573
+ "epoch": 1.7236426314277506,
574
+ "eval_bleu": 43.809297513184895,
575
+ "eval_chrf": 68.57608398234109,
576
+ "eval_loss": 1.0441299676895142,
577
+ "eval_runtime": 93.6873,
578
+ "eval_samples_per_second": 8.71,
579
+ "eval_steps_per_second": 1.089,
580
+ "step": 6000
581
+ },
582
+ {
583
+ "epoch": 1.7523700086182132,
584
+ "grad_norm": 6.588860511779785,
585
+ "learning_rate": 4.338552140189601e-06,
586
+ "loss": 0.9614,
587
+ "step": 6100
588
+ },
589
+ {
590
+ "epoch": 1.7810973858086756,
591
+ "grad_norm": 3.7804524898529053,
592
+ "learning_rate": 3.835823039356507e-06,
593
+ "loss": 0.975,
594
+ "step": 6200
595
+ },
596
+ {
597
+ "epoch": 1.8098247629991382,
598
+ "grad_norm": 7.301486968994141,
599
+ "learning_rate": 3.3330939385234123e-06,
600
+ "loss": 0.9646,
601
+ "step": 6300
602
+ },
603
+ {
604
+ "epoch": 1.8385521401896008,
605
+ "grad_norm": 4.416522026062012,
606
+ "learning_rate": 2.8303648376903186e-06,
607
+ "loss": 1.084,
608
+ "step": 6400
609
+ },
610
+ {
611
+ "epoch": 1.8385521401896008,
612
+ "eval_bleu": 43.701603906200226,
613
+ "eval_chrf": 68.50928352812315,
614
+ "eval_loss": 1.0416936874389648,
615
+ "eval_runtime": 94.7088,
616
+ "eval_samples_per_second": 8.616,
617
+ "eval_steps_per_second": 1.077,
618
+ "step": 6400
619
+ },
620
+ {
621
+ "epoch": 1.8672795173800631,
622
+ "grad_norm": 4.443411827087402,
623
+ "learning_rate": 2.327635736857225e-06,
624
+ "loss": 1.0306,
625
+ "step": 6500
626
+ },
627
+ {
628
+ "epoch": 1.8960068945705257,
629
+ "grad_norm": 2.7506167888641357,
630
+ "learning_rate": 1.824906636024131e-06,
631
+ "loss": 0.9817,
632
+ "step": 6600
633
+ },
634
+ {
635
+ "epoch": 1.9247342717609883,
636
+ "grad_norm": 3.5802366733551025,
637
+ "learning_rate": 1.3221775351910368e-06,
638
+ "loss": 1.0187,
639
+ "step": 6700
640
+ },
641
+ {
642
+ "epoch": 1.9534616489514507,
643
+ "grad_norm": 3.9516446590423584,
644
+ "learning_rate": 8.194484343579431e-07,
645
+ "loss": 0.9603,
646
+ "step": 6800
647
+ },
648
+ {
649
+ "epoch": 1.9534616489514507,
650
+ "eval_bleu": 43.77821033710757,
651
+ "eval_chrf": 68.65385549549458,
652
+ "eval_loss": 1.0408196449279785,
653
+ "eval_runtime": 93.305,
654
+ "eval_samples_per_second": 8.746,
655
+ "eval_steps_per_second": 1.093,
656
+ "step": 6800
657
+ },
658
+ {
659
+ "epoch": 1.9821890261419133,
660
+ "grad_norm": 3.300943374633789,
661
+ "learning_rate": 1.7658359666762423e-05,
662
+ "loss": 0.9946,
663
+ "step": 6900
664
+ },
665
+ {
666
+ "epoch": 2.010916403332376,
667
+ "grad_norm": 4.883905410766602,
668
+ "learning_rate": 1.7406995116345875e-05,
669
+ "loss": 0.9878,
670
+ "step": 7000
671
+ },
672
+ {
673
+ "epoch": 2.0396437805228382,
674
+ "grad_norm": 8.941852569580078,
675
+ "learning_rate": 1.715563056592933e-05,
676
+ "loss": 1.0303,
677
+ "step": 7100
678
+ },
679
+ {
680
+ "epoch": 2.0683711577133006,
681
+ "grad_norm": 5.8202619552612305,
682
+ "learning_rate": 1.6904266015512782e-05,
683
+ "loss": 1.024,
684
+ "step": 7200
685
+ },
686
+ {
687
+ "epoch": 2.0683711577133006,
688
+ "eval_bleu": 44.776518325274054,
689
+ "eval_chrf": 69.47421963436383,
690
+ "eval_loss": 1.011445164680481,
691
+ "eval_runtime": 96.566,
692
+ "eval_samples_per_second": 8.45,
693
+ "eval_steps_per_second": 1.056,
694
+ "step": 7200
695
+ },
696
+ {
697
+ "epoch": 2.0970985349037634,
698
+ "grad_norm": 4.4640212059021,
699
+ "learning_rate": 1.6652901465096237e-05,
700
+ "loss": 0.9985,
701
+ "step": 7300
702
+ },
703
+ {
704
+ "epoch": 2.125825912094226,
705
+ "grad_norm": 5.426873207092285,
706
+ "learning_rate": 1.640153691467969e-05,
707
+ "loss": 1.0666,
708
+ "step": 7400
709
+ },
710
+ {
711
+ "epoch": 2.154553289284688,
712
+ "grad_norm": 4.508958339691162,
713
+ "learning_rate": 1.615017236426314e-05,
714
+ "loss": 1.0056,
715
+ "step": 7500
716
+ },
717
+ {
718
+ "epoch": 2.183280666475151,
719
+ "grad_norm": 4.3101115226745605,
720
+ "learning_rate": 1.5898807813846596e-05,
721
+ "loss": 0.9447,
722
+ "step": 7600
723
+ },
724
+ {
725
+ "epoch": 2.183280666475151,
726
+ "eval_bleu": 44.62186321013662,
727
+ "eval_chrf": 69.46358431357127,
728
+ "eval_loss": 1.0075112581253052,
729
+ "eval_runtime": 97.4195,
730
+ "eval_samples_per_second": 8.376,
731
+ "eval_steps_per_second": 1.047,
732
+ "step": 7600
733
+ },
734
+ {
735
+ "epoch": 2.2120080436656133,
736
+ "grad_norm": 6.016995906829834,
737
+ "learning_rate": 1.5647443263430048e-05,
738
+ "loss": 0.9619,
739
+ "step": 7700
740
+ },
741
+ {
742
+ "epoch": 2.2407354208560757,
743
+ "grad_norm": 4.525808811187744,
744
+ "learning_rate": 1.53960787130135e-05,
745
+ "loss": 0.9508,
746
+ "step": 7800
747
+ },
748
+ {
749
+ "epoch": 2.2694627980465385,
750
+ "grad_norm": 6.449229717254639,
751
+ "learning_rate": 1.5144714162596954e-05,
752
+ "loss": 1.0076,
753
+ "step": 7900
754
+ },
755
+ {
756
+ "epoch": 2.298190175237001,
757
+ "grad_norm": 5.163827419281006,
758
+ "learning_rate": 1.4893349612180407e-05,
759
+ "loss": 0.9704,
760
+ "step": 8000
761
+ },
762
+ {
763
+ "epoch": 2.298190175237001,
764
+ "eval_bleu": 44.871701979454734,
765
+ "eval_chrf": 69.5808210141665,
766
+ "eval_loss": 1.0056633949279785,
767
+ "eval_runtime": 99.6577,
768
+ "eval_samples_per_second": 8.188,
769
+ "eval_steps_per_second": 1.024,
770
+ "step": 8000
771
+ },
772
+ {
773
+ "epoch": 2.3269175524274632,
774
+ "grad_norm": 5.531907081604004,
775
+ "learning_rate": 1.4641985061763859e-05,
776
+ "loss": 1.0062,
777
+ "step": 8100
778
+ },
779
+ {
780
+ "epoch": 2.355644929617926,
781
+ "grad_norm": 3.4357688426971436,
782
+ "learning_rate": 1.4390620511347313e-05,
783
+ "loss": 0.9495,
784
+ "step": 8200
785
+ },
786
+ {
787
+ "epoch": 2.3843723068083884,
788
+ "grad_norm": 4.002674102783203,
789
+ "learning_rate": 1.4139255960930766e-05,
790
+ "loss": 1.0155,
791
+ "step": 8300
792
+ },
793
+ {
794
+ "epoch": 2.4130996839988508,
795
+ "grad_norm": 4.969707489013672,
796
+ "learning_rate": 1.3887891410514218e-05,
797
+ "loss": 0.9888,
798
+ "step": 8400
799
+ },
800
+ {
801
+ "epoch": 2.4130996839988508,
802
+ "eval_bleu": 44.87312977165901,
803
+ "eval_chrf": 69.59913415031703,
804
+ "eval_loss": 1.0011674165725708,
805
+ "eval_runtime": 96.8255,
806
+ "eval_samples_per_second": 8.428,
807
+ "eval_steps_per_second": 1.053,
808
+ "step": 8400
809
+ },
810
+ {
811
+ "epoch": 2.4418270611893136,
812
+ "grad_norm": 4.432431221008301,
813
+ "learning_rate": 1.3636526860097672e-05,
814
+ "loss": 0.9735,
815
+ "step": 8500
816
+ },
817
+ {
818
+ "epoch": 2.470554438379776,
819
+ "grad_norm": 3.4303410053253174,
820
+ "learning_rate": 1.3385162309681125e-05,
821
+ "loss": 1.0241,
822
+ "step": 8600
823
+ },
824
+ {
825
+ "epoch": 2.4992818155702383,
826
+ "grad_norm": 6.358030796051025,
827
+ "learning_rate": 1.3133797759264577e-05,
828
+ "loss": 0.9798,
829
+ "step": 8700
830
+ },
831
+ {
832
+ "epoch": 2.5280091927607007,
833
+ "grad_norm": 3.427170991897583,
834
+ "learning_rate": 1.288243320884803e-05,
835
+ "loss": 0.9448,
836
+ "step": 8800
837
+ },
838
+ {
839
+ "epoch": 2.5280091927607007,
840
+ "eval_bleu": 45.198620007189945,
841
+ "eval_chrf": 69.6731219507204,
842
+ "eval_loss": 1.0011487007141113,
843
+ "eval_runtime": 96.2525,
844
+ "eval_samples_per_second": 8.478,
845
+ "eval_steps_per_second": 1.06,
846
+ "step": 8800
847
+ },
848
+ {
849
+ "epoch": 2.5567365699511635,
850
+ "grad_norm": 4.5513715744018555,
851
+ "learning_rate": 1.2631068658431484e-05,
852
+ "loss": 1.0314,
853
+ "step": 8900
854
+ },
855
+ {
856
+ "epoch": 2.585463947141626,
857
+ "grad_norm": 4.186106204986572,
858
+ "learning_rate": 1.2379704108014938e-05,
859
+ "loss": 1.0178,
860
+ "step": 9000
861
+ },
862
+ {
863
+ "epoch": 2.6141913243320882,
864
+ "grad_norm": 6.664104461669922,
865
+ "learning_rate": 1.2128339557598391e-05,
866
+ "loss": 0.9677,
867
+ "step": 9100
868
+ },
869
+ {
870
+ "epoch": 2.642918701522551,
871
+ "grad_norm": 4.591300964355469,
872
+ "learning_rate": 1.1876975007181843e-05,
873
+ "loss": 0.9438,
874
+ "step": 9200
875
+ },
876
+ {
877
+ "epoch": 2.642918701522551,
878
+ "eval_bleu": 45.157866699798426,
879
+ "eval_chrf": 69.66915651414561,
880
+ "eval_loss": 1.0011706352233887,
881
+ "eval_runtime": 96.261,
882
+ "eval_samples_per_second": 8.477,
883
+ "eval_steps_per_second": 1.06,
884
+ "step": 9200
885
+ },
886
+ {
887
+ "epoch": 2.6716460787130134,
888
+ "grad_norm": 3.8000681400299072,
889
+ "learning_rate": 1.1625610456765297e-05,
890
+ "loss": 0.9528,
891
+ "step": 9300
892
+ },
893
+ {
894
+ "epoch": 2.7003734559034758,
895
+ "grad_norm": 4.306599140167236,
896
+ "learning_rate": 1.137424590634875e-05,
897
+ "loss": 0.9654,
898
+ "step": 9400
899
+ },
900
+ {
901
+ "epoch": 2.7291008330939386,
902
+ "grad_norm": 2.658061981201172,
903
+ "learning_rate": 1.1122881355932202e-05,
904
+ "loss": 0.9606,
905
+ "step": 9500
906
+ },
907
+ {
908
+ "epoch": 2.757828210284401,
909
+ "grad_norm": 3.8488523960113525,
910
+ "learning_rate": 1.0871516805515656e-05,
911
+ "loss": 0.9975,
912
+ "step": 9600
913
+ },
914
+ {
915
+ "epoch": 2.757828210284401,
916
+ "eval_bleu": 45.270832494636274,
917
+ "eval_chrf": 69.71396054828098,
918
+ "eval_loss": 0.9907030463218689,
919
+ "eval_runtime": 95.5189,
920
+ "eval_samples_per_second": 8.543,
921
+ "eval_steps_per_second": 1.068,
922
+ "step": 9600
923
+ },
924
+ {
925
+ "epoch": 2.7865555874748633,
926
+ "grad_norm": 6.573736190795898,
927
+ "learning_rate": 1.062015225509911e-05,
928
+ "loss": 0.9221,
929
+ "step": 9700
930
+ },
931
+ {
932
+ "epoch": 2.815282964665326,
933
+ "grad_norm": 5.980939865112305,
934
+ "learning_rate": 1.0368787704682561e-05,
935
+ "loss": 0.9592,
936
+ "step": 9800
937
+ },
938
+ {
939
+ "epoch": 2.8440103418557885,
940
+ "grad_norm": 7.583348274230957,
941
+ "learning_rate": 1.0117423154266015e-05,
942
+ "loss": 1.0261,
943
+ "step": 9900
944
+ },
945
+ {
946
+ "epoch": 2.872737719046251,
947
+ "grad_norm": 4.932117938995361,
948
+ "learning_rate": 9.866058603849468e-06,
949
+ "loss": 0.9585,
950
+ "step": 10000
951
+ },
952
+ {
953
+ "epoch": 2.872737719046251,
954
+ "eval_bleu": 45.452688800881376,
955
+ "eval_chrf": 69.95442745584451,
956
+ "eval_loss": 0.994485080242157,
957
+ "eval_runtime": 95.7902,
958
+ "eval_samples_per_second": 8.519,
959
+ "eval_steps_per_second": 1.065,
960
+ "step": 10000
961
+ },
962
+ {
963
+ "epoch": 2.9014650962367137,
964
+ "grad_norm": 4.127469062805176,
965
+ "learning_rate": 9.61469405343292e-06,
966
+ "loss": 0.9333,
967
+ "step": 10100
968
+ },
969
+ {
970
+ "epoch": 2.930192473427176,
971
+ "grad_norm": 4.6181559562683105,
972
+ "learning_rate": 9.363329503016374e-06,
973
+ "loss": 0.8895,
974
+ "step": 10200
975
+ },
976
+ {
977
+ "epoch": 2.9589198506176384,
978
+ "grad_norm": 3.028794050216675,
979
+ "learning_rate": 9.111964952599827e-06,
980
+ "loss": 0.9395,
981
+ "step": 10300
982
+ },
983
+ {
984
+ "epoch": 2.987647227808101,
985
+ "grad_norm": 5.18999719619751,
986
+ "learning_rate": 8.86060040218328e-06,
987
+ "loss": 0.9686,
988
+ "step": 10400
989
+ },
990
+ {
991
+ "epoch": 2.987647227808101,
992
+ "eval_bleu": 45.433727848087464,
993
+ "eval_chrf": 69.79153825463808,
994
+ "eval_loss": 0.993137001991272,
995
+ "eval_runtime": 96.9358,
996
+ "eval_samples_per_second": 8.418,
997
+ "eval_steps_per_second": 1.052,
998
+ "step": 10400
999
  }
1000
  ],
1001
  "logging_steps": 100,
1002
+ "max_steps": 13924,
1003
  "num_input_tokens_seen": 0,
1004
+ "num_train_epochs": 4,
1005
  "save_steps": 400,
1006
  "stateful_callbacks": {
1007
  "TrainerControl": {
 
1015
  "attributes": {}
1016
  }
1017
  },
1018
+ "total_flos": 2.8481299959250944e+16,
1019
  "train_batch_size": 8,
1020
  "trial_name": null,
1021
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c7095dd3f16a053228927e43b0533849042f7097c1f117a57fdb94e71d9c4e7
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c569ee8bfa824e15f7ea34dff282d6cfba522dc5377663ba7e9fb590f959a53d
3
  size 5905
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad15d12fa229f1794a8dd10223f571b0c4044b188c2edb0e066b7a18688f6132
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ed261aaa3c3ec74fccf2ba187e24f5eb03a30d3d2c0e6edb64a19100c1aaa77
3
  size 990185320