Sabbir772 commited on
Commit
26e92bc
·
verified ·
1 Parent(s): 451478d

Training in progress, step 3420

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:239d42f619e9c7ef78a589e8878755b7ff05452678bd76c3387d95b909c97859
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc6b220a7ddc1693f8f9890cc5852a7b7e0f32a9fe123095302817600f977b31
3
  size 990185320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc13493b7d44375cc8cae4d02fa249621dda652ef7ed46c67218c1d0e8ed9508
3
  size 1980545291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b3dcd6af690fe95d99438cd0881674cf89b2c2354b97a0b8dbee8da92dc4dfe
3
  size 1980545291
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46665dae441c58ae29353356f7aac4a3e2cfc255bb6a3218b134f74d51910343
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:640f46eb39b244e7bfc751c4f25ad8e0191fac059c44f640e0680bc644444835
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d718bae36912f89382302637b1931f50f428ec0f6caf053ec44b1bbc42dc924d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c693fb2ea94de3ab99065331d59deff37c4c392b1695d7781427a4e71d863f58
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,320 +2,396 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.871345029239766,
6
  "eval_steps": 400,
7
- "global_step": 3200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.05847953216374269,
14
- "grad_norm": 3.643113851547241,
15
- "learning_rate": 4.3697368421052636e-05,
16
- "loss": 1.2757,
17
  "step": 100
18
  },
19
  {
20
- "epoch": 0.11695906432748537,
21
- "grad_norm": 3.530352830886841,
22
- "learning_rate": 4.2381578947368426e-05,
23
- "loss": 1.337,
24
  "step": 200
25
  },
26
  {
27
- "epoch": 0.17543859649122806,
28
- "grad_norm": 5.174887657165527,
29
- "learning_rate": 4.1065789473684215e-05,
30
- "loss": 1.3173,
31
  "step": 300
32
  },
33
  {
34
- "epoch": 0.23391812865497075,
35
- "grad_norm": 3.0970983505249023,
36
- "learning_rate": 3.9750000000000004e-05,
37
- "loss": 1.3895,
38
  "step": 400
39
  },
40
  {
41
- "epoch": 0.23391812865497075,
42
- "eval_bleu": 39.47529188348838,
43
- "eval_chrf": 65.3378689248108,
44
- "eval_loss": 1.3580824136734009,
45
- "eval_runtime": 56.39,
46
- "eval_samples_per_second": 9.115,
47
- "eval_steps_per_second": 1.153,
48
  "step": 400
49
  },
50
  {
51
- "epoch": 0.29239766081871343,
52
- "grad_norm": 5.711610794067383,
53
- "learning_rate": 3.843421052631579e-05,
54
- "loss": 1.3278,
55
  "step": 500
56
  },
57
  {
58
- "epoch": 0.3508771929824561,
59
- "grad_norm": 3.5587220191955566,
60
- "learning_rate": 3.711842105263158e-05,
61
- "loss": 1.412,
62
  "step": 600
63
  },
64
  {
65
- "epoch": 0.4093567251461988,
66
- "grad_norm": 3.550055980682373,
67
- "learning_rate": 3.580263157894737e-05,
68
- "loss": 1.3132,
69
  "step": 700
70
  },
71
  {
72
- "epoch": 0.4678362573099415,
73
- "grad_norm": 3.751816511154175,
74
- "learning_rate": 3.448684210526316e-05,
75
- "loss": 1.3409,
76
  "step": 800
77
  },
78
  {
79
- "epoch": 0.4678362573099415,
80
- "eval_bleu": 39.42249073542359,
81
- "eval_chrf": 65.4339382658089,
82
- "eval_loss": 1.34471595287323,
83
- "eval_runtime": 56.6324,
84
- "eval_samples_per_second": 9.076,
85
- "eval_steps_per_second": 1.148,
86
  "step": 800
87
  },
88
  {
89
- "epoch": 0.5263157894736842,
90
- "grad_norm": 3.660346508026123,
91
- "learning_rate": 3.317105263157895e-05,
92
- "loss": 1.3487,
93
  "step": 900
94
  },
95
  {
96
- "epoch": 0.5847953216374269,
97
- "grad_norm": 6.453419208526611,
98
- "learning_rate": 3.185526315789474e-05,
99
- "loss": 1.3217,
100
  "step": 1000
101
  },
102
  {
103
- "epoch": 0.6432748538011696,
104
- "grad_norm": 4.79046630859375,
105
- "learning_rate": 3.053947368421053e-05,
106
- "loss": 1.3755,
107
  "step": 1100
108
  },
109
  {
110
- "epoch": 0.7017543859649122,
111
- "grad_norm": 4.565652847290039,
112
- "learning_rate": 2.9223684210526318e-05,
113
- "loss": 1.3523,
114
  "step": 1200
115
  },
116
  {
117
- "epoch": 0.7017543859649122,
118
- "eval_bleu": 40.520638506139925,
119
- "eval_chrf": 65.95395212452995,
120
- "eval_loss": 1.3250073194503784,
121
- "eval_runtime": 56.4151,
122
- "eval_samples_per_second": 9.111,
123
- "eval_steps_per_second": 1.152,
124
  "step": 1200
125
  },
126
  {
127
- "epoch": 0.7602339181286549,
128
- "grad_norm": 3.959063768386841,
129
- "learning_rate": 2.790789473684211e-05,
130
- "loss": 1.3696,
131
  "step": 1300
132
  },
133
  {
134
- "epoch": 0.8187134502923976,
135
- "grad_norm": 5.297095775604248,
136
- "learning_rate": 2.6592105263157896e-05,
137
- "loss": 1.33,
138
  "step": 1400
139
  },
140
  {
141
- "epoch": 0.8771929824561403,
142
- "grad_norm": 4.624599933624268,
143
- "learning_rate": 2.527631578947369e-05,
144
- "loss": 1.3566,
145
  "step": 1500
146
  },
147
  {
148
- "epoch": 0.935672514619883,
149
- "grad_norm": 4.531945705413818,
150
- "learning_rate": 2.3960526315789475e-05,
151
- "loss": 1.3527,
152
  "step": 1600
153
  },
154
  {
155
- "epoch": 0.935672514619883,
156
- "eval_bleu": 40.56125811248909,
157
- "eval_chrf": 65.81915372622016,
158
- "eval_loss": 1.3156400918960571,
159
- "eval_runtime": 56.9807,
160
- "eval_samples_per_second": 9.021,
161
- "eval_steps_per_second": 1.141,
162
  "step": 1600
163
  },
164
  {
165
- "epoch": 0.9941520467836257,
166
- "grad_norm": 4.245266437530518,
167
- "learning_rate": 2.2644736842105267e-05,
168
- "loss": 1.3855,
169
  "step": 1700
170
  },
171
  {
172
- "epoch": 1.0526315789473684,
173
- "grad_norm": 5.7170867919921875,
174
- "learning_rate": 2.1328947368421053e-05,
175
- "loss": 1.3195,
176
  "step": 1800
177
  },
178
  {
179
- "epoch": 1.1111111111111112,
180
- "grad_norm": 4.01872444152832,
181
- "learning_rate": 2.0013157894736842e-05,
182
- "loss": 1.2638,
183
  "step": 1900
184
  },
185
  {
186
- "epoch": 1.1695906432748537,
187
- "grad_norm": 5.8071441650390625,
188
- "learning_rate": 1.869736842105263e-05,
189
- "loss": 1.3409,
190
  "step": 2000
191
  },
192
  {
193
- "epoch": 1.1695906432748537,
194
- "eval_bleu": 40.43566420212,
195
- "eval_chrf": 66.0648038736721,
196
- "eval_loss": 1.3172210454940796,
197
- "eval_runtime": 57.0925,
198
- "eval_samples_per_second": 9.003,
199
- "eval_steps_per_second": 1.139,
200
  "step": 2000
201
  },
202
  {
203
- "epoch": 1.2280701754385965,
204
- "grad_norm": 5.8771162033081055,
205
- "learning_rate": 1.738157894736842e-05,
206
- "loss": 1.2521,
207
  "step": 2100
208
  },
209
  {
210
- "epoch": 1.286549707602339,
211
- "grad_norm": 3.5148508548736572,
212
- "learning_rate": 1.606578947368421e-05,
213
- "loss": 1.2719,
214
  "step": 2200
215
  },
216
  {
217
- "epoch": 1.345029239766082,
218
- "grad_norm": 3.7268385887145996,
219
- "learning_rate": 1.4750000000000001e-05,
220
- "loss": 1.3249,
221
  "step": 2300
222
  },
223
  {
224
- "epoch": 1.4035087719298245,
225
- "grad_norm": 6.356854438781738,
226
- "learning_rate": 1.343421052631579e-05,
227
- "loss": 1.2397,
228
  "step": 2400
229
  },
230
  {
231
- "epoch": 1.4035087719298245,
232
- "eval_bleu": 40.582868133203924,
233
- "eval_chrf": 65.94447550029828,
234
- "eval_loss": 1.321014642715454,
235
- "eval_runtime": 56.2984,
236
- "eval_samples_per_second": 9.13,
237
- "eval_steps_per_second": 1.155,
238
  "step": 2400
239
  },
240
  {
241
- "epoch": 1.4619883040935673,
242
- "grad_norm": 4.95539665222168,
243
- "learning_rate": 1.211842105263158e-05,
244
- "loss": 1.3243,
245
  "step": 2500
246
  },
247
  {
248
- "epoch": 1.52046783625731,
249
- "grad_norm": 3.4667985439300537,
250
- "learning_rate": 1.0802631578947369e-05,
251
- "loss": 1.2947,
252
  "step": 2600
253
  },
254
  {
255
- "epoch": 1.5789473684210527,
256
- "grad_norm": 3.2790651321411133,
257
- "learning_rate": 9.486842105263158e-06,
258
- "loss": 1.2674,
259
  "step": 2700
260
  },
261
  {
262
- "epoch": 1.6374269005847952,
263
- "grad_norm": 4.367522716522217,
264
- "learning_rate": 8.171052631578949e-06,
265
- "loss": 1.3275,
266
  "step": 2800
267
  },
268
  {
269
- "epoch": 1.6374269005847952,
270
- "eval_bleu": 40.196557878597055,
271
- "eval_chrf": 66.05405159197521,
272
- "eval_loss": 1.3174731731414795,
273
- "eval_runtime": 55.9756,
274
- "eval_samples_per_second": 9.183,
275
- "eval_steps_per_second": 1.161,
276
  "step": 2800
277
  },
278
  {
279
- "epoch": 1.695906432748538,
280
- "grad_norm": 3.6613388061523438,
281
- "learning_rate": 6.855263157894738e-06,
282
- "loss": 1.2249,
283
  "step": 2900
284
  },
285
  {
286
- "epoch": 1.7543859649122808,
287
- "grad_norm": 4.5111002922058105,
288
- "learning_rate": 5.5394736842105266e-06,
289
- "loss": 1.2362,
290
  "step": 3000
291
  },
292
  {
293
- "epoch": 1.8128654970760234,
294
- "grad_norm": 5.002144813537598,
295
- "learning_rate": 4.223684210526316e-06,
296
- "loss": 1.2918,
297
  "step": 3100
298
  },
299
  {
300
- "epoch": 1.871345029239766,
301
- "grad_norm": 3.4458835124969482,
302
- "learning_rate": 2.9078947368421054e-06,
303
- "loss": 1.2348,
304
  "step": 3200
305
  },
306
  {
307
- "epoch": 1.871345029239766,
308
- "eval_bleu": 39.817105604910104,
309
- "eval_chrf": 65.83702875844537,
310
- "eval_loss": 1.3151392936706543,
311
- "eval_runtime": 56.5728,
312
- "eval_samples_per_second": 9.086,
313
- "eval_steps_per_second": 1.149,
314
  "step": 3200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  }
316
  ],
317
  "logging_steps": 100,
318
- "max_steps": 3420,
319
  "num_input_tokens_seen": 0,
320
  "num_train_epochs": 2,
321
  "save_steps": 400,
@@ -331,7 +407,7 @@
331
  "attributes": {}
332
  }
333
  },
334
- "total_flos": 8764108937625600.0,
335
  "train_batch_size": 8,
336
  "trial_name": null,
337
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1918951132300357,
6
  "eval_steps": 400,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.029797377830750895,
14
+ "grad_norm": 4.59083890914917,
15
+ "learning_rate": 3.448376042908224e-05,
16
+ "loss": 1.4613,
17
  "step": 100
18
  },
19
  {
20
+ "epoch": 0.05959475566150179,
21
+ "grad_norm": 6.048128604888916,
22
+ "learning_rate": 3.39623063170441e-05,
23
+ "loss": 1.3655,
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.08939213349225268,
28
+ "grad_norm": 5.665088176727295,
29
+ "learning_rate": 3.344085220500596e-05,
30
+ "loss": 1.3917,
31
  "step": 300
32
  },
33
  {
34
+ "epoch": 0.11918951132300358,
35
+ "grad_norm": 5.817560195922852,
36
+ "learning_rate": 3.291939809296781e-05,
37
+ "loss": 1.3542,
38
  "step": 400
39
  },
40
  {
41
+ "epoch": 0.11918951132300358,
42
+ "eval_bleu": 37.614257412118484,
43
+ "eval_chrf": 64.19881893879953,
44
+ "eval_loss": 1.4418301582336426,
45
+ "eval_runtime": 59.317,
46
+ "eval_samples_per_second": 8.665,
47
+ "eval_steps_per_second": 1.096,
48
  "step": 400
49
  },
50
  {
51
+ "epoch": 0.14898688915375446,
52
+ "grad_norm": 5.501027584075928,
53
+ "learning_rate": 3.2397943980929674e-05,
54
+ "loss": 1.2908,
55
  "step": 500
56
  },
57
  {
58
+ "epoch": 0.17878426698450536,
59
+ "grad_norm": 7.59045934677124,
60
+ "learning_rate": 3.1876489868891536e-05,
61
+ "loss": 1.2857,
62
  "step": 600
63
  },
64
  {
65
+ "epoch": 0.20858164481525626,
66
+ "grad_norm": 5.808661937713623,
67
+ "learning_rate": 3.13550357568534e-05,
68
+ "loss": 1.3866,
69
  "step": 700
70
  },
71
  {
72
+ "epoch": 0.23837902264600716,
73
+ "grad_norm": 4.547348499298096,
74
+ "learning_rate": 3.083358164481525e-05,
75
+ "loss": 1.3608,
76
  "step": 800
77
  },
78
  {
79
+ "epoch": 0.23837902264600716,
80
+ "eval_bleu": 37.30665125911135,
81
+ "eval_chrf": 64.10679124808756,
82
+ "eval_loss": 1.4419689178466797,
83
+ "eval_runtime": 60.8532,
84
+ "eval_samples_per_second": 8.447,
85
+ "eval_steps_per_second": 1.068,
86
  "step": 800
87
  },
88
  {
89
+ "epoch": 0.26817640047675806,
90
+ "grad_norm": 5.98895263671875,
91
+ "learning_rate": 3.031212753277711e-05,
92
+ "loss": 1.2325,
93
  "step": 900
94
  },
95
  {
96
+ "epoch": 0.29797377830750893,
97
+ "grad_norm": 3.8415517807006836,
98
+ "learning_rate": 2.9790673420738973e-05,
99
+ "loss": 1.3219,
100
  "step": 1000
101
  },
102
  {
103
+ "epoch": 0.32777115613825986,
104
+ "grad_norm": 4.784970283508301,
105
+ "learning_rate": 2.9269219308700832e-05,
106
+ "loss": 1.2864,
107
  "step": 1100
108
  },
109
  {
110
+ "epoch": 0.3575685339690107,
111
+ "grad_norm": 5.327032089233398,
112
+ "learning_rate": 2.8747765196662694e-05,
113
+ "loss": 1.2509,
114
  "step": 1200
115
  },
116
  {
117
+ "epoch": 0.3575685339690107,
118
+ "eval_bleu": 37.69409614318079,
119
+ "eval_chrf": 64.1652126034879,
120
+ "eval_loss": 1.4356836080551147,
121
+ "eval_runtime": 61.5641,
122
+ "eval_samples_per_second": 8.349,
123
+ "eval_steps_per_second": 1.056,
124
  "step": 1200
125
  },
126
  {
127
+ "epoch": 0.3873659117997616,
128
+ "grad_norm": 6.48747444152832,
129
+ "learning_rate": 2.822631108462455e-05,
130
+ "loss": 1.3008,
131
  "step": 1300
132
  },
133
  {
134
+ "epoch": 0.4171632896305125,
135
+ "grad_norm": 10.605354309082031,
136
+ "learning_rate": 2.7704856972586408e-05,
137
+ "loss": 1.1936,
138
  "step": 1400
139
  },
140
  {
141
+ "epoch": 0.4469606674612634,
142
+ "grad_norm": 5.0671796798706055,
143
+ "learning_rate": 2.718340286054827e-05,
144
+ "loss": 1.2895,
145
  "step": 1500
146
  },
147
  {
148
+ "epoch": 0.4767580452920143,
149
+ "grad_norm": 3.872220516204834,
150
+ "learning_rate": 2.6661948748510128e-05,
151
+ "loss": 1.3018,
152
  "step": 1600
153
  },
154
  {
155
+ "epoch": 0.4767580452920143,
156
+ "eval_bleu": 37.773400824938314,
157
+ "eval_chrf": 64.3403222198125,
158
+ "eval_loss": 1.4320310354232788,
159
+ "eval_runtime": 61.3981,
160
+ "eval_samples_per_second": 8.372,
161
+ "eval_steps_per_second": 1.059,
162
  "step": 1600
163
  },
164
  {
165
+ "epoch": 0.5065554231227652,
166
+ "grad_norm": 5.484747409820557,
167
+ "learning_rate": 2.614049463647199e-05,
168
+ "loss": 1.2971,
169
  "step": 1700
170
  },
171
  {
172
+ "epoch": 0.5363528009535161,
173
+ "grad_norm": 5.198516368865967,
174
+ "learning_rate": 2.561904052443385e-05,
175
+ "loss": 1.2132,
176
  "step": 1800
177
  },
178
  {
179
+ "epoch": 0.566150178784267,
180
+ "grad_norm": 4.3765363693237305,
181
+ "learning_rate": 2.5097586412395707e-05,
182
+ "loss": 1.3134,
183
  "step": 1900
184
  },
185
  {
186
+ "epoch": 0.5959475566150179,
187
+ "grad_norm": 5.041851997375488,
188
+ "learning_rate": 2.4576132300357566e-05,
189
+ "loss": 1.2662,
190
  "step": 2000
191
  },
192
  {
193
+ "epoch": 0.5959475566150179,
194
+ "eval_bleu": 38.239729003886104,
195
+ "eval_chrf": 64.08872351391828,
196
+ "eval_loss": 1.4284172058105469,
197
+ "eval_runtime": 61.8577,
198
+ "eval_samples_per_second": 8.309,
199
+ "eval_steps_per_second": 1.051,
200
  "step": 2000
201
  },
202
  {
203
+ "epoch": 0.6257449344457687,
204
+ "grad_norm": 5.912806987762451,
205
+ "learning_rate": 2.4054678188319424e-05,
206
+ "loss": 1.2808,
207
  "step": 2100
208
  },
209
  {
210
+ "epoch": 0.6555423122765197,
211
+ "grad_norm": 7.626971244812012,
212
+ "learning_rate": 2.3533224076281286e-05,
213
+ "loss": 1.265,
214
  "step": 2200
215
  },
216
  {
217
+ "epoch": 0.6853396901072706,
218
+ "grad_norm": 6.593811511993408,
219
+ "learning_rate": 2.3011769964243145e-05,
220
+ "loss": 1.236,
221
  "step": 2300
222
  },
223
  {
224
+ "epoch": 0.7151370679380215,
225
+ "grad_norm": 5.527437686920166,
226
+ "learning_rate": 2.2490315852205003e-05,
227
+ "loss": 1.2324,
228
  "step": 2400
229
  },
230
  {
231
+ "epoch": 0.7151370679380215,
232
+ "eval_bleu": 38.382663636027985,
233
+ "eval_chrf": 64.41707679357026,
234
+ "eval_loss": 1.4260649681091309,
235
+ "eval_runtime": 61.867,
236
+ "eval_samples_per_second": 8.308,
237
+ "eval_steps_per_second": 1.051,
238
  "step": 2400
239
  },
240
  {
241
+ "epoch": 0.7449344457687723,
242
+ "grad_norm": 6.579026222229004,
243
+ "learning_rate": 2.1968861740166865e-05,
244
+ "loss": 1.2627,
245
  "step": 2500
246
  },
247
  {
248
+ "epoch": 0.7747318235995232,
249
+ "grad_norm": 5.275082111358643,
250
+ "learning_rate": 2.1447407628128724e-05,
251
+ "loss": 1.2462,
252
  "step": 2600
253
  },
254
  {
255
+ "epoch": 0.8045292014302742,
256
+ "grad_norm": 8.691877365112305,
257
+ "learning_rate": 2.0925953516090586e-05,
258
+ "loss": 1.1717,
259
  "step": 2700
260
  },
261
  {
262
+ "epoch": 0.834326579261025,
263
+ "grad_norm": 6.7765913009643555,
264
+ "learning_rate": 2.040449940405244e-05,
265
+ "loss": 1.2225,
266
  "step": 2800
267
  },
268
  {
269
+ "epoch": 0.834326579261025,
270
+ "eval_bleu": 38.71626609686758,
271
+ "eval_chrf": 64.87666686265256,
272
+ "eval_loss": 1.4296128749847412,
273
+ "eval_runtime": 60.8774,
274
+ "eval_samples_per_second": 8.443,
275
+ "eval_steps_per_second": 1.068,
276
  "step": 2800
277
  },
278
  {
279
+ "epoch": 0.8641239570917759,
280
+ "grad_norm": 7.38505744934082,
281
+ "learning_rate": 1.98830452920143e-05,
282
+ "loss": 1.2582,
283
  "step": 2900
284
  },
285
  {
286
+ "epoch": 0.8939213349225268,
287
+ "grad_norm": 5.248499870300293,
288
+ "learning_rate": 1.936159117997616e-05,
289
+ "loss": 1.2588,
290
  "step": 3000
291
  },
292
  {
293
+ "epoch": 0.9237187127532777,
294
+ "grad_norm": 4.9279069900512695,
295
+ "learning_rate": 1.884013706793802e-05,
296
+ "loss": 1.2679,
297
  "step": 3100
298
  },
299
  {
300
+ "epoch": 0.9535160905840286,
301
+ "grad_norm": 7.552145004272461,
302
+ "learning_rate": 1.831868295589988e-05,
303
+ "loss": 1.2384,
304
  "step": 3200
305
  },
306
  {
307
+ "epoch": 0.9535160905840286,
308
+ "eval_bleu": 38.518218505947324,
309
+ "eval_chrf": 64.89625356432627,
310
+ "eval_loss": 1.417944312095642,
311
+ "eval_runtime": 61.2351,
312
+ "eval_samples_per_second": 8.394,
313
+ "eval_steps_per_second": 1.061,
314
  "step": 3200
315
+ },
316
+ {
317
+ "epoch": 0.9833134684147795,
318
+ "grad_norm": 5.52092170715332,
319
+ "learning_rate": 1.779722884386174e-05,
320
+ "loss": 1.2863,
321
+ "step": 3300
322
+ },
323
+ {
324
+ "epoch": 1.0131108462455305,
325
+ "grad_norm": 5.495266437530518,
326
+ "learning_rate": 1.72757747318236e-05,
327
+ "loss": 1.1856,
328
+ "step": 3400
329
+ },
330
+ {
331
+ "epoch": 1.0429082240762813,
332
+ "grad_norm": 7.5805511474609375,
333
+ "learning_rate": 1.6754320619785457e-05,
334
+ "loss": 1.147,
335
+ "step": 3500
336
+ },
337
+ {
338
+ "epoch": 1.0727056019070322,
339
+ "grad_norm": 5.156323432922363,
340
+ "learning_rate": 1.6232866507747316e-05,
341
+ "loss": 1.1485,
342
+ "step": 3600
343
+ },
344
+ {
345
+ "epoch": 1.0727056019070322,
346
+ "eval_bleu": 38.668867292632044,
347
+ "eval_chrf": 64.79576644290259,
348
+ "eval_loss": 1.4277405738830566,
349
+ "eval_runtime": 60.538,
350
+ "eval_samples_per_second": 8.491,
351
+ "eval_steps_per_second": 1.074,
352
+ "step": 3600
353
+ },
354
+ {
355
+ "epoch": 1.102502979737783,
356
+ "grad_norm": 4.9568867683410645,
357
+ "learning_rate": 1.5711412395709178e-05,
358
+ "loss": 1.157,
359
+ "step": 3700
360
+ },
361
+ {
362
+ "epoch": 1.132300357568534,
363
+ "grad_norm": 4.534054279327393,
364
+ "learning_rate": 1.5189958283671036e-05,
365
+ "loss": 1.2088,
366
+ "step": 3800
367
+ },
368
+ {
369
+ "epoch": 1.162097735399285,
370
+ "grad_norm": 5.1630635261535645,
371
+ "learning_rate": 1.4668504171632896e-05,
372
+ "loss": 1.1492,
373
+ "step": 3900
374
+ },
375
+ {
376
+ "epoch": 1.1918951132300357,
377
+ "grad_norm": 6.447033405303955,
378
+ "learning_rate": 1.4147050059594753e-05,
379
+ "loss": 1.1213,
380
+ "step": 4000
381
+ },
382
+ {
383
+ "epoch": 1.1918951132300357,
384
+ "eval_bleu": 39.1303880259221,
385
+ "eval_chrf": 64.98118735600528,
386
+ "eval_loss": 1.4224255084991455,
387
+ "eval_runtime": 59.6926,
388
+ "eval_samples_per_second": 8.611,
389
+ "eval_steps_per_second": 1.089,
390
+ "step": 4000
391
  }
392
  ],
393
  "logging_steps": 100,
394
+ "max_steps": 6712,
395
  "num_input_tokens_seen": 0,
396
  "num_train_epochs": 2,
397
  "save_steps": 400,
 
407
  "attributes": {}
408
  }
409
  },
410
+ "total_flos": 1.0955136172032e+16,
411
  "train_batch_size": 8,
412
  "trial_name": null,
413
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62b6e79a3c177bbec778e900c4bdda3fb42784f15e16b06128f5ef92437fb78c
3
  size 5905
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1296a4c4cbbfb511129c5fc25155a089f73ccf8250b3689b0cf311c5f320fd65
3
  size 5905
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:239d42f619e9c7ef78a589e8878755b7ff05452678bd76c3387d95b909c97859
3
  size 990185320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b5562ff57c9718f87f2bf333235b1a1bd61d2e3cd5b19f6faf8e9b411d4c66
3
  size 990185320