QomSSLab commited on
Commit
c77e1db
·
verified ·
1 Parent(s): 88aa90b

Upload model and tokenizer

Browse files
Files changed (1) hide show
  1. checkpoint-52896/trainer_state.json +1511 -0
checkpoint-52896/trainer_state.json ADDED
@@ -0,0 +1,1511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9999432860086583,
6
+ "eval_steps": 500,
7
+ "global_step": 52896,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00945233189027733,
14
+ "grad_norm": 29.5,
15
+ "learning_rate": 9.960000000000001e-06,
16
+ "loss": 4.1004,
17
+ "step": 250
18
+ },
19
+ {
20
+ "epoch": 0.01890466378055466,
21
+ "grad_norm": 20.125,
22
+ "learning_rate": 9.999448050049255e-06,
23
+ "loss": 4.0386,
24
+ "step": 500
25
+ },
26
+ {
27
+ "epoch": 0.028356995670831994,
28
+ "grad_norm": 16.625,
29
+ "learning_rate": 9.997783447634044e-06,
30
+ "loss": 4.1263,
31
+ "step": 750
32
+ },
33
+ {
34
+ "epoch": 0.03780932756110932,
35
+ "grad_norm": 18.5,
36
+ "learning_rate": 9.995006554320588e-06,
37
+ "loss": 3.9826,
38
+ "step": 1000
39
+ },
40
+ {
41
+ "epoch": 0.047261659451386655,
42
+ "grad_norm": 18.625,
43
+ "learning_rate": 9.991117988125487e-06,
44
+ "loss": 4.0093,
45
+ "step": 1250
46
+ },
47
+ {
48
+ "epoch": 0.05671399134166399,
49
+ "grad_norm": 12.375,
50
+ "learning_rate": 9.986118614475757e-06,
51
+ "loss": 3.9503,
52
+ "step": 1500
53
+ },
54
+ {
55
+ "epoch": 0.06616632323194133,
56
+ "grad_norm": 15.8125,
57
+ "learning_rate": 9.980009546016204e-06,
58
+ "loss": 4.0245,
59
+ "step": 1750
60
+ },
61
+ {
62
+ "epoch": 0.07561865512221864,
63
+ "grad_norm": 17.5,
64
+ "learning_rate": 9.972792142361807e-06,
65
+ "loss": 3.9901,
66
+ "step": 2000
67
+ },
68
+ {
69
+ "epoch": 0.08507098701249598,
70
+ "grad_norm": 22.75,
71
+ "learning_rate": 9.964468009795128e-06,
72
+ "loss": 3.9098,
73
+ "step": 2250
74
+ },
75
+ {
76
+ "epoch": 0.09452331890277331,
77
+ "grad_norm": 19.5,
78
+ "learning_rate": 9.95503900090882e-06,
79
+ "loss": 3.9788,
80
+ "step": 2500
81
+ },
82
+ {
83
+ "epoch": 0.10397565079305064,
84
+ "grad_norm": 17.375,
85
+ "learning_rate": 9.944507214193314e-06,
86
+ "loss": 4.0492,
87
+ "step": 2750
88
+ },
89
+ {
90
+ "epoch": 0.11342798268332797,
91
+ "grad_norm": 16.75,
92
+ "learning_rate": 9.932874993569803e-06,
93
+ "loss": 3.9152,
94
+ "step": 3000
95
+ },
96
+ {
97
+ "epoch": 0.1228803145736053,
98
+ "grad_norm": 19.625,
99
+ "learning_rate": 9.92014492786856e-06,
100
+ "loss": 4.0289,
101
+ "step": 3250
102
+ },
103
+ {
104
+ "epoch": 0.13233264646388265,
105
+ "grad_norm": 26.75,
106
+ "learning_rate": 9.906319850252806e-06,
107
+ "loss": 3.9419,
108
+ "step": 3500
109
+ },
110
+ {
111
+ "epoch": 0.14178497835415999,
112
+ "grad_norm": 17.875,
113
+ "learning_rate": 9.891402837588142e-06,
114
+ "loss": 3.9255,
115
+ "step": 3750
116
+ },
117
+ {
118
+ "epoch": 0.1512373102444373,
119
+ "grad_norm": 17.0,
120
+ "learning_rate": 9.875397209757793e-06,
121
+ "loss": 4.068,
122
+ "step": 4000
123
+ },
124
+ {
125
+ "epoch": 0.16068964213471462,
126
+ "grad_norm": 22.375,
127
+ "learning_rate": 9.858306528923734e-06,
128
+ "loss": 3.9229,
129
+ "step": 4250
130
+ },
131
+ {
132
+ "epoch": 0.17014197402499195,
133
+ "grad_norm": 15.8125,
134
+ "learning_rate": 9.840134598733906e-06,
135
+ "loss": 3.8975,
136
+ "step": 4500
137
+ },
138
+ {
139
+ "epoch": 0.1795943059152693,
140
+ "grad_norm": 17.875,
141
+ "learning_rate": 9.8208854634757e-06,
142
+ "loss": 3.9159,
143
+ "step": 4750
144
+ },
145
+ {
146
+ "epoch": 0.18904663780554662,
147
+ "grad_norm": 19.0,
148
+ "learning_rate": 9.800563407175856e-06,
149
+ "loss": 3.9892,
150
+ "step": 5000
151
+ },
152
+ {
153
+ "epoch": 0.19849896969582395,
154
+ "grad_norm": 24.0,
155
+ "learning_rate": 9.779172952647035e-06,
156
+ "loss": 3.9846,
157
+ "step": 5250
158
+ },
159
+ {
160
+ "epoch": 0.20795130158610128,
161
+ "grad_norm": 15.9375,
162
+ "learning_rate": 9.756718860481235e-06,
163
+ "loss": 4.0746,
164
+ "step": 5500
165
+ },
166
+ {
167
+ "epoch": 0.21740363347637862,
168
+ "grad_norm": 19.25,
169
+ "learning_rate": 9.733206127990285e-06,
170
+ "loss": 3.9736,
171
+ "step": 5750
172
+ },
173
+ {
174
+ "epoch": 0.22685596536665595,
175
+ "grad_norm": 17.25,
176
+ "learning_rate": 9.708639988093663e-06,
177
+ "loss": 3.9673,
178
+ "step": 6000
179
+ },
180
+ {
181
+ "epoch": 0.23630829725693328,
182
+ "grad_norm": 17.375,
183
+ "learning_rate": 9.683025908153868e-06,
184
+ "loss": 3.9672,
185
+ "step": 6250
186
+ },
187
+ {
188
+ "epoch": 0.2457606291472106,
189
+ "grad_norm": 17.375,
190
+ "learning_rate": 9.656369588759628e-06,
191
+ "loss": 3.9812,
192
+ "step": 6500
193
+ },
194
+ {
195
+ "epoch": 0.255212961037488,
196
+ "grad_norm": 17.625,
197
+ "learning_rate": 9.628676962457194e-06,
198
+ "loss": 3.9659,
199
+ "step": 6750
200
+ },
201
+ {
202
+ "epoch": 0.2646652929277653,
203
+ "grad_norm": 18.0,
204
+ "learning_rate": 9.599954192430004e-06,
205
+ "loss": 3.9614,
206
+ "step": 7000
207
+ },
208
+ {
209
+ "epoch": 0.27411762481804264,
210
+ "grad_norm": 17.125,
211
+ "learning_rate": 9.570207671127034e-06,
212
+ "loss": 3.9424,
213
+ "step": 7250
214
+ },
215
+ {
216
+ "epoch": 0.28356995670831997,
217
+ "grad_norm": 24.125,
218
+ "learning_rate": 9.539444018840107e-06,
219
+ "loss": 3.9533,
220
+ "step": 7500
221
+ },
222
+ {
223
+ "epoch": 0.29302228859859725,
224
+ "grad_norm": 20.375,
225
+ "learning_rate": 9.507670082230507e-06,
226
+ "loss": 4.0344,
227
+ "step": 7750
228
+ },
229
+ {
230
+ "epoch": 0.3024746204888746,
231
+ "grad_norm": 18.375,
232
+ "learning_rate": 9.474892932805209e-06,
233
+ "loss": 3.8986,
234
+ "step": 8000
235
+ },
236
+ {
237
+ "epoch": 0.3119269523791519,
238
+ "grad_norm": 19.5,
239
+ "learning_rate": 9.441119865343054e-06,
240
+ "loss": 3.9415,
241
+ "step": 8250
242
+ },
243
+ {
244
+ "epoch": 0.32137928426942924,
245
+ "grad_norm": 20.125,
246
+ "learning_rate": 9.406358396271266e-06,
247
+ "loss": 3.9542,
248
+ "step": 8500
249
+ },
250
+ {
251
+ "epoch": 0.3308316161597066,
252
+ "grad_norm": 25.375,
253
+ "learning_rate": 9.370616261992605e-06,
254
+ "loss": 4.0098,
255
+ "step": 8750
256
+ },
257
+ {
258
+ "epoch": 0.3402839480499839,
259
+ "grad_norm": 19.625,
260
+ "learning_rate": 9.33390141716358e-06,
261
+ "loss": 3.9839,
262
+ "step": 9000
263
+ },
264
+ {
265
+ "epoch": 0.34973627994026124,
266
+ "grad_norm": 19.5,
267
+ "learning_rate": 9.296222032924092e-06,
268
+ "loss": 3.9886,
269
+ "step": 9250
270
+ },
271
+ {
272
+ "epoch": 0.3591886118305386,
273
+ "grad_norm": 16.75,
274
+ "learning_rate": 9.257586495078882e-06,
275
+ "loss": 3.8992,
276
+ "step": 9500
277
+ },
278
+ {
279
+ "epoch": 0.3686409437208159,
280
+ "grad_norm": 17.5,
281
+ "learning_rate": 9.21800340223122e-06,
282
+ "loss": 4.0108,
283
+ "step": 9750
284
+ },
285
+ {
286
+ "epoch": 0.37809327561109324,
287
+ "grad_norm": 17.375,
288
+ "learning_rate": 9.177481563869226e-06,
289
+ "loss": 3.9957,
290
+ "step": 10000
291
+ },
292
+ {
293
+ "epoch": 0.38754560750137057,
294
+ "grad_norm": 19.625,
295
+ "learning_rate": 9.136029998405253e-06,
296
+ "loss": 3.966,
297
+ "step": 10250
298
+ },
299
+ {
300
+ "epoch": 0.3969979393916479,
301
+ "grad_norm": 20.125,
302
+ "learning_rate": 9.093657931168782e-06,
303
+ "loss": 4.0057,
304
+ "step": 10500
305
+ },
306
+ {
307
+ "epoch": 0.40645027128192524,
308
+ "grad_norm": 25.5,
309
+ "learning_rate": 9.050374792353265e-06,
310
+ "loss": 4.0049,
311
+ "step": 10750
312
+ },
313
+ {
314
+ "epoch": 0.41590260317220257,
315
+ "grad_norm": 17.5,
316
+ "learning_rate": 9.006190214917363e-06,
317
+ "loss": 4.046,
318
+ "step": 11000
319
+ },
320
+ {
321
+ "epoch": 0.4253549350624799,
322
+ "grad_norm": 15.25,
323
+ "learning_rate": 8.961114032441067e-06,
324
+ "loss": 4.0138,
325
+ "step": 11250
326
+ },
327
+ {
328
+ "epoch": 0.43480726695275723,
329
+ "grad_norm": 58.5,
330
+ "learning_rate": 8.915156276937175e-06,
331
+ "loss": 4.0145,
332
+ "step": 11500
333
+ },
334
+ {
335
+ "epoch": 0.44425959884303456,
336
+ "grad_norm": 18.25,
337
+ "learning_rate": 8.868327176618592e-06,
338
+ "loss": 3.9748,
339
+ "step": 11750
340
+ },
341
+ {
342
+ "epoch": 0.4537119307333119,
343
+ "grad_norm": 17.0,
344
+ "learning_rate": 8.82063715362197e-06,
345
+ "loss": 4.0039,
346
+ "step": 12000
347
+ },
348
+ {
349
+ "epoch": 0.46316426262358923,
350
+ "grad_norm": 17.875,
351
+ "learning_rate": 8.772096821688194e-06,
352
+ "loss": 4.1231,
353
+ "step": 12250
354
+ },
355
+ {
356
+ "epoch": 0.47261659451386656,
357
+ "grad_norm": 19.25,
358
+ "learning_rate": 8.722716983800226e-06,
359
+ "loss": 4.0778,
360
+ "step": 12500
361
+ },
362
+ {
363
+ "epoch": 0.4820689264041439,
364
+ "grad_norm": 14.0625,
365
+ "learning_rate": 8.672508629778809e-06,
366
+ "loss": 3.9998,
367
+ "step": 12750
368
+ },
369
+ {
370
+ "epoch": 0.4915212582944212,
371
+ "grad_norm": 20.0,
372
+ "learning_rate": 8.621482933836634e-06,
373
+ "loss": 4.0298,
374
+ "step": 13000
375
+ },
376
+ {
377
+ "epoch": 0.5009735901846986,
378
+ "grad_norm": 21.5,
379
+ "learning_rate": 8.569651252091418e-06,
380
+ "loss": 3.9807,
381
+ "step": 13250
382
+ },
383
+ {
384
+ "epoch": 0.510425922074976,
385
+ "grad_norm": 17.75,
386
+ "learning_rate": 8.517025120038536e-06,
387
+ "loss": 4.084,
388
+ "step": 13500
389
+ },
390
+ {
391
+ "epoch": 0.5198782539652532,
392
+ "grad_norm": 19.0,
393
+ "learning_rate": 8.463616249983718e-06,
394
+ "loss": 4.1373,
395
+ "step": 13750
396
+ },
397
+ {
398
+ "epoch": 0.5293305858555306,
399
+ "grad_norm": 19.75,
400
+ "learning_rate": 8.409436528436381e-06,
401
+ "loss": 4.0691,
402
+ "step": 14000
403
+ },
404
+ {
405
+ "epoch": 0.5387829177458079,
406
+ "grad_norm": 17.25,
407
+ "learning_rate": 8.354498013464228e-06,
408
+ "loss": 4.0686,
409
+ "step": 14250
410
+ },
411
+ {
412
+ "epoch": 0.5482352496360853,
413
+ "grad_norm": 16.875,
414
+ "learning_rate": 8.298812932009622e-06,
415
+ "loss": 4.0066,
416
+ "step": 14500
417
+ },
418
+ {
419
+ "epoch": 0.5576875815263626,
420
+ "grad_norm": 19.625,
421
+ "learning_rate": 8.242393677168406e-06,
422
+ "loss": 4.0525,
423
+ "step": 14750
424
+ },
425
+ {
426
+ "epoch": 0.5671399134166399,
427
+ "grad_norm": 20.5,
428
+ "learning_rate": 8.185252805431732e-06,
429
+ "loss": 4.0993,
430
+ "step": 15000
431
+ },
432
+ {
433
+ "epoch": 0.5765922453069172,
434
+ "grad_norm": 20.375,
435
+ "learning_rate": 8.127403033891532e-06,
436
+ "loss": 4.0902,
437
+ "step": 15250
438
+ },
439
+ {
440
+ "epoch": 0.5860445771971945,
441
+ "grad_norm": 21.125,
442
+ "learning_rate": 8.068857237410237e-06,
443
+ "loss": 4.0273,
444
+ "step": 15500
445
+ },
446
+ {
447
+ "epoch": 0.5954969090874719,
448
+ "grad_norm": 19.875,
449
+ "learning_rate": 8.00962844575539e-06,
450
+ "loss": 4.0831,
451
+ "step": 15750
452
+ },
453
+ {
454
+ "epoch": 0.6049492409777492,
455
+ "grad_norm": 243.0,
456
+ "learning_rate": 7.949729840699784e-06,
457
+ "loss": 4.0758,
458
+ "step": 16000
459
+ },
460
+ {
461
+ "epoch": 0.6144015728680265,
462
+ "grad_norm": 17.875,
463
+ "learning_rate": 7.889174753087767e-06,
464
+ "loss": 4.0918,
465
+ "step": 16250
466
+ },
467
+ {
468
+ "epoch": 0.6238539047583038,
469
+ "grad_norm": 19.125,
470
+ "learning_rate": 7.827976659868368e-06,
471
+ "loss": 4.0538,
472
+ "step": 16500
473
+ },
474
+ {
475
+ "epoch": 0.6333062366485812,
476
+ "grad_norm": 19.625,
477
+ "learning_rate": 7.766149181095916e-06,
478
+ "loss": 4.1164,
479
+ "step": 16750
480
+ },
481
+ {
482
+ "epoch": 0.6427585685388585,
483
+ "grad_norm": 16.5,
484
+ "learning_rate": 7.703706076898803e-06,
485
+ "loss": 4.0626,
486
+ "step": 17000
487
+ },
488
+ {
489
+ "epoch": 0.6522109004291359,
490
+ "grad_norm": 14.375,
491
+ "learning_rate": 7.640661244417064e-06,
492
+ "loss": 4.0444,
493
+ "step": 17250
494
+ },
495
+ {
496
+ "epoch": 0.6616632323194132,
497
+ "grad_norm": 19.0,
498
+ "learning_rate": 7.577028714709484e-06,
499
+ "loss": 4.0429,
500
+ "step": 17500
501
+ },
502
+ {
503
+ "epoch": 0.6711155642096905,
504
+ "grad_norm": 17.875,
505
+ "learning_rate": 7.512822649630893e-06,
506
+ "loss": 4.0362,
507
+ "step": 17750
508
+ },
509
+ {
510
+ "epoch": 0.6805678960999678,
511
+ "grad_norm": 15.625,
512
+ "learning_rate": 7.44805733868033e-06,
513
+ "loss": 4.0806,
514
+ "step": 18000
515
+ },
516
+ {
517
+ "epoch": 0.6900202279902452,
518
+ "grad_norm": 17.625,
519
+ "learning_rate": 7.382747195820834e-06,
520
+ "loss": 4.0933,
521
+ "step": 18250
522
+ },
523
+ {
524
+ "epoch": 0.6994725598805225,
525
+ "grad_norm": 16.875,
526
+ "learning_rate": 7.316906756271515e-06,
527
+ "loss": 4.0495,
528
+ "step": 18500
529
+ },
530
+ {
531
+ "epoch": 0.7089248917707999,
532
+ "grad_norm": 20.375,
533
+ "learning_rate": 7.250550673272639e-06,
534
+ "loss": 4.0599,
535
+ "step": 18750
536
+ },
537
+ {
538
+ "epoch": 0.7183772236610771,
539
+ "grad_norm": 19.375,
540
+ "learning_rate": 7.1836937148244445e-06,
541
+ "loss": 4.0653,
542
+ "step": 19000
543
+ },
544
+ {
545
+ "epoch": 0.7278295555513545,
546
+ "grad_norm": 16.5,
547
+ "learning_rate": 7.1163507604004326e-06,
548
+ "loss": 4.0266,
549
+ "step": 19250
550
+ },
551
+ {
552
+ "epoch": 0.7372818874416318,
553
+ "grad_norm": 20.75,
554
+ "learning_rate": 7.048536797635832e-06,
555
+ "loss": 4.0484,
556
+ "step": 19500
557
+ },
558
+ {
559
+ "epoch": 0.7467342193319092,
560
+ "grad_norm": 18.25,
561
+ "learning_rate": 6.9802669189920005e-06,
562
+ "loss": 4.043,
563
+ "step": 19750
564
+ },
565
+ {
566
+ "epoch": 0.7561865512221865,
567
+ "grad_norm": 47.0,
568
+ "learning_rate": 6.911556318397493e-06,
569
+ "loss": 4.0716,
570
+ "step": 20000
571
+ },
572
+ {
573
+ "epoch": 0.7656388831124639,
574
+ "grad_norm": 15.625,
575
+ "learning_rate": 6.8424202878665515e-06,
576
+ "loss": 4.059,
577
+ "step": 20250
578
+ },
579
+ {
580
+ "epoch": 0.7750912150027411,
581
+ "grad_norm": 18.5,
582
+ "learning_rate": 6.772874214095761e-06,
583
+ "loss": 3.9974,
584
+ "step": 20500
585
+ },
586
+ {
587
+ "epoch": 0.7845435468930185,
588
+ "grad_norm": 16.0,
589
+ "learning_rate": 6.702933575039631e-06,
590
+ "loss": 4.0551,
591
+ "step": 20750
592
+ },
593
+ {
594
+ "epoch": 0.7939958787832958,
595
+ "grad_norm": 15.5,
596
+ "learning_rate": 6.6326139364658795e-06,
597
+ "loss": 4.1337,
598
+ "step": 21000
599
+ },
600
+ {
601
+ "epoch": 0.8034482106735732,
602
+ "grad_norm": 19.125,
603
+ "learning_rate": 6.561930948491155e-06,
604
+ "loss": 4.0849,
605
+ "step": 21250
606
+ },
607
+ {
608
+ "epoch": 0.8129005425638505,
609
+ "grad_norm": 19.125,
610
+ "learning_rate": 6.4909003420980065e-06,
611
+ "loss": 4.069,
612
+ "step": 21500
613
+ },
614
+ {
615
+ "epoch": 0.8223528744541279,
616
+ "grad_norm": 18.875,
617
+ "learning_rate": 6.419537925633836e-06,
618
+ "loss": 4.0218,
619
+ "step": 21750
620
+ },
621
+ {
622
+ "epoch": 0.8318052063444051,
623
+ "grad_norm": 17.125,
624
+ "learning_rate": 6.34785958129265e-06,
625
+ "loss": 3.9901,
626
+ "step": 22000
627
+ },
628
+ {
629
+ "epoch": 0.8412575382346825,
630
+ "grad_norm": 17.625,
631
+ "learning_rate": 6.275881261580363e-06,
632
+ "loss": 4.0088,
633
+ "step": 22250
634
+ },
635
+ {
636
+ "epoch": 0.8507098701249598,
637
+ "grad_norm": 17.0,
638
+ "learning_rate": 6.2036189857644616e-06,
639
+ "loss": 4.0448,
640
+ "step": 22500
641
+ },
642
+ {
643
+ "epoch": 0.8601622020152372,
644
+ "grad_norm": 15.8125,
645
+ "learning_rate": 6.131088836308805e-06,
646
+ "loss": 4.0443,
647
+ "step": 22750
648
+ },
649
+ {
650
+ "epoch": 0.8696145339055145,
651
+ "grad_norm": 22.875,
652
+ "learning_rate": 6.058306955294365e-06,
653
+ "loss": 4.0573,
654
+ "step": 23000
655
+ },
656
+ {
657
+ "epoch": 0.8790668657957919,
658
+ "grad_norm": 54.0,
659
+ "learning_rate": 5.9852895408266955e-06,
660
+ "loss": 4.0054,
661
+ "step": 23250
662
+ },
663
+ {
664
+ "epoch": 0.8885191976860691,
665
+ "grad_norm": 21.0,
666
+ "learning_rate": 5.9120528434309245e-06,
667
+ "loss": 4.0112,
668
+ "step": 23500
669
+ },
670
+ {
671
+ "epoch": 0.8979715295763465,
672
+ "grad_norm": 19.5,
673
+ "learning_rate": 5.838613162435106e-06,
674
+ "loss": 4.0095,
675
+ "step": 23750
676
+ },
677
+ {
678
+ "epoch": 0.9074238614666238,
679
+ "grad_norm": 22.375,
680
+ "learning_rate": 5.764986842342675e-06,
681
+ "loss": 3.9941,
682
+ "step": 24000
683
+ },
684
+ {
685
+ "epoch": 0.9168761933569012,
686
+ "grad_norm": 15.875,
687
+ "learning_rate": 5.6911902691948786e-06,
688
+ "loss": 3.9703,
689
+ "step": 24250
690
+ },
691
+ {
692
+ "epoch": 0.9263285252471785,
693
+ "grad_norm": 18.0,
694
+ "learning_rate": 5.617239866923945e-06,
695
+ "loss": 3.9949,
696
+ "step": 24500
697
+ },
698
+ {
699
+ "epoch": 0.9357808571374558,
700
+ "grad_norm": 18.25,
701
+ "learning_rate": 5.543152093697826e-06,
702
+ "loss": 4.0225,
703
+ "step": 24750
704
+ },
705
+ {
706
+ "epoch": 0.9452331890277331,
707
+ "grad_norm": 17.625,
708
+ "learning_rate": 5.4689434382573156e-06,
709
+ "loss": 3.998,
710
+ "step": 25000
711
+ },
712
+ {
713
+ "epoch": 0.9546855209180105,
714
+ "grad_norm": 16.5,
715
+ "learning_rate": 5.39463041624638e-06,
716
+ "loss": 3.9813,
717
+ "step": 25250
718
+ },
719
+ {
720
+ "epoch": 0.9641378528082878,
721
+ "grad_norm": 17.125,
722
+ "learning_rate": 5.320229566536474e-06,
723
+ "loss": 3.9089,
724
+ "step": 25500
725
+ },
726
+ {
727
+ "epoch": 0.9735901846985652,
728
+ "grad_norm": 17.875,
729
+ "learning_rate": 5.245757447545706e-06,
730
+ "loss": 4.0302,
731
+ "step": 25750
732
+ },
733
+ {
734
+ "epoch": 0.9830425165888425,
735
+ "grad_norm": 17.25,
736
+ "learning_rate": 5.171230633553656e-06,
737
+ "loss": 3.9841,
738
+ "step": 26000
739
+ },
740
+ {
741
+ "epoch": 0.9924948484791198,
742
+ "grad_norm": 16.125,
743
+ "learning_rate": 5.096665711012646e-06,
744
+ "loss": 3.9648,
745
+ "step": 26250
746
+ },
747
+ {
748
+ "epoch": 1.0019282757056165,
749
+ "grad_norm": 21.125,
750
+ "learning_rate": 5.0220792748563195e-06,
751
+ "loss": 3.8978,
752
+ "step": 26500
753
+ },
754
+ {
755
+ "epoch": 1.011380607595894,
756
+ "grad_norm": 25.75,
757
+ "learning_rate": 4.94748792480632e-06,
758
+ "loss": 3.4963,
759
+ "step": 26750
760
+ },
761
+ {
762
+ "epoch": 1.0208329394861713,
763
+ "grad_norm": 17.375,
764
+ "learning_rate": 4.872908261677911e-06,
765
+ "loss": 3.6178,
766
+ "step": 27000
767
+ },
768
+ {
769
+ "epoch": 1.0302852713764485,
770
+ "grad_norm": 25.375,
771
+ "learning_rate": 4.7983568836853564e-06,
772
+ "loss": 3.5309,
773
+ "step": 27250
774
+ },
775
+ {
776
+ "epoch": 1.0397376032667258,
777
+ "grad_norm": 26.625,
778
+ "learning_rate": 4.723850382747863e-06,
779
+ "loss": 3.4875,
780
+ "step": 27500
781
+ },
782
+ {
783
+ "epoch": 1.0491899351570033,
784
+ "grad_norm": 23.25,
785
+ "learning_rate": 4.649405340796947e-06,
786
+ "loss": 3.5433,
787
+ "step": 27750
788
+ },
789
+ {
790
+ "epoch": 1.0586422670472806,
791
+ "grad_norm": 17.125,
792
+ "learning_rate": 4.575038326086007e-06,
793
+ "loss": 3.5867,
794
+ "step": 28000
795
+ },
796
+ {
797
+ "epoch": 1.0680945989375579,
798
+ "grad_norm": 17.875,
799
+ "learning_rate": 4.500765889502937e-06,
800
+ "loss": 3.5986,
801
+ "step": 28250
802
+ },
803
+ {
804
+ "epoch": 1.0775469308278351,
805
+ "grad_norm": 18.0,
806
+ "learning_rate": 4.426604560886636e-06,
807
+ "loss": 3.5402,
808
+ "step": 28500
809
+ },
810
+ {
811
+ "epoch": 1.0869992627181126,
812
+ "grad_norm": 17.75,
813
+ "learning_rate": 4.3525708453481505e-06,
814
+ "loss": 3.5184,
815
+ "step": 28750
816
+ },
817
+ {
818
+ "epoch": 1.09645159460839,
819
+ "grad_norm": 19.875,
820
+ "learning_rate": 4.278681219597375e-06,
821
+ "loss": 3.5137,
822
+ "step": 29000
823
+ },
824
+ {
825
+ "epoch": 1.1059039264986672,
826
+ "grad_norm": 22.375,
827
+ "learning_rate": 4.204952128276027e-06,
828
+ "loss": 3.6344,
829
+ "step": 29250
830
+ },
831
+ {
832
+ "epoch": 1.1153562583889445,
833
+ "grad_norm": 19.25,
834
+ "learning_rate": 4.131399980297796e-06,
835
+ "loss": 3.5423,
836
+ "step": 29500
837
+ },
838
+ {
839
+ "epoch": 1.124808590279222,
840
+ "grad_norm": 24.25,
841
+ "learning_rate": 4.058041145196414e-06,
842
+ "loss": 3.5973,
843
+ "step": 29750
844
+ },
845
+ {
846
+ "epoch": 1.1342609221694993,
847
+ "grad_norm": 20.875,
848
+ "learning_rate": 3.98489194948251e-06,
849
+ "loss": 3.5454,
850
+ "step": 30000
851
+ },
852
+ {
853
+ "epoch": 1.1437132540597765,
854
+ "grad_norm": 19.25,
855
+ "learning_rate": 3.911968673010038e-06,
856
+ "loss": 3.5508,
857
+ "step": 30250
858
+ },
859
+ {
860
+ "epoch": 1.1531655859500538,
861
+ "grad_norm": 20.0,
862
+ "learning_rate": 3.839287545353076e-06,
863
+ "loss": 3.4747,
864
+ "step": 30500
865
+ },
866
+ {
867
+ "epoch": 1.1626179178403313,
868
+ "grad_norm": 21.25,
869
+ "learning_rate": 3.7668647421938275e-06,
870
+ "loss": 3.538,
871
+ "step": 30750
872
+ },
873
+ {
874
+ "epoch": 1.1720702497306086,
875
+ "grad_norm": 26.125,
876
+ "learning_rate": 3.694716381722609e-06,
877
+ "loss": 3.5677,
878
+ "step": 31000
879
+ },
880
+ {
881
+ "epoch": 1.1815225816208859,
882
+ "grad_norm": 25.75,
883
+ "learning_rate": 3.6228585210506427e-06,
884
+ "loss": 3.5912,
885
+ "step": 31250
886
+ },
887
+ {
888
+ "epoch": 1.1909749135111631,
889
+ "grad_norm": 17.5,
890
+ "learning_rate": 3.551307152636431e-06,
891
+ "loss": 3.5178,
892
+ "step": 31500
893
+ },
894
+ {
895
+ "epoch": 1.2004272454014404,
896
+ "grad_norm": 21.875,
897
+ "learning_rate": 3.4800782007265265e-06,
898
+ "loss": 3.5475,
899
+ "step": 31750
900
+ },
901
+ {
902
+ "epoch": 1.209879577291718,
903
+ "grad_norm": 28.0,
904
+ "learning_rate": 3.409187517811486e-06,
905
+ "loss": 3.5383,
906
+ "step": 32000
907
+ },
908
+ {
909
+ "epoch": 1.2193319091819952,
910
+ "grad_norm": 29.625,
911
+ "learning_rate": 3.3386508810977856e-06,
912
+ "loss": 3.5525,
913
+ "step": 32250
914
+ },
915
+ {
916
+ "epoch": 1.2287842410722725,
917
+ "grad_norm": 18.75,
918
+ "learning_rate": 3.2684839889964988e-06,
919
+ "loss": 3.5202,
920
+ "step": 32500
921
+ },
922
+ {
923
+ "epoch": 1.23823657296255,
924
+ "grad_norm": 24.0,
925
+ "learning_rate": 3.1987024576295012e-06,
926
+ "loss": 3.5926,
927
+ "step": 32750
928
+ },
929
+ {
930
+ "epoch": 1.2476889048528272,
931
+ "grad_norm": 21.0,
932
+ "learning_rate": 3.1293218173540074e-06,
933
+ "loss": 3.4852,
934
+ "step": 33000
935
+ },
936
+ {
937
+ "epoch": 1.2571412367431045,
938
+ "grad_norm": 20.125,
939
+ "learning_rate": 3.060357509306171e-06,
940
+ "loss": 3.4994,
941
+ "step": 33250
942
+ },
943
+ {
944
+ "epoch": 1.2665935686333818,
945
+ "grad_norm": 20.875,
946
+ "learning_rate": 2.9918248819645624e-06,
947
+ "loss": 3.566,
948
+ "step": 33500
949
+ },
950
+ {
951
+ "epoch": 1.276045900523659,
952
+ "grad_norm": 39.75,
953
+ "learning_rate": 2.923739187734258e-06,
954
+ "loss": 3.517,
955
+ "step": 33750
956
+ },
957
+ {
958
+ "epoch": 1.2854982324139366,
959
+ "grad_norm": 18.875,
960
+ "learning_rate": 2.8561155795523133e-06,
961
+ "loss": 3.4938,
962
+ "step": 34000
963
+ },
964
+ {
965
+ "epoch": 1.2949505643042138,
966
+ "grad_norm": 20.625,
967
+ "learning_rate": 2.788969107515369e-06,
968
+ "loss": 3.501,
969
+ "step": 34250
970
+ },
971
+ {
972
+ "epoch": 1.3044028961944911,
973
+ "grad_norm": 22.5,
974
+ "learning_rate": 2.722314715530156e-06,
975
+ "loss": 3.574,
976
+ "step": 34500
977
+ },
978
+ {
979
+ "epoch": 1.3138552280847686,
980
+ "grad_norm": 22.375,
981
+ "learning_rate": 2.6561672379876236e-06,
982
+ "loss": 3.4953,
983
+ "step": 34750
984
+ },
985
+ {
986
+ "epoch": 1.323307559975046,
987
+ "grad_norm": 20.375,
988
+ "learning_rate": 2.590541396461438e-06,
989
+ "loss": 3.4766,
990
+ "step": 35000
991
+ },
992
+ {
993
+ "epoch": 1.3327598918653232,
994
+ "grad_norm": 23.0,
995
+ "learning_rate": 2.5254517964316084e-06,
996
+ "loss": 3.4905,
997
+ "step": 35250
998
+ },
999
+ {
1000
+ "epoch": 1.3422122237556005,
1001
+ "grad_norm": 24.0,
1002
+ "learning_rate": 2.4609129240339253e-06,
1003
+ "loss": 3.5543,
1004
+ "step": 35500
1005
+ },
1006
+ {
1007
+ "epoch": 1.3516645556458777,
1008
+ "grad_norm": 23.125,
1009
+ "learning_rate": 2.39693914283598e-06,
1010
+ "loss": 3.5577,
1011
+ "step": 35750
1012
+ },
1013
+ {
1014
+ "epoch": 1.3611168875361552,
1015
+ "grad_norm": 19.125,
1016
+ "learning_rate": 2.333544690640451e-06,
1017
+ "loss": 3.515,
1018
+ "step": 36000
1019
+ },
1020
+ {
1021
+ "epoch": 1.3705692194264325,
1022
+ "grad_norm": 22.625,
1023
+ "learning_rate": 2.270743676316383e-06,
1024
+ "loss": 3.506,
1025
+ "step": 36250
1026
+ },
1027
+ {
1028
+ "epoch": 1.3800215513167098,
1029
+ "grad_norm": 18.625,
1030
+ "learning_rate": 2.20855007665916e-06,
1031
+ "loss": 3.5909,
1032
+ "step": 36500
1033
+ },
1034
+ {
1035
+ "epoch": 1.3894738832069873,
1036
+ "grad_norm": 20.875,
1037
+ "learning_rate": 2.1469777332798804e-06,
1038
+ "loss": 3.5444,
1039
+ "step": 36750
1040
+ },
1041
+ {
1042
+ "epoch": 1.3989262150972646,
1043
+ "grad_norm": 19.375,
1044
+ "learning_rate": 2.086040349524807e-06,
1045
+ "loss": 3.5885,
1046
+ "step": 37000
1047
+ },
1048
+ {
1049
+ "epoch": 1.4083785469875418,
1050
+ "grad_norm": 20.5,
1051
+ "learning_rate": 2.025751487425591e-06,
1052
+ "loss": 3.5437,
1053
+ "step": 37250
1054
+ },
1055
+ {
1056
+ "epoch": 1.4178308788778191,
1057
+ "grad_norm": 22.75,
1058
+ "learning_rate": 1.9661245646809546e-06,
1059
+ "loss": 3.5815,
1060
+ "step": 37500
1061
+ },
1062
+ {
1063
+ "epoch": 1.4272832107680964,
1064
+ "grad_norm": 24.375,
1065
+ "learning_rate": 1.9071728516704897e-06,
1066
+ "loss": 3.5147,
1067
+ "step": 37750
1068
+ },
1069
+ {
1070
+ "epoch": 1.4367355426583739,
1071
+ "grad_norm": 25.75,
1072
+ "learning_rate": 1.8489094685012394e-06,
1073
+ "loss": 3.5288,
1074
+ "step": 38000
1075
+ },
1076
+ {
1077
+ "epoch": 1.4461878745486512,
1078
+ "grad_norm": 16.25,
1079
+ "learning_rate": 1.7913473820877353e-06,
1080
+ "loss": 3.5381,
1081
+ "step": 38250
1082
+ },
1083
+ {
1084
+ "epoch": 1.4556402064389284,
1085
+ "grad_norm": 22.375,
1086
+ "learning_rate": 1.7344994032661116e-06,
1087
+ "loss": 3.5954,
1088
+ "step": 38500
1089
+ },
1090
+ {
1091
+ "epoch": 1.465092538329206,
1092
+ "grad_norm": 19.125,
1093
+ "learning_rate": 1.6783781839429785e-06,
1094
+ "loss": 3.5212,
1095
+ "step": 38750
1096
+ },
1097
+ {
1098
+ "epoch": 1.4745448702194832,
1099
+ "grad_norm": 23.25,
1100
+ "learning_rate": 1.6229962142796469e-06,
1101
+ "loss": 3.4585,
1102
+ "step": 39000
1103
+ },
1104
+ {
1105
+ "epoch": 1.4839972021097605,
1106
+ "grad_norm": 18.375,
1107
+ "learning_rate": 1.5683658199123524e-06,
1108
+ "loss": 3.5231,
1109
+ "step": 39250
1110
+ },
1111
+ {
1112
+ "epoch": 1.4934495340000378,
1113
+ "grad_norm": 25.0,
1114
+ "learning_rate": 1.5144991592091162e-06,
1115
+ "loss": 3.5881,
1116
+ "step": 39500
1117
+ },
1118
+ {
1119
+ "epoch": 1.502901865890315,
1120
+ "grad_norm": 22.25,
1121
+ "learning_rate": 1.461408220563803e-06,
1122
+ "loss": 3.5278,
1123
+ "step": 39750
1124
+ },
1125
+ {
1126
+ "epoch": 1.5123541977805925,
1127
+ "grad_norm": 18.625,
1128
+ "learning_rate": 1.4091048197280227e-06,
1129
+ "loss": 3.5224,
1130
+ "step": 40000
1131
+ },
1132
+ {
1133
+ "epoch": 1.5218065296708698,
1134
+ "grad_norm": 21.5,
1135
+ "learning_rate": 1.3576005971814627e-06,
1136
+ "loss": 3.5465,
1137
+ "step": 40250
1138
+ },
1139
+ {
1140
+ "epoch": 1.531258861561147,
1141
+ "grad_norm": 19.375,
1142
+ "learning_rate": 1.3069070155412145e-06,
1143
+ "loss": 3.5318,
1144
+ "step": 40500
1145
+ },
1146
+ {
1147
+ "epoch": 1.5407111934514246,
1148
+ "grad_norm": 19.5,
1149
+ "learning_rate": 1.2570353570106864e-06,
1150
+ "loss": 3.5316,
1151
+ "step": 40750
1152
+ },
1153
+ {
1154
+ "epoch": 1.5501635253417017,
1155
+ "grad_norm": 23.5,
1156
+ "learning_rate": 1.2079967208686787e-06,
1157
+ "loss": 3.5112,
1158
+ "step": 41000
1159
+ },
1160
+ {
1161
+ "epoch": 1.5596158572319792,
1162
+ "grad_norm": 21.25,
1163
+ "learning_rate": 1.159802020999159e-06,
1164
+ "loss": 3.5891,
1165
+ "step": 41250
1166
+ },
1167
+ {
1168
+ "epoch": 1.5690681891222564,
1169
+ "grad_norm": 22.375,
1170
+ "learning_rate": 1.112461983462304e-06,
1171
+ "loss": 3.5365,
1172
+ "step": 41500
1173
+ },
1174
+ {
1175
+ "epoch": 1.5785205210125337,
1176
+ "grad_norm": 18.25,
1177
+ "learning_rate": 1.0659871441073422e-06,
1178
+ "loss": 3.5665,
1179
+ "step": 41750
1180
+ },
1181
+ {
1182
+ "epoch": 1.5879728529028112,
1183
+ "grad_norm": 37.75,
1184
+ "learning_rate": 1.020387846227724e-06,
1185
+ "loss": 3.5764,
1186
+ "step": 42000
1187
+ },
1188
+ {
1189
+ "epoch": 1.5974251847930885,
1190
+ "grad_norm": 21.75,
1191
+ "learning_rate": 9.756742382591577e-07,
1192
+ "loss": 3.6041,
1193
+ "step": 42250
1194
+ },
1195
+ {
1196
+ "epoch": 1.6068775166833658,
1197
+ "grad_norm": 24.375,
1198
+ "learning_rate": 9.318562715210039e-07,
1199
+ "loss": 3.6046,
1200
+ "step": 42500
1201
+ },
1202
+ {
1203
+ "epoch": 1.6163298485736433,
1204
+ "grad_norm": 18.375,
1205
+ "learning_rate": 8.889436980015336e-07,
1206
+ "loss": 3.5789,
1207
+ "step": 42750
1208
+ },
1209
+ {
1210
+ "epoch": 1.6257821804639203,
1211
+ "grad_norm": 19.875,
1212
+ "learning_rate": 8.469460681875674e-07,
1213
+ "loss": 3.588,
1214
+ "step": 43000
1215
+ },
1216
+ {
1217
+ "epoch": 1.6352345123541978,
1218
+ "grad_norm": 20.375,
1219
+ "learning_rate": 8.058727289389485e-07,
1220
+ "loss": 3.571,
1221
+ "step": 43250
1222
+ },
1223
+ {
1224
+ "epoch": 1.644686844244475,
1225
+ "grad_norm": 33.25,
1226
+ "learning_rate": 7.657328214083226e-07,
1227
+ "loss": 3.5252,
1228
+ "step": 43500
1229
+ },
1230
+ {
1231
+ "epoch": 1.6541391761347524,
1232
+ "grad_norm": 20.625,
1233
+ "learning_rate": 7.26535279006727e-07,
1234
+ "loss": 3.5418,
1235
+ "step": 43750
1236
+ },
1237
+ {
1238
+ "epoch": 1.6635915080250299,
1239
+ "grad_norm": 20.25,
1240
+ "learning_rate": 6.882888254153902e-07,
1241
+ "loss": 3.475,
1242
+ "step": 44000
1243
+ },
1244
+ {
1245
+ "epoch": 1.6730438399153071,
1246
+ "grad_norm": 22.25,
1247
+ "learning_rate": 6.51001972644218e-07,
1248
+ "loss": 3.6097,
1249
+ "step": 44250
1250
+ },
1251
+ {
1252
+ "epoch": 1.6824961718055844,
1253
+ "grad_norm": 20.125,
1254
+ "learning_rate": 6.146830191373909e-07,
1255
+ "loss": 3.5361,
1256
+ "step": 44500
1257
+ },
1258
+ {
1259
+ "epoch": 1.691948503695862,
1260
+ "grad_norm": 19.375,
1261
+ "learning_rate": 5.793400479264849e-07,
1262
+ "loss": 3.5127,
1263
+ "step": 44750
1264
+ },
1265
+ {
1266
+ "epoch": 1.701400835586139,
1267
+ "grad_norm": 22.5,
1268
+ "learning_rate": 5.449809248315402e-07,
1269
+ "loss": 3.5631,
1270
+ "step": 45000
1271
+ },
1272
+ {
1273
+ "epoch": 1.7108531674764165,
1274
+ "grad_norm": 20.125,
1275
+ "learning_rate": 5.11613296710467e-07,
1276
+ "loss": 3.4704,
1277
+ "step": 45250
1278
+ },
1279
+ {
1280
+ "epoch": 1.7203054993666937,
1281
+ "grad_norm": 25.75,
1282
+ "learning_rate": 4.792445897571845e-07,
1283
+ "loss": 3.5528,
1284
+ "step": 45500
1285
+ },
1286
+ {
1287
+ "epoch": 1.729757831256971,
1288
+ "grad_norm": 24.0,
1289
+ "learning_rate": 4.478820078488749e-07,
1290
+ "loss": 3.515,
1291
+ "step": 45750
1292
+ },
1293
+ {
1294
+ "epoch": 1.7392101631472485,
1295
+ "grad_norm": 17.375,
1296
+ "learning_rate": 4.175325309427064e-07,
1297
+ "loss": 3.5246,
1298
+ "step": 46000
1299
+ },
1300
+ {
1301
+ "epoch": 1.7486624950375258,
1302
+ "grad_norm": 21.875,
1303
+ "learning_rate": 3.882029135223975e-07,
1304
+ "loss": 3.504,
1305
+ "step": 46250
1306
+ },
1307
+ {
1308
+ "epoch": 1.758114826927803,
1309
+ "grad_norm": 23.0,
1310
+ "learning_rate": 3.598996830949619e-07,
1311
+ "loss": 3.5885,
1312
+ "step": 46500
1313
+ },
1314
+ {
1315
+ "epoch": 1.7675671588180806,
1316
+ "grad_norm": 29.0,
1317
+ "learning_rate": 3.326291387379654e-07,
1318
+ "loss": 3.5235,
1319
+ "step": 46750
1320
+ },
1321
+ {
1322
+ "epoch": 1.7770194907083576,
1323
+ "grad_norm": 20.0,
1324
+ "learning_rate": 3.0639734969762524e-07,
1325
+ "loss": 3.5873,
1326
+ "step": 47000
1327
+ },
1328
+ {
1329
+ "epoch": 1.7864718225986351,
1330
+ "grad_norm": 20.75,
1331
+ "learning_rate": 2.8121015403805406e-07,
1332
+ "loss": 3.501,
1333
+ "step": 47250
1334
+ },
1335
+ {
1336
+ "epoch": 1.7959241544889124,
1337
+ "grad_norm": 19.375,
1338
+ "learning_rate": 2.570731573419638e-07,
1339
+ "loss": 3.4923,
1340
+ "step": 47500
1341
+ },
1342
+ {
1343
+ "epoch": 1.8053764863791897,
1344
+ "grad_norm": 23.625,
1345
+ "learning_rate": 2.3399173146309906e-07,
1346
+ "loss": 3.4967,
1347
+ "step": 47750
1348
+ },
1349
+ {
1350
+ "epoch": 1.8148288182694672,
1351
+ "grad_norm": 16.875,
1352
+ "learning_rate": 2.119710133306996e-07,
1353
+ "loss": 3.5254,
1354
+ "step": 48000
1355
+ },
1356
+ {
1357
+ "epoch": 1.8242811501597445,
1358
+ "grad_norm": 19.625,
1359
+ "learning_rate": 1.9101590380623925e-07,
1360
+ "loss": 3.4922,
1361
+ "step": 48250
1362
+ },
1363
+ {
1364
+ "epoch": 1.8337334820500217,
1365
+ "grad_norm": 23.125,
1366
+ "learning_rate": 1.711310665927046e-07,
1367
+ "loss": 3.5446,
1368
+ "step": 48500
1369
+ },
1370
+ {
1371
+ "epoch": 1.8431858139402992,
1372
+ "grad_norm": 25.375,
1373
+ "learning_rate": 1.5232092719666025e-07,
1374
+ "loss": 3.5395,
1375
+ "step": 48750
1376
+ },
1377
+ {
1378
+ "epoch": 1.8526381458305763,
1379
+ "grad_norm": 20.0,
1380
+ "learning_rate": 1.3458967194331485e-07,
1381
+ "loss": 3.5714,
1382
+ "step": 49000
1383
+ },
1384
+ {
1385
+ "epoch": 1.8620904777208538,
1386
+ "grad_norm": 23.0,
1387
+ "learning_rate": 1.1794124704483324e-07,
1388
+ "loss": 3.5383,
1389
+ "step": 49250
1390
+ },
1391
+ {
1392
+ "epoch": 1.871542809611131,
1393
+ "grad_norm": 16.875,
1394
+ "learning_rate": 1.0237935772207608e-07,
1395
+ "loss": 3.5393,
1396
+ "step": 49500
1397
+ },
1398
+ {
1399
+ "epoch": 1.8809951415014083,
1400
+ "grad_norm": 18.25,
1401
+ "learning_rate": 8.790746737997569e-08,
1402
+ "loss": 3.5028,
1403
+ "step": 49750
1404
+ },
1405
+ {
1406
+ "epoch": 1.8904474733916858,
1407
+ "grad_norm": 19.5,
1408
+ "learning_rate": 7.452879683673809e-08,
1409
+ "loss": 3.5613,
1410
+ "step": 50000
1411
+ },
1412
+ {
1413
+ "epoch": 1.899899805281963,
1414
+ "grad_norm": 21.25,
1415
+ "learning_rate": 6.224632360702143e-08,
1416
+ "loss": 3.5304,
1417
+ "step": 50250
1418
+ },
1419
+ {
1420
+ "epoch": 1.9093521371722404,
1421
+ "grad_norm": 21.75,
1422
+ "learning_rate": 5.1062781239271665e-08,
1423
+ "loss": 3.5906,
1424
+ "step": 50500
1425
+ },
1426
+ {
1427
+ "epoch": 1.9188044690625177,
1428
+ "grad_norm": 21.75,
1429
+ "learning_rate": 4.0980658707355234e-08,
1430
+ "loss": 3.5713,
1431
+ "step": 50750
1432
+ },
1433
+ {
1434
+ "epoch": 1.928256800952795,
1435
+ "grad_norm": 25.125,
1436
+ "learning_rate": 3.2002199856617236e-08,
1437
+ "loss": 3.5918,
1438
+ "step": 51000
1439
+ },
1440
+ {
1441
+ "epoch": 1.9377091328430724,
1442
+ "grad_norm": 19.625,
1443
+ "learning_rate": 2.412940290450083e-08,
1444
+ "loss": 3.4949,
1445
+ "step": 51250
1446
+ },
1447
+ {
1448
+ "epoch": 1.9471614647333497,
1449
+ "grad_norm": 21.5,
1450
+ "learning_rate": 1.736401999582804e-08,
1451
+ "loss": 3.5472,
1452
+ "step": 51500
1453
+ },
1454
+ {
1455
+ "epoch": 1.956613796623627,
1456
+ "grad_norm": 23.375,
1457
+ "learning_rate": 1.1707556812851074e-08,
1458
+ "loss": 3.5542,
1459
+ "step": 51750
1460
+ },
1461
+ {
1462
+ "epoch": 1.9660661285139045,
1463
+ "grad_norm": 24.25,
1464
+ "learning_rate": 7.161272240148731e-09,
1465
+ "loss": 3.5946,
1466
+ "step": 52000
1467
+ },
1468
+ {
1469
+ "epoch": 1.9755184604041816,
1470
+ "grad_norm": 20.875,
1471
+ "learning_rate": 3.726178084456078e-09,
1472
+ "loss": 3.5614,
1473
+ "step": 52250
1474
+ },
1475
+ {
1476
+ "epoch": 1.984970792294459,
1477
+ "grad_norm": 19.625,
1478
+ "learning_rate": 1.4030388494790104e-09,
1479
+ "loss": 3.5535,
1480
+ "step": 52500
1481
+ },
1482
+ {
1483
+ "epoch": 1.9944231241847363,
1484
+ "grad_norm": 24.25,
1485
+ "learning_rate": 1.923715657464742e-10,
1486
+ "loss": 3.5299,
1487
+ "step": 52750
1488
+ }
1489
+ ],
1490
+ "logging_steps": 250,
1491
+ "max_steps": 52896,
1492
+ "num_input_tokens_seen": 0,
1493
+ "num_train_epochs": 2,
1494
+ "save_steps": 250,
1495
+ "stateful_callbacks": {
1496
+ "TrainerControl": {
1497
+ "args": {
1498
+ "should_epoch_stop": false,
1499
+ "should_evaluate": false,
1500
+ "should_log": false,
1501
+ "should_save": true,
1502
+ "should_training_stop": true
1503
+ },
1504
+ "attributes": {}
1505
+ }
1506
+ },
1507
+ "total_flos": 2.8374993889748353e+19,
1508
+ "train_batch_size": 1,
1509
+ "trial_name": null,
1510
+ "trial_params": null
1511
+ }