bimabk commited on
Commit
8153d3e
·
verified ·
1 Parent(s): 8b0cc07

Upload task output 8ca8a9ea-9ae3-4938-9713-015819984d61

Browse files
config.json CHANGED
@@ -24,7 +24,7 @@
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.51.3",
26
  "unsloth_version": "2024.9",
27
- "use_cache": true,
28
  "use_sliding_window": false,
29
  "vocab_size": 151936
30
  }
 
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.51.3",
26
  "unsloth_version": "2024.9",
27
+ "use_cache": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 151936
30
  }
generation_config.json CHANGED
@@ -1,5 +1,8 @@
1
  {
2
- "temperature": null,
3
- "top_p": null,
 
 
 
4
  "transformers_version": "4.51.3"
5
  }
 
1
  {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_length": 131072,
5
+ "max_new_tokens": 2048,
6
+ "pad_token_id": 151646,
7
  "transformers_version": "4.51.3"
8
  }
loss.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 444,0.5547934770584106
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d12c9b0441f04996569a163227f7ac467be83bf83be7575d05183a90fdef5eb
3
  size 3087467144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:559a2b218642a05221baaaedf97fdf7dda92c3b667aa94d81ec33b470850c9b3
3
  size 3087467144
trainer_state.json ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.9910313901345291,
6
+ "eval_steps": 500,
7
+ "global_step": 444,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02242152466367713,
14
+ "grad_norm": 2.15625,
15
+ "learning_rate": 2.305093438544906e-05,
16
+ "loss": 0.9112,
17
+ "step": 5
18
+ },
19
+ {
20
+ "epoch": 0.04484304932735426,
21
+ "grad_norm": 1.4765625,
22
+ "learning_rate": 5.1864602367260384e-05,
23
+ "loss": 0.7142,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.06726457399103139,
28
+ "grad_norm": 1.09375,
29
+ "learning_rate": 8.067827034907172e-05,
30
+ "loss": 0.6696,
31
+ "step": 15
32
+ },
33
+ {
34
+ "epoch": 0.08968609865470852,
35
+ "grad_norm": 0.9921875,
36
+ "learning_rate": 0.00010949193833088303,
37
+ "loss": 0.652,
38
+ "step": 20
39
+ },
40
+ {
41
+ "epoch": 0.11210762331838565,
42
+ "grad_norm": 0.87109375,
43
+ "learning_rate": 0.00013830560631269436,
44
+ "loss": 0.6583,
45
+ "step": 25
46
+ },
47
+ {
48
+ "epoch": 0.13452914798206278,
49
+ "grad_norm": 1.2578125,
50
+ "learning_rate": 0.00016711927429450567,
51
+ "loss": 0.6354,
52
+ "step": 30
53
+ },
54
+ {
55
+ "epoch": 0.15695067264573992,
56
+ "grad_norm": 1.1875,
57
+ "learning_rate": 0.000195932942276317,
58
+ "loss": 0.6436,
59
+ "step": 35
60
+ },
61
+ {
62
+ "epoch": 0.17937219730941703,
63
+ "grad_norm": 1.1484375,
64
+ "learning_rate": 0.00022474661025812833,
65
+ "loss": 0.6563,
66
+ "step": 40
67
+ },
68
+ {
69
+ "epoch": 0.20179372197309417,
70
+ "grad_norm": 9.5,
71
+ "learning_rate": 0.00025356027823993967,
72
+ "loss": 0.8176,
73
+ "step": 45
74
+ },
75
+ {
76
+ "epoch": 0.2242152466367713,
77
+ "grad_norm": 1.2890625,
78
+ "learning_rate": 0.00028237394622175095,
79
+ "loss": 0.8045,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.24663677130044842,
84
+ "grad_norm": 1.3125,
85
+ "learning_rate": 0.00028811441478558746,
86
+ "loss": 0.7267,
87
+ "step": 55
88
+ },
89
+ {
90
+ "epoch": 0.26905829596412556,
91
+ "grad_norm": 1.2578125,
92
+ "learning_rate": 0.0002880239788169296,
93
+ "loss": 0.7135,
94
+ "step": 60
95
+ },
96
+ {
97
+ "epoch": 0.2914798206278027,
98
+ "grad_norm": 0.6953125,
99
+ "learning_rate": 0.0002878640385367635,
100
+ "loss": 0.6942,
101
+ "step": 65
102
+ },
103
+ {
104
+ "epoch": 0.31390134529147984,
105
+ "grad_norm": 0.640625,
106
+ "learning_rate": 0.0002876346969346014,
107
+ "loss": 0.6893,
108
+ "step": 70
109
+ },
110
+ {
111
+ "epoch": 0.336322869955157,
112
+ "grad_norm": 0.66015625,
113
+ "learning_rate": 0.0002873361016891883,
114
+ "loss": 0.6921,
115
+ "step": 75
116
+ },
117
+ {
118
+ "epoch": 0.35874439461883406,
119
+ "grad_norm": 0.60546875,
120
+ "learning_rate": 0.000286968445073407,
121
+ "loss": 0.6859,
122
+ "step": 80
123
+ },
124
+ {
125
+ "epoch": 0.3811659192825112,
126
+ "grad_norm": 0.57421875,
127
+ "learning_rate": 0.00028653196383046967,
128
+ "loss": 0.6619,
129
+ "step": 85
130
+ },
131
+ {
132
+ "epoch": 0.40358744394618834,
133
+ "grad_norm": 0.54296875,
134
+ "learning_rate": 0.0002860269390214723,
135
+ "loss": 0.6673,
136
+ "step": 90
137
+ },
138
+ {
139
+ "epoch": 0.4260089686098655,
140
+ "grad_norm": 0.55859375,
141
+ "learning_rate": 0.00028545369584441287,
142
+ "loss": 0.6669,
143
+ "step": 95
144
+ },
145
+ {
146
+ "epoch": 0.4484304932735426,
147
+ "grad_norm": 0.6328125,
148
+ "learning_rate": 0.00028481260342478823,
149
+ "loss": 0.6705,
150
+ "step": 100
151
+ },
152
+ {
153
+ "epoch": 0.47085201793721976,
154
+ "grad_norm": 0.53515625,
155
+ "learning_rate": 0.000284104074577905,
156
+ "loss": 0.6551,
157
+ "step": 105
158
+ },
159
+ {
160
+ "epoch": 0.49327354260089684,
161
+ "grad_norm": 0.5390625,
162
+ "learning_rate": 0.00028332856554305765,
163
+ "loss": 0.6536,
164
+ "step": 110
165
+ },
166
+ {
167
+ "epoch": 0.515695067264574,
168
+ "grad_norm": 0.484375,
169
+ "learning_rate": 0.0002824865756897446,
170
+ "loss": 0.6496,
171
+ "step": 115
172
+ },
173
+ {
174
+ "epoch": 0.5381165919282511,
175
+ "grad_norm": 0.5,
176
+ "learning_rate": 0.0002815786471961118,
177
+ "loss": 0.6421,
178
+ "step": 120
179
+ },
180
+ {
181
+ "epoch": 0.5605381165919282,
182
+ "grad_norm": 0.44921875,
183
+ "learning_rate": 0.00028060536469983084,
184
+ "loss": 0.6333,
185
+ "step": 125
186
+ },
187
+ {
188
+ "epoch": 0.5829596412556054,
189
+ "grad_norm": 0.46484375,
190
+ "learning_rate": 0.0002795673549216364,
191
+ "loss": 0.6506,
192
+ "step": 130
193
+ },
194
+ {
195
+ "epoch": 0.6053811659192825,
196
+ "grad_norm": 0.515625,
197
+ "learning_rate": 0.0002784652862617649,
198
+ "loss": 0.6242,
199
+ "step": 135
200
+ },
201
+ {
202
+ "epoch": 0.6278026905829597,
203
+ "grad_norm": 0.5234375,
204
+ "learning_rate": 0.0002772998683695552,
205
+ "loss": 0.6467,
206
+ "step": 140
207
+ },
208
+ {
209
+ "epoch": 0.6502242152466368,
210
+ "grad_norm": 0.43359375,
211
+ "learning_rate": 0.00027607185168648785,
212
+ "loss": 0.634,
213
+ "step": 145
214
+ },
215
+ {
216
+ "epoch": 0.672645739910314,
217
+ "grad_norm": 0.5078125,
218
+ "learning_rate": 0.0002747820269629572,
219
+ "loss": 0.6367,
220
+ "step": 150
221
+ },
222
+ {
223
+ "epoch": 0.695067264573991,
224
+ "grad_norm": 0.478515625,
225
+ "learning_rate": 0.0002734312247490874,
226
+ "loss": 0.6091,
227
+ "step": 155
228
+ },
229
+ {
230
+ "epoch": 0.7174887892376681,
231
+ "grad_norm": 0.421875,
232
+ "learning_rate": 0.0002720203148599208,
233
+ "loss": 0.6293,
234
+ "step": 160
235
+ },
236
+ {
237
+ "epoch": 0.7399103139013453,
238
+ "grad_norm": 0.4140625,
239
+ "learning_rate": 0.00027055020581532246,
240
+ "loss": 0.6168,
241
+ "step": 165
242
+ },
243
+ {
244
+ "epoch": 0.7623318385650224,
245
+ "grad_norm": 0.462890625,
246
+ "learning_rate": 0.00026902184425496155,
247
+ "loss": 0.6284,
248
+ "step": 170
249
+ },
250
+ {
251
+ "epoch": 0.7847533632286996,
252
+ "grad_norm": 0.435546875,
253
+ "learning_rate": 0.0002674362143287467,
254
+ "loss": 0.6163,
255
+ "step": 175
256
+ },
257
+ {
258
+ "epoch": 0.8071748878923767,
259
+ "grad_norm": 0.47265625,
260
+ "learning_rate": 0.0002657943370631075,
261
+ "loss": 0.6099,
262
+ "step": 180
263
+ },
264
+ {
265
+ "epoch": 0.8295964125560538,
266
+ "grad_norm": 0.53125,
267
+ "learning_rate": 0.00026409726970353,
268
+ "loss": 0.6159,
269
+ "step": 185
270
+ },
271
+ {
272
+ "epoch": 0.852017937219731,
273
+ "grad_norm": 0.48828125,
274
+ "learning_rate": 0.00026234610503377063,
275
+ "loss": 0.6001,
276
+ "step": 190
277
+ },
278
+ {
279
+ "epoch": 0.874439461883408,
280
+ "grad_norm": 0.41796875,
281
+ "learning_rate": 0.00026054197067218514,
282
+ "loss": 0.5967,
283
+ "step": 195
284
+ },
285
+ {
286
+ "epoch": 0.8968609865470852,
287
+ "grad_norm": 0.435546875,
288
+ "learning_rate": 0.0002586860283456274,
289
+ "loss": 0.6301,
290
+ "step": 200
291
+ },
292
+ {
293
+ "epoch": 0.9192825112107623,
294
+ "grad_norm": 0.447265625,
295
+ "learning_rate": 0.00025677947314138464,
296
+ "loss": 0.6061,
297
+ "step": 205
298
+ },
299
+ {
300
+ "epoch": 0.9417040358744395,
301
+ "grad_norm": 0.447265625,
302
+ "learning_rate": 0.00025482353273763113,
303
+ "loss": 0.5923,
304
+ "step": 210
305
+ },
306
+ {
307
+ "epoch": 0.9641255605381166,
308
+ "grad_norm": 0.494140625,
309
+ "learning_rate": 0.0002528194666128958,
310
+ "loss": 0.6004,
311
+ "step": 215
312
+ },
313
+ {
314
+ "epoch": 0.9865470852017937,
315
+ "grad_norm": 0.419921875,
316
+ "learning_rate": 0.0002507685652350527,
317
+ "loss": 0.612,
318
+ "step": 220
319
+ },
320
+ {
321
+ "epoch": 0.9955156950672646,
322
+ "eval_loss": 0.6039933562278748,
323
+ "eval_runtime": 2.2333,
324
+ "eval_samples_per_second": 18.806,
325
+ "eval_steps_per_second": 18.806,
326
+ "step": 222
327
+ },
328
+ {
329
+ "epoch": 1.0089686098654709,
330
+ "grad_norm": 0.55859375,
331
+ "learning_rate": 0.0002486721492303566,
332
+ "loss": 0.533,
333
+ "step": 225
334
+ },
335
+ {
336
+ "epoch": 1.031390134529148,
337
+ "grad_norm": 0.451171875,
338
+ "learning_rate": 0.0002465315685330595,
339
+ "loss": 0.4381,
340
+ "step": 230
341
+ },
342
+ {
343
+ "epoch": 1.053811659192825,
344
+ "grad_norm": 0.466796875,
345
+ "learning_rate": 0.0002443482015161539,
346
+ "loss": 0.4257,
347
+ "step": 235
348
+ },
349
+ {
350
+ "epoch": 1.0762331838565022,
351
+ "grad_norm": 0.490234375,
352
+ "learning_rate": 0.0002421234541038045,
353
+ "loss": 0.4324,
354
+ "step": 240
355
+ },
356
+ {
357
+ "epoch": 1.0986547085201794,
358
+ "grad_norm": 0.404296875,
359
+ "learning_rate": 0.00023985875886603888,
360
+ "loss": 0.4228,
361
+ "step": 245
362
+ },
363
+ {
364
+ "epoch": 1.1210762331838564,
365
+ "grad_norm": 0.40234375,
366
+ "learning_rate": 0.00023755557409627998,
367
+ "loss": 0.4219,
368
+ "step": 250
369
+ },
370
+ {
371
+ "epoch": 1.1434977578475336,
372
+ "grad_norm": 0.373046875,
373
+ "learning_rate": 0.00023521538287231476,
374
+ "loss": 0.4365,
375
+ "step": 255
376
+ },
377
+ {
378
+ "epoch": 1.1659192825112108,
379
+ "grad_norm": 0.42578125,
380
+ "learning_rate": 0.0002328396921013038,
381
+ "loss": 0.4315,
382
+ "step": 260
383
+ },
384
+ {
385
+ "epoch": 1.188340807174888,
386
+ "grad_norm": 0.384765625,
387
+ "learning_rate": 0.00023043003154944643,
388
+ "loss": 0.428,
389
+ "step": 265
390
+ },
391
+ {
392
+ "epoch": 1.210762331838565,
393
+ "grad_norm": 0.41796875,
394
+ "learning_rate": 0.000227987952856926,
395
+ "loss": 0.4248,
396
+ "step": 270
397
+ },
398
+ {
399
+ "epoch": 1.2331838565022422,
400
+ "grad_norm": 0.365234375,
401
+ "learning_rate": 0.00022551502853877082,
402
+ "loss": 0.4306,
403
+ "step": 275
404
+ },
405
+ {
406
+ "epoch": 1.2556053811659194,
407
+ "grad_norm": 0.376953125,
408
+ "learning_rate": 0.00022301285097227208,
409
+ "loss": 0.4423,
410
+ "step": 280
411
+ },
412
+ {
413
+ "epoch": 1.2780269058295963,
414
+ "grad_norm": 0.39453125,
415
+ "learning_rate": 0.00022048303137161342,
416
+ "loss": 0.4351,
417
+ "step": 285
418
+ },
419
+ {
420
+ "epoch": 1.3004484304932735,
421
+ "grad_norm": 0.3828125,
422
+ "learning_rate": 0.00021792719875036935,
423
+ "loss": 0.4371,
424
+ "step": 290
425
+ },
426
+ {
427
+ "epoch": 1.3228699551569507,
428
+ "grad_norm": 0.380859375,
429
+ "learning_rate": 0.00021534699887254367,
430
+ "loss": 0.434,
431
+ "step": 295
432
+ },
433
+ {
434
+ "epoch": 1.3452914798206277,
435
+ "grad_norm": 0.375,
436
+ "learning_rate": 0.00021274409319282082,
437
+ "loss": 0.4292,
438
+ "step": 300
439
+ },
440
+ {
441
+ "epoch": 1.3677130044843049,
442
+ "grad_norm": 0.37890625,
443
+ "learning_rate": 0.00021012015778671412,
444
+ "loss": 0.4345,
445
+ "step": 305
446
+ },
447
+ {
448
+ "epoch": 1.390134529147982,
449
+ "grad_norm": 0.390625,
450
+ "learning_rate": 0.00020747688227129932,
451
+ "loss": 0.4313,
452
+ "step": 310
453
+ },
454
+ {
455
+ "epoch": 1.4125560538116593,
456
+ "grad_norm": 0.3671875,
457
+ "learning_rate": 0.00020481596871722803,
458
+ "loss": 0.4319,
459
+ "step": 315
460
+ },
461
+ {
462
+ "epoch": 1.4349775784753362,
463
+ "grad_norm": 0.42578125,
464
+ "learning_rate": 0.0002021391305527223,
465
+ "loss": 0.4306,
466
+ "step": 320
467
+ },
468
+ {
469
+ "epoch": 1.4573991031390134,
470
+ "grad_norm": 0.359375,
471
+ "learning_rate": 0.00019944809146025586,
472
+ "loss": 0.4415,
473
+ "step": 325
474
+ },
475
+ {
476
+ "epoch": 1.4798206278026906,
477
+ "grad_norm": 0.376953125,
478
+ "learning_rate": 0.00019674458426663204,
479
+ "loss": 0.4288,
480
+ "step": 330
481
+ },
482
+ {
483
+ "epoch": 1.5022421524663678,
484
+ "grad_norm": 0.365234375,
485
+ "learning_rate": 0.0001940303498271737,
486
+ "loss": 0.4263,
487
+ "step": 335
488
+ },
489
+ {
490
+ "epoch": 1.5246636771300448,
491
+ "grad_norm": 0.361328125,
492
+ "learning_rate": 0.00019130713590474326,
493
+ "loss": 0.429,
494
+ "step": 340
495
+ },
496
+ {
497
+ "epoch": 1.547085201793722,
498
+ "grad_norm": 0.3671875,
499
+ "learning_rate": 0.00018857669604431496,
500
+ "loss": 0.4147,
501
+ "step": 345
502
+ },
503
+ {
504
+ "epoch": 1.5695067264573992,
505
+ "grad_norm": 0.369140625,
506
+ "learning_rate": 0.0001858407884438237,
507
+ "loss": 0.4215,
508
+ "step": 350
509
+ },
510
+ {
511
+ "epoch": 1.5919282511210762,
512
+ "grad_norm": 0.373046875,
513
+ "learning_rate": 0.0001831011748220177,
514
+ "loss": 0.4232,
515
+ "step": 355
516
+ },
517
+ {
518
+ "epoch": 1.6143497757847534,
519
+ "grad_norm": 0.359375,
520
+ "learning_rate": 0.00018035961928404432,
521
+ "loss": 0.4133,
522
+ "step": 360
523
+ },
524
+ {
525
+ "epoch": 1.6367713004484306,
526
+ "grad_norm": 0.3515625,
527
+ "learning_rate": 0.0001776178871854989,
528
+ "loss": 0.4198,
529
+ "step": 365
530
+ },
531
+ {
532
+ "epoch": 1.6591928251121075,
533
+ "grad_norm": 0.35546875,
534
+ "learning_rate": 0.00017487774399566828,
535
+ "loss": 0.4253,
536
+ "step": 370
537
+ },
538
+ {
539
+ "epoch": 1.6816143497757847,
540
+ "grad_norm": 0.36328125,
541
+ "learning_rate": 0.00017214095416070196,
542
+ "loss": 0.4247,
543
+ "step": 375
544
+ },
545
+ {
546
+ "epoch": 1.704035874439462,
547
+ "grad_norm": 0.3515625,
548
+ "learning_rate": 0.00016940927996744084,
549
+ "loss": 0.4173,
550
+ "step": 380
551
+ },
552
+ {
553
+ "epoch": 1.726457399103139,
554
+ "grad_norm": 0.337890625,
555
+ "learning_rate": 0.00016668448040863722,
556
+ "loss": 0.4197,
557
+ "step": 385
558
+ },
559
+ {
560
+ "epoch": 1.7488789237668163,
561
+ "grad_norm": 0.357421875,
562
+ "learning_rate": 0.000163968310050296,
563
+ "loss": 0.4171,
564
+ "step": 390
565
+ },
566
+ {
567
+ "epoch": 1.7713004484304933,
568
+ "grad_norm": 0.33203125,
569
+ "learning_rate": 0.00016126251790186578,
570
+ "loss": 0.3982,
571
+ "step": 395
572
+ },
573
+ {
574
+ "epoch": 1.7937219730941703,
575
+ "grad_norm": 0.330078125,
576
+ "learning_rate": 0.0001585688462900084,
577
+ "loss": 0.4102,
578
+ "step": 400
579
+ },
580
+ {
581
+ "epoch": 1.8161434977578477,
582
+ "grad_norm": 0.341796875,
583
+ "learning_rate": 0.0001558890297366718,
584
+ "loss": 0.4096,
585
+ "step": 405
586
+ },
587
+ {
588
+ "epoch": 1.8385650224215246,
589
+ "grad_norm": 0.330078125,
590
+ "learning_rate": 0.000153224793842188,
591
+ "loss": 0.4118,
592
+ "step": 410
593
+ },
594
+ {
595
+ "epoch": 1.8609865470852018,
596
+ "grad_norm": 0.328125,
597
+ "learning_rate": 0.0001505778541741166,
598
+ "loss": 0.3986,
599
+ "step": 415
600
+ },
601
+ {
602
+ "epoch": 1.883408071748879,
603
+ "grad_norm": 0.341796875,
604
+ "learning_rate": 0.00014794991516254793,
605
+ "loss": 0.399,
606
+ "step": 420
607
+ },
608
+ {
609
+ "epoch": 1.905829596412556,
610
+ "grad_norm": 0.345703125,
611
+ "learning_rate": 0.0001453426690025785,
612
+ "loss": 0.402,
613
+ "step": 425
614
+ },
615
+ {
616
+ "epoch": 1.9282511210762332,
617
+ "grad_norm": 0.34765625,
618
+ "learning_rate": 0.00014275779456466455,
619
+ "loss": 0.4102,
620
+ "step": 430
621
+ },
622
+ {
623
+ "epoch": 1.9506726457399104,
624
+ "grad_norm": 0.32421875,
625
+ "learning_rate": 0.00014019695631355567,
626
+ "loss": 0.3942,
627
+ "step": 435
628
+ },
629
+ {
630
+ "epoch": 1.9730941704035874,
631
+ "grad_norm": 0.333984375,
632
+ "learning_rate": 0.0001376618032365048,
633
+ "loss": 0.4017,
634
+ "step": 440
635
+ },
636
+ {
637
+ "epoch": 1.9910313901345291,
638
+ "eval_loss": 0.5547934770584106,
639
+ "eval_runtime": 2.0196,
640
+ "eval_samples_per_second": 20.796,
641
+ "eval_steps_per_second": 20.796,
642
+ "step": 444
643
+ }
644
+ ],
645
+ "logging_steps": 5,
646
+ "max_steps": 669,
647
+ "num_input_tokens_seen": 0,
648
+ "num_train_epochs": 3,
649
+ "save_steps": 500,
650
+ "stateful_callbacks": {
651
+ "TrainerControl": {
652
+ "args": {
653
+ "should_epoch_stop": false,
654
+ "should_evaluate": false,
655
+ "should_log": false,
656
+ "should_save": true,
657
+ "should_training_stop": false
658
+ },
659
+ "attributes": {}
660
+ }
661
+ },
662
+ "total_flos": 7.145669325918044e+17,
663
+ "train_batch_size": 100,
664
+ "trial_name": null,
665
+ "trial_params": null
666
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ede999dce3b0ae7cde633fe6179008d038ba6714ae128a79622dd376393a435
3
+ size 5688