VladShash commited on
Commit
05bd547
·
verified ·
1 Parent(s): 77acfd9

Upload 14 files

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. scheduler.pt +1 -1
  3. tokenizer.json +2 -2
  4. trainer_state.json +196 -406
  5. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:799aa3e96e60f5340d777ed0661bebe1ced4b6229995e9d6934bb2d9459c379e
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8362b3238b6d8ad257157ad1a100b5775db124e0f16793d6ae9bf9f0af7f4aab
3
  size 2384234968
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b083c39fe6388e575235f5e9fc0de1f1f66b59a6209788eb53663fc005d70f1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dc6f2eb7585a3157fdebf182056facd58efd07c6d4a2f75bf05bb9c2bc6f807
3
  size 1064
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c760c4e715b8fa39254607191eca619d9b0612d5b29d3002ac512a5b6cad7d55
3
- size 11422934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
trainer_state.json CHANGED
@@ -4,535 +4,325 @@
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 3735,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.013386880856760375,
14
- "grad_norm": 10.444840431213379,
15
- "learning_rate": 9.995753909720682e-06,
16
- "loss": 0.6863,
17
- "step": 50
18
- },
19
- {
20
- "epoch": 0.02677376171352075,
21
- "grad_norm": 6.0297532081604,
22
- "learning_rate": 9.98267481169144e-06,
23
- "loss": 0.5107,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.040160642570281124,
28
- "grad_norm": 10.061092376708984,
29
- "learning_rate": 9.960784067018357e-06,
30
- "loss": 0.461,
31
- "step": 150
32
- },
33
- {
34
- "epoch": 0.0535475234270415,
35
- "grad_norm": 6.345846176147461,
36
- "learning_rate": 9.930120388544258e-06,
37
- "loss": 0.464,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.06693440428380187,
42
- "grad_norm": 11.498032569885254,
43
- "learning_rate": 9.890738003669029e-06,
44
- "loss": 0.4628,
45
- "step": 250
46
- },
47
- {
48
- "epoch": 0.08032128514056225,
49
- "grad_norm": 7.820000648498535,
50
- "learning_rate": 9.842706558450787e-06,
51
- "loss": 0.4477,
52
  "step": 300
53
  },
54
  {
55
- "epoch": 0.09370816599732262,
56
- "grad_norm": 11.710145950317383,
57
- "learning_rate": 9.786110994439813e-06,
58
- "loss": 0.4411,
59
- "step": 350
60
- },
61
- {
62
- "epoch": 0.107095046854083,
63
- "grad_norm": 7.274962902069092,
64
- "learning_rate": 9.721051398463065e-06,
65
- "loss": 0.4735,
66
  "step": 400
67
  },
68
  {
69
- "epoch": 0.12048192771084337,
70
- "grad_norm": 8.92360782623291,
71
- "learning_rate": 9.647642825624925e-06,
72
- "loss": 0.4501,
73
- "step": 450
74
- },
75
- {
76
- "epoch": 0.13386880856760375,
77
- "grad_norm": 8.818862915039062,
78
- "learning_rate": 9.56601509583717e-06,
79
- "loss": 0.457,
80
  "step": 500
81
  },
82
  {
83
- "epoch": 0.14725568942436412,
84
- "grad_norm": 12.081962585449219,
85
- "learning_rate": 9.476312564238035e-06,
86
- "loss": 0.4308,
87
- "step": 550
88
- },
89
- {
90
- "epoch": 0.1606425702811245,
91
- "grad_norm": 10.395655632019043,
92
- "learning_rate": 9.378693865906347e-06,
93
- "loss": 0.4408,
94
  "step": 600
95
  },
96
  {
97
- "epoch": 0.17402945113788487,
98
- "grad_norm": 8.816801071166992,
99
- "learning_rate": 9.273331635322185e-06,
100
- "loss": 0.4497,
101
- "step": 650
102
- },
103
- {
104
- "epoch": 0.18741633199464525,
105
- "grad_norm": 5.368481636047363,
106
- "learning_rate": 9.160412201070217e-06,
107
- "loss": 0.4447,
108
  "step": 700
109
  },
110
  {
111
- "epoch": 0.20080321285140562,
112
- "grad_norm": 6.392587184906006,
113
- "learning_rate": 9.040135256325584e-06,
114
- "loss": 0.4407,
115
- "step": 750
116
- },
117
- {
118
- "epoch": 0.214190093708166,
119
- "grad_norm": 7.844484806060791,
120
- "learning_rate": 8.912713505705093e-06,
121
- "loss": 0.4528,
122
  "step": 800
123
  },
124
  {
125
- "epoch": 0.22757697456492637,
126
- "grad_norm": 8.644186019897461,
127
- "learning_rate": 8.778372289108224e-06,
128
- "loss": 0.4218,
129
- "step": 850
130
- },
131
- {
132
- "epoch": 0.24096385542168675,
133
- "grad_norm": 10.462806701660156,
134
- "learning_rate": 8.637349183213186e-06,
135
- "loss": 0.4194,
136
  "step": 900
137
  },
138
  {
139
- "epoch": 0.2543507362784471,
140
- "grad_norm": 8.724151611328125,
141
- "learning_rate": 8.489893581332753e-06,
142
- "loss": 0.4529,
143
- "step": 950
144
- },
145
- {
146
- "epoch": 0.2677376171352075,
147
- "grad_norm": 8.977086067199707,
148
- "learning_rate": 8.33626625237289e-06,
149
- "loss": 0.4354,
150
  "step": 1000
151
  },
152
  {
153
- "epoch": 0.28112449799196787,
154
- "grad_norm": 7.824547290802002,
155
- "learning_rate": 8.176738879674129e-06,
156
- "loss": 0.4314,
157
- "step": 1050
158
- },
159
- {
160
- "epoch": 0.29451137884872824,
161
- "grad_norm": 5.790531635284424,
162
- "learning_rate": 8.01159358055124e-06,
163
- "loss": 0.4286,
164
  "step": 1100
165
  },
166
  {
167
- "epoch": 0.3078982597054886,
168
- "grad_norm": 9.854269027709961,
169
- "learning_rate": 7.84112240738086e-06,
170
- "loss": 0.4201,
171
- "step": 1150
172
- },
173
- {
174
- "epoch": 0.321285140562249,
175
- "grad_norm": 10.543339729309082,
176
- "learning_rate": 7.665626831119396e-06,
177
- "loss": 0.4405,
178
  "step": 1200
179
  },
180
  {
181
- "epoch": 0.33467202141900937,
182
- "grad_norm": 4.100381851196289,
183
- "learning_rate": 7.485417208164567e-06,
184
- "loss": 0.4256,
185
- "step": 1250
186
- },
187
- {
188
- "epoch": 0.34805890227576974,
189
- "grad_norm": 6.328100681304932,
190
- "learning_rate": 7.300812231503435e-06,
191
- "loss": 0.4333,
192
  "step": 1300
193
  },
194
  {
195
- "epoch": 0.3614457831325301,
196
- "grad_norm": 5.430217742919922,
197
- "learning_rate": 7.112138367117529e-06,
198
- "loss": 0.3958,
199
- "step": 1350
200
- },
201
- {
202
- "epoch": 0.3748326639892905,
203
- "grad_norm": 10.457396507263184,
204
- "learning_rate": 6.91972927664176e-06,
205
- "loss": 0.3899,
206
  "step": 1400
207
  },
208
  {
209
- "epoch": 0.38821954484605087,
210
- "grad_norm": 6.345710754394531,
211
- "learning_rate": 6.723925227298133e-06,
212
- "loss": 0.3759,
213
- "step": 1450
214
- },
215
- {
216
- "epoch": 0.40160642570281124,
217
- "grad_norm": 9.008039474487305,
218
- "learning_rate": 6.525072490147766e-06,
219
- "loss": 0.3912,
220
  "step": 1500
221
  },
222
  {
223
- "epoch": 0.4149933065595716,
224
- "grad_norm": 7.655404090881348,
225
- "learning_rate": 6.323522727725371e-06,
226
- "loss": 0.3992,
227
- "step": 1550
228
- },
229
- {
230
- "epoch": 0.428380187416332,
231
- "grad_norm": 7.3834967613220215,
232
- "learning_rate": 6.119632372139152e-06,
233
- "loss": 0.3655,
234
  "step": 1600
235
  },
236
  {
237
- "epoch": 0.44176706827309237,
238
- "grad_norm": 8.772950172424316,
239
- "learning_rate": 5.913761994735908e-06,
240
- "loss": 0.4217,
241
- "step": 1650
242
- },
243
- {
244
- "epoch": 0.45515394912985274,
245
- "grad_norm": 5.111259937286377,
246
- "learning_rate": 5.706275668446074e-06,
247
- "loss": 0.4281,
248
  "step": 1700
249
  },
250
  {
251
- "epoch": 0.4685408299866131,
252
- "grad_norm": 5.894604682922363,
253
- "learning_rate": 5.497540323936371e-06,
254
- "loss": 0.4276,
255
- "step": 1750
256
- },
257
- {
258
- "epoch": 0.4819277108433735,
259
- "grad_norm": 9.776211738586426,
260
- "learning_rate": 5.2879251007086555e-06,
261
- "loss": 0.4073,
262
  "step": 1800
263
  },
264
  {
265
- "epoch": 0.49531459170013387,
266
- "grad_norm": 8.807998657226562,
267
- "learning_rate": 5.077800694292546e-06,
268
- "loss": 0.4339,
269
- "step": 1850
270
- },
271
- {
272
- "epoch": 0.5087014725568942,
273
- "grad_norm": 8.668907165527344,
274
- "learning_rate": 4.867538700686292e-06,
275
- "loss": 0.432,
276
  "step": 1900
277
  },
278
  {
279
- "epoch": 0.5220883534136547,
280
- "grad_norm": 7.573647499084473,
281
- "learning_rate": 4.657510959205182e-06,
282
- "loss": 0.402,
283
- "step": 1950
284
- },
285
- {
286
- "epoch": 0.535475234270415,
287
- "grad_norm": 7.700588703155518,
288
- "learning_rate": 4.448088894899669e-06,
289
- "loss": 0.3972,
290
  "step": 2000
291
  },
292
  {
293
- "epoch": 0.5488621151271754,
294
- "grad_norm": 7.340878009796143,
295
- "learning_rate": 4.23964286170611e-06,
296
- "loss": 0.3288,
297
- "step": 2050
298
- },
299
- {
300
- "epoch": 0.5622489959839357,
301
- "grad_norm": 7.881528377532959,
302
- "learning_rate": 4.032541487491709e-06,
303
- "loss": 0.4066,
304
  "step": 2100
305
  },
306
  {
307
- "epoch": 0.5756358768406962,
308
- "grad_norm": 7.409366607666016,
309
- "learning_rate": 3.827151022151955e-06,
310
- "loss": 0.3516,
311
- "step": 2150
312
- },
313
- {
314
- "epoch": 0.5890227576974565,
315
- "grad_norm": 7.922168731689453,
316
- "learning_rate": 3.623834689913387e-06,
317
- "loss": 0.3729,
318
  "step": 2200
319
  },
320
  {
321
- "epoch": 0.6024096385542169,
322
- "grad_norm": 8.52475643157959,
323
- "learning_rate": 3.4229520469871445e-06,
324
- "loss": 0.4058,
325
- "step": 2250
326
- },
327
- {
328
- "epoch": 0.6157965194109772,
329
- "grad_norm": 8.120729446411133,
330
- "learning_rate": 3.2248583457092163e-06,
331
- "loss": 0.3451,
332
  "step": 2300
333
  },
334
  {
335
- "epoch": 0.6291834002677377,
336
- "grad_norm": 8.336807250976562,
337
- "learning_rate": 3.0299039062919417e-06,
338
- "loss": 0.3691,
339
- "step": 2350
340
- },
341
- {
342
- "epoch": 0.642570281124498,
343
- "grad_norm": 8.232279777526855,
344
- "learning_rate": 2.8384334972977275e-06,
345
- "loss": 0.412,
346
  "step": 2400
347
  },
348
  {
349
- "epoch": 0.6559571619812584,
350
- "grad_norm": 5.542270183563232,
351
- "learning_rate": 2.650785725930657e-06,
352
- "loss": 0.3768,
353
- "step": 2450
354
- },
355
- {
356
- "epoch": 0.6693440428380187,
357
- "grad_norm": 5.975722312927246,
358
- "learning_rate": 2.4672924392241493e-06,
359
- "loss": 0.3842,
360
  "step": 2500
361
  },
362
  {
363
- "epoch": 0.6827309236947792,
364
- "grad_norm": 6.6057868003845215,
365
- "learning_rate": 2.288278137183748e-06,
366
- "loss": 0.3278,
367
- "step": 2550
368
- },
369
- {
370
- "epoch": 0.6961178045515395,
371
- "grad_norm": 7.060661792755127,
372
- "learning_rate": 2.11405939892275e-06,
373
- "loss": 0.3673,
374
  "step": 2600
375
  },
376
  {
377
- "epoch": 0.7095046854082999,
378
- "grad_norm": 5.489977836608887,
379
- "learning_rate": 1.9449443228056565e-06,
380
- "loss": 0.3522,
381
- "step": 2650
382
- },
383
- {
384
- "epoch": 0.7228915662650602,
385
- "grad_norm": 7.412909507751465,
386
- "learning_rate": 1.7812319815894096e-06,
387
- "loss": 0.366,
388
  "step": 2700
389
  },
390
  {
391
- "epoch": 0.7362784471218207,
392
- "grad_norm": 5.627942085266113,
393
- "learning_rate": 1.6232118935260628e-06,
394
- "loss": 0.3906,
395
- "step": 2750
396
- },
397
- {
398
- "epoch": 0.749665327978581,
399
- "grad_norm": 6.9760212898254395,
400
- "learning_rate": 1.4711635103621718e-06,
401
- "loss": 0.3912,
402
  "step": 2800
403
  },
404
  {
405
- "epoch": 0.7630522088353414,
406
- "grad_norm": 7.291072368621826,
407
- "learning_rate": 1.325355723140367e-06,
408
- "loss": 0.3753,
409
- "step": 2850
410
- },
411
- {
412
- "epoch": 0.7764390896921017,
413
- "grad_norm": 6.7273478507995605,
414
- "learning_rate": 1.1860463866770826e-06,
415
- "loss": 0.3694,
416
  "step": 2900
417
  },
418
  {
419
- "epoch": 0.7898259705488622,
420
- "grad_norm": 7.016156196594238,
421
- "learning_rate": 1.0534818635573751e-06,
422
- "loss": 0.3636,
423
- "step": 2950
424
- },
425
- {
426
- "epoch": 0.8032128514056225,
427
- "grad_norm": 7.053563117980957,
428
- "learning_rate": 9.278965884532598e-07,
429
- "loss": 0.4107,
430
  "step": 3000
431
  },
432
  {
433
- "epoch": 0.8165997322623829,
434
- "grad_norm": 6.060437202453613,
435
- "learning_rate": 8.09512653536052e-07,
436
- "loss": 0.3908,
437
- "step": 3050
438
  },
439
  {
440
- "epoch": 0.8299866131191432,
441
- "grad_norm": 7.361605644226074,
442
- "learning_rate": 6.985394157158893e-07,
443
- "loss": 0.3662,
444
- "step": 3100
445
  },
446
  {
447
- "epoch": 0.8433734939759037,
448
- "grad_norm": 5.547077655792236,
449
- "learning_rate": 5.951731264030202e-07,
450
- "loss": 0.3513,
451
- "step": 3150
452
  },
453
  {
454
- "epoch": 0.856760374832664,
455
- "grad_norm": 9.102234840393066,
456
- "learning_rate": 4.995965844455969e-07,
457
- "loss": 0.409,
458
- "step": 3200
459
  },
460
  {
461
- "epoch": 0.8701472556894244,
462
- "grad_norm": 6.737813472747803,
463
- "learning_rate": 4.1197881285776675e-07,
464
- "loss": 0.3838,
465
- "step": 3250
466
  },
467
  {
468
- "epoch": 0.8835341365461847,
469
- "grad_norm": 6.354872226715088,
470
- "learning_rate": 3.324747599097078e-07,
471
- "loss": 0.4013,
472
- "step": 3300
473
  },
474
  {
475
- "epoch": 0.8969210174029452,
476
- "grad_norm": 7.315886497497559,
477
- "learning_rate": 2.61225025108276e-07,
478
- "loss": 0.4003,
479
- "step": 3350
480
  },
481
  {
482
- "epoch": 0.9103078982597055,
483
- "grad_norm": 7.8258562088012695,
484
- "learning_rate": 1.9835561055279728e-07,
485
- "loss": 0.3364,
486
- "step": 3400
487
  },
488
  {
489
- "epoch": 0.9236947791164659,
490
- "grad_norm": 3.8167738914489746,
491
- "learning_rate": 1.4397769810576668e-07,
492
- "loss": 0.3611,
493
- "step": 3450
494
  },
495
  {
496
- "epoch": 0.9370816599732262,
497
- "grad_norm": 9.796895980834961,
498
- "learning_rate": 9.818745277249076e-08,
499
- "loss": 0.363,
500
- "step": 3500
501
  },
502
  {
503
- "epoch": 0.9504685408299867,
504
- "grad_norm": 9.241850852966309,
505
- "learning_rate": 6.106585263740528e-08,
506
- "loss": 0.4065,
507
- "step": 3550
508
  },
509
  {
510
- "epoch": 0.963855421686747,
511
- "grad_norm": 6.8581624031066895,
512
- "learning_rate": 3.267854565780326e-08,
513
- "loss": 0.361,
514
- "step": 3600
515
  },
516
  {
517
- "epoch": 0.9772423025435074,
518
- "grad_norm": 6.801017761230469,
519
- "learning_rate": 1.3075733568240212e-08,
520
- "loss": 0.3531,
521
- "step": 3650
522
  },
523
  {
524
- "epoch": 0.9906291834002677,
525
- "grad_norm": 7.733842372894287,
526
- "learning_rate": 2.2920831009209944e-09,
527
- "loss": 0.4049,
528
- "step": 3700
529
  }
530
  ],
531
- "logging_steps": 50,
532
- "max_steps": 3735,
533
  "num_input_tokens_seen": 0,
534
  "num_train_epochs": 1,
535
- "save_steps": 50000,
536
  "stateful_callbacks": {
537
  "TrainerControl": {
538
  "args": {
@@ -545,8 +335,8 @@
545
  "attributes": {}
546
  }
547
  },
548
- "total_flos": 1.010777670549504e+16,
549
- "train_batch_size": 2,
550
  "trial_name": null,
551
  "trial_params": null
552
  }
 
4
  "best_model_checkpoint": null,
5
  "epoch": 1.0,
6
  "eval_steps": 500,
7
+ "global_step": 4467,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.02238638907544213,
14
+ "grad_norm": 8.243054389953613,
15
+ "learning_rate": 4.993942787771599e-06,
16
+ "loss": 1.1762,
 
 
 
 
 
 
 
17
  "step": 100
18
  },
19
  {
20
+ "epoch": 0.04477277815088426,
21
+ "grad_norm": 13.294663429260254,
22
+ "learning_rate": 4.975555846433033e-06,
23
+ "loss": 1.1404,
 
 
 
 
 
 
 
24
  "step": 200
25
  },
26
  {
27
+ "epoch": 0.0671591672263264,
28
+ "grad_norm": 8.65041446685791,
29
+ "learning_rate": 4.9449294649220665e-06,
30
+ "loss": 1.0161,
 
 
 
 
 
 
 
31
  "step": 300
32
  },
33
  {
34
+ "epoch": 0.08954555630176853,
35
+ "grad_norm": 10.313958168029785,
36
+ "learning_rate": 4.902215063682208e-06,
37
+ "loss": 0.9855,
 
 
 
 
 
 
 
38
  "step": 400
39
  },
40
  {
41
+ "epoch": 0.11193194537721066,
42
+ "grad_norm": 10.495174407958984,
43
+ "learning_rate": 4.847623827752661e-06,
44
+ "loss": 0.9682,
 
 
 
 
 
 
 
45
  "step": 500
46
  },
47
  {
48
+ "epoch": 0.1343183344526528,
49
+ "grad_norm": 12.217084884643555,
50
+ "learning_rate": 4.781425662644569e-06,
51
+ "loss": 0.988,
 
 
 
 
 
 
 
52
  "step": 600
53
  },
54
  {
55
+ "epoch": 0.15670472352809492,
56
+ "grad_norm": 10.601492881774902,
57
+ "learning_rate": 4.703947859896326e-06,
58
+ "loss": 1.0177,
 
 
 
 
 
 
 
59
  "step": 700
60
  },
61
  {
62
+ "epoch": 0.17909111260353705,
63
+ "grad_norm": 7.260547637939453,
64
+ "learning_rate": 4.615573478905602e-06,
65
+ "loss": 0.9601,
 
 
 
 
 
 
 
66
  "step": 800
67
  },
68
  {
69
+ "epoch": 0.20147750167897918,
70
+ "grad_norm": 10.645676612854004,
71
+ "learning_rate": 4.5167394530384775e-06,
72
+ "loss": 0.9422,
 
 
 
 
 
 
 
73
  "step": 900
74
  },
75
  {
76
+ "epoch": 0.2238638907544213,
77
+ "grad_norm": 10.681941986083984,
78
+ "learning_rate": 4.407934429379341e-06,
79
+ "loss": 1.0196,
 
 
 
 
 
 
 
80
  "step": 1000
81
  },
82
  {
83
+ "epoch": 0.24625027982986344,
84
+ "grad_norm": 9.875200271606445,
85
+ "learning_rate": 4.28969635280205e-06,
86
+ "loss": 1.0191,
 
 
 
 
 
 
 
87
  "step": 1100
88
  },
89
  {
90
+ "epoch": 0.2686366689053056,
91
+ "grad_norm": 10.363883018493652,
92
+ "learning_rate": 4.162609806307003e-06,
93
+ "loss": 0.9449,
 
 
 
 
 
 
 
94
  "step": 1200
95
  },
96
  {
97
+ "epoch": 0.2910230579807477,
98
+ "grad_norm": 8.477913856506348,
99
+ "learning_rate": 4.027303120773824e-06,
100
+ "loss": 0.9039,
 
 
 
 
 
 
 
101
  "step": 1300
102
  },
103
  {
104
+ "epoch": 0.31340944705618984,
105
+ "grad_norm": 9.964315414428711,
106
+ "learning_rate": 3.884445268419355e-06,
107
+ "loss": 1.0146,
 
 
 
 
 
 
 
108
  "step": 1400
109
  },
110
  {
111
+ "epoch": 0.33579583613163194,
112
+ "grad_norm": 10.20909309387207,
113
+ "learning_rate": 3.734742555320098e-06,
114
+ "loss": 1.0387,
 
 
 
 
 
 
 
115
  "step": 1500
116
  },
117
  {
118
+ "epoch": 0.3581822252070741,
119
+ "grad_norm": 12.675057411193848,
120
+ "learning_rate": 3.578935129351634e-06,
121
+ "loss": 1.0002,
 
 
 
 
 
 
 
122
  "step": 1600
123
  },
124
  {
125
+ "epoch": 0.3805686142825162,
126
+ "grad_norm": 11.391951560974121,
127
+ "learning_rate": 3.4177933208102103e-06,
128
+ "loss": 0.9976,
 
 
 
 
 
 
 
129
  "step": 1700
130
  },
131
  {
132
+ "epoch": 0.40295500335795836,
133
+ "grad_norm": 10.763954162597656,
134
+ "learning_rate": 3.2521138338088676e-06,
135
+ "loss": 0.9535,
 
 
 
 
 
 
 
136
  "step": 1800
137
  },
138
  {
139
+ "epoch": 0.42534139243340047,
140
+ "grad_norm": 9.04262638092041,
141
+ "learning_rate": 3.0827158072783113e-06,
142
+ "loss": 1.0119,
 
 
 
 
 
 
 
143
  "step": 1900
144
  },
145
  {
146
+ "epoch": 0.4477277815088426,
147
+ "grad_norm": 7.757719993591309,
148
+ "learning_rate": 2.9104367650473923e-06,
149
+ "loss": 0.9353,
 
 
 
 
 
 
 
150
  "step": 2000
151
  },
152
  {
153
+ "epoch": 0.47011417058428473,
154
+ "grad_norm": 8.288910865783691,
155
+ "learning_rate": 2.7361284750264927e-06,
156
+ "loss": 1.0068,
 
 
 
 
 
 
 
157
  "step": 2100
158
  },
159
  {
160
+ "epoch": 0.4925005596597269,
161
+ "grad_norm": 10.480790138244629,
162
+ "learning_rate": 2.5606527379664746e-06,
163
+ "loss": 0.9621,
 
 
 
 
 
 
 
164
  "step": 2200
165
  },
166
  {
167
+ "epoch": 0.514886948735169,
168
+ "grad_norm": 12.541658401489258,
169
+ "learning_rate": 2.384877126614103e-06,
170
+ "loss": 0.9756,
 
 
 
 
 
 
 
171
  "step": 2300
172
  },
173
  {
174
+ "epoch": 0.5372733378106112,
175
+ "grad_norm": 13.887962341308594,
176
+ "learning_rate": 2.20967069633002e-06,
177
+ "loss": 0.9665,
 
 
 
 
 
 
 
178
  "step": 2400
179
  },
180
  {
181
+ "epoch": 0.5596597268860533,
182
+ "grad_norm": 11.765827178955078,
183
+ "learning_rate": 2.035899688376515e-06,
184
+ "loss": 0.9511,
 
 
 
 
 
 
 
185
  "step": 2500
186
  },
187
  {
188
+ "epoch": 0.5820461159614954,
189
+ "grad_norm": 11.959718704223633,
190
+ "learning_rate": 1.8644232471185239e-06,
191
+ "loss": 0.9713,
 
 
 
 
 
 
 
192
  "step": 2600
193
  },
194
  {
195
+ "epoch": 0.6044325050369376,
196
+ "grad_norm": 8.351862907409668,
197
+ "learning_rate": 1.6960891723125235e-06,
198
+ "loss": 1.0633,
 
 
 
 
 
 
 
199
  "step": 2700
200
  },
201
  {
202
+ "epoch": 0.6268188941123797,
203
+ "grad_norm": 7.508358478546143,
204
+ "learning_rate": 1.5317297274845156e-06,
205
+ "loss": 0.895,
 
 
 
 
 
 
 
206
  "step": 2800
207
  },
208
  {
209
+ "epoch": 0.6492052831878218,
210
+ "grad_norm": 12.4747314453125,
211
+ "learning_rate": 1.372157525120959e-06,
212
+ "loss": 0.9394,
 
 
 
 
 
 
 
213
  "step": 2900
214
  },
215
  {
216
+ "epoch": 0.6715916722632639,
217
+ "grad_norm": 12.205965042114258,
218
+ "learning_rate": 1.2181615090167711e-06,
219
+ "loss": 0.9485,
 
 
 
 
 
 
 
220
  "step": 3000
221
  },
222
  {
223
+ "epoch": 0.6939780613387061,
224
+ "grad_norm": 6.783346652984619,
225
+ "learning_rate": 1.0705030536441147e-06,
226
+ "loss": 0.8932,
227
+ "step": 3100
228
  },
229
  {
230
+ "epoch": 0.7163644504141482,
231
+ "grad_norm": 9.786669731140137,
232
+ "learning_rate": 9.299121998271918e-07,
233
+ "loss": 0.9629,
234
+ "step": 3200
235
  },
236
  {
237
+ "epoch": 0.7387508394895903,
238
+ "grad_norm": 8.05592155456543,
239
+ "learning_rate": 7.970840453342679e-07,
240
+ "loss": 0.9799,
241
+ "step": 3300
242
  },
243
  {
244
+ "epoch": 0.7611372285650324,
245
+ "grad_norm": 11.015290260314941,
246
+ "learning_rate": 6.726753082323087e-07,
247
+ "loss": 0.8752,
248
+ "step": 3400
249
  },
250
  {
251
+ "epoch": 0.7835236176404746,
252
+ "grad_norm": 8.78658390045166,
253
+ "learning_rate": 5.573010799953652e-07,
254
+ "loss": 0.8905,
255
+ "step": 3500
256
  },
257
  {
258
+ "epoch": 0.8059100067159167,
259
+ "grad_norm": 11.667632102966309,
260
+ "learning_rate": 4.515317844197653e-07,
261
+ "loss": 0.9166,
262
+ "step": 3600
263
  },
264
  {
265
+ "epoch": 0.8282963957913588,
266
+ "grad_norm": 8.403421401977539,
267
+ "learning_rate": 3.5589035738156305e-07,
268
+ "loss": 0.9322,
269
+ "step": 3700
270
  },
271
  {
272
+ "epoch": 0.8506827848668009,
273
+ "grad_norm": 11.51596450805664,
274
+ "learning_rate": 2.708496613798717e-07,
275
+ "loss": 0.909,
276
+ "step": 3800
277
  },
278
  {
279
+ "epoch": 0.8730691739422431,
280
+ "grad_norm": 6.4879045486450195,
281
+ "learning_rate": 1.9683014764887682e-07,
282
+ "loss": 0.9552,
283
+ "step": 3900
284
  },
285
  {
286
+ "epoch": 0.8954555630176853,
287
+ "grad_norm": 11.149494171142578,
288
+ "learning_rate": 1.3419777739733408e-07,
289
+ "loss": 0.9242,
290
+ "step": 4000
291
  },
292
  {
293
+ "epoch": 0.9178419520931274,
294
+ "grad_norm": 8.041584014892578,
295
+ "learning_rate": 8.326221245317373e-08,
296
+ "loss": 0.9278,
297
+ "step": 4100
298
  },
299
  {
300
+ "epoch": 0.9402283411685695,
301
+ "grad_norm": 9.19898796081543,
302
+ "learning_rate": 4.427528425888977e-08,
303
+ "loss": 0.8939,
304
+ "step": 4200
305
  },
306
  {
307
+ "epoch": 0.9626147302440117,
308
+ "grad_norm": 9.239889144897461,
309
+ "learning_rate": 1.7429748787176626e-08,
310
+ "loss": 0.9563,
311
+ "step": 4300
312
  },
313
  {
314
+ "epoch": 0.9850011193194538,
315
+ "grad_norm": 6.8510565757751465,
316
+ "learning_rate": 2.8583335326598516e-09,
317
+ "loss": 0.961,
318
+ "step": 4400
319
  }
320
  ],
321
+ "logging_steps": 100,
322
+ "max_steps": 4467,
323
  "num_input_tokens_seen": 0,
324
  "num_train_epochs": 1,
325
+ "save_steps": 100000,
326
  "stateful_callbacks": {
327
  "TrainerControl": {
328
  "args": {
 
335
  "attributes": {}
336
  }
337
  },
338
+ "total_flos": 2.417341778362368e+16,
339
+ "train_batch_size": 4,
340
  "trial_name": null,
341
  "trial_params": null
342
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd3bc88ff5b25f40cd4e4bf31742ef38e7a91fb605a571a3ce68503effacfe00
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18779069295d4fb496b6ef1c071438cc814f890abd5a4efec426d22b7fe46a44
3
  size 5304