cools commited on
Commit
1d6caff
·
1 Parent(s): fa5b897

Training in progress, step 25, checkpoint

Browse files
last-checkpoint/README.md CHANGED
@@ -18,7 +18,6 @@ base_model: bn22/Mistral-7B-Instruct-v0.1-sharded
18
 
19
 
20
  - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
  - **Shared by [optional]:** [More Information Needed]
23
  - **Model type:** [More Information Needed]
24
  - **Language(s) (NLP):** [More Information Needed]
@@ -77,7 +76,7 @@ Use the code below to get started with the model.
77
 
78
  ### Training Data
79
 
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
 
82
  [More Information Needed]
83
 
@@ -108,7 +107,7 @@ Use the code below to get started with the model.
108
 
109
  #### Testing Data
110
 
111
- <!-- This should link to a Dataset Card if possible. -->
112
 
113
  [More Information Needed]
114
 
@@ -212,7 +211,7 @@ The following `bitsandbytes` quantization config was used during training:
212
  - llm_int8_has_fp16_weight: True
213
  - bnb_4bit_quant_type: nf4
214
  - bnb_4bit_use_double_quant: True
215
- - bnb_4bit_compute_dtype: float16
216
 
217
  ### Framework versions
218
 
 
18
 
19
 
20
  - **Developed by:** [More Information Needed]
 
21
  - **Shared by [optional]:** [More Information Needed]
22
  - **Model type:** [More Information Needed]
23
  - **Language(s) (NLP):** [More Information Needed]
 
76
 
77
  ### Training Data
78
 
79
+ <!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
80
 
81
  [More Information Needed]
82
 
 
107
 
108
  #### Testing Data
109
 
110
+ <!-- This should link to a Data Card if possible. -->
111
 
112
  [More Information Needed]
113
 
 
211
  - llm_int8_has_fp16_weight: True
212
  - bnb_4bit_quant_type: nf4
213
  - bnb_4bit_use_double_quant: True
214
+ - bnb_4bit_compute_dtype: bfloat16
215
 
216
  ### Framework versions
217
 
last-checkpoint/adapter_config.json CHANGED
@@ -16,13 +16,13 @@
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "o_proj",
20
- "gate_proj",
21
  "up_proj",
 
 
22
  "down_proj",
23
- "q_proj",
24
- "v_proj",
25
- "k_proj"
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
 
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v_proj",
20
+ "q_proj",
21
  "up_proj",
22
+ "gate_proj",
23
+ "k_proj",
24
  "down_proj",
25
+ "o_proj"
 
 
26
  ],
27
  "task_type": "CAUSAL_LM"
28
  }
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d91107f5c242c8df5342db2e311f2309392ecf0548153ba97e37db1ac8814530
3
  size 83945296
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80fa3a57d9708cddd9142d3819011418aedaa524c0b53b7c786d849cb8e3c617
3
  size 83945296
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24377944ce0dd27b0697f70f0d52d926be551d7d662470a7bb5e641270e05441
3
  size 42545748
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0d237a9a2cb2bf345606e0f288a49c7998e790371004a83437ab48963c73264
3
  size 42545748
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:79438b1731d80b13c3e856d3e6ada7b258fe0028bd55a30293421a89363cfff2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d0d220e3ffb9cbe7aa8b7ee81ee25441366727f842c08121cff70e1f43b7a50
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ad2180f5b27e9af3551807b0da763fbb16c05640f8b967bf27f34341ae1380
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:350b7a6f8159717ef2af74e68b6e9c8ed2775fab22debcfdf6f728bfa1bef4ad
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,619 +1,169 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 100.0,
5
  "eval_steps": 1000,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "learning_rate": 0.0,
14
- "loss": 1.399,
15
- "step": 1
16
- },
17
- {
18
- "epoch": 2.0,
19
- "learning_rate": 0.0,
20
- "loss": 1.399,
21
- "step": 2
22
- },
23
- {
24
- "epoch": 3.0,
25
- "learning_rate": 0.0,
26
- "loss": 1.399,
27
- "step": 3
28
- },
29
- {
30
- "epoch": 4.0,
31
- "learning_rate": 0.0,
32
- "loss": 1.399,
33
- "step": 4
34
- },
35
- {
36
- "epoch": 5.0,
37
  "learning_rate": 4e-05,
38
- "loss": 1.399,
39
- "step": 5
40
  },
41
  {
42
- "epoch": 6.0,
43
  "learning_rate": 8e-05,
44
- "loss": 1.399,
45
- "step": 6
46
  },
47
  {
48
- "epoch": 7.0,
49
  "learning_rate": 0.00012,
50
- "loss": 1.2225,
51
- "step": 7
52
  },
53
  {
54
- "epoch": 8.0,
55
  "learning_rate": 0.00016,
56
- "loss": 0.9757,
57
- "step": 8
58
  },
59
  {
60
- "epoch": 9.0,
61
  "learning_rate": 0.0002,
62
- "loss": 0.6616,
63
- "step": 9
64
  },
65
  {
66
- "epoch": 10.0,
67
  "learning_rate": 0.00019789473684210526,
68
- "loss": 0.34,
69
- "step": 10
70
  },
71
  {
72
- "epoch": 11.0,
73
  "learning_rate": 0.00019578947368421054,
74
- "loss": 0.1851,
75
- "step": 11
76
  },
77
  {
78
- "epoch": 12.0,
79
  "learning_rate": 0.0001936842105263158,
80
- "loss": 0.18,
81
- "step": 12
82
  },
83
  {
84
- "epoch": 13.0,
85
  "learning_rate": 0.00019157894736842104,
86
- "loss": 0.1289,
87
- "step": 13
88
  },
89
  {
90
- "epoch": 14.0,
91
  "learning_rate": 0.00018947368421052632,
92
- "loss": 0.0939,
93
- "step": 14
94
  },
95
  {
96
- "epoch": 15.0,
97
  "learning_rate": 0.0001873684210526316,
98
- "loss": 0.0693,
99
- "step": 15
100
  },
101
  {
102
- "epoch": 16.0,
103
  "learning_rate": 0.00018526315789473685,
104
- "loss": 0.0616,
105
- "step": 16
106
  },
107
  {
108
- "epoch": 17.0,
109
  "learning_rate": 0.0001831578947368421,
110
- "loss": 0.0507,
111
- "step": 17
112
  },
113
  {
114
- "epoch": 18.0,
115
  "learning_rate": 0.00018105263157894739,
116
- "loss": 0.0416,
117
- "step": 18
118
  },
119
  {
120
- "epoch": 19.0,
121
  "learning_rate": 0.00017894736842105264,
122
- "loss": 0.0413,
123
- "step": 19
124
  },
125
  {
126
- "epoch": 20.0,
127
  "learning_rate": 0.0001768421052631579,
128
- "loss": 0.0545,
129
- "step": 20
130
  },
131
  {
132
- "epoch": 21.0,
133
  "learning_rate": 0.00017473684210526317,
134
- "loss": 0.0323,
135
- "step": 21
136
  },
137
  {
138
- "epoch": 22.0,
139
  "learning_rate": 0.00017263157894736842,
140
- "loss": 0.0254,
141
- "step": 22
142
  },
143
  {
144
- "epoch": 23.0,
145
  "learning_rate": 0.0001705263157894737,
146
- "loss": 0.0052,
147
- "step": 23
148
  },
149
  {
150
- "epoch": 24.0,
151
  "learning_rate": 0.00016842105263157895,
152
- "loss": 0.0047,
153
- "step": 24
154
  },
155
  {
156
- "epoch": 25.0,
157
  "learning_rate": 0.00016631578947368423,
158
- "loss": 0.0127,
159
- "step": 25
160
  },
161
  {
162
- "epoch": 26.0,
163
  "learning_rate": 0.00016421052631578948,
164
- "loss": 0.0049,
165
- "step": 26
166
  },
167
  {
168
- "epoch": 27.0,
169
  "learning_rate": 0.00016210526315789473,
170
- "loss": 0.0042,
171
- "step": 27
172
  },
173
  {
174
- "epoch": 28.0,
175
  "learning_rate": 0.00016,
176
- "loss": 0.0052,
177
- "step": 28
178
  },
179
  {
180
- "epoch": 29.0,
181
  "learning_rate": 0.00015789473684210527,
182
- "loss": 0.0041,
183
- "step": 29
184
- },
185
- {
186
- "epoch": 30.0,
187
- "learning_rate": 0.00015578947368421052,
188
- "loss": 0.0044,
189
- "step": 30
190
- },
191
- {
192
- "epoch": 31.0,
193
- "learning_rate": 0.0001536842105263158,
194
- "loss": 0.0044,
195
- "step": 31
196
- },
197
- {
198
- "epoch": 32.0,
199
- "learning_rate": 0.00015157894736842108,
200
- "loss": 0.0043,
201
- "step": 32
202
- },
203
- {
204
- "epoch": 33.0,
205
- "learning_rate": 0.00014947368421052633,
206
- "loss": 0.0043,
207
- "step": 33
208
- },
209
- {
210
- "epoch": 34.0,
211
- "learning_rate": 0.00014736842105263158,
212
- "loss": 0.0043,
213
- "step": 34
214
- },
215
- {
216
- "epoch": 35.0,
217
- "learning_rate": 0.00014526315789473686,
218
- "loss": 0.0044,
219
- "step": 35
220
- },
221
- {
222
- "epoch": 36.0,
223
- "learning_rate": 0.0001431578947368421,
224
- "loss": 0.0035,
225
- "step": 36
226
- },
227
- {
228
- "epoch": 37.0,
229
- "learning_rate": 0.00014105263157894736,
230
- "loss": 0.004,
231
- "step": 37
232
- },
233
- {
234
- "epoch": 38.0,
235
- "learning_rate": 0.00013894736842105264,
236
- "loss": 0.0044,
237
- "step": 38
238
- },
239
- {
240
- "epoch": 39.0,
241
- "learning_rate": 0.0001368421052631579,
242
- "loss": 0.0041,
243
- "step": 39
244
- },
245
- {
246
- "epoch": 40.0,
247
- "learning_rate": 0.00013473684210526317,
248
- "loss": 0.0043,
249
- "step": 40
250
- },
251
- {
252
- "epoch": 41.0,
253
- "learning_rate": 0.00013263157894736842,
254
- "loss": 0.0041,
255
- "step": 41
256
- },
257
- {
258
- "epoch": 42.0,
259
- "learning_rate": 0.0001305263157894737,
260
- "loss": 0.0042,
261
- "step": 42
262
- },
263
- {
264
- "epoch": 43.0,
265
- "learning_rate": 0.00012842105263157895,
266
- "loss": 0.0036,
267
- "step": 43
268
- },
269
- {
270
- "epoch": 44.0,
271
- "learning_rate": 0.0001263157894736842,
272
- "loss": 0.0038,
273
- "step": 44
274
- },
275
- {
276
- "epoch": 45.0,
277
- "learning_rate": 0.00012421052631578949,
278
- "loss": 0.0039,
279
- "step": 45
280
- },
281
- {
282
- "epoch": 46.0,
283
- "learning_rate": 0.00012210526315789474,
284
- "loss": 0.0045,
285
- "step": 46
286
- },
287
- {
288
- "epoch": 47.0,
289
- "learning_rate": 0.00012,
290
- "loss": 0.0053,
291
- "step": 47
292
- },
293
- {
294
- "epoch": 48.0,
295
- "learning_rate": 0.00011789473684210525,
296
- "loss": 0.0042,
297
- "step": 48
298
- },
299
- {
300
- "epoch": 49.0,
301
- "learning_rate": 0.00011578947368421053,
302
- "loss": 0.0029,
303
- "step": 49
304
- },
305
- {
306
- "epoch": 50.0,
307
- "learning_rate": 0.0001136842105263158,
308
- "loss": 0.004,
309
- "step": 50
310
- },
311
- {
312
- "epoch": 51.0,
313
- "learning_rate": 0.00011157894736842105,
314
- "loss": 0.0042,
315
- "step": 51
316
- },
317
- {
318
- "epoch": 52.0,
319
- "learning_rate": 0.00010947368421052633,
320
- "loss": 0.0039,
321
- "step": 52
322
- },
323
- {
324
- "epoch": 53.0,
325
- "learning_rate": 0.00010736842105263158,
326
- "loss": 0.0046,
327
- "step": 53
328
- },
329
- {
330
- "epoch": 54.0,
331
- "learning_rate": 0.00010526315789473685,
332
- "loss": 0.0041,
333
- "step": 54
334
- },
335
- {
336
- "epoch": 55.0,
337
- "learning_rate": 0.00010315789473684211,
338
- "loss": 0.0042,
339
- "step": 55
340
- },
341
- {
342
- "epoch": 56.0,
343
- "learning_rate": 0.00010105263157894738,
344
- "loss": 0.0042,
345
- "step": 56
346
- },
347
- {
348
- "epoch": 57.0,
349
- "learning_rate": 9.894736842105263e-05,
350
- "loss": 0.0037,
351
- "step": 57
352
- },
353
- {
354
- "epoch": 58.0,
355
- "learning_rate": 9.68421052631579e-05,
356
- "loss": 0.0038,
357
- "step": 58
358
- },
359
- {
360
- "epoch": 59.0,
361
- "learning_rate": 9.473684210526316e-05,
362
- "loss": 0.0042,
363
- "step": 59
364
- },
365
- {
366
- "epoch": 60.0,
367
- "learning_rate": 9.263157894736843e-05,
368
- "loss": 0.0042,
369
- "step": 60
370
- },
371
- {
372
- "epoch": 61.0,
373
- "learning_rate": 9.052631578947369e-05,
374
- "loss": 0.0042,
375
- "step": 61
376
- },
377
- {
378
- "epoch": 62.0,
379
- "learning_rate": 8.842105263157894e-05,
380
- "loss": 0.0038,
381
- "step": 62
382
- },
383
- {
384
- "epoch": 63.0,
385
- "learning_rate": 8.631578947368421e-05,
386
- "loss": 0.0041,
387
- "step": 63
388
- },
389
- {
390
- "epoch": 64.0,
391
- "learning_rate": 8.421052631578948e-05,
392
- "loss": 0.0036,
393
- "step": 64
394
- },
395
- {
396
- "epoch": 65.0,
397
- "learning_rate": 8.210526315789474e-05,
398
- "loss": 0.0042,
399
- "step": 65
400
- },
401
- {
402
- "epoch": 66.0,
403
- "learning_rate": 8e-05,
404
- "loss": 0.004,
405
- "step": 66
406
- },
407
- {
408
- "epoch": 67.0,
409
- "learning_rate": 7.789473684210526e-05,
410
- "loss": 0.0036,
411
- "step": 67
412
- },
413
- {
414
- "epoch": 68.0,
415
- "learning_rate": 7.578947368421054e-05,
416
- "loss": 0.0043,
417
- "step": 68
418
- },
419
- {
420
- "epoch": 69.0,
421
- "learning_rate": 7.368421052631579e-05,
422
- "loss": 0.0037,
423
- "step": 69
424
- },
425
- {
426
- "epoch": 70.0,
427
- "learning_rate": 7.157894736842105e-05,
428
- "loss": 0.0038,
429
- "step": 70
430
- },
431
- {
432
- "epoch": 71.0,
433
- "learning_rate": 6.947368421052632e-05,
434
- "loss": 0.0043,
435
- "step": 71
436
- },
437
- {
438
- "epoch": 72.0,
439
- "learning_rate": 6.736842105263159e-05,
440
- "loss": 0.004,
441
- "step": 72
442
- },
443
- {
444
- "epoch": 73.0,
445
- "learning_rate": 6.526315789473685e-05,
446
- "loss": 0.0043,
447
- "step": 73
448
- },
449
- {
450
- "epoch": 74.0,
451
- "learning_rate": 6.31578947368421e-05,
452
- "loss": 0.0041,
453
- "step": 74
454
- },
455
- {
456
- "epoch": 75.0,
457
- "learning_rate": 6.105263157894737e-05,
458
- "loss": 0.0034,
459
- "step": 75
460
- },
461
- {
462
- "epoch": 76.0,
463
- "learning_rate": 5.894736842105263e-05,
464
- "loss": 0.0041,
465
- "step": 76
466
- },
467
- {
468
- "epoch": 77.0,
469
- "learning_rate": 5.68421052631579e-05,
470
- "loss": 0.0041,
471
- "step": 77
472
- },
473
- {
474
- "epoch": 78.0,
475
- "learning_rate": 5.4736842105263165e-05,
476
- "loss": 0.0034,
477
- "step": 78
478
- },
479
- {
480
- "epoch": 79.0,
481
- "learning_rate": 5.2631578947368424e-05,
482
- "loss": 0.0038,
483
- "step": 79
484
- },
485
- {
486
- "epoch": 80.0,
487
- "learning_rate": 5.052631578947369e-05,
488
- "loss": 0.0037,
489
- "step": 80
490
- },
491
- {
492
- "epoch": 81.0,
493
- "learning_rate": 4.842105263157895e-05,
494
- "loss": 0.0044,
495
- "step": 81
496
- },
497
- {
498
- "epoch": 82.0,
499
- "learning_rate": 4.6315789473684214e-05,
500
- "loss": 0.0044,
501
- "step": 82
502
- },
503
- {
504
- "epoch": 83.0,
505
- "learning_rate": 4.421052631578947e-05,
506
- "loss": 0.0042,
507
- "step": 83
508
- },
509
- {
510
- "epoch": 84.0,
511
- "learning_rate": 4.210526315789474e-05,
512
- "loss": 0.0037,
513
- "step": 84
514
- },
515
- {
516
- "epoch": 85.0,
517
- "learning_rate": 4e-05,
518
- "loss": 0.0039,
519
- "step": 85
520
- },
521
- {
522
- "epoch": 86.0,
523
- "learning_rate": 3.789473684210527e-05,
524
- "loss": 0.0029,
525
- "step": 86
526
- },
527
- {
528
- "epoch": 87.0,
529
- "learning_rate": 3.578947368421053e-05,
530
- "loss": 0.0039,
531
- "step": 87
532
- },
533
- {
534
- "epoch": 88.0,
535
- "learning_rate": 3.368421052631579e-05,
536
- "loss": 0.004,
537
- "step": 88
538
- },
539
- {
540
- "epoch": 89.0,
541
- "learning_rate": 3.157894736842105e-05,
542
- "loss": 0.0042,
543
- "step": 89
544
- },
545
- {
546
- "epoch": 90.0,
547
- "learning_rate": 2.9473684210526314e-05,
548
- "loss": 0.0038,
549
- "step": 90
550
- },
551
- {
552
- "epoch": 91.0,
553
- "learning_rate": 2.7368421052631583e-05,
554
- "loss": 0.0035,
555
- "step": 91
556
- },
557
- {
558
- "epoch": 92.0,
559
- "learning_rate": 2.5263157894736845e-05,
560
- "loss": 0.0041,
561
- "step": 92
562
- },
563
- {
564
- "epoch": 93.0,
565
- "learning_rate": 2.3157894736842107e-05,
566
- "loss": 0.0039,
567
- "step": 93
568
- },
569
- {
570
- "epoch": 94.0,
571
- "learning_rate": 2.105263157894737e-05,
572
- "loss": 0.0034,
573
- "step": 94
574
- },
575
- {
576
- "epoch": 95.0,
577
- "learning_rate": 1.8947368421052634e-05,
578
- "loss": 0.0044,
579
- "step": 95
580
- },
581
- {
582
- "epoch": 96.0,
583
- "learning_rate": 1.6842105263157896e-05,
584
- "loss": 0.0041,
585
- "step": 96
586
- },
587
- {
588
- "epoch": 97.0,
589
- "learning_rate": 1.4736842105263157e-05,
590
- "loss": 0.0036,
591
- "step": 97
592
- },
593
- {
594
- "epoch": 98.0,
595
- "learning_rate": 1.2631578947368422e-05,
596
- "loss": 0.0037,
597
- "step": 98
598
- },
599
- {
600
- "epoch": 99.0,
601
- "learning_rate": 1.0526315789473684e-05,
602
- "loss": 0.006,
603
- "step": 99
604
- },
605
- {
606
- "epoch": 100.0,
607
- "learning_rate": 8.421052631578948e-06,
608
- "loss": 0.0041,
609
- "step": 100
610
  }
611
  ],
612
  "logging_steps": 1,
613
  "max_steps": 100,
614
- "num_train_epochs": 100,
615
  "save_steps": 25,
616
- "total_flos": 1069744742400000.0,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.005755064456721915,
5
  "eval_steps": 1000,
6
+ "global_step": 25,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  "learning_rate": 4e-05,
14
+ "loss": 3.6137,
15
+ "step": 1
16
  },
17
  {
18
+ "epoch": 0.0,
19
  "learning_rate": 8e-05,
20
+ "loss": 3.4233,
21
+ "step": 2
22
  },
23
  {
24
+ "epoch": 0.0,
25
  "learning_rate": 0.00012,
26
+ "loss": 3.3927,
27
+ "step": 3
28
  },
29
  {
30
+ "epoch": 0.0,
31
  "learning_rate": 0.00016,
32
+ "loss": 2.8163,
33
+ "step": 4
34
  },
35
  {
36
+ "epoch": 0.0,
37
  "learning_rate": 0.0002,
38
+ "loss": 2.4858,
39
+ "step": 5
40
  },
41
  {
42
+ "epoch": 0.0,
43
  "learning_rate": 0.00019789473684210526,
44
+ "loss": 1.9139,
45
+ "step": 6
46
  },
47
  {
48
+ "epoch": 0.0,
49
  "learning_rate": 0.00019578947368421054,
50
+ "loss": 2.0399,
51
+ "step": 7
52
  },
53
  {
54
+ "epoch": 0.0,
55
  "learning_rate": 0.0001936842105263158,
56
+ "loss": 1.9175,
57
+ "step": 8
58
  },
59
  {
60
+ "epoch": 0.0,
61
  "learning_rate": 0.00019157894736842104,
62
+ "loss": 2.2095,
63
+ "step": 9
64
  },
65
  {
66
+ "epoch": 0.0,
67
  "learning_rate": 0.00018947368421052632,
68
+ "loss": 1.9044,
69
+ "step": 10
70
  },
71
  {
72
+ "epoch": 0.0,
73
  "learning_rate": 0.0001873684210526316,
74
+ "loss": 2.013,
75
+ "step": 11
76
  },
77
  {
78
+ "epoch": 0.0,
79
  "learning_rate": 0.00018526315789473685,
80
+ "loss": 2.1313,
81
+ "step": 12
82
  },
83
  {
84
+ "epoch": 0.0,
85
  "learning_rate": 0.0001831578947368421,
86
+ "loss": 1.9865,
87
+ "step": 13
88
  },
89
  {
90
+ "epoch": 0.0,
91
  "learning_rate": 0.00018105263157894739,
92
+ "loss": 1.7398,
93
+ "step": 14
94
  },
95
  {
96
+ "epoch": 0.0,
97
  "learning_rate": 0.00017894736842105264,
98
+ "loss": 1.693,
99
+ "step": 15
100
  },
101
  {
102
+ "epoch": 0.0,
103
  "learning_rate": 0.0001768421052631579,
104
+ "loss": 1.6588,
105
+ "step": 16
106
  },
107
  {
108
+ "epoch": 0.0,
109
  "learning_rate": 0.00017473684210526317,
110
+ "loss": 1.989,
111
+ "step": 17
112
  },
113
  {
114
+ "epoch": 0.0,
115
  "learning_rate": 0.00017263157894736842,
116
+ "loss": 1.9944,
117
+ "step": 18
118
  },
119
  {
120
+ "epoch": 0.0,
121
  "learning_rate": 0.0001705263157894737,
122
+ "loss": 1.7393,
123
+ "step": 19
124
  },
125
  {
126
+ "epoch": 0.0,
127
  "learning_rate": 0.00016842105263157895,
128
+ "loss": 1.7573,
129
+ "step": 20
130
  },
131
  {
132
+ "epoch": 0.0,
133
  "learning_rate": 0.00016631578947368423,
134
+ "loss": 1.546,
135
+ "step": 21
136
  },
137
  {
138
+ "epoch": 0.01,
139
  "learning_rate": 0.00016421052631578948,
140
+ "loss": 1.6286,
141
+ "step": 22
142
  },
143
  {
144
+ "epoch": 0.01,
145
  "learning_rate": 0.00016210526315789473,
146
+ "loss": 1.61,
147
+ "step": 23
148
  },
149
  {
150
+ "epoch": 0.01,
151
  "learning_rate": 0.00016,
152
+ "loss": 1.7942,
153
+ "step": 24
154
  },
155
  {
156
+ "epoch": 0.01,
157
  "learning_rate": 0.00015789473684210527,
158
+ "loss": 1.4316,
159
+ "step": 25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  }
161
  ],
162
  "logging_steps": 1,
163
  "max_steps": 100,
164
+ "num_train_epochs": 1,
165
  "save_steps": 25,
166
+ "total_flos": 505946473365504.0,
167
  "trial_name": null,
168
  "trial_params": null
169
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b905f14608904b19068aed8f635b4021ef5a4d4a28a5ffd1e2cb0eebda395135
3
- size 4600
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e140806e2e2e31fba82f540cf5fc839f41cb8ed4903678e59fb42ac85a482011
3
+ size 4536