TinyPixel commited on
Commit
273d9da
·
1 Parent(s): d1b4d6a

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "EleutherAI/pythia-1b-deduped",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
@@ -14,10 +14,13 @@
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
- "query_key_value",
18
- "dense",
19
- "dense_h_to_4h",
20
- "dense_4h_to_h"
 
 
 
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "TinyPixel/Llama-2-7B-bf16-sharded",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
 
14
  "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
+ "q_proj",
18
+ "up_proj",
19
+ "o_proj",
20
+ "k_proj",
21
+ "down_proj",
22
+ "gate_proj",
23
+ "v_proj"
24
  ],
25
  "task_type": "CAUSAL_LM"
26
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acdf9bf2d1e91072e64a7f34b57befb4fe35fcebac7f0f316e1b5d7c7750159f
3
- size 134264397
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d0201f6519602fdb0a1e9bfe74c23bde2f75aa4974d9727a3ba7b8831c3c52
3
+ size 639792909
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7849318e6cd75c739b240ec4f1c2782c36331c50946af580e1fb1c6390abe257
3
- size 268514565
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f828fd98d557668603427f374c6ed59c1b219cf948e90247cfcb37e20166445a
3
+ size 1279539973
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:116c4798dde8668dcde32e823b3f8efd0009e1b99a81b1da986aece5126726ad
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:275d9de1c3e5b37f047ae8355f292a2419092f9ef0035a764a03e5322591e86f
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82033db06e63ad2bc52542b0b543acf8729d9027beecc78d15289871f2007b09
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c1b52c97a7a8218f3e120cb3e6f62943bb3c9d99f61adb685463e5b497c02d
3
  size 627
special_tokens_map.json CHANGED
@@ -1,6 +1,24 @@
1
  {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "pad_token": "<|endoftext|>",
5
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,9 +1,31 @@
1
  {
2
- "add_prefix_space": false,
3
- "bos_token": "<|endoftext|>",
4
- "clean_up_tokenization_spaces": true,
5
- "eos_token": "<|endoftext|>",
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "model_max_length": 1000000000000000019884624838656,
7
- "tokenizer_class": "GPTNeoXTokenizer",
8
- "unk_token": "<|endoftext|>"
 
 
 
 
 
 
 
 
 
9
  }
 
1
  {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "clean_up_tokenization_spaces": false,
11
+ "eos_token": {
12
+ "__type": "AddedToken",
13
+ "content": "</s>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
  "model_max_length": 1000000000000000019884624838656,
20
+ "pad_token": null,
21
+ "sp_model_kwargs": {},
22
+ "tokenizer_class": "LlamaTokenizer",
23
+ "unk_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<unk>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9668842156151801,
5
- "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -10,1507 +10,907 @@
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 2e-05,
13
- "loss": 1.9786,
14
  "step": 2
15
  },
16
  {
17
  "epoch": 0.01,
18
  "learning_rate": 2e-05,
19
- "loss": 1.7708,
20
  "step": 4
21
  },
22
  {
23
  "epoch": 0.01,
24
  "learning_rate": 2e-05,
25
- "loss": 1.7948,
26
  "step": 6
27
  },
28
  {
29
  "epoch": 0.02,
30
  "learning_rate": 2e-05,
31
- "loss": 1.8149,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 0.02,
36
  "learning_rate": 2e-05,
37
- "loss": 1.9953,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 0.02,
42
  "learning_rate": 2e-05,
43
- "loss": 1.8709,
44
  "step": 12
45
  },
46
  {
47
  "epoch": 0.03,
48
  "learning_rate": 2e-05,
49
- "loss": 1.9072,
50
  "step": 14
51
  },
52
  {
53
  "epoch": 0.03,
54
  "learning_rate": 2e-05,
55
- "loss": 1.9104,
56
  "step": 16
57
  },
58
  {
59
  "epoch": 0.03,
60
  "learning_rate": 2e-05,
61
- "loss": 1.9629,
62
  "step": 18
63
  },
64
  {
65
  "epoch": 0.04,
66
  "learning_rate": 2e-05,
67
- "loss": 2.0541,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 0.04,
72
  "learning_rate": 2e-05,
73
- "loss": 2.1711,
74
  "step": 22
75
  },
76
  {
77
  "epoch": 0.05,
78
  "learning_rate": 2e-05,
79
- "loss": 2.0025,
80
  "step": 24
81
  },
82
  {
83
  "epoch": 0.05,
84
  "learning_rate": 2e-05,
85
- "loss": 2.0005,
86
  "step": 26
87
  },
88
  {
89
  "epoch": 0.05,
90
  "learning_rate": 2e-05,
91
- "loss": 2.1043,
92
  "step": 28
93
  },
94
  {
95
  "epoch": 0.06,
96
  "learning_rate": 2e-05,
97
- "loss": 2.3104,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 0.06,
102
  "learning_rate": 2e-05,
103
- "loss": 2.2821,
104
  "step": 32
105
  },
106
  {
107
  "epoch": 0.07,
108
  "learning_rate": 2e-05,
109
- "loss": 2.1946,
110
  "step": 34
111
  },
112
  {
113
  "epoch": 0.07,
114
  "learning_rate": 2e-05,
115
- "loss": 2.2659,
116
  "step": 36
117
  },
118
  {
119
  "epoch": 0.07,
120
  "learning_rate": 2e-05,
121
- "loss": 2.168,
122
  "step": 38
123
  },
124
  {
125
  "epoch": 0.08,
126
  "learning_rate": 2e-05,
127
- "loss": 2.2006,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 0.08,
132
  "learning_rate": 2e-05,
133
- "loss": 2.2207,
134
  "step": 42
135
  },
136
  {
137
  "epoch": 0.09,
138
  "learning_rate": 2e-05,
139
- "loss": 2.1845,
140
  "step": 44
141
  },
142
  {
143
  "epoch": 0.09,
144
  "learning_rate": 2e-05,
145
- "loss": 2.0855,
146
  "step": 46
147
  },
148
  {
149
  "epoch": 0.09,
150
  "learning_rate": 2e-05,
151
- "loss": 1.772,
152
  "step": 48
153
  },
154
  {
155
  "epoch": 0.1,
156
  "learning_rate": 2e-05,
157
- "loss": 1.2171,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 0.1,
162
  "learning_rate": 2e-05,
163
- "loss": 1.4885,
164
  "step": 52
165
  },
166
  {
167
  "epoch": 0.1,
168
  "learning_rate": 2e-05,
169
- "loss": 1.5829,
170
  "step": 54
171
  },
172
  {
173
  "epoch": 0.11,
174
  "learning_rate": 2e-05,
175
- "loss": 1.8063,
176
  "step": 56
177
  },
178
  {
179
  "epoch": 0.11,
180
  "learning_rate": 2e-05,
181
- "loss": 1.7111,
182
  "step": 58
183
  },
184
  {
185
  "epoch": 0.12,
186
  "learning_rate": 2e-05,
187
- "loss": 1.641,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 0.12,
192
  "learning_rate": 2e-05,
193
- "loss": 1.5854,
194
  "step": 62
195
  },
196
  {
197
  "epoch": 0.12,
198
  "learning_rate": 2e-05,
199
- "loss": 1.7189,
200
  "step": 64
201
  },
202
  {
203
  "epoch": 0.13,
204
  "learning_rate": 2e-05,
205
- "loss": 1.7115,
206
  "step": 66
207
  },
208
  {
209
  "epoch": 0.13,
210
  "learning_rate": 2e-05,
211
- "loss": 1.7226,
212
  "step": 68
213
  },
214
  {
215
  "epoch": 0.14,
216
  "learning_rate": 2e-05,
217
- "loss": 1.7819,
218
  "step": 70
219
  },
220
  {
221
  "epoch": 0.14,
222
  "learning_rate": 2e-05,
223
- "loss": 1.775,
224
  "step": 72
225
  },
226
  {
227
  "epoch": 0.14,
228
  "learning_rate": 2e-05,
229
- "loss": 1.7143,
230
  "step": 74
231
  },
232
  {
233
  "epoch": 0.15,
234
  "learning_rate": 2e-05,
235
- "loss": 1.9604,
236
  "step": 76
237
  },
238
  {
239
  "epoch": 0.15,
240
  "learning_rate": 2e-05,
241
- "loss": 2.005,
242
  "step": 78
243
  },
244
  {
245
  "epoch": 0.15,
246
  "learning_rate": 2e-05,
247
- "loss": 1.9415,
248
  "step": 80
249
  },
250
  {
251
  "epoch": 0.16,
252
  "learning_rate": 2e-05,
253
- "loss": 1.9649,
254
  "step": 82
255
  },
256
  {
257
  "epoch": 0.16,
258
  "learning_rate": 2e-05,
259
- "loss": 1.8703,
260
  "step": 84
261
  },
262
  {
263
  "epoch": 0.17,
264
  "learning_rate": 2e-05,
265
- "loss": 1.8023,
266
  "step": 86
267
  },
268
  {
269
  "epoch": 0.17,
270
  "learning_rate": 2e-05,
271
- "loss": 1.9498,
272
  "step": 88
273
  },
274
  {
275
  "epoch": 0.17,
276
  "learning_rate": 2e-05,
277
- "loss": 2.1438,
278
  "step": 90
279
  },
280
  {
281
  "epoch": 0.18,
282
  "learning_rate": 2e-05,
283
- "loss": 1.9555,
284
  "step": 92
285
  },
286
  {
287
  "epoch": 0.18,
288
  "learning_rate": 2e-05,
289
- "loss": 1.8544,
290
  "step": 94
291
  },
292
  {
293
  "epoch": 0.19,
294
  "learning_rate": 2e-05,
295
- "loss": 1.8175,
296
  "step": 96
297
  },
298
  {
299
  "epoch": 0.19,
300
  "learning_rate": 2e-05,
301
- "loss": 1.7122,
302
  "step": 98
303
  },
304
  {
305
  "epoch": 0.19,
306
  "learning_rate": 2e-05,
307
- "loss": 1.0217,
308
  "step": 100
309
  },
310
  {
311
  "epoch": 0.2,
312
  "learning_rate": 2e-05,
313
- "loss": 1.706,
314
  "step": 102
315
  },
316
  {
317
  "epoch": 0.2,
318
  "learning_rate": 2e-05,
319
- "loss": 1.6285,
320
  "step": 104
321
  },
322
  {
323
  "epoch": 0.2,
324
  "learning_rate": 2e-05,
325
- "loss": 1.7356,
326
  "step": 106
327
  },
328
  {
329
  "epoch": 0.21,
330
  "learning_rate": 2e-05,
331
- "loss": 1.7032,
332
  "step": 108
333
  },
334
  {
335
  "epoch": 0.21,
336
  "learning_rate": 2e-05,
337
- "loss": 1.5121,
338
  "step": 110
339
  },
340
  {
341
  "epoch": 0.22,
342
  "learning_rate": 2e-05,
343
- "loss": 1.7203,
344
  "step": 112
345
  },
346
  {
347
  "epoch": 0.22,
348
  "learning_rate": 2e-05,
349
- "loss": 1.713,
350
  "step": 114
351
  },
352
  {
353
  "epoch": 0.22,
354
  "learning_rate": 2e-05,
355
- "loss": 1.8506,
356
  "step": 116
357
  },
358
  {
359
  "epoch": 0.23,
360
  "learning_rate": 2e-05,
361
- "loss": 1.7168,
362
  "step": 118
363
  },
364
  {
365
  "epoch": 0.23,
366
  "learning_rate": 2e-05,
367
- "loss": 1.766,
368
  "step": 120
369
  },
370
  {
371
  "epoch": 0.24,
372
  "learning_rate": 2e-05,
373
- "loss": 1.7612,
374
  "step": 122
375
  },
376
  {
377
  "epoch": 0.24,
378
  "learning_rate": 2e-05,
379
- "loss": 1.6406,
380
  "step": 124
381
  },
382
  {
383
  "epoch": 0.24,
384
  "learning_rate": 2e-05,
385
- "loss": 1.9361,
386
  "step": 126
387
  },
388
  {
389
  "epoch": 0.25,
390
  "learning_rate": 2e-05,
391
- "loss": 1.7824,
392
  "step": 128
393
  },
394
  {
395
  "epoch": 0.25,
396
  "learning_rate": 2e-05,
397
- "loss": 1.9262,
398
  "step": 130
399
  },
400
  {
401
  "epoch": 0.26,
402
  "learning_rate": 2e-05,
403
- "loss": 1.792,
404
  "step": 132
405
  },
406
  {
407
  "epoch": 0.26,
408
  "learning_rate": 2e-05,
409
- "loss": 1.9787,
410
  "step": 134
411
  },
412
  {
413
  "epoch": 0.26,
414
  "learning_rate": 2e-05,
415
- "loss": 2.0559,
416
  "step": 136
417
  },
418
  {
419
  "epoch": 0.27,
420
  "learning_rate": 2e-05,
421
- "loss": 2.0093,
422
  "step": 138
423
  },
424
  {
425
  "epoch": 0.27,
426
  "learning_rate": 2e-05,
427
- "loss": 2.0827,
428
  "step": 140
429
  },
430
  {
431
  "epoch": 0.27,
432
  "learning_rate": 2e-05,
433
- "loss": 1.9098,
434
  "step": 142
435
  },
436
  {
437
  "epoch": 0.28,
438
  "learning_rate": 2e-05,
439
- "loss": 1.8169,
440
  "step": 144
441
  },
442
  {
443
  "epoch": 0.28,
444
  "learning_rate": 2e-05,
445
- "loss": 1.6815,
446
  "step": 146
447
  },
448
  {
449
  "epoch": 0.29,
450
  "learning_rate": 2e-05,
451
- "loss": 1.5554,
452
  "step": 148
453
  },
454
  {
455
  "epoch": 0.29,
456
  "learning_rate": 2e-05,
457
- "loss": 1.0749,
458
  "step": 150
459
  },
460
  {
461
  "epoch": 0.29,
462
  "learning_rate": 2e-05,
463
- "loss": 1.5625,
464
  "step": 152
465
  },
466
  {
467
  "epoch": 0.3,
468
  "learning_rate": 2e-05,
469
- "loss": 1.5828,
470
  "step": 154
471
  },
472
  {
473
  "epoch": 0.3,
474
  "learning_rate": 2e-05,
475
- "loss": 1.8176,
476
  "step": 156
477
  },
478
  {
479
  "epoch": 0.31,
480
  "learning_rate": 2e-05,
481
- "loss": 1.5725,
482
  "step": 158
483
  },
484
  {
485
  "epoch": 0.31,
486
  "learning_rate": 2e-05,
487
- "loss": 1.6738,
488
  "step": 160
489
  },
490
  {
491
  "epoch": 0.31,
492
  "learning_rate": 2e-05,
493
- "loss": 1.6085,
494
  "step": 162
495
  },
496
  {
497
  "epoch": 0.32,
498
  "learning_rate": 2e-05,
499
- "loss": 1.6153,
500
  "step": 164
501
  },
502
  {
503
  "epoch": 0.32,
504
  "learning_rate": 2e-05,
505
- "loss": 1.6836,
506
  "step": 166
507
  },
508
  {
509
  "epoch": 0.32,
510
  "learning_rate": 2e-05,
511
- "loss": 1.529,
512
  "step": 168
513
  },
514
  {
515
  "epoch": 0.33,
516
  "learning_rate": 2e-05,
517
- "loss": 1.764,
518
  "step": 170
519
  },
520
  {
521
  "epoch": 0.33,
522
  "learning_rate": 2e-05,
523
- "loss": 1.7149,
524
  "step": 172
525
  },
526
  {
527
  "epoch": 0.34,
528
  "learning_rate": 2e-05,
529
- "loss": 1.723,
530
  "step": 174
531
  },
532
  {
533
  "epoch": 0.34,
534
  "learning_rate": 2e-05,
535
- "loss": 1.9239,
536
  "step": 176
537
  },
538
  {
539
  "epoch": 0.34,
540
  "learning_rate": 2e-05,
541
- "loss": 1.7521,
542
  "step": 178
543
  },
544
  {
545
  "epoch": 0.35,
546
  "learning_rate": 2e-05,
547
- "loss": 1.7628,
548
  "step": 180
549
  },
550
  {
551
  "epoch": 0.35,
552
  "learning_rate": 2e-05,
553
- "loss": 1.8299,
554
  "step": 182
555
  },
556
  {
557
  "epoch": 0.36,
558
  "learning_rate": 2e-05,
559
- "loss": 1.9278,
560
  "step": 184
561
  },
562
  {
563
  "epoch": 0.36,
564
  "learning_rate": 2e-05,
565
- "loss": 1.8865,
566
  "step": 186
567
  },
568
  {
569
  "epoch": 0.36,
570
  "learning_rate": 2e-05,
571
- "loss": 1.815,
572
  "step": 188
573
  },
574
  {
575
  "epoch": 0.37,
576
  "learning_rate": 2e-05,
577
- "loss": 1.8266,
578
  "step": 190
579
  },
580
  {
581
  "epoch": 0.37,
582
  "learning_rate": 2e-05,
583
- "loss": 1.9384,
584
  "step": 192
585
  },
586
  {
587
  "epoch": 0.38,
588
  "learning_rate": 2e-05,
589
- "loss": 1.9409,
590
  "step": 194
591
  },
592
  {
593
  "epoch": 0.38,
594
  "learning_rate": 2e-05,
595
- "loss": 1.8486,
596
  "step": 196
597
  },
598
  {
599
  "epoch": 0.38,
600
  "learning_rate": 2e-05,
601
- "loss": 1.4427,
602
  "step": 198
603
  },
604
  {
605
  "epoch": 0.39,
606
  "learning_rate": 2e-05,
607
- "loss": 1.134,
608
  "step": 200
609
  },
610
  {
611
  "epoch": 0.39,
612
  "learning_rate": 2e-05,
613
- "loss": 1.5885,
614
  "step": 202
615
  },
616
  {
617
  "epoch": 0.39,
618
  "learning_rate": 2e-05,
619
- "loss": 1.5889,
620
  "step": 204
621
  },
622
  {
623
  "epoch": 0.4,
624
  "learning_rate": 2e-05,
625
- "loss": 1.6148,
626
  "step": 206
627
  },
628
  {
629
  "epoch": 0.4,
630
  "learning_rate": 2e-05,
631
- "loss": 1.5699,
632
  "step": 208
633
  },
634
  {
635
  "epoch": 0.41,
636
  "learning_rate": 2e-05,
637
- "loss": 1.5963,
638
  "step": 210
639
  },
640
  {
641
  "epoch": 0.41,
642
  "learning_rate": 2e-05,
643
- "loss": 1.6684,
644
  "step": 212
645
  },
646
  {
647
  "epoch": 0.41,
648
  "learning_rate": 2e-05,
649
- "loss": 1.5738,
650
  "step": 214
651
  },
652
  {
653
  "epoch": 0.42,
654
  "learning_rate": 2e-05,
655
- "loss": 1.6024,
656
  "step": 216
657
  },
658
  {
659
  "epoch": 0.42,
660
  "learning_rate": 2e-05,
661
- "loss": 1.7576,
662
  "step": 218
663
  },
664
  {
665
  "epoch": 0.43,
666
  "learning_rate": 2e-05,
667
- "loss": 1.7024,
668
  "step": 220
669
  },
670
  {
671
  "epoch": 0.43,
672
  "learning_rate": 2e-05,
673
- "loss": 1.816,
674
  "step": 222
675
  },
676
  {
677
  "epoch": 0.43,
678
  "learning_rate": 2e-05,
679
- "loss": 1.7472,
680
  "step": 224
681
  },
682
  {
683
  "epoch": 0.44,
684
  "learning_rate": 2e-05,
685
- "loss": 1.6804,
686
  "step": 226
687
  },
688
  {
689
  "epoch": 0.44,
690
  "learning_rate": 2e-05,
691
- "loss": 1.7602,
692
  "step": 228
693
  },
694
  {
695
  "epoch": 0.44,
696
  "learning_rate": 2e-05,
697
- "loss": 1.7963,
698
  "step": 230
699
  },
700
  {
701
  "epoch": 0.45,
702
  "learning_rate": 2e-05,
703
- "loss": 1.9796,
704
  "step": 232
705
  },
706
  {
707
  "epoch": 0.45,
708
  "learning_rate": 2e-05,
709
- "loss": 1.8835,
710
  "step": 234
711
  },
712
  {
713
  "epoch": 0.46,
714
  "learning_rate": 2e-05,
715
- "loss": 1.8678,
716
  "step": 236
717
  },
718
  {
719
  "epoch": 0.46,
720
  "learning_rate": 2e-05,
721
- "loss": 1.9907,
722
  "step": 238
723
  },
724
  {
725
  "epoch": 0.46,
726
  "learning_rate": 2e-05,
727
- "loss": 1.8836,
728
  "step": 240
729
  },
730
  {
731
  "epoch": 0.47,
732
  "learning_rate": 2e-05,
733
- "loss": 1.9357,
734
  "step": 242
735
  },
736
  {
737
  "epoch": 0.47,
738
  "learning_rate": 2e-05,
739
- "loss": 1.8493,
740
  "step": 244
741
  },
742
  {
743
  "epoch": 0.48,
744
  "learning_rate": 2e-05,
745
- "loss": 1.8133,
746
  "step": 246
747
  },
748
  {
749
  "epoch": 0.48,
750
  "learning_rate": 2e-05,
751
- "loss": 1.4859,
752
  "step": 248
753
  },
754
  {
755
  "epoch": 0.48,
756
  "learning_rate": 2e-05,
757
- "loss": 0.9864,
758
  "step": 250
759
  },
760
  {
761
  "epoch": 0.49,
762
  "learning_rate": 2e-05,
763
- "loss": 1.5172,
764
  "step": 252
765
  },
766
  {
767
  "epoch": 0.49,
768
  "learning_rate": 2e-05,
769
- "loss": 1.6246,
770
  "step": 254
771
  },
772
  {
773
  "epoch": 0.5,
774
  "learning_rate": 2e-05,
775
- "loss": 1.6437,
776
  "step": 256
777
  },
778
  {
779
  "epoch": 0.5,
780
  "learning_rate": 2e-05,
781
- "loss": 1.5835,
782
  "step": 258
783
  },
784
  {
785
  "epoch": 0.5,
786
  "learning_rate": 2e-05,
787
- "loss": 1.7483,
788
  "step": 260
789
  },
790
  {
791
  "epoch": 0.51,
792
  "learning_rate": 2e-05,
793
- "loss": 1.4925,
794
  "step": 262
795
  },
796
  {
797
  "epoch": 0.51,
798
  "learning_rate": 2e-05,
799
- "loss": 1.694,
800
  "step": 264
801
  },
802
  {
803
  "epoch": 0.51,
804
  "learning_rate": 2e-05,
805
- "loss": 1.7077,
806
  "step": 266
807
  },
808
  {
809
  "epoch": 0.52,
810
  "learning_rate": 2e-05,
811
- "loss": 1.6526,
812
  "step": 268
813
  },
814
  {
815
  "epoch": 0.52,
816
  "learning_rate": 2e-05,
817
- "loss": 1.6859,
818
  "step": 270
819
  },
820
  {
821
  "epoch": 0.53,
822
  "learning_rate": 2e-05,
823
- "loss": 1.7182,
824
  "step": 272
825
  },
826
  {
827
  "epoch": 0.53,
828
  "learning_rate": 2e-05,
829
- "loss": 1.6757,
830
  "step": 274
831
  },
832
  {
833
  "epoch": 0.53,
834
  "learning_rate": 2e-05,
835
- "loss": 1.766,
836
  "step": 276
837
  },
838
  {
839
  "epoch": 0.54,
840
  "learning_rate": 2e-05,
841
- "loss": 1.936,
842
  "step": 278
843
  },
844
  {
845
  "epoch": 0.54,
846
  "learning_rate": 2e-05,
847
- "loss": 1.8934,
848
  "step": 280
849
  },
850
  {
851
  "epoch": 0.55,
852
  "learning_rate": 2e-05,
853
- "loss": 1.9231,
854
  "step": 282
855
  },
856
  {
857
  "epoch": 0.55,
858
  "learning_rate": 2e-05,
859
- "loss": 1.9038,
860
  "step": 284
861
  },
862
  {
863
  "epoch": 0.55,
864
  "learning_rate": 2e-05,
865
- "loss": 2.0763,
866
  "step": 286
867
  },
868
  {
869
  "epoch": 0.56,
870
  "learning_rate": 2e-05,
871
- "loss": 1.973,
872
  "step": 288
873
  },
874
  {
875
  "epoch": 0.56,
876
  "learning_rate": 2e-05,
877
- "loss": 1.8104,
878
  "step": 290
879
  },
880
  {
881
  "epoch": 0.56,
882
  "learning_rate": 2e-05,
883
- "loss": 1.9657,
884
  "step": 292
885
  },
886
  {
887
  "epoch": 0.57,
888
  "learning_rate": 2e-05,
889
- "loss": 1.7721,
890
  "step": 294
891
  },
892
  {
893
  "epoch": 0.57,
894
  "learning_rate": 2e-05,
895
- "loss": 1.6366,
896
  "step": 296
897
  },
898
  {
899
  "epoch": 0.58,
900
  "learning_rate": 2e-05,
901
- "loss": 1.4167,
902
  "step": 298
903
  },
904
  {
905
  "epoch": 0.58,
906
  "learning_rate": 2e-05,
907
- "loss": 1.0393,
908
  "step": 300
909
- },
910
- {
911
- "epoch": 0.58,
912
- "learning_rate": 2e-05,
913
- "loss": 1.6262,
914
- "step": 302
915
- },
916
- {
917
- "epoch": 0.59,
918
- "learning_rate": 2e-05,
919
- "loss": 1.7146,
920
- "step": 304
921
- },
922
- {
923
- "epoch": 0.59,
924
- "learning_rate": 2e-05,
925
- "loss": 1.5797,
926
- "step": 306
927
- },
928
- {
929
- "epoch": 0.6,
930
- "learning_rate": 2e-05,
931
- "loss": 1.6793,
932
- "step": 308
933
- },
934
- {
935
- "epoch": 0.6,
936
- "learning_rate": 2e-05,
937
- "loss": 1.5755,
938
- "step": 310
939
- },
940
- {
941
- "epoch": 0.6,
942
- "learning_rate": 2e-05,
943
- "loss": 1.5813,
944
- "step": 312
945
- },
946
- {
947
- "epoch": 0.61,
948
- "learning_rate": 2e-05,
949
- "loss": 1.6024,
950
- "step": 314
951
- },
952
- {
953
- "epoch": 0.61,
954
- "learning_rate": 2e-05,
955
- "loss": 1.7725,
956
- "step": 316
957
- },
958
- {
959
- "epoch": 0.61,
960
- "learning_rate": 2e-05,
961
- "loss": 1.6265,
962
- "step": 318
963
- },
964
- {
965
- "epoch": 0.62,
966
- "learning_rate": 2e-05,
967
- "loss": 1.5773,
968
- "step": 320
969
- },
970
- {
971
- "epoch": 0.62,
972
- "learning_rate": 2e-05,
973
- "loss": 1.6883,
974
- "step": 322
975
- },
976
- {
977
- "epoch": 0.63,
978
- "learning_rate": 2e-05,
979
- "loss": 1.7982,
980
- "step": 324
981
- },
982
- {
983
- "epoch": 0.63,
984
- "learning_rate": 2e-05,
985
- "loss": 1.7559,
986
- "step": 326
987
- },
988
- {
989
- "epoch": 0.63,
990
- "learning_rate": 2e-05,
991
- "loss": 1.8074,
992
- "step": 328
993
- },
994
- {
995
- "epoch": 0.64,
996
- "learning_rate": 2e-05,
997
- "loss": 2.0078,
998
- "step": 330
999
- },
1000
- {
1001
- "epoch": 0.64,
1002
- "learning_rate": 2e-05,
1003
- "loss": 1.9758,
1004
- "step": 332
1005
- },
1006
- {
1007
- "epoch": 0.65,
1008
- "learning_rate": 2e-05,
1009
- "loss": 1.8103,
1010
- "step": 334
1011
- },
1012
- {
1013
- "epoch": 0.65,
1014
- "learning_rate": 2e-05,
1015
- "loss": 2.0364,
1016
- "step": 336
1017
- },
1018
- {
1019
- "epoch": 0.65,
1020
- "learning_rate": 2e-05,
1021
- "loss": 1.8717,
1022
- "step": 338
1023
- },
1024
- {
1025
- "epoch": 0.66,
1026
- "learning_rate": 2e-05,
1027
- "loss": 1.8783,
1028
- "step": 340
1029
- },
1030
- {
1031
- "epoch": 0.66,
1032
- "learning_rate": 2e-05,
1033
- "loss": 1.954,
1034
- "step": 342
1035
- },
1036
- {
1037
- "epoch": 0.67,
1038
- "learning_rate": 2e-05,
1039
- "loss": 1.8943,
1040
- "step": 344
1041
- },
1042
- {
1043
- "epoch": 0.67,
1044
- "learning_rate": 2e-05,
1045
- "loss": 1.7534,
1046
- "step": 346
1047
- },
1048
- {
1049
- "epoch": 0.67,
1050
- "learning_rate": 2e-05,
1051
- "loss": 1.606,
1052
- "step": 348
1053
- },
1054
- {
1055
- "epoch": 0.68,
1056
- "learning_rate": 2e-05,
1057
- "loss": 1.0254,
1058
- "step": 350
1059
- },
1060
- {
1061
- "epoch": 0.68,
1062
- "learning_rate": 2e-05,
1063
- "loss": 1.5422,
1064
- "step": 352
1065
- },
1066
- {
1067
- "epoch": 0.68,
1068
- "learning_rate": 2e-05,
1069
- "loss": 1.6437,
1070
- "step": 354
1071
- },
1072
- {
1073
- "epoch": 0.69,
1074
- "learning_rate": 2e-05,
1075
- "loss": 1.5021,
1076
- "step": 356
1077
- },
1078
- {
1079
- "epoch": 0.69,
1080
- "learning_rate": 2e-05,
1081
- "loss": 1.7269,
1082
- "step": 358
1083
- },
1084
- {
1085
- "epoch": 0.7,
1086
- "learning_rate": 2e-05,
1087
- "loss": 1.6228,
1088
- "step": 360
1089
- },
1090
- {
1091
- "epoch": 0.7,
1092
- "learning_rate": 2e-05,
1093
- "loss": 1.631,
1094
- "step": 362
1095
- },
1096
- {
1097
- "epoch": 0.7,
1098
- "learning_rate": 2e-05,
1099
- "loss": 1.7287,
1100
- "step": 364
1101
- },
1102
- {
1103
- "epoch": 0.71,
1104
- "learning_rate": 2e-05,
1105
- "loss": 1.6234,
1106
- "step": 366
1107
- },
1108
- {
1109
- "epoch": 0.71,
1110
- "learning_rate": 2e-05,
1111
- "loss": 1.6457,
1112
- "step": 368
1113
- },
1114
- {
1115
- "epoch": 0.72,
1116
- "learning_rate": 2e-05,
1117
- "loss": 1.6345,
1118
- "step": 370
1119
- },
1120
- {
1121
- "epoch": 0.72,
1122
- "learning_rate": 2e-05,
1123
- "loss": 1.6274,
1124
- "step": 372
1125
- },
1126
- {
1127
- "epoch": 0.72,
1128
- "learning_rate": 2e-05,
1129
- "loss": 1.65,
1130
- "step": 374
1131
- },
1132
- {
1133
- "epoch": 0.73,
1134
- "learning_rate": 2e-05,
1135
- "loss": 1.7637,
1136
- "step": 376
1137
- },
1138
- {
1139
- "epoch": 0.73,
1140
- "learning_rate": 2e-05,
1141
- "loss": 1.8289,
1142
- "step": 378
1143
- },
1144
- {
1145
- "epoch": 0.73,
1146
- "learning_rate": 2e-05,
1147
- "loss": 1.9179,
1148
- "step": 380
1149
- },
1150
- {
1151
- "epoch": 0.74,
1152
- "learning_rate": 2e-05,
1153
- "loss": 2.0096,
1154
- "step": 382
1155
- },
1156
- {
1157
- "epoch": 0.74,
1158
- "learning_rate": 2e-05,
1159
- "loss": 1.9448,
1160
- "step": 384
1161
- },
1162
- {
1163
- "epoch": 0.75,
1164
- "learning_rate": 2e-05,
1165
- "loss": 1.9791,
1166
- "step": 386
1167
- },
1168
- {
1169
- "epoch": 0.75,
1170
- "learning_rate": 2e-05,
1171
- "loss": 1.9646,
1172
- "step": 388
1173
- },
1174
- {
1175
- "epoch": 0.75,
1176
- "learning_rate": 2e-05,
1177
- "loss": 1.9525,
1178
- "step": 390
1179
- },
1180
- {
1181
- "epoch": 0.76,
1182
- "learning_rate": 2e-05,
1183
- "loss": 1.9584,
1184
- "step": 392
1185
- },
1186
- {
1187
- "epoch": 0.76,
1188
- "learning_rate": 2e-05,
1189
- "loss": 1.8401,
1190
- "step": 394
1191
- },
1192
- {
1193
- "epoch": 0.77,
1194
- "learning_rate": 2e-05,
1195
- "loss": 1.7071,
1196
- "step": 396
1197
- },
1198
- {
1199
- "epoch": 0.77,
1200
- "learning_rate": 2e-05,
1201
- "loss": 1.5105,
1202
- "step": 398
1203
- },
1204
- {
1205
- "epoch": 0.77,
1206
- "learning_rate": 2e-05,
1207
- "loss": 0.9903,
1208
- "step": 400
1209
- },
1210
- {
1211
- "epoch": 0.78,
1212
- "learning_rate": 2e-05,
1213
- "loss": 1.6691,
1214
- "step": 402
1215
- },
1216
- {
1217
- "epoch": 0.78,
1218
- "learning_rate": 2e-05,
1219
- "loss": 1.5754,
1220
- "step": 404
1221
- },
1222
- {
1223
- "epoch": 0.79,
1224
- "learning_rate": 2e-05,
1225
- "loss": 1.7317,
1226
- "step": 406
1227
- },
1228
- {
1229
- "epoch": 0.79,
1230
- "learning_rate": 2e-05,
1231
- "loss": 1.6006,
1232
- "step": 408
1233
- },
1234
- {
1235
- "epoch": 0.79,
1236
- "learning_rate": 2e-05,
1237
- "loss": 1.6372,
1238
- "step": 410
1239
- },
1240
- {
1241
- "epoch": 0.8,
1242
- "learning_rate": 2e-05,
1243
- "loss": 1.4847,
1244
- "step": 412
1245
- },
1246
- {
1247
- "epoch": 0.8,
1248
- "learning_rate": 2e-05,
1249
- "loss": 1.5862,
1250
- "step": 414
1251
- },
1252
- {
1253
- "epoch": 0.8,
1254
- "learning_rate": 2e-05,
1255
- "loss": 1.6069,
1256
- "step": 416
1257
- },
1258
- {
1259
- "epoch": 0.81,
1260
- "learning_rate": 2e-05,
1261
- "loss": 1.6557,
1262
- "step": 418
1263
- },
1264
- {
1265
- "epoch": 0.81,
1266
- "learning_rate": 2e-05,
1267
- "loss": 1.7094,
1268
- "step": 420
1269
- },
1270
- {
1271
- "epoch": 0.82,
1272
- "learning_rate": 2e-05,
1273
- "loss": 1.8511,
1274
- "step": 422
1275
- },
1276
- {
1277
- "epoch": 0.82,
1278
- "learning_rate": 2e-05,
1279
- "loss": 1.8803,
1280
- "step": 424
1281
- },
1282
- {
1283
- "epoch": 0.82,
1284
- "learning_rate": 2e-05,
1285
- "loss": 1.857,
1286
- "step": 426
1287
- },
1288
- {
1289
- "epoch": 0.83,
1290
- "learning_rate": 2e-05,
1291
- "loss": 1.7882,
1292
- "step": 428
1293
- },
1294
- {
1295
- "epoch": 0.83,
1296
- "learning_rate": 2e-05,
1297
- "loss": 1.7939,
1298
- "step": 430
1299
- },
1300
- {
1301
- "epoch": 0.84,
1302
- "learning_rate": 2e-05,
1303
- "loss": 1.8605,
1304
- "step": 432
1305
- },
1306
- {
1307
- "epoch": 0.84,
1308
- "learning_rate": 2e-05,
1309
- "loss": 1.9402,
1310
- "step": 434
1311
- },
1312
- {
1313
- "epoch": 0.84,
1314
- "learning_rate": 2e-05,
1315
- "loss": 1.7993,
1316
- "step": 436
1317
- },
1318
- {
1319
- "epoch": 0.85,
1320
- "learning_rate": 2e-05,
1321
- "loss": 2.0032,
1322
- "step": 438
1323
- },
1324
- {
1325
- "epoch": 0.85,
1326
- "learning_rate": 2e-05,
1327
- "loss": 1.9246,
1328
- "step": 440
1329
- },
1330
- {
1331
- "epoch": 0.85,
1332
- "learning_rate": 2e-05,
1333
- "loss": 1.8394,
1334
- "step": 442
1335
- },
1336
- {
1337
- "epoch": 0.86,
1338
- "learning_rate": 2e-05,
1339
- "loss": 1.7374,
1340
- "step": 444
1341
- },
1342
- {
1343
- "epoch": 0.86,
1344
- "learning_rate": 2e-05,
1345
- "loss": 1.6768,
1346
- "step": 446
1347
- },
1348
- {
1349
- "epoch": 0.87,
1350
- "learning_rate": 2e-05,
1351
- "loss": 1.4662,
1352
- "step": 448
1353
- },
1354
- {
1355
- "epoch": 0.87,
1356
- "learning_rate": 2e-05,
1357
- "loss": 0.9718,
1358
- "step": 450
1359
- },
1360
- {
1361
- "epoch": 0.87,
1362
- "learning_rate": 2e-05,
1363
- "loss": 1.4755,
1364
- "step": 452
1365
- },
1366
- {
1367
- "epoch": 0.88,
1368
- "learning_rate": 2e-05,
1369
- "loss": 1.6846,
1370
- "step": 454
1371
- },
1372
- {
1373
- "epoch": 0.88,
1374
- "learning_rate": 2e-05,
1375
- "loss": 1.6875,
1376
- "step": 456
1377
- },
1378
- {
1379
- "epoch": 0.89,
1380
- "learning_rate": 2e-05,
1381
- "loss": 1.4627,
1382
- "step": 458
1383
- },
1384
- {
1385
- "epoch": 0.89,
1386
- "learning_rate": 2e-05,
1387
- "loss": 1.6094,
1388
- "step": 460
1389
- },
1390
- {
1391
- "epoch": 0.89,
1392
- "learning_rate": 2e-05,
1393
- "loss": 1.4555,
1394
- "step": 462
1395
- },
1396
- {
1397
- "epoch": 0.9,
1398
- "learning_rate": 2e-05,
1399
- "loss": 1.6874,
1400
- "step": 464
1401
- },
1402
- {
1403
- "epoch": 0.9,
1404
- "learning_rate": 2e-05,
1405
- "loss": 1.5308,
1406
- "step": 466
1407
- },
1408
- {
1409
- "epoch": 0.91,
1410
- "learning_rate": 2e-05,
1411
- "loss": 1.7519,
1412
- "step": 468
1413
- },
1414
- {
1415
- "epoch": 0.91,
1416
- "learning_rate": 2e-05,
1417
- "loss": 1.6585,
1418
- "step": 470
1419
- },
1420
- {
1421
- "epoch": 0.91,
1422
- "learning_rate": 2e-05,
1423
- "loss": 1.726,
1424
- "step": 472
1425
- },
1426
- {
1427
- "epoch": 0.92,
1428
- "learning_rate": 2e-05,
1429
- "loss": 1.6701,
1430
- "step": 474
1431
- },
1432
- {
1433
- "epoch": 0.92,
1434
- "learning_rate": 2e-05,
1435
- "loss": 1.8123,
1436
- "step": 476
1437
- },
1438
- {
1439
- "epoch": 0.92,
1440
- "learning_rate": 2e-05,
1441
- "loss": 1.7762,
1442
- "step": 478
1443
- },
1444
- {
1445
- "epoch": 0.93,
1446
- "learning_rate": 2e-05,
1447
- "loss": 1.796,
1448
- "step": 480
1449
- },
1450
- {
1451
- "epoch": 0.93,
1452
- "learning_rate": 2e-05,
1453
- "loss": 1.8989,
1454
- "step": 482
1455
- },
1456
- {
1457
- "epoch": 0.94,
1458
- "learning_rate": 2e-05,
1459
- "loss": 1.7805,
1460
- "step": 484
1461
- },
1462
- {
1463
- "epoch": 0.94,
1464
- "learning_rate": 2e-05,
1465
- "loss": 1.911,
1466
- "step": 486
1467
- },
1468
- {
1469
- "epoch": 0.94,
1470
- "learning_rate": 2e-05,
1471
- "loss": 1.9393,
1472
- "step": 488
1473
- },
1474
- {
1475
- "epoch": 0.95,
1476
- "learning_rate": 2e-05,
1477
- "loss": 1.9401,
1478
- "step": 490
1479
- },
1480
- {
1481
- "epoch": 0.95,
1482
- "learning_rate": 2e-05,
1483
- "loss": 1.8289,
1484
- "step": 492
1485
- },
1486
- {
1487
- "epoch": 0.96,
1488
- "learning_rate": 2e-05,
1489
- "loss": 1.8784,
1490
- "step": 494
1491
- },
1492
- {
1493
- "epoch": 0.96,
1494
- "learning_rate": 2e-05,
1495
- "loss": 1.7339,
1496
- "step": 496
1497
- },
1498
- {
1499
- "epoch": 0.96,
1500
- "learning_rate": 2e-05,
1501
- "loss": 1.4148,
1502
- "step": 498
1503
- },
1504
- {
1505
- "epoch": 0.97,
1506
- "learning_rate": 2e-05,
1507
- "loss": 0.9561,
1508
- "step": 500
1509
  }
1510
  ],
1511
  "max_steps": 1000,
1512
  "num_train_epochs": 2,
1513
- "total_flos": 7103278540800000.0,
1514
  "trial_name": null,
1515
  "trial_params": null
1516
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.580130529369108,
5
+ "global_step": 300,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
10
  {
11
  "epoch": 0.0,
12
  "learning_rate": 2e-05,
13
+ "loss": 1.3732,
14
  "step": 2
15
  },
16
  {
17
  "epoch": 0.01,
18
  "learning_rate": 2e-05,
19
+ "loss": 1.1638,
20
  "step": 4
21
  },
22
  {
23
  "epoch": 0.01,
24
  "learning_rate": 2e-05,
25
+ "loss": 1.1545,
26
  "step": 6
27
  },
28
  {
29
  "epoch": 0.02,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.2039,
32
  "step": 8
33
  },
34
  {
35
  "epoch": 0.02,
36
  "learning_rate": 2e-05,
37
+ "loss": 1.2867,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 0.02,
42
  "learning_rate": 2e-05,
43
+ "loss": 1.2554,
44
  "step": 12
45
  },
46
  {
47
  "epoch": 0.03,
48
  "learning_rate": 2e-05,
49
+ "loss": 1.2596,
50
  "step": 14
51
  },
52
  {
53
  "epoch": 0.03,
54
  "learning_rate": 2e-05,
55
+ "loss": 1.2628,
56
  "step": 16
57
  },
58
  {
59
  "epoch": 0.03,
60
  "learning_rate": 2e-05,
61
+ "loss": 1.3257,
62
  "step": 18
63
  },
64
  {
65
  "epoch": 0.04,
66
  "learning_rate": 2e-05,
67
+ "loss": 1.3706,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 0.04,
72
  "learning_rate": 2e-05,
73
+ "loss": 1.6044,
74
  "step": 22
75
  },
76
  {
77
  "epoch": 0.05,
78
  "learning_rate": 2e-05,
79
+ "loss": 1.2826,
80
  "step": 24
81
  },
82
  {
83
  "epoch": 0.05,
84
  "learning_rate": 2e-05,
85
+ "loss": 1.3722,
86
  "step": 26
87
  },
88
  {
89
  "epoch": 0.05,
90
  "learning_rate": 2e-05,
91
+ "loss": 1.4654,
92
  "step": 28
93
  },
94
  {
95
  "epoch": 0.06,
96
  "learning_rate": 2e-05,
97
+ "loss": 1.6323,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 0.06,
102
  "learning_rate": 2e-05,
103
+ "loss": 1.683,
104
  "step": 32
105
  },
106
  {
107
  "epoch": 0.07,
108
  "learning_rate": 2e-05,
109
+ "loss": 1.5204,
110
  "step": 34
111
  },
112
  {
113
  "epoch": 0.07,
114
  "learning_rate": 2e-05,
115
+ "loss": 1.679,
116
  "step": 36
117
  },
118
  {
119
  "epoch": 0.07,
120
  "learning_rate": 2e-05,
121
+ "loss": 1.4497,
122
  "step": 38
123
  },
124
  {
125
  "epoch": 0.08,
126
  "learning_rate": 2e-05,
127
+ "loss": 1.5898,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 0.08,
132
  "learning_rate": 2e-05,
133
+ "loss": 1.5762,
134
  "step": 42
135
  },
136
  {
137
  "epoch": 0.09,
138
  "learning_rate": 2e-05,
139
+ "loss": 1.6473,
140
  "step": 44
141
  },
142
  {
143
  "epoch": 0.09,
144
  "learning_rate": 2e-05,
145
+ "loss": 1.4771,
146
  "step": 46
147
  },
148
  {
149
  "epoch": 0.09,
150
  "learning_rate": 2e-05,
151
+ "loss": 1.3425,
152
  "step": 48
153
  },
154
  {
155
  "epoch": 0.1,
156
  "learning_rate": 2e-05,
157
+ "loss": 0.8668,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 0.1,
162
  "learning_rate": 2e-05,
163
+ "loss": 0.9341,
164
  "step": 52
165
  },
166
  {
167
  "epoch": 0.1,
168
  "learning_rate": 2e-05,
169
+ "loss": 1.0335,
170
  "step": 54
171
  },
172
  {
173
  "epoch": 0.11,
174
  "learning_rate": 2e-05,
175
+ "loss": 1.1254,
176
  "step": 56
177
  },
178
  {
179
  "epoch": 0.11,
180
  "learning_rate": 2e-05,
181
+ "loss": 1.0025,
182
  "step": 58
183
  },
184
  {
185
  "epoch": 0.12,
186
  "learning_rate": 2e-05,
187
+ "loss": 1.081,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 0.12,
192
  "learning_rate": 2e-05,
193
+ "loss": 1.029,
194
  "step": 62
195
  },
196
  {
197
  "epoch": 0.12,
198
  "learning_rate": 2e-05,
199
+ "loss": 1.0352,
200
  "step": 64
201
  },
202
  {
203
  "epoch": 0.13,
204
  "learning_rate": 2e-05,
205
+ "loss": 1.156,
206
  "step": 66
207
  },
208
  {
209
  "epoch": 0.13,
210
  "learning_rate": 2e-05,
211
+ "loss": 1.0304,
212
  "step": 68
213
  },
214
  {
215
  "epoch": 0.14,
216
  "learning_rate": 2e-05,
217
+ "loss": 1.0913,
218
  "step": 70
219
  },
220
  {
221
  "epoch": 0.14,
222
  "learning_rate": 2e-05,
223
+ "loss": 1.239,
224
  "step": 72
225
  },
226
  {
227
  "epoch": 0.14,
228
  "learning_rate": 2e-05,
229
+ "loss": 1.0982,
230
  "step": 74
231
  },
232
  {
233
  "epoch": 0.15,
234
  "learning_rate": 2e-05,
235
+ "loss": 1.2554,
236
  "step": 76
237
  },
238
  {
239
  "epoch": 0.15,
240
  "learning_rate": 2e-05,
241
+ "loss": 1.3172,
242
  "step": 78
243
  },
244
  {
245
  "epoch": 0.15,
246
  "learning_rate": 2e-05,
247
+ "loss": 1.3916,
248
  "step": 80
249
  },
250
  {
251
  "epoch": 0.16,
252
  "learning_rate": 2e-05,
253
+ "loss": 1.1953,
254
  "step": 82
255
  },
256
  {
257
  "epoch": 0.16,
258
  "learning_rate": 2e-05,
259
+ "loss": 1.3974,
260
  "step": 84
261
  },
262
  {
263
  "epoch": 0.17,
264
  "learning_rate": 2e-05,
265
+ "loss": 1.3862,
266
  "step": 86
267
  },
268
  {
269
  "epoch": 0.17,
270
  "learning_rate": 2e-05,
271
+ "loss": 1.2707,
272
  "step": 88
273
  },
274
  {
275
  "epoch": 0.17,
276
  "learning_rate": 2e-05,
277
+ "loss": 1.4924,
278
  "step": 90
279
  },
280
  {
281
  "epoch": 0.18,
282
  "learning_rate": 2e-05,
283
+ "loss": 1.5437,
284
  "step": 92
285
  },
286
  {
287
  "epoch": 0.18,
288
  "learning_rate": 2e-05,
289
+ "loss": 1.35,
290
  "step": 94
291
  },
292
  {
293
  "epoch": 0.19,
294
  "learning_rate": 2e-05,
295
+ "loss": 1.3366,
296
  "step": 96
297
  },
298
  {
299
  "epoch": 0.19,
300
  "learning_rate": 2e-05,
301
+ "loss": 1.2631,
302
  "step": 98
303
  },
304
  {
305
  "epoch": 0.19,
306
  "learning_rate": 2e-05,
307
+ "loss": 0.814,
308
  "step": 100
309
  },
310
  {
311
  "epoch": 0.2,
312
  "learning_rate": 2e-05,
313
+ "loss": 1.1012,
314
  "step": 102
315
  },
316
  {
317
  "epoch": 0.2,
318
  "learning_rate": 2e-05,
319
+ "loss": 1.0838,
320
  "step": 104
321
  },
322
  {
323
  "epoch": 0.2,
324
  "learning_rate": 2e-05,
325
+ "loss": 1.026,
326
  "step": 106
327
  },
328
  {
329
  "epoch": 0.21,
330
  "learning_rate": 2e-05,
331
+ "loss": 1.0317,
332
  "step": 108
333
  },
334
  {
335
  "epoch": 0.21,
336
  "learning_rate": 2e-05,
337
+ "loss": 1.0449,
338
  "step": 110
339
  },
340
  {
341
  "epoch": 0.22,
342
  "learning_rate": 2e-05,
343
+ "loss": 1.1489,
344
  "step": 112
345
  },
346
  {
347
  "epoch": 0.22,
348
  "learning_rate": 2e-05,
349
+ "loss": 1.1748,
350
  "step": 114
351
  },
352
  {
353
  "epoch": 0.22,
354
  "learning_rate": 2e-05,
355
+ "loss": 0.9617,
356
  "step": 116
357
  },
358
  {
359
  "epoch": 0.23,
360
  "learning_rate": 2e-05,
361
+ "loss": 1.2164,
362
  "step": 118
363
  },
364
  {
365
  "epoch": 0.23,
366
  "learning_rate": 2e-05,
367
+ "loss": 1.1345,
368
  "step": 120
369
  },
370
  {
371
  "epoch": 0.24,
372
  "learning_rate": 2e-05,
373
+ "loss": 1.1359,
374
  "step": 122
375
  },
376
  {
377
  "epoch": 0.24,
378
  "learning_rate": 2e-05,
379
+ "loss": 1.2059,
380
  "step": 124
381
  },
382
  {
383
  "epoch": 0.24,
384
  "learning_rate": 2e-05,
385
+ "loss": 1.1603,
386
  "step": 126
387
  },
388
  {
389
  "epoch": 0.25,
390
  "learning_rate": 2e-05,
391
+ "loss": 1.298,
392
  "step": 128
393
  },
394
  {
395
  "epoch": 0.25,
396
  "learning_rate": 2e-05,
397
+ "loss": 1.1893,
398
  "step": 130
399
  },
400
  {
401
  "epoch": 0.26,
402
  "learning_rate": 2e-05,
403
+ "loss": 1.3392,
404
  "step": 132
405
  },
406
  {
407
  "epoch": 0.26,
408
  "learning_rate": 2e-05,
409
+ "loss": 1.4176,
410
  "step": 134
411
  },
412
  {
413
  "epoch": 0.26,
414
  "learning_rate": 2e-05,
415
+ "loss": 1.4475,
416
  "step": 136
417
  },
418
  {
419
  "epoch": 0.27,
420
  "learning_rate": 2e-05,
421
+ "loss": 1.4052,
422
  "step": 138
423
  },
424
  {
425
  "epoch": 0.27,
426
  "learning_rate": 2e-05,
427
+ "loss": 1.464,
428
  "step": 140
429
  },
430
  {
431
  "epoch": 0.27,
432
  "learning_rate": 2e-05,
433
+ "loss": 1.4211,
434
  "step": 142
435
  },
436
  {
437
  "epoch": 0.28,
438
  "learning_rate": 2e-05,
439
+ "loss": 1.2525,
440
  "step": 144
441
  },
442
  {
443
  "epoch": 0.28,
444
  "learning_rate": 2e-05,
445
+ "loss": 1.3329,
446
  "step": 146
447
  },
448
  {
449
  "epoch": 0.29,
450
  "learning_rate": 2e-05,
451
+ "loss": 1.135,
452
  "step": 148
453
  },
454
  {
455
  "epoch": 0.29,
456
  "learning_rate": 2e-05,
457
+ "loss": 0.8317,
458
  "step": 150
459
  },
460
  {
461
  "epoch": 0.29,
462
  "learning_rate": 2e-05,
463
+ "loss": 1.03,
464
  "step": 152
465
  },
466
  {
467
  "epoch": 0.3,
468
  "learning_rate": 2e-05,
469
+ "loss": 1.0428,
470
  "step": 154
471
  },
472
  {
473
  "epoch": 0.3,
474
  "learning_rate": 2e-05,
475
+ "loss": 1.1478,
476
  "step": 156
477
  },
478
  {
479
  "epoch": 0.31,
480
  "learning_rate": 2e-05,
481
+ "loss": 1.0514,
482
  "step": 158
483
  },
484
  {
485
  "epoch": 0.31,
486
  "learning_rate": 2e-05,
487
+ "loss": 1.0284,
488
  "step": 160
489
  },
490
  {
491
  "epoch": 0.31,
492
  "learning_rate": 2e-05,
493
+ "loss": 1.0399,
494
  "step": 162
495
  },
496
  {
497
  "epoch": 0.32,
498
  "learning_rate": 2e-05,
499
+ "loss": 0.9585,
500
  "step": 164
501
  },
502
  {
503
  "epoch": 0.32,
504
  "learning_rate": 2e-05,
505
+ "loss": 0.984,
506
  "step": 166
507
  },
508
  {
509
  "epoch": 0.32,
510
  "learning_rate": 2e-05,
511
+ "loss": 1.0095,
512
  "step": 168
513
  },
514
  {
515
  "epoch": 0.33,
516
  "learning_rate": 2e-05,
517
+ "loss": 1.0371,
518
  "step": 170
519
  },
520
  {
521
  "epoch": 0.33,
522
  "learning_rate": 2e-05,
523
+ "loss": 1.1832,
524
  "step": 172
525
  },
526
  {
527
  "epoch": 0.34,
528
  "learning_rate": 2e-05,
529
+ "loss": 1.1151,
530
  "step": 174
531
  },
532
  {
533
  "epoch": 0.34,
534
  "learning_rate": 2e-05,
535
+ "loss": 1.2616,
536
  "step": 176
537
  },
538
  {
539
  "epoch": 0.34,
540
  "learning_rate": 2e-05,
541
+ "loss": 1.2923,
542
  "step": 178
543
  },
544
  {
545
  "epoch": 0.35,
546
  "learning_rate": 2e-05,
547
+ "loss": 1.1316,
548
  "step": 180
549
  },
550
  {
551
  "epoch": 0.35,
552
  "learning_rate": 2e-05,
553
+ "loss": 1.2669,
554
  "step": 182
555
  },
556
  {
557
  "epoch": 0.36,
558
  "learning_rate": 2e-05,
559
+ "loss": 1.3682,
560
  "step": 184
561
  },
562
  {
563
  "epoch": 0.36,
564
  "learning_rate": 2e-05,
565
+ "loss": 1.297,
566
  "step": 186
567
  },
568
  {
569
  "epoch": 0.36,
570
  "learning_rate": 2e-05,
571
+ "loss": 1.2828,
572
  "step": 188
573
  },
574
  {
575
  "epoch": 0.37,
576
  "learning_rate": 2e-05,
577
+ "loss": 1.2625,
578
  "step": 190
579
  },
580
  {
581
  "epoch": 0.37,
582
  "learning_rate": 2e-05,
583
+ "loss": 1.4606,
584
  "step": 192
585
  },
586
  {
587
  "epoch": 0.38,
588
  "learning_rate": 2e-05,
589
+ "loss": 1.2795,
590
  "step": 194
591
  },
592
  {
593
  "epoch": 0.38,
594
  "learning_rate": 2e-05,
595
+ "loss": 1.4136,
596
  "step": 196
597
  },
598
  {
599
  "epoch": 0.38,
600
  "learning_rate": 2e-05,
601
+ "loss": 1.0876,
602
  "step": 198
603
  },
604
  {
605
  "epoch": 0.39,
606
  "learning_rate": 2e-05,
607
+ "loss": 0.7762,
608
  "step": 200
609
  },
610
  {
611
  "epoch": 0.39,
612
  "learning_rate": 2e-05,
613
+ "loss": 1.0827,
614
  "step": 202
615
  },
616
  {
617
  "epoch": 0.39,
618
  "learning_rate": 2e-05,
619
+ "loss": 0.9613,
620
  "step": 204
621
  },
622
  {
623
  "epoch": 0.4,
624
  "learning_rate": 2e-05,
625
+ "loss": 1.0078,
626
  "step": 206
627
  },
628
  {
629
  "epoch": 0.4,
630
  "learning_rate": 2e-05,
631
+ "loss": 1.0128,
632
  "step": 208
633
  },
634
  {
635
  "epoch": 0.41,
636
  "learning_rate": 2e-05,
637
+ "loss": 1.063,
638
  "step": 210
639
  },
640
  {
641
  "epoch": 0.41,
642
  "learning_rate": 2e-05,
643
+ "loss": 1.0805,
644
  "step": 212
645
  },
646
  {
647
  "epoch": 0.41,
648
  "learning_rate": 2e-05,
649
+ "loss": 0.9611,
650
  "step": 214
651
  },
652
  {
653
  "epoch": 0.42,
654
  "learning_rate": 2e-05,
655
+ "loss": 1.0106,
656
  "step": 216
657
  },
658
  {
659
  "epoch": 0.42,
660
  "learning_rate": 2e-05,
661
+ "loss": 1.0785,
662
  "step": 218
663
  },
664
  {
665
  "epoch": 0.43,
666
  "learning_rate": 2e-05,
667
+ "loss": 1.1339,
668
  "step": 220
669
  },
670
  {
671
  "epoch": 0.43,
672
  "learning_rate": 2e-05,
673
+ "loss": 1.1942,
674
  "step": 222
675
  },
676
  {
677
  "epoch": 0.43,
678
  "learning_rate": 2e-05,
679
+ "loss": 1.1281,
680
  "step": 224
681
  },
682
  {
683
  "epoch": 0.44,
684
  "learning_rate": 2e-05,
685
+ "loss": 1.1421,
686
  "step": 226
687
  },
688
  {
689
  "epoch": 0.44,
690
  "learning_rate": 2e-05,
691
+ "loss": 1.1084,
692
  "step": 228
693
  },
694
  {
695
  "epoch": 0.44,
696
  "learning_rate": 2e-05,
697
+ "loss": 1.3067,
698
  "step": 230
699
  },
700
  {
701
  "epoch": 0.45,
702
  "learning_rate": 2e-05,
703
+ "loss": 1.3573,
704
  "step": 232
705
  },
706
  {
707
  "epoch": 0.45,
708
  "learning_rate": 2e-05,
709
+ "loss": 1.2424,
710
  "step": 234
711
  },
712
  {
713
  "epoch": 0.46,
714
  "learning_rate": 2e-05,
715
+ "loss": 1.2454,
716
  "step": 236
717
  },
718
  {
719
  "epoch": 0.46,
720
  "learning_rate": 2e-05,
721
+ "loss": 1.3976,
722
  "step": 238
723
  },
724
  {
725
  "epoch": 0.46,
726
  "learning_rate": 2e-05,
727
+ "loss": 1.2691,
728
  "step": 240
729
  },
730
  {
731
  "epoch": 0.47,
732
  "learning_rate": 2e-05,
733
+ "loss": 1.4862,
734
  "step": 242
735
  },
736
  {
737
  "epoch": 0.47,
738
  "learning_rate": 2e-05,
739
+ "loss": 1.3623,
740
  "step": 244
741
  },
742
  {
743
  "epoch": 0.48,
744
  "learning_rate": 2e-05,
745
+ "loss": 1.2709,
746
  "step": 246
747
  },
748
  {
749
  "epoch": 0.48,
750
  "learning_rate": 2e-05,
751
+ "loss": 1.0707,
752
  "step": 248
753
  },
754
  {
755
  "epoch": 0.48,
756
  "learning_rate": 2e-05,
757
+ "loss": 0.6945,
758
  "step": 250
759
  },
760
  {
761
  "epoch": 0.49,
762
  "learning_rate": 2e-05,
763
+ "loss": 0.9516,
764
  "step": 252
765
  },
766
  {
767
  "epoch": 0.49,
768
  "learning_rate": 2e-05,
769
+ "loss": 1.0244,
770
  "step": 254
771
  },
772
  {
773
  "epoch": 0.5,
774
  "learning_rate": 2e-05,
775
+ "loss": 1.1423,
776
  "step": 256
777
  },
778
  {
779
  "epoch": 0.5,
780
  "learning_rate": 2e-05,
781
+ "loss": 0.9696,
782
  "step": 258
783
  },
784
  {
785
  "epoch": 0.5,
786
  "learning_rate": 2e-05,
787
+ "loss": 1.0191,
788
  "step": 260
789
  },
790
  {
791
  "epoch": 0.51,
792
  "learning_rate": 2e-05,
793
+ "loss": 1.0291,
794
  "step": 262
795
  },
796
  {
797
  "epoch": 0.51,
798
  "learning_rate": 2e-05,
799
+ "loss": 1.0665,
800
  "step": 264
801
  },
802
  {
803
  "epoch": 0.51,
804
  "learning_rate": 2e-05,
805
+ "loss": 1.0518,
806
  "step": 266
807
  },
808
  {
809
  "epoch": 0.52,
810
  "learning_rate": 2e-05,
811
+ "loss": 1.0239,
812
  "step": 268
813
  },
814
  {
815
  "epoch": 0.52,
816
  "learning_rate": 2e-05,
817
+ "loss": 1.12,
818
  "step": 270
819
  },
820
  {
821
  "epoch": 0.53,
822
  "learning_rate": 2e-05,
823
+ "loss": 1.126,
824
  "step": 272
825
  },
826
  {
827
  "epoch": 0.53,
828
  "learning_rate": 2e-05,
829
+ "loss": 1.0559,
830
  "step": 274
831
  },
832
  {
833
  "epoch": 0.53,
834
  "learning_rate": 2e-05,
835
+ "loss": 1.257,
836
  "step": 276
837
  },
838
  {
839
  "epoch": 0.54,
840
  "learning_rate": 2e-05,
841
+ "loss": 1.3178,
842
  "step": 278
843
  },
844
  {
845
  "epoch": 0.54,
846
  "learning_rate": 2e-05,
847
+ "loss": 1.32,
848
  "step": 280
849
  },
850
  {
851
  "epoch": 0.55,
852
  "learning_rate": 2e-05,
853
+ "loss": 1.153,
854
  "step": 282
855
  },
856
  {
857
  "epoch": 0.55,
858
  "learning_rate": 2e-05,
859
+ "loss": 1.4046,
860
  "step": 284
861
  },
862
  {
863
  "epoch": 0.55,
864
  "learning_rate": 2e-05,
865
+ "loss": 1.4531,
866
  "step": 286
867
  },
868
  {
869
  "epoch": 0.56,
870
  "learning_rate": 2e-05,
871
+ "loss": 1.3236,
872
  "step": 288
873
  },
874
  {
875
  "epoch": 0.56,
876
  "learning_rate": 2e-05,
877
+ "loss": 1.3427,
878
  "step": 290
879
  },
880
  {
881
  "epoch": 0.56,
882
  "learning_rate": 2e-05,
883
+ "loss": 1.4536,
884
  "step": 292
885
  },
886
  {
887
  "epoch": 0.57,
888
  "learning_rate": 2e-05,
889
+ "loss": 1.2831,
890
  "step": 294
891
  },
892
  {
893
  "epoch": 0.57,
894
  "learning_rate": 2e-05,
895
+ "loss": 1.2384,
896
  "step": 296
897
  },
898
  {
899
  "epoch": 0.58,
900
  "learning_rate": 2e-05,
901
+ "loss": 1.0424,
902
  "step": 298
903
  },
904
  {
905
  "epoch": 0.58,
906
  "learning_rate": 2e-05,
907
+ "loss": 0.7491,
908
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  }
910
  ],
911
  "max_steps": 1000,
912
  "num_train_epochs": 2,
913
+ "total_flos": 3.0851676306210816e+16,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65bd61ef1c7e6c258f8363ecc436710acb30568eee6e50b6b4c27e5166d1da58
3
  size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b982e90943b5865c4d2d304aa9bfce32dad4e18f6ed906af992890773d4badb
3
  size 3963