IRI2070 commited on
Commit
820e58c
·
verified ·
1 Parent(s): a978a1c

Upload 10 files

Browse files
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.5545159532889594,
4
+ "eval_loss": 2.853837490081787,
5
+ "eval_runtime": 100.33,
6
+ "eval_samples": 27721,
7
+ "eval_samples_per_second": 276.298,
8
+ "eval_steps_per_second": 8.641,
9
+ "perplexity": 17.354250987692804,
10
+ "total_flos": 3692222600970240.0,
11
+ "train_loss": 3.0354640246975806,
12
+ "train_runtime": 19788.2132,
13
+ "train_samples": 867980,
14
+ "train_samples_per_second": 131.59,
15
+ "train_steps_per_second": 4.112
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.5545159532889594,
4
+ "eval_loss": 2.853837490081787,
5
+ "eval_runtime": 100.33,
6
+ "eval_samples": 27721,
7
+ "eval_samples_per_second": 276.298,
8
+ "eval_steps_per_second": 8.641,
9
+ "perplexity": 17.354250987692804
10
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7a4ab5c612128a7524818797f72377a297776cb3302443c45bafca858308748
3
  size 26690208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79e8367cbafae9d789adc9218817dc7e19e5238a8cb24a0e7f8139803803710b
3
  size 26690208
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 3692222600970240.0,
4
+ "train_loss": 3.0354640246975806,
5
+ "train_runtime": 19788.2132,
6
+ "train_samples": 867980,
7
+ "train_samples_per_second": 131.59,
8
+ "train_steps_per_second": 4.112
9
+ }
trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8294930875576036,
6
  "eval_steps": 500,
7
- "global_step": 22500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -323,6 +323,834 @@
323
  "learning_rate": 3.617572964669739e-05,
324
  "loss": 3.075,
325
  "step": 22500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  }
327
  ],
328
  "logging_steps": 500,
@@ -337,12 +1165,12 @@
337
  "should_evaluate": false,
338
  "should_log": false,
339
  "should_save": true,
340
- "should_training_stop": false
341
  },
342
  "attributes": {}
343
  }
344
  },
345
- "total_flos": 1020914565120000.0,
346
  "train_batch_size": 32,
347
  "trial_name": null,
348
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 81375,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
323
  "learning_rate": 3.617572964669739e-05,
324
  "loss": 3.075,
325
  "step": 22500
326
+ },
327
+ {
328
+ "epoch": 0.847926267281106,
329
+ "grad_norm": 2.820953130722046,
330
+ "learning_rate": 3.586850998463902e-05,
331
+ "loss": 3.068,
332
+ "step": 23000
333
+ },
334
+ {
335
+ "epoch": 0.8663594470046083,
336
+ "grad_norm": 2.321009397506714,
337
+ "learning_rate": 3.556129032258065e-05,
338
+ "loss": 3.0588,
339
+ "step": 23500
340
+ },
341
+ {
342
+ "epoch": 0.8847926267281107,
343
+ "grad_norm": 2.54306697845459,
344
+ "learning_rate": 3.525407066052228e-05,
345
+ "loss": 3.0475,
346
+ "step": 24000
347
+ },
348
+ {
349
+ "epoch": 0.9032258064516129,
350
+ "grad_norm": 2.3935065269470215,
351
+ "learning_rate": 3.4946850998463904e-05,
352
+ "loss": 3.0174,
353
+ "step": 24500
354
+ },
355
+ {
356
+ "epoch": 0.9216589861751152,
357
+ "grad_norm": 2.3906099796295166,
358
+ "learning_rate": 3.463963133640553e-05,
359
+ "loss": 3.0189,
360
+ "step": 25000
361
+ },
362
+ {
363
+ "epoch": 0.9400921658986175,
364
+ "grad_norm": 2.480583906173706,
365
+ "learning_rate": 3.433241167434716e-05,
366
+ "loss": 3.0467,
367
+ "step": 25500
368
+ },
369
+ {
370
+ "epoch": 0.9585253456221198,
371
+ "grad_norm": 2.1853816509246826,
372
+ "learning_rate": 3.402519201228879e-05,
373
+ "loss": 3.0324,
374
+ "step": 26000
375
+ },
376
+ {
377
+ "epoch": 0.9769585253456221,
378
+ "grad_norm": 2.525022506713867,
379
+ "learning_rate": 3.371797235023041e-05,
380
+ "loss": 3.0046,
381
+ "step": 26500
382
+ },
383
+ {
384
+ "epoch": 0.9953917050691244,
385
+ "grad_norm": 2.310753345489502,
386
+ "learning_rate": 3.3410752688172044e-05,
387
+ "loss": 3.0123,
388
+ "step": 27000
389
+ },
390
+ {
391
+ "epoch": 1.0138248847926268,
392
+ "grad_norm": 2.827805757522583,
393
+ "learning_rate": 3.3103533026113676e-05,
394
+ "loss": 3.0073,
395
+ "step": 27500
396
+ },
397
+ {
398
+ "epoch": 1.032258064516129,
399
+ "grad_norm": 2.4869062900543213,
400
+ "learning_rate": 3.27963133640553e-05,
401
+ "loss": 2.9999,
402
+ "step": 28000
403
+ },
404
+ {
405
+ "epoch": 1.0506912442396312,
406
+ "grad_norm": 2.9428353309631348,
407
+ "learning_rate": 3.2489093701996933e-05,
408
+ "loss": 3.0065,
409
+ "step": 28500
410
+ },
411
+ {
412
+ "epoch": 1.0691244239631337,
413
+ "grad_norm": 2.4092352390289307,
414
+ "learning_rate": 3.218187403993856e-05,
415
+ "loss": 2.97,
416
+ "step": 29000
417
+ },
418
+ {
419
+ "epoch": 1.087557603686636,
420
+ "grad_norm": 2.185153007507324,
421
+ "learning_rate": 3.1874654377880184e-05,
422
+ "loss": 2.9744,
423
+ "step": 29500
424
+ },
425
+ {
426
+ "epoch": 1.1059907834101383,
427
+ "grad_norm": 2.547611713409424,
428
+ "learning_rate": 3.1567434715821816e-05,
429
+ "loss": 2.9755,
430
+ "step": 30000
431
+ },
432
+ {
433
+ "epoch": 1.1244239631336406,
434
+ "grad_norm": 2.3823814392089844,
435
+ "learning_rate": 3.126021505376344e-05,
436
+ "loss": 2.9597,
437
+ "step": 30500
438
+ },
439
+ {
440
+ "epoch": 1.1428571428571428,
441
+ "grad_norm": 2.282871961593628,
442
+ "learning_rate": 3.095299539170507e-05,
443
+ "loss": 2.9871,
444
+ "step": 31000
445
+ },
446
+ {
447
+ "epoch": 1.1612903225806452,
448
+ "grad_norm": 2.517770767211914,
449
+ "learning_rate": 3.06457757296467e-05,
450
+ "loss": 2.9971,
451
+ "step": 31500
452
+ },
453
+ {
454
+ "epoch": 1.1797235023041475,
455
+ "grad_norm": 2.8500301837921143,
456
+ "learning_rate": 3.0338556067588324e-05,
457
+ "loss": 2.9692,
458
+ "step": 32000
459
+ },
460
+ {
461
+ "epoch": 1.1981566820276497,
462
+ "grad_norm": 2.3024988174438477,
463
+ "learning_rate": 3.0031336405529953e-05,
464
+ "loss": 2.9512,
465
+ "step": 32500
466
+ },
467
+ {
468
+ "epoch": 1.2165898617511521,
469
+ "grad_norm": 2.4389448165893555,
470
+ "learning_rate": 2.9724116743471585e-05,
471
+ "loss": 2.9743,
472
+ "step": 33000
473
+ },
474
+ {
475
+ "epoch": 1.2350230414746544,
476
+ "grad_norm": 2.6087846755981445,
477
+ "learning_rate": 2.9416897081413213e-05,
478
+ "loss": 2.9634,
479
+ "step": 33500
480
+ },
481
+ {
482
+ "epoch": 1.2534562211981566,
483
+ "grad_norm": 2.1963679790496826,
484
+ "learning_rate": 2.9109677419354842e-05,
485
+ "loss": 2.9408,
486
+ "step": 34000
487
+ },
488
+ {
489
+ "epoch": 1.271889400921659,
490
+ "grad_norm": 2.6434950828552246,
491
+ "learning_rate": 2.880245775729647e-05,
492
+ "loss": 2.9334,
493
+ "step": 34500
494
+ },
495
+ {
496
+ "epoch": 1.2903225806451613,
497
+ "grad_norm": 2.5725350379943848,
498
+ "learning_rate": 2.8495238095238096e-05,
499
+ "loss": 2.9443,
500
+ "step": 35000
501
+ },
502
+ {
503
+ "epoch": 1.3087557603686637,
504
+ "grad_norm": 2.343334674835205,
505
+ "learning_rate": 2.8188018433179725e-05,
506
+ "loss": 2.9587,
507
+ "step": 35500
508
+ },
509
+ {
510
+ "epoch": 1.327188940092166,
511
+ "grad_norm": 2.673114776611328,
512
+ "learning_rate": 2.7880798771121353e-05,
513
+ "loss": 2.956,
514
+ "step": 36000
515
+ },
516
+ {
517
+ "epoch": 1.3456221198156681,
518
+ "grad_norm": 2.481757640838623,
519
+ "learning_rate": 2.757357910906298e-05,
520
+ "loss": 2.9332,
521
+ "step": 36500
522
+ },
523
+ {
524
+ "epoch": 1.3640552995391704,
525
+ "grad_norm": 3.0299792289733887,
526
+ "learning_rate": 2.7266359447004607e-05,
527
+ "loss": 2.947,
528
+ "step": 37000
529
+ },
530
+ {
531
+ "epoch": 1.3824884792626728,
532
+ "grad_norm": 3.3357937335968018,
533
+ "learning_rate": 2.6959139784946236e-05,
534
+ "loss": 2.9236,
535
+ "step": 37500
536
+ },
537
+ {
538
+ "epoch": 1.400921658986175,
539
+ "grad_norm": 2.214954376220703,
540
+ "learning_rate": 2.6651920122887865e-05,
541
+ "loss": 2.9334,
542
+ "step": 38000
543
+ },
544
+ {
545
+ "epoch": 1.4193548387096775,
546
+ "grad_norm": 2.7208831310272217,
547
+ "learning_rate": 2.6344700460829497e-05,
548
+ "loss": 2.9001,
549
+ "step": 38500
550
+ },
551
+ {
552
+ "epoch": 1.4377880184331797,
553
+ "grad_norm": 2.822230577468872,
554
+ "learning_rate": 2.6037480798771125e-05,
555
+ "loss": 2.9563,
556
+ "step": 39000
557
+ },
558
+ {
559
+ "epoch": 1.456221198156682,
560
+ "grad_norm": 2.5907464027404785,
561
+ "learning_rate": 2.573026113671275e-05,
562
+ "loss": 2.9352,
563
+ "step": 39500
564
+ },
565
+ {
566
+ "epoch": 1.4746543778801844,
567
+ "grad_norm": 2.509422540664673,
568
+ "learning_rate": 2.542304147465438e-05,
569
+ "loss": 2.9388,
570
+ "step": 40000
571
+ },
572
+ {
573
+ "epoch": 1.4930875576036866,
574
+ "grad_norm": 2.8918466567993164,
575
+ "learning_rate": 2.5115821812596008e-05,
576
+ "loss": 2.9072,
577
+ "step": 40500
578
+ },
579
+ {
580
+ "epoch": 1.511520737327189,
581
+ "grad_norm": 2.3461146354675293,
582
+ "learning_rate": 2.4808602150537637e-05,
583
+ "loss": 2.9013,
584
+ "step": 41000
585
+ },
586
+ {
587
+ "epoch": 1.5299539170506913,
588
+ "grad_norm": 2.3494646549224854,
589
+ "learning_rate": 2.4501382488479262e-05,
590
+ "loss": 2.8947,
591
+ "step": 41500
592
+ },
593
+ {
594
+ "epoch": 1.5483870967741935,
595
+ "grad_norm": 2.2246921062469482,
596
+ "learning_rate": 2.419416282642089e-05,
597
+ "loss": 2.9193,
598
+ "step": 42000
599
+ },
600
+ {
601
+ "epoch": 1.5668202764976957,
602
+ "grad_norm": 2.4895882606506348,
603
+ "learning_rate": 2.3886943164362523e-05,
604
+ "loss": 2.9209,
605
+ "step": 42500
606
+ },
607
+ {
608
+ "epoch": 1.5852534562211982,
609
+ "grad_norm": 2.234105110168457,
610
+ "learning_rate": 2.3579723502304148e-05,
611
+ "loss": 2.9104,
612
+ "step": 43000
613
+ },
614
+ {
615
+ "epoch": 1.6036866359447006,
616
+ "grad_norm": 2.2471518516540527,
617
+ "learning_rate": 2.3272503840245777e-05,
618
+ "loss": 2.8924,
619
+ "step": 43500
620
+ },
621
+ {
622
+ "epoch": 1.6221198156682028,
623
+ "grad_norm": 2.6903395652770996,
624
+ "learning_rate": 2.2965284178187405e-05,
625
+ "loss": 2.9108,
626
+ "step": 44000
627
+ },
628
+ {
629
+ "epoch": 1.640552995391705,
630
+ "grad_norm": 2.5113911628723145,
631
+ "learning_rate": 2.265806451612903e-05,
632
+ "loss": 2.9167,
633
+ "step": 44500
634
+ },
635
+ {
636
+ "epoch": 1.6589861751152073,
637
+ "grad_norm": 2.295367956161499,
638
+ "learning_rate": 2.2350844854070663e-05,
639
+ "loss": 2.91,
640
+ "step": 45000
641
+ },
642
+ {
643
+ "epoch": 1.6774193548387095,
644
+ "grad_norm": 2.705887794494629,
645
+ "learning_rate": 2.204362519201229e-05,
646
+ "loss": 2.9193,
647
+ "step": 45500
648
+ },
649
+ {
650
+ "epoch": 1.695852534562212,
651
+ "grad_norm": 2.490004777908325,
652
+ "learning_rate": 2.1736405529953917e-05,
653
+ "loss": 2.8773,
654
+ "step": 46000
655
+ },
656
+ {
657
+ "epoch": 1.7142857142857144,
658
+ "grad_norm": 2.564751148223877,
659
+ "learning_rate": 2.1429185867895545e-05,
660
+ "loss": 2.8979,
661
+ "step": 46500
662
+ },
663
+ {
664
+ "epoch": 1.7327188940092166,
665
+ "grad_norm": 2.5527286529541016,
666
+ "learning_rate": 2.1121966205837174e-05,
667
+ "loss": 2.9022,
668
+ "step": 47000
669
+ },
670
+ {
671
+ "epoch": 1.7511520737327189,
672
+ "grad_norm": 2.6402347087860107,
673
+ "learning_rate": 2.0814746543778803e-05,
674
+ "loss": 2.8751,
675
+ "step": 47500
676
+ },
677
+ {
678
+ "epoch": 1.769585253456221,
679
+ "grad_norm": 2.415748357772827,
680
+ "learning_rate": 2.050752688172043e-05,
681
+ "loss": 2.8786,
682
+ "step": 48000
683
+ },
684
+ {
685
+ "epoch": 1.7880184331797235,
686
+ "grad_norm": 2.6750245094299316,
687
+ "learning_rate": 2.020030721966206e-05,
688
+ "loss": 2.8853,
689
+ "step": 48500
690
+ },
691
+ {
692
+ "epoch": 1.8064516129032258,
693
+ "grad_norm": 2.4245858192443848,
694
+ "learning_rate": 1.989308755760369e-05,
695
+ "loss": 2.875,
696
+ "step": 49000
697
+ },
698
+ {
699
+ "epoch": 1.8248847926267282,
700
+ "grad_norm": 2.660170078277588,
701
+ "learning_rate": 1.9585867895545314e-05,
702
+ "loss": 2.8737,
703
+ "step": 49500
704
+ },
705
+ {
706
+ "epoch": 1.8433179723502304,
707
+ "grad_norm": 2.4194977283477783,
708
+ "learning_rate": 1.9278648233486943e-05,
709
+ "loss": 2.8654,
710
+ "step": 50000
711
+ },
712
+ {
713
+ "epoch": 1.8617511520737327,
714
+ "grad_norm": 2.4706435203552246,
715
+ "learning_rate": 1.8971428571428575e-05,
716
+ "loss": 2.8302,
717
+ "step": 50500
718
+ },
719
+ {
720
+ "epoch": 1.8801843317972349,
721
+ "grad_norm": 2.8965485095977783,
722
+ "learning_rate": 1.86642089093702e-05,
723
+ "loss": 2.874,
724
+ "step": 51000
725
+ },
726
+ {
727
+ "epoch": 1.8986175115207373,
728
+ "grad_norm": 2.812009811401367,
729
+ "learning_rate": 1.835698924731183e-05,
730
+ "loss": 2.8833,
731
+ "step": 51500
732
+ },
733
+ {
734
+ "epoch": 1.9170506912442398,
735
+ "grad_norm": 2.7252895832061768,
736
+ "learning_rate": 1.8049769585253457e-05,
737
+ "loss": 2.8639,
738
+ "step": 52000
739
+ },
740
+ {
741
+ "epoch": 1.935483870967742,
742
+ "grad_norm": 2.5407986640930176,
743
+ "learning_rate": 1.7742549923195083e-05,
744
+ "loss": 2.8637,
745
+ "step": 52500
746
+ },
747
+ {
748
+ "epoch": 1.9539170506912442,
749
+ "grad_norm": 2.5381031036376953,
750
+ "learning_rate": 1.7435330261136715e-05,
751
+ "loss": 2.8636,
752
+ "step": 53000
753
+ },
754
+ {
755
+ "epoch": 1.9723502304147464,
756
+ "grad_norm": 2.5974888801574707,
757
+ "learning_rate": 1.7128110599078343e-05,
758
+ "loss": 2.8563,
759
+ "step": 53500
760
+ },
761
+ {
762
+ "epoch": 1.9907834101382489,
763
+ "grad_norm": 2.5476796627044678,
764
+ "learning_rate": 1.682089093701997e-05,
765
+ "loss": 2.8651,
766
+ "step": 54000
767
+ },
768
+ {
769
+ "epoch": 2.0092165898617513,
770
+ "grad_norm": 2.4536616802215576,
771
+ "learning_rate": 1.6513671274961597e-05,
772
+ "loss": 2.8788,
773
+ "step": 54500
774
+ },
775
+ {
776
+ "epoch": 2.0276497695852536,
777
+ "grad_norm": 3.109189510345459,
778
+ "learning_rate": 1.6206451612903226e-05,
779
+ "loss": 2.8355,
780
+ "step": 55000
781
+ },
782
+ {
783
+ "epoch": 2.046082949308756,
784
+ "grad_norm": 2.727445363998413,
785
+ "learning_rate": 1.5899231950844855e-05,
786
+ "loss": 2.8582,
787
+ "step": 55500
788
+ },
789
+ {
790
+ "epoch": 2.064516129032258,
791
+ "grad_norm": 2.809833288192749,
792
+ "learning_rate": 1.5592012288786483e-05,
793
+ "loss": 2.8763,
794
+ "step": 56000
795
+ },
796
+ {
797
+ "epoch": 2.0829493087557602,
798
+ "grad_norm": 2.9285683631896973,
799
+ "learning_rate": 1.5284792626728112e-05,
800
+ "loss": 2.8524,
801
+ "step": 56500
802
+ },
803
+ {
804
+ "epoch": 2.1013824884792625,
805
+ "grad_norm": 2.599776268005371,
806
+ "learning_rate": 1.4977572964669739e-05,
807
+ "loss": 2.8235,
808
+ "step": 57000
809
+ },
810
+ {
811
+ "epoch": 2.119815668202765,
812
+ "grad_norm": 2.367570638656616,
813
+ "learning_rate": 1.4670353302611368e-05,
814
+ "loss": 2.8624,
815
+ "step": 57500
816
+ },
817
+ {
818
+ "epoch": 2.1382488479262673,
819
+ "grad_norm": 2.971496820449829,
820
+ "learning_rate": 1.4363133640552995e-05,
821
+ "loss": 2.8542,
822
+ "step": 58000
823
+ },
824
+ {
825
+ "epoch": 2.1566820276497696,
826
+ "grad_norm": 2.530744791030884,
827
+ "learning_rate": 1.4055913978494625e-05,
828
+ "loss": 2.878,
829
+ "step": 58500
830
+ },
831
+ {
832
+ "epoch": 2.175115207373272,
833
+ "grad_norm": 2.3559248447418213,
834
+ "learning_rate": 1.3748694316436254e-05,
835
+ "loss": 2.8378,
836
+ "step": 59000
837
+ },
838
+ {
839
+ "epoch": 2.193548387096774,
840
+ "grad_norm": 2.6017301082611084,
841
+ "learning_rate": 1.344147465437788e-05,
842
+ "loss": 2.8526,
843
+ "step": 59500
844
+ },
845
+ {
846
+ "epoch": 2.2119815668202767,
847
+ "grad_norm": 2.727224349975586,
848
+ "learning_rate": 1.313425499231951e-05,
849
+ "loss": 2.8457,
850
+ "step": 60000
851
+ },
852
+ {
853
+ "epoch": 2.230414746543779,
854
+ "grad_norm": 2.7515804767608643,
855
+ "learning_rate": 1.2827035330261136e-05,
856
+ "loss": 2.8422,
857
+ "step": 60500
858
+ },
859
+ {
860
+ "epoch": 2.248847926267281,
861
+ "grad_norm": 2.1259450912475586,
862
+ "learning_rate": 1.2519815668202767e-05,
863
+ "loss": 2.8552,
864
+ "step": 61000
865
+ },
866
+ {
867
+ "epoch": 2.2672811059907834,
868
+ "grad_norm": 2.3828954696655273,
869
+ "learning_rate": 1.2212596006144395e-05,
870
+ "loss": 2.833,
871
+ "step": 61500
872
+ },
873
+ {
874
+ "epoch": 2.2857142857142856,
875
+ "grad_norm": 2.588263988494873,
876
+ "learning_rate": 1.1905376344086022e-05,
877
+ "loss": 2.8259,
878
+ "step": 62000
879
+ },
880
+ {
881
+ "epoch": 2.3041474654377883,
882
+ "grad_norm": 2.4910430908203125,
883
+ "learning_rate": 1.159815668202765e-05,
884
+ "loss": 2.8322,
885
+ "step": 62500
886
+ },
887
+ {
888
+ "epoch": 2.3225806451612905,
889
+ "grad_norm": 2.4441442489624023,
890
+ "learning_rate": 1.129093701996928e-05,
891
+ "loss": 2.8368,
892
+ "step": 63000
893
+ },
894
+ {
895
+ "epoch": 2.3410138248847927,
896
+ "grad_norm": 2.8292665481567383,
897
+ "learning_rate": 1.0983717357910907e-05,
898
+ "loss": 2.8541,
899
+ "step": 63500
900
+ },
901
+ {
902
+ "epoch": 2.359447004608295,
903
+ "grad_norm": 3.0737709999084473,
904
+ "learning_rate": 1.0676497695852535e-05,
905
+ "loss": 2.8302,
906
+ "step": 64000
907
+ },
908
+ {
909
+ "epoch": 2.377880184331797,
910
+ "grad_norm": 2.6240386962890625,
911
+ "learning_rate": 1.0369278033794164e-05,
912
+ "loss": 2.8581,
913
+ "step": 64500
914
+ },
915
+ {
916
+ "epoch": 2.3963133640552994,
917
+ "grad_norm": 2.7089595794677734,
918
+ "learning_rate": 1.0062058371735791e-05,
919
+ "loss": 2.8291,
920
+ "step": 65000
921
+ },
922
+ {
923
+ "epoch": 2.4147465437788016,
924
+ "grad_norm": 2.5590097904205322,
925
+ "learning_rate": 9.754838709677421e-06,
926
+ "loss": 2.8423,
927
+ "step": 65500
928
+ },
929
+ {
930
+ "epoch": 2.4331797235023043,
931
+ "grad_norm": 2.529129981994629,
932
+ "learning_rate": 9.447619047619048e-06,
933
+ "loss": 2.8313,
934
+ "step": 66000
935
+ },
936
+ {
937
+ "epoch": 2.4516129032258065,
938
+ "grad_norm": 2.3878464698791504,
939
+ "learning_rate": 9.140399385560675e-06,
940
+ "loss": 2.8396,
941
+ "step": 66500
942
+ },
943
+ {
944
+ "epoch": 2.4700460829493087,
945
+ "grad_norm": 2.324528217315674,
946
+ "learning_rate": 8.833179723502306e-06,
947
+ "loss": 2.8406,
948
+ "step": 67000
949
+ },
950
+ {
951
+ "epoch": 2.488479262672811,
952
+ "grad_norm": 2.531818389892578,
953
+ "learning_rate": 8.525960061443933e-06,
954
+ "loss": 2.8283,
955
+ "step": 67500
956
+ },
957
+ {
958
+ "epoch": 2.506912442396313,
959
+ "grad_norm": 2.370063066482544,
960
+ "learning_rate": 8.218740399385561e-06,
961
+ "loss": 2.8218,
962
+ "step": 68000
963
+ },
964
+ {
965
+ "epoch": 2.525345622119816,
966
+ "grad_norm": 2.7173168659210205,
967
+ "learning_rate": 7.91152073732719e-06,
968
+ "loss": 2.8193,
969
+ "step": 68500
970
+ },
971
+ {
972
+ "epoch": 2.543778801843318,
973
+ "grad_norm": 2.893047571182251,
974
+ "learning_rate": 7.604301075268818e-06,
975
+ "loss": 2.8789,
976
+ "step": 69000
977
+ },
978
+ {
979
+ "epoch": 2.5622119815668203,
980
+ "grad_norm": 2.3326709270477295,
981
+ "learning_rate": 7.297081413210446e-06,
982
+ "loss": 2.82,
983
+ "step": 69500
984
+ },
985
+ {
986
+ "epoch": 2.5806451612903225,
987
+ "grad_norm": 2.6681976318359375,
988
+ "learning_rate": 6.989861751152074e-06,
989
+ "loss": 2.8304,
990
+ "step": 70000
991
+ },
992
+ {
993
+ "epoch": 2.5990783410138247,
994
+ "grad_norm": 2.398226261138916,
995
+ "learning_rate": 6.682642089093702e-06,
996
+ "loss": 2.8588,
997
+ "step": 70500
998
+ },
999
+ {
1000
+ "epoch": 2.6175115207373274,
1001
+ "grad_norm": 2.898515462875366,
1002
+ "learning_rate": 6.375422427035331e-06,
1003
+ "loss": 2.8197,
1004
+ "step": 71000
1005
+ },
1006
+ {
1007
+ "epoch": 2.6359447004608296,
1008
+ "grad_norm": 2.739598035812378,
1009
+ "learning_rate": 6.0682027649769585e-06,
1010
+ "loss": 2.8118,
1011
+ "step": 71500
1012
+ },
1013
+ {
1014
+ "epoch": 2.654377880184332,
1015
+ "grad_norm": 2.643958806991577,
1016
+ "learning_rate": 5.760983102918587e-06,
1017
+ "loss": 2.8237,
1018
+ "step": 72000
1019
+ },
1020
+ {
1021
+ "epoch": 2.672811059907834,
1022
+ "grad_norm": 2.4359323978424072,
1023
+ "learning_rate": 5.453763440860216e-06,
1024
+ "loss": 2.849,
1025
+ "step": 72500
1026
+ },
1027
+ {
1028
+ "epoch": 2.6912442396313363,
1029
+ "grad_norm": 2.789459228515625,
1030
+ "learning_rate": 5.146543778801844e-06,
1031
+ "loss": 2.8295,
1032
+ "step": 73000
1033
+ },
1034
+ {
1035
+ "epoch": 2.709677419354839,
1036
+ "grad_norm": 2.7510650157928467,
1037
+ "learning_rate": 4.8393241167434715e-06,
1038
+ "loss": 2.8261,
1039
+ "step": 73500
1040
+ },
1041
+ {
1042
+ "epoch": 2.7281105990783407,
1043
+ "grad_norm": 2.687920570373535,
1044
+ "learning_rate": 4.5321044546851e-06,
1045
+ "loss": 2.8247,
1046
+ "step": 74000
1047
+ },
1048
+ {
1049
+ "epoch": 2.7465437788018434,
1050
+ "grad_norm": 2.563568592071533,
1051
+ "learning_rate": 4.224884792626729e-06,
1052
+ "loss": 2.8077,
1053
+ "step": 74500
1054
+ },
1055
+ {
1056
+ "epoch": 2.7649769585253456,
1057
+ "grad_norm": 2.5233335494995117,
1058
+ "learning_rate": 3.917665130568357e-06,
1059
+ "loss": 2.8249,
1060
+ "step": 75000
1061
+ },
1062
+ {
1063
+ "epoch": 2.783410138248848,
1064
+ "grad_norm": 2.1638145446777344,
1065
+ "learning_rate": 3.610445468509985e-06,
1066
+ "loss": 2.8117,
1067
+ "step": 75500
1068
+ },
1069
+ {
1070
+ "epoch": 2.80184331797235,
1071
+ "grad_norm": 2.3863344192504883,
1072
+ "learning_rate": 3.303225806451613e-06,
1073
+ "loss": 2.8267,
1074
+ "step": 76000
1075
+ },
1076
+ {
1077
+ "epoch": 2.8202764976958523,
1078
+ "grad_norm": 2.6310081481933594,
1079
+ "learning_rate": 2.9960061443932414e-06,
1080
+ "loss": 2.7984,
1081
+ "step": 76500
1082
+ },
1083
+ {
1084
+ "epoch": 2.838709677419355,
1085
+ "grad_norm": 2.5833451747894287,
1086
+ "learning_rate": 2.6887864823348697e-06,
1087
+ "loss": 2.828,
1088
+ "step": 77000
1089
+ },
1090
+ {
1091
+ "epoch": 2.857142857142857,
1092
+ "grad_norm": 2.796090602874756,
1093
+ "learning_rate": 2.381566820276498e-06,
1094
+ "loss": 2.8328,
1095
+ "step": 77500
1096
+ },
1097
+ {
1098
+ "epoch": 2.8755760368663594,
1099
+ "grad_norm": 2.4523568153381348,
1100
+ "learning_rate": 2.074347158218126e-06,
1101
+ "loss": 2.8262,
1102
+ "step": 78000
1103
+ },
1104
+ {
1105
+ "epoch": 2.8940092165898617,
1106
+ "grad_norm": 2.7820804119110107,
1107
+ "learning_rate": 1.7671274961597542e-06,
1108
+ "loss": 2.8492,
1109
+ "step": 78500
1110
+ },
1111
+ {
1112
+ "epoch": 2.912442396313364,
1113
+ "grad_norm": 2.6446750164031982,
1114
+ "learning_rate": 1.4599078341013825e-06,
1115
+ "loss": 2.8004,
1116
+ "step": 79000
1117
+ },
1118
+ {
1119
+ "epoch": 2.9308755760368665,
1120
+ "grad_norm": 2.688173532485962,
1121
+ "learning_rate": 1.1526881720430107e-06,
1122
+ "loss": 2.8324,
1123
+ "step": 79500
1124
+ },
1125
+ {
1126
+ "epoch": 2.9493087557603688,
1127
+ "grad_norm": 2.5094845294952393,
1128
+ "learning_rate": 8.454685099846391e-07,
1129
+ "loss": 2.8299,
1130
+ "step": 80000
1131
+ },
1132
+ {
1133
+ "epoch": 2.967741935483871,
1134
+ "grad_norm": 3.0129997730255127,
1135
+ "learning_rate": 5.382488479262673e-07,
1136
+ "loss": 2.8397,
1137
+ "step": 80500
1138
+ },
1139
+ {
1140
+ "epoch": 2.986175115207373,
1141
+ "grad_norm": 2.273484706878662,
1142
+ "learning_rate": 2.3102918586789556e-07,
1143
+ "loss": 2.8117,
1144
+ "step": 81000
1145
+ },
1146
+ {
1147
+ "epoch": 3.0,
1148
+ "step": 81375,
1149
+ "total_flos": 3692222600970240.0,
1150
+ "train_loss": 3.0354640246975806,
1151
+ "train_runtime": 19788.2132,
1152
+ "train_samples_per_second": 131.59,
1153
+ "train_steps_per_second": 4.112
1154
  }
1155
  ],
1156
  "logging_steps": 500,
 
1165
  "should_evaluate": false,
1166
  "should_log": false,
1167
  "should_save": true,
1168
+ "should_training_stop": true
1169
  },
1170
  "attributes": {}
1171
  }
1172
  },
1173
+ "total_flos": 3692222600970240.0,
1174
  "train_batch_size": 32,
1175
  "trial_name": null,
1176
  "trial_params": null