Sabbir772 commited on
Commit
e32615e
·
verified ·
1 Parent(s): 2bb2506

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe2a15bf99c0f458b2206d6f35220f4c958e95e187565eba49da4fb8564ae369
3
  size 990197608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bd739e3030cabd9261b2fb8ca1428184e2711c13722ac619bc934519bf3051c
3
  size 990197608
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c91c5b91e574bf8d32168fe23a0d8f97d2c0d7a1ad5ad3fe64951d9602e1bdc7
3
  size 1980569867
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0a7cfc818dd03b695689292cf139f33a9b667b8a8e134f4c54f6a29163bbd4
3
  size 1980569867
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2276a80239d2bec0f27af2510173a92b0a5242a76a5b11dff11d2bba9784d26
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e86f46b4125143fd9131eedafad3b1c11fa3edb1d514dbfb16332dc9646a522
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa9bf41974fd17e7e1e3fb4258cf1a9ed2a23fdde64d539be88a5a088f814ff5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:723dfd3f2d8a568ebabe037a79a9ffd4dd726a3f691a794a96a82766663bf137
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0,
6
  "eval_steps": 500,
7
- "global_step": 5241,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -380,6 +380,378 @@
380
  "eval_samples_per_second": 23.052,
381
  "eval_steps_per_second": 2.885,
382
  "step": 5241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
  }
384
  ],
385
  "logging_steps": 100,
@@ -399,7 +771,7 @@
399
  "attributes": {}
400
  }
401
  },
402
- "total_flos": 1.4352011987779584e+16,
403
  "train_batch_size": 8,
404
  "trial_name": null,
405
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
+ "global_step": 10482,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
380
  "eval_samples_per_second": 23.052,
381
  "eval_steps_per_second": 2.885,
382
  "step": 5241
383
+ },
384
+ {
385
+ "epoch": 1.0112573936271705,
386
+ "grad_norm": 6.653578281402588,
387
+ "learning_rate": 4.4944667048273234e-05,
388
+ "loss": 2.8934,
389
+ "step": 5300
390
+ },
391
+ {
392
+ "epoch": 1.0303377218088152,
393
+ "grad_norm": 4.845913887023926,
394
+ "learning_rate": 4.484926540736501e-05,
395
+ "loss": 2.6501,
396
+ "step": 5400
397
+ },
398
+ {
399
+ "epoch": 1.0494180499904597,
400
+ "grad_norm": 5.232843399047852,
401
+ "learning_rate": 4.475386376645678e-05,
402
+ "loss": 2.7856,
403
+ "step": 5500
404
+ },
405
+ {
406
+ "epoch": 1.0684983781721045,
407
+ "grad_norm": 6.629660129547119,
408
+ "learning_rate": 4.465846212554856e-05,
409
+ "loss": 2.8155,
410
+ "step": 5600
411
+ },
412
+ {
413
+ "epoch": 1.0875787063537492,
414
+ "grad_norm": 4.096762180328369,
415
+ "learning_rate": 4.456306048464034e-05,
416
+ "loss": 2.789,
417
+ "step": 5700
418
+ },
419
+ {
420
+ "epoch": 1.106659034535394,
421
+ "grad_norm": 6.521574020385742,
422
+ "learning_rate": 4.4467658843732114e-05,
423
+ "loss": 2.749,
424
+ "step": 5800
425
+ },
426
+ {
427
+ "epoch": 1.1257393627170387,
428
+ "grad_norm": 4.849175930023193,
429
+ "learning_rate": 4.4372257202823894e-05,
430
+ "loss": 2.7133,
431
+ "step": 5900
432
+ },
433
+ {
434
+ "epoch": 1.1448196908986834,
435
+ "grad_norm": 5.988980770111084,
436
+ "learning_rate": 4.427685556191567e-05,
437
+ "loss": 2.704,
438
+ "step": 6000
439
+ },
440
+ {
441
+ "epoch": 1.1639000190803281,
442
+ "grad_norm": 4.623400688171387,
443
+ "learning_rate": 4.418145392100744e-05,
444
+ "loss": 2.6576,
445
+ "step": 6100
446
+ },
447
+ {
448
+ "epoch": 1.1829803472619729,
449
+ "grad_norm": 4.707732200622559,
450
+ "learning_rate": 4.408605228009922e-05,
451
+ "loss": 2.7199,
452
+ "step": 6200
453
+ },
454
+ {
455
+ "epoch": 1.2020606754436176,
456
+ "grad_norm": 6.403053283691406,
457
+ "learning_rate": 4.399065063919099e-05,
458
+ "loss": 2.6899,
459
+ "step": 6300
460
+ },
461
+ {
462
+ "epoch": 1.2211410036252623,
463
+ "grad_norm": 6.6001152992248535,
464
+ "learning_rate": 4.389524899828277e-05,
465
+ "loss": 2.7606,
466
+ "step": 6400
467
+ },
468
+ {
469
+ "epoch": 1.240221331806907,
470
+ "grad_norm": 5.213536739349365,
471
+ "learning_rate": 4.379984735737455e-05,
472
+ "loss": 2.7052,
473
+ "step": 6500
474
+ },
475
+ {
476
+ "epoch": 1.2593016599885518,
477
+ "grad_norm": 4.794836044311523,
478
+ "learning_rate": 4.3704445716466326e-05,
479
+ "loss": 2.7579,
480
+ "step": 6600
481
+ },
482
+ {
483
+ "epoch": 1.2783819881701965,
484
+ "grad_norm": 5.8824143409729,
485
+ "learning_rate": 4.3609044075558106e-05,
486
+ "loss": 2.6076,
487
+ "step": 6700
488
+ },
489
+ {
490
+ "epoch": 1.2974623163518413,
491
+ "grad_norm": 6.338265419006348,
492
+ "learning_rate": 4.351364243464988e-05,
493
+ "loss": 2.5968,
494
+ "step": 6800
495
+ },
496
+ {
497
+ "epoch": 1.316542644533486,
498
+ "grad_norm": 6.901667594909668,
499
+ "learning_rate": 4.341824079374165e-05,
500
+ "loss": 2.6482,
501
+ "step": 6900
502
+ },
503
+ {
504
+ "epoch": 1.3356229727151308,
505
+ "grad_norm": 6.267462730407715,
506
+ "learning_rate": 4.332283915283343e-05,
507
+ "loss": 2.6434,
508
+ "step": 7000
509
+ },
510
+ {
511
+ "epoch": 1.3547033008967755,
512
+ "grad_norm": 5.581694602966309,
513
+ "learning_rate": 4.3227437511925206e-05,
514
+ "loss": 2.5478,
515
+ "step": 7100
516
+ },
517
+ {
518
+ "epoch": 1.3737836290784202,
519
+ "grad_norm": 4.70138692855835,
520
+ "learning_rate": 4.3132035871016985e-05,
521
+ "loss": 2.5637,
522
+ "step": 7200
523
+ },
524
+ {
525
+ "epoch": 1.392863957260065,
526
+ "grad_norm": 6.99065637588501,
527
+ "learning_rate": 4.303663423010876e-05,
528
+ "loss": 2.5464,
529
+ "step": 7300
530
+ },
531
+ {
532
+ "epoch": 1.4119442854417095,
533
+ "grad_norm": 6.660660743713379,
534
+ "learning_rate": 4.294123258920054e-05,
535
+ "loss": 2.5766,
536
+ "step": 7400
537
+ },
538
+ {
539
+ "epoch": 1.4310246136233542,
540
+ "grad_norm": 5.83965539932251,
541
+ "learning_rate": 4.284583094829231e-05,
542
+ "loss": 2.5757,
543
+ "step": 7500
544
+ },
545
+ {
546
+ "epoch": 1.450104941804999,
547
+ "grad_norm": 5.41910982131958,
548
+ "learning_rate": 4.2750429307384085e-05,
549
+ "loss": 2.6146,
550
+ "step": 7600
551
+ },
552
+ {
553
+ "epoch": 1.4691852699866437,
554
+ "grad_norm": 4.368034839630127,
555
+ "learning_rate": 4.2655027666475865e-05,
556
+ "loss": 2.4073,
557
+ "step": 7700
558
+ },
559
+ {
560
+ "epoch": 1.4882655981682884,
561
+ "grad_norm": 3.716670036315918,
562
+ "learning_rate": 4.255962602556764e-05,
563
+ "loss": 2.5729,
564
+ "step": 7800
565
+ },
566
+ {
567
+ "epoch": 1.5073459263499331,
568
+ "grad_norm": 4.219146251678467,
569
+ "learning_rate": 4.246422438465942e-05,
570
+ "loss": 2.4766,
571
+ "step": 7900
572
+ },
573
+ {
574
+ "epoch": 1.5264262545315779,
575
+ "grad_norm": 5.474557399749756,
576
+ "learning_rate": 4.23688227437512e-05,
577
+ "loss": 2.5176,
578
+ "step": 8000
579
+ },
580
+ {
581
+ "epoch": 1.5455065827132226,
582
+ "grad_norm": 93.11466217041016,
583
+ "learning_rate": 4.227342110284297e-05,
584
+ "loss": 2.3887,
585
+ "step": 8100
586
+ },
587
+ {
588
+ "epoch": 1.5645869108948673,
589
+ "grad_norm": 6.055609703063965,
590
+ "learning_rate": 4.217801946193475e-05,
591
+ "loss": 2.5206,
592
+ "step": 8200
593
+ },
594
+ {
595
+ "epoch": 1.583667239076512,
596
+ "grad_norm": 6.243997573852539,
597
+ "learning_rate": 4.2082617821026524e-05,
598
+ "loss": 2.4598,
599
+ "step": 8300
600
+ },
601
+ {
602
+ "epoch": 1.6027475672581568,
603
+ "grad_norm": 5.589599132537842,
604
+ "learning_rate": 4.19872161801183e-05,
605
+ "loss": 2.4932,
606
+ "step": 8400
607
+ },
608
+ {
609
+ "epoch": 1.6218278954398015,
610
+ "grad_norm": 6.761661052703857,
611
+ "learning_rate": 4.189181453921008e-05,
612
+ "loss": 2.4582,
613
+ "step": 8500
614
+ },
615
+ {
616
+ "epoch": 1.6409082236214463,
617
+ "grad_norm": 4.9730963706970215,
618
+ "learning_rate": 4.179641289830185e-05,
619
+ "loss": 2.4221,
620
+ "step": 8600
621
+ },
622
+ {
623
+ "epoch": 1.659988551803091,
624
+ "grad_norm": 6.11653995513916,
625
+ "learning_rate": 4.170101125739363e-05,
626
+ "loss": 2.4959,
627
+ "step": 8700
628
+ },
629
+ {
630
+ "epoch": 1.6790688799847358,
631
+ "grad_norm": 5.8039398193359375,
632
+ "learning_rate": 4.160560961648541e-05,
633
+ "loss": 2.4431,
634
+ "step": 8800
635
+ },
636
+ {
637
+ "epoch": 1.6981492081663805,
638
+ "grad_norm": 4.404674530029297,
639
+ "learning_rate": 4.1510207975577184e-05,
640
+ "loss": 2.5057,
641
+ "step": 8900
642
+ },
643
+ {
644
+ "epoch": 1.7172295363480252,
645
+ "grad_norm": 4.7745256423950195,
646
+ "learning_rate": 4.141480633466896e-05,
647
+ "loss": 2.4474,
648
+ "step": 9000
649
+ },
650
+ {
651
+ "epoch": 1.73630986452967,
652
+ "grad_norm": 4.619002819061279,
653
+ "learning_rate": 4.131940469376073e-05,
654
+ "loss": 2.3865,
655
+ "step": 9100
656
+ },
657
+ {
658
+ "epoch": 1.7553901927113147,
659
+ "grad_norm": 5.063472270965576,
660
+ "learning_rate": 4.122400305285251e-05,
661
+ "loss": 2.3381,
662
+ "step": 9200
663
+ },
664
+ {
665
+ "epoch": 1.7744705208929594,
666
+ "grad_norm": 5.410485744476318,
667
+ "learning_rate": 4.112860141194429e-05,
668
+ "loss": 2.4036,
669
+ "step": 9300
670
+ },
671
+ {
672
+ "epoch": 1.7935508490746042,
673
+ "grad_norm": 5.242465496063232,
674
+ "learning_rate": 4.103319977103606e-05,
675
+ "loss": 2.3774,
676
+ "step": 9400
677
+ },
678
+ {
679
+ "epoch": 1.812631177256249,
680
+ "grad_norm": 5.760533809661865,
681
+ "learning_rate": 4.093779813012784e-05,
682
+ "loss": 2.3344,
683
+ "step": 9500
684
+ },
685
+ {
686
+ "epoch": 1.8317115054378936,
687
+ "grad_norm": 6.042536735534668,
688
+ "learning_rate": 4.0842396489219616e-05,
689
+ "loss": 2.37,
690
+ "step": 9600
691
+ },
692
+ {
693
+ "epoch": 1.8507918336195384,
694
+ "grad_norm": 5.290925025939941,
695
+ "learning_rate": 4.0746994848311396e-05,
696
+ "loss": 2.4738,
697
+ "step": 9700
698
+ },
699
+ {
700
+ "epoch": 1.869872161801183,
701
+ "grad_norm": 4.640475273132324,
702
+ "learning_rate": 4.065159320740317e-05,
703
+ "loss": 2.3234,
704
+ "step": 9800
705
+ },
706
+ {
707
+ "epoch": 1.8889524899828278,
708
+ "grad_norm": 6.546270847320557,
709
+ "learning_rate": 4.055619156649494e-05,
710
+ "loss": 2.412,
711
+ "step": 9900
712
+ },
713
+ {
714
+ "epoch": 1.9080328181644726,
715
+ "grad_norm": 4.5001325607299805,
716
+ "learning_rate": 4.046078992558672e-05,
717
+ "loss": 2.3115,
718
+ "step": 10000
719
+ },
720
+ {
721
+ "epoch": 1.9271131463461173,
722
+ "grad_norm": 4.442992210388184,
723
+ "learning_rate": 4.0365388284678495e-05,
724
+ "loss": 2.3329,
725
+ "step": 10100
726
+ },
727
+ {
728
+ "epoch": 1.946193474527762,
729
+ "grad_norm": 4.229004383087158,
730
+ "learning_rate": 4.0269986643770275e-05,
731
+ "loss": 2.2672,
732
+ "step": 10200
733
+ },
734
+ {
735
+ "epoch": 1.9652738027094065,
736
+ "grad_norm": 5.293257713317871,
737
+ "learning_rate": 4.0174585002862055e-05,
738
+ "loss": 2.3171,
739
+ "step": 10300
740
+ },
741
+ {
742
+ "epoch": 1.9843541308910513,
743
+ "grad_norm": 5.781225681304932,
744
+ "learning_rate": 4.007918336195383e-05,
745
+ "loss": 2.4069,
746
+ "step": 10400
747
+ },
748
+ {
749
+ "epoch": 2.0,
750
+ "eval_loss": 1.8545597791671753,
751
+ "eval_runtime": 201.9056,
752
+ "eval_samples_per_second": 23.07,
753
+ "eval_steps_per_second": 2.887,
754
+ "step": 10482
755
  }
756
  ],
757
  "logging_steps": 100,
 
771
  "attributes": {}
772
  }
773
  },
774
+ "total_flos": 2.870402397555917e+16,
775
  "train_batch_size": 8,
776
  "trial_name": null,
777
  "trial_params": null