CocoRoF commited on
Commit
c51ed94
·
verified ·
1 Parent(s): 8272978

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6b296b786d5bd2121d4bba7dc69f589c0f5e83973a65ff1a6c67b9053cf2381
3
  size 735217848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c854e311156a00fb209c1d5b18bf088757f9e875811a4af0292d4b051e6c6446
3
  size 735217848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ec8e23938fd7c55c2ccf015da2bd80ad3d755e4ee0d6634623f07ae27d815c7
3
  size 1470521978
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de307672e8d7a91febef716662f5657dacb787fa03178bc23fb37badcafa7ed1
3
  size 1470521978
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:558a4b0fbdb033d779e0f95e05927694fea0f2ec8f7e3ce8de68c5939e6b9f27
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:038ef74d9d7647e927602a31e3ff40ed015ce2147efee9b81efc43a4be3f559b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:228d14efa38075e5075e5f3ea1c158f27661d545dab61c548dfe15e36f9e3d44
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fca3c514eb217652ef846414a7b25fe2d542ec928f14020a84d1e47090ecb880
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.23430178069353327,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -437,6 +437,436 @@
437
  "eval_spearman_manhattan": 0.8187222998801444,
438
  "eval_steps_per_second": 15.744,
439
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  }
441
  ],
442
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.46860356138706655,
5
  "eval_steps": 100,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
437
  "eval_spearman_manhattan": 0.8187222998801444,
438
  "eval_steps_per_second": 15.744,
439
  "step": 500
440
+ },
441
+ {
442
+ "epoch": 0.23898781630740393,
443
+ "grad_norm": 1.9643880128860474,
444
+ "learning_rate": 4.850632614807873e-05,
445
+ "loss": 0.4231,
446
+ "step": 510
447
+ },
448
+ {
449
+ "epoch": 0.2436738519212746,
450
+ "grad_norm": 1.7972699403762817,
451
+ "learning_rate": 4.847703842549204e-05,
452
+ "loss": 0.3893,
453
+ "step": 520
454
+ },
455
+ {
456
+ "epoch": 0.24835988753514526,
457
+ "grad_norm": 1.6312799453735352,
458
+ "learning_rate": 4.844775070290534e-05,
459
+ "loss": 0.3869,
460
+ "step": 530
461
+ },
462
+ {
463
+ "epoch": 0.2530459231490159,
464
+ "grad_norm": 1.8009634017944336,
465
+ "learning_rate": 4.841846298031865e-05,
466
+ "loss": 0.4,
467
+ "step": 540
468
+ },
469
+ {
470
+ "epoch": 0.25773195876288657,
471
+ "grad_norm": 1.1453664302825928,
472
+ "learning_rate": 4.838917525773196e-05,
473
+ "loss": 0.3568,
474
+ "step": 550
475
+ },
476
+ {
477
+ "epoch": 0.2624179943767573,
478
+ "grad_norm": 1.683673620223999,
479
+ "learning_rate": 4.8359887535145274e-05,
480
+ "loss": 0.3674,
481
+ "step": 560
482
+ },
483
+ {
484
+ "epoch": 0.26710402999062793,
485
+ "grad_norm": 1.3695913553237915,
486
+ "learning_rate": 4.833059981255858e-05,
487
+ "loss": 0.3817,
488
+ "step": 570
489
+ },
490
+ {
491
+ "epoch": 0.2717900656044986,
492
+ "grad_norm": 1.8631620407104492,
493
+ "learning_rate": 4.8301312089971884e-05,
494
+ "loss": 0.3546,
495
+ "step": 580
496
+ },
497
+ {
498
+ "epoch": 0.27647610121836924,
499
+ "grad_norm": 1.5883185863494873,
500
+ "learning_rate": 4.827202436738519e-05,
501
+ "loss": 0.3973,
502
+ "step": 590
503
+ },
504
+ {
505
+ "epoch": 0.28116213683223995,
506
+ "grad_norm": 1.7056660652160645,
507
+ "learning_rate": 4.82427366447985e-05,
508
+ "loss": 0.3754,
509
+ "step": 600
510
+ },
511
+ {
512
+ "epoch": 0.28116213683223995,
513
+ "eval_loss": 0.059529613703489304,
514
+ "eval_pearson_cosine": 0.8104839357699305,
515
+ "eval_pearson_dot": 0.7706455959096417,
516
+ "eval_pearson_euclidean": 0.8087417307856555,
517
+ "eval_pearson_manhattan": 0.8101299227665919,
518
+ "eval_runtime": 6.0212,
519
+ "eval_samples_per_second": 249.121,
520
+ "eval_spearman_cosine": 0.8125251228747598,
521
+ "eval_spearman_dot": 0.7680727600884657,
522
+ "eval_spearman_euclidean": 0.8146128693278114,
523
+ "eval_spearman_manhattan": 0.8161621120875591,
524
+ "eval_steps_per_second": 15.612,
525
+ "step": 600
526
+ },
527
+ {
528
+ "epoch": 0.2858481724461106,
529
+ "grad_norm": 1.8564058542251587,
530
+ "learning_rate": 4.821344892221181e-05,
531
+ "loss": 0.4016,
532
+ "step": 610
533
+ },
534
+ {
535
+ "epoch": 0.29053420805998126,
536
+ "grad_norm": 1.467993974685669,
537
+ "learning_rate": 4.818416119962512e-05,
538
+ "loss": 0.3858,
539
+ "step": 620
540
+ },
541
+ {
542
+ "epoch": 0.2952202436738519,
543
+ "grad_norm": 2.3624465465545654,
544
+ "learning_rate": 4.815487347703843e-05,
545
+ "loss": 0.3796,
546
+ "step": 630
547
+ },
548
+ {
549
+ "epoch": 0.29990627928772257,
550
+ "grad_norm": 1.588629126548767,
551
+ "learning_rate": 4.8125585754451736e-05,
552
+ "loss": 0.3667,
553
+ "step": 640
554
+ },
555
+ {
556
+ "epoch": 0.3045923149015933,
557
+ "grad_norm": 1.380112886428833,
558
+ "learning_rate": 4.8096298031865044e-05,
559
+ "loss": 0.3453,
560
+ "step": 650
561
+ },
562
+ {
563
+ "epoch": 0.30927835051546393,
564
+ "grad_norm": 1.4270693063735962,
565
+ "learning_rate": 4.806701030927835e-05,
566
+ "loss": 0.3345,
567
+ "step": 660
568
+ },
569
+ {
570
+ "epoch": 0.3139643861293346,
571
+ "grad_norm": 2.204744338989258,
572
+ "learning_rate": 4.803772258669166e-05,
573
+ "loss": 0.395,
574
+ "step": 670
575
+ },
576
+ {
577
+ "epoch": 0.31865042174320524,
578
+ "grad_norm": 1.4480923414230347,
579
+ "learning_rate": 4.800843486410497e-05,
580
+ "loss": 0.3691,
581
+ "step": 680
582
+ },
583
+ {
584
+ "epoch": 0.3233364573570759,
585
+ "grad_norm": 1.8864325284957886,
586
+ "learning_rate": 4.797914714151828e-05,
587
+ "loss": 0.3986,
588
+ "step": 690
589
+ },
590
+ {
591
+ "epoch": 0.3280224929709466,
592
+ "grad_norm": 1.3784370422363281,
593
+ "learning_rate": 4.794985941893159e-05,
594
+ "loss": 0.3729,
595
+ "step": 700
596
+ },
597
+ {
598
+ "epoch": 0.3280224929709466,
599
+ "eval_loss": 0.061924997717142105,
600
+ "eval_pearson_cosine": 0.8155259960165324,
601
+ "eval_pearson_dot": 0.7761366153485074,
602
+ "eval_pearson_euclidean": 0.8127568794877789,
603
+ "eval_pearson_manhattan": 0.8144288026347226,
604
+ "eval_runtime": 6.0626,
605
+ "eval_samples_per_second": 247.42,
606
+ "eval_spearman_cosine": 0.8175981152530937,
607
+ "eval_spearman_dot": 0.7736443532595881,
608
+ "eval_spearman_euclidean": 0.8195662973032031,
609
+ "eval_spearman_manhattan": 0.8212465310439688,
610
+ "eval_steps_per_second": 15.505,
611
+ "step": 700
612
+ },
613
+ {
614
+ "epoch": 0.33270852858481725,
615
+ "grad_norm": 1.7109017372131348,
616
+ "learning_rate": 4.7920571696344895e-05,
617
+ "loss": 0.3345,
618
+ "step": 710
619
+ },
620
+ {
621
+ "epoch": 0.3373945641986879,
622
+ "grad_norm": 1.8547511100769043,
623
+ "learning_rate": 4.7891283973758204e-05,
624
+ "loss": 0.3735,
625
+ "step": 720
626
+ },
627
+ {
628
+ "epoch": 0.34208059981255856,
629
+ "grad_norm": 1.5369923114776611,
630
+ "learning_rate": 4.786199625117151e-05,
631
+ "loss": 0.3304,
632
+ "step": 730
633
+ },
634
+ {
635
+ "epoch": 0.3467666354264292,
636
+ "grad_norm": 1.308568000793457,
637
+ "learning_rate": 4.783270852858482e-05,
638
+ "loss": 0.3717,
639
+ "step": 740
640
+ },
641
+ {
642
+ "epoch": 0.3514526710402999,
643
+ "grad_norm": 1.3743574619293213,
644
+ "learning_rate": 4.780342080599813e-05,
645
+ "loss": 0.3381,
646
+ "step": 750
647
+ },
648
+ {
649
+ "epoch": 0.3561387066541706,
650
+ "grad_norm": 1.874657154083252,
651
+ "learning_rate": 4.777413308341144e-05,
652
+ "loss": 0.3193,
653
+ "step": 760
654
+ },
655
+ {
656
+ "epoch": 0.36082474226804123,
657
+ "grad_norm": 1.4700101613998413,
658
+ "learning_rate": 4.774484536082475e-05,
659
+ "loss": 0.3799,
660
+ "step": 770
661
+ },
662
+ {
663
+ "epoch": 0.3655107778819119,
664
+ "grad_norm": 1.5662988424301147,
665
+ "learning_rate": 4.771555763823805e-05,
666
+ "loss": 0.3453,
667
+ "step": 780
668
+ },
669
+ {
670
+ "epoch": 0.3701968134957826,
671
+ "grad_norm": 1.4666754007339478,
672
+ "learning_rate": 4.768626991565136e-05,
673
+ "loss": 0.3175,
674
+ "step": 790
675
+ },
676
+ {
677
+ "epoch": 0.37488284910965325,
678
+ "grad_norm": 1.3993242979049683,
679
+ "learning_rate": 4.765698219306467e-05,
680
+ "loss": 0.341,
681
+ "step": 800
682
+ },
683
+ {
684
+ "epoch": 0.37488284910965325,
685
+ "eval_loss": 0.05296875163912773,
686
+ "eval_pearson_cosine": 0.8137295797811834,
687
+ "eval_pearson_dot": 0.7695932846417932,
688
+ "eval_pearson_euclidean": 0.8189567419998482,
689
+ "eval_pearson_manhattan": 0.8200667930673546,
690
+ "eval_runtime": 6.3175,
691
+ "eval_samples_per_second": 237.436,
692
+ "eval_spearman_cosine": 0.815541427803139,
693
+ "eval_spearman_dot": 0.7663341686268886,
694
+ "eval_spearman_euclidean": 0.8233566840888671,
695
+ "eval_spearman_manhattan": 0.8246092914965037,
696
+ "eval_steps_per_second": 14.879,
697
+ "step": 800
698
+ },
699
+ {
700
+ "epoch": 0.3795688847235239,
701
+ "grad_norm": 1.4209802150726318,
702
+ "learning_rate": 4.762769447047798e-05,
703
+ "loss": 0.3831,
704
+ "step": 810
705
+ },
706
+ {
707
+ "epoch": 0.38425492033739456,
708
+ "grad_norm": 1.4097892045974731,
709
+ "learning_rate": 4.759840674789129e-05,
710
+ "loss": 0.3318,
711
+ "step": 820
712
+ },
713
+ {
714
+ "epoch": 0.3889409559512652,
715
+ "grad_norm": 1.706900715827942,
716
+ "learning_rate": 4.756911902530459e-05,
717
+ "loss": 0.3328,
718
+ "step": 830
719
+ },
720
+ {
721
+ "epoch": 0.3936269915651359,
722
+ "grad_norm": 1.610275149345398,
723
+ "learning_rate": 4.75398313027179e-05,
724
+ "loss": 0.3575,
725
+ "step": 840
726
+ },
727
+ {
728
+ "epoch": 0.3983130271790066,
729
+ "grad_norm": 1.4575105905532837,
730
+ "learning_rate": 4.751054358013121e-05,
731
+ "loss": 0.307,
732
+ "step": 850
733
+ },
734
+ {
735
+ "epoch": 0.4029990627928772,
736
+ "grad_norm": 1.638424277305603,
737
+ "learning_rate": 4.7481255857544524e-05,
738
+ "loss": 0.3504,
739
+ "step": 860
740
+ },
741
+ {
742
+ "epoch": 0.4076850984067479,
743
+ "grad_norm": 1.8157601356506348,
744
+ "learning_rate": 4.745196813495783e-05,
745
+ "loss": 0.3931,
746
+ "step": 870
747
+ },
748
+ {
749
+ "epoch": 0.41237113402061853,
750
+ "grad_norm": 1.6680104732513428,
751
+ "learning_rate": 4.7422680412371134e-05,
752
+ "loss": 0.362,
753
+ "step": 880
754
+ },
755
+ {
756
+ "epoch": 0.41705716963448924,
757
+ "grad_norm": 1.4331028461456299,
758
+ "learning_rate": 4.739339268978444e-05,
759
+ "loss": 0.3451,
760
+ "step": 890
761
+ },
762
+ {
763
+ "epoch": 0.4217432052483599,
764
+ "grad_norm": 1.3940101861953735,
765
+ "learning_rate": 4.736410496719775e-05,
766
+ "loss": 0.3161,
767
+ "step": 900
768
+ },
769
+ {
770
+ "epoch": 0.4217432052483599,
771
+ "eval_loss": 0.05680527910590172,
772
+ "eval_pearson_cosine": 0.816164907471336,
773
+ "eval_pearson_dot": 0.7659985241939467,
774
+ "eval_pearson_euclidean": 0.8198292531320703,
775
+ "eval_pearson_manhattan": 0.8209187797411488,
776
+ "eval_runtime": 6.6335,
777
+ "eval_samples_per_second": 226.126,
778
+ "eval_spearman_cosine": 0.8181742542924034,
779
+ "eval_spearman_dot": 0.7624851760530289,
780
+ "eval_spearman_euclidean": 0.8251528076462932,
781
+ "eval_spearman_manhattan": 0.8261936560831687,
782
+ "eval_steps_per_second": 14.171,
783
+ "step": 900
784
+ },
785
+ {
786
+ "epoch": 0.42642924086223055,
787
+ "grad_norm": 1.5849499702453613,
788
+ "learning_rate": 4.733481724461106e-05,
789
+ "loss": 0.2852,
790
+ "step": 910
791
+ },
792
+ {
793
+ "epoch": 0.4311152764761012,
794
+ "grad_norm": 1.8611364364624023,
795
+ "learning_rate": 4.7305529522024375e-05,
796
+ "loss": 0.3517,
797
+ "step": 920
798
+ },
799
+ {
800
+ "epoch": 0.43580131208997186,
801
+ "grad_norm": 1.759479284286499,
802
+ "learning_rate": 4.727624179943768e-05,
803
+ "loss": 0.3309,
804
+ "step": 930
805
+ },
806
+ {
807
+ "epoch": 0.44048734770384257,
808
+ "grad_norm": 1.3715683221817017,
809
+ "learning_rate": 4.7246954076850985e-05,
810
+ "loss": 0.2964,
811
+ "step": 940
812
+ },
813
+ {
814
+ "epoch": 0.4451733833177132,
815
+ "grad_norm": 1.6326545476913452,
816
+ "learning_rate": 4.7217666354264294e-05,
817
+ "loss": 0.3501,
818
+ "step": 950
819
+ },
820
+ {
821
+ "epoch": 0.4498594189315839,
822
+ "grad_norm": 1.238206148147583,
823
+ "learning_rate": 4.71883786316776e-05,
824
+ "loss": 0.3366,
825
+ "step": 960
826
+ },
827
+ {
828
+ "epoch": 0.45454545454545453,
829
+ "grad_norm": 1.6656396389007568,
830
+ "learning_rate": 4.715909090909091e-05,
831
+ "loss": 0.3594,
832
+ "step": 970
833
+ },
834
+ {
835
+ "epoch": 0.4592314901593252,
836
+ "grad_norm": 1.5264825820922852,
837
+ "learning_rate": 4.712980318650422e-05,
838
+ "loss": 0.3309,
839
+ "step": 980
840
+ },
841
+ {
842
+ "epoch": 0.4639175257731959,
843
+ "grad_norm": 1.4031989574432373,
844
+ "learning_rate": 4.710051546391753e-05,
845
+ "loss": 0.3616,
846
+ "step": 990
847
+ },
848
+ {
849
+ "epoch": 0.46860356138706655,
850
+ "grad_norm": 1.439453125,
851
+ "learning_rate": 4.7071227741330836e-05,
852
+ "loss": 0.3122,
853
+ "step": 1000
854
+ },
855
+ {
856
+ "epoch": 0.46860356138706655,
857
+ "eval_loss": 0.05414344370365143,
858
+ "eval_pearson_cosine": 0.8215390057088641,
859
+ "eval_pearson_dot": 0.7789934072191471,
860
+ "eval_pearson_euclidean": 0.8206818537339018,
861
+ "eval_pearson_manhattan": 0.8219733991381624,
862
+ "eval_runtime": 6.2607,
863
+ "eval_samples_per_second": 239.588,
864
+ "eval_spearman_cosine": 0.8235945278831797,
865
+ "eval_spearman_dot": 0.7745226194646113,
866
+ "eval_spearman_euclidean": 0.8268444005248111,
867
+ "eval_spearman_manhattan": 0.8284194308491212,
868
+ "eval_steps_per_second": 15.014,
869
+ "step": 1000
870
  }
871
  ],
872
  "logging_steps": 10,