mohammadmahdinouri commited on
Commit
5ff5a7a
·
verified ·
1 Parent(s): 31de4ff

Training in progress, step 71000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a07616faa045066ec0c6bc3a39f81fd70daf70607d96d0e781dd6c34dcb93bac
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:757633efe84a53c5ec97a90a7f4675f908dbeafb070171c08276f4ceae89bf82
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9a037d3fe4119e06bc9061da9083521b60e6c6995c76fa79acf6ccf4d47db48
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c319382408e536debfaba9985144c2b85aedc267f1adb41fa2fcd682a710d69
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce52a0fe6f5e9f18a4b09c87839e49a605cf4ac6c8b60ff37506c33748b93356
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b55180ad5c333f626bc6ef839beda747e8f0633fdb8a2329d1af0642155fcad0
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3330dae2d90b27ab31e1b3a875fe5cc81976ac373bbcaffe36a7fd41e6b0b4f7
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fb9e669a1e66d6084675ac17f9361f1d66f6538870dda5d62bb9fedf0717021
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c261298b41c9253298dd0b7fe9d0a70e7ec2c12dc3d995355a08c0fc31994d03
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c86e46203b6a91184b0093d776c5c5cbb5568a55f409f62928f5b11605d793
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1a0b78acdb08786028f54a8bd9831bbf3ab754e9fc46f50ba80121f99f2998b
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57649dd5fae41007b8326ad8bceda3664e8263c16462c398827f7c60518777a9
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a426a703a44cb3e7bbfa24a198521fd4285b0c71838a0067a8a121cdfc2dd80
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065fc078fd1aeeb645695c18fb1eff98c533b26302779a57f06b17d1e0565e6a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.10369202874935562,
6
  "eval_steps": 500,
7
- "global_step": 70000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24508,6 +24508,356 @@
24508
  "learning_rate": 0.000482837447795457,
24509
  "loss": 16.578,
24510
  "step": 70000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24511
  }
24512
  ],
24513
  "logging_steps": 20,
@@ -24527,7 +24877,7 @@
24527
  "attributes": {}
24528
  }
24529
  },
24530
- "total_flos": 5.146634442721408e+19,
24531
  "train_batch_size": 48,
24532
  "trial_name": null,
24533
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.10517334344577499,
6
  "eval_steps": 500,
7
+ "global_step": 71000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24508
  "learning_rate": 0.000482837447795457,
24509
  "loss": 16.578,
24510
  "step": 70000
24511
+ },
24512
+ {
24513
+ "epoch": 0.10372165504328401,
24514
+ "grad_norm": 7.25,
24515
+ "learning_rate": 0.0004828325088604494,
24516
+ "loss": 16.562,
24517
+ "step": 70020
24518
+ },
24519
+ {
24520
+ "epoch": 0.1037512813372124,
24521
+ "grad_norm": 6.625,
24522
+ "learning_rate": 0.00048282756992544187,
24523
+ "loss": 16.5095,
24524
+ "step": 70040
24525
+ },
24526
+ {
24527
+ "epoch": 0.10378090763114078,
24528
+ "grad_norm": 6.09375,
24529
+ "learning_rate": 0.00048282263099043427,
24530
+ "loss": 16.5207,
24531
+ "step": 70060
24532
+ },
24533
+ {
24534
+ "epoch": 0.10381053392506917,
24535
+ "grad_norm": 6.625,
24536
+ "learning_rate": 0.0004828176920554267,
24537
+ "loss": 16.5471,
24538
+ "step": 70080
24539
+ },
24540
+ {
24541
+ "epoch": 0.10384016021899757,
24542
+ "grad_norm": 6.6875,
24543
+ "learning_rate": 0.00048281275312041916,
24544
+ "loss": 16.524,
24545
+ "step": 70100
24546
+ },
24547
+ {
24548
+ "epoch": 0.10386978651292596,
24549
+ "grad_norm": 6.1875,
24550
+ "learning_rate": 0.0004828078141854116,
24551
+ "loss": 16.4794,
24552
+ "step": 70120
24553
+ },
24554
+ {
24555
+ "epoch": 0.10389941280685434,
24556
+ "grad_norm": 6.75,
24557
+ "learning_rate": 0.000482802875250404,
24558
+ "loss": 16.5229,
24559
+ "step": 70140
24560
+ },
24561
+ {
24562
+ "epoch": 0.10392903910078273,
24563
+ "grad_norm": 6.125,
24564
+ "learning_rate": 0.00048279793631539645,
24565
+ "loss": 16.5296,
24566
+ "step": 70160
24567
+ },
24568
+ {
24569
+ "epoch": 0.10395866539471112,
24570
+ "grad_norm": 6.59375,
24571
+ "learning_rate": 0.0004827929973803889,
24572
+ "loss": 16.5783,
24573
+ "step": 70180
24574
+ },
24575
+ {
24576
+ "epoch": 0.1039882916886395,
24577
+ "grad_norm": 6.8125,
24578
+ "learning_rate": 0.0004827880584453813,
24579
+ "loss": 16.5537,
24580
+ "step": 70200
24581
+ },
24582
+ {
24583
+ "epoch": 0.10401791798256789,
24584
+ "grad_norm": 6.75,
24585
+ "learning_rate": 0.00048278311951037374,
24586
+ "loss": 16.5815,
24587
+ "step": 70220
24588
+ },
24589
+ {
24590
+ "epoch": 0.10404754427649628,
24591
+ "grad_norm": 8.6875,
24592
+ "learning_rate": 0.0004827781805753662,
24593
+ "loss": 16.5732,
24594
+ "step": 70240
24595
+ },
24596
+ {
24597
+ "epoch": 0.10407717057042466,
24598
+ "grad_norm": 6.5,
24599
+ "learning_rate": 0.00048277324164035864,
24600
+ "loss": 16.5605,
24601
+ "step": 70260
24602
+ },
24603
+ {
24604
+ "epoch": 0.10410679686435305,
24605
+ "grad_norm": 6.65625,
24606
+ "learning_rate": 0.00048276830270535103,
24607
+ "loss": 16.5224,
24608
+ "step": 70280
24609
+ },
24610
+ {
24611
+ "epoch": 0.10413642315828144,
24612
+ "grad_norm": 7.0625,
24613
+ "learning_rate": 0.0004827633637703435,
24614
+ "loss": 16.5452,
24615
+ "step": 70300
24616
+ },
24617
+ {
24618
+ "epoch": 0.10416604945220982,
24619
+ "grad_norm": 6.71875,
24620
+ "learning_rate": 0.0004827584248353359,
24621
+ "loss": 16.5098,
24622
+ "step": 70320
24623
+ },
24624
+ {
24625
+ "epoch": 0.10419567574613821,
24626
+ "grad_norm": 6.25,
24627
+ "learning_rate": 0.0004827534859003284,
24628
+ "loss": 16.5676,
24629
+ "step": 70340
24630
+ },
24631
+ {
24632
+ "epoch": 0.1042253020400666,
24633
+ "grad_norm": 6.875,
24634
+ "learning_rate": 0.00048274854696532077,
24635
+ "loss": 16.4514,
24636
+ "step": 70360
24637
+ },
24638
+ {
24639
+ "epoch": 0.10425492833399498,
24640
+ "grad_norm": 6.40625,
24641
+ "learning_rate": 0.0004827436080303132,
24642
+ "loss": 16.5532,
24643
+ "step": 70380
24644
+ },
24645
+ {
24646
+ "epoch": 0.10428455462792338,
24647
+ "grad_norm": 6.65625,
24648
+ "learning_rate": 0.00048273866909530566,
24649
+ "loss": 16.5367,
24650
+ "step": 70400
24651
+ },
24652
+ {
24653
+ "epoch": 0.10431418092185177,
24654
+ "grad_norm": 6.15625,
24655
+ "learning_rate": 0.0004827337301602981,
24656
+ "loss": 16.5784,
24657
+ "step": 70420
24658
+ },
24659
+ {
24660
+ "epoch": 0.10434380721578015,
24661
+ "grad_norm": 6.59375,
24662
+ "learning_rate": 0.0004827287912252905,
24663
+ "loss": 16.5085,
24664
+ "step": 70440
24665
+ },
24666
+ {
24667
+ "epoch": 0.10437343350970854,
24668
+ "grad_norm": 6.53125,
24669
+ "learning_rate": 0.00048272385229028295,
24670
+ "loss": 16.5486,
24671
+ "step": 70460
24672
+ },
24673
+ {
24674
+ "epoch": 0.10440305980363693,
24675
+ "grad_norm": 5.75,
24676
+ "learning_rate": 0.0004827189133552754,
24677
+ "loss": 16.4874,
24678
+ "step": 70480
24679
+ },
24680
+ {
24681
+ "epoch": 0.10443268609756531,
24682
+ "grad_norm": 6.84375,
24683
+ "learning_rate": 0.00048271397442026785,
24684
+ "loss": 16.4957,
24685
+ "step": 70500
24686
+ },
24687
+ {
24688
+ "epoch": 0.1044623123914937,
24689
+ "grad_norm": 6.75,
24690
+ "learning_rate": 0.00048270903548526024,
24691
+ "loss": 16.5765,
24692
+ "step": 70520
24693
+ },
24694
+ {
24695
+ "epoch": 0.10449193868542209,
24696
+ "grad_norm": 8.125,
24697
+ "learning_rate": 0.00048270409655025263,
24698
+ "loss": 16.5136,
24699
+ "step": 70540
24700
+ },
24701
+ {
24702
+ "epoch": 0.10452156497935047,
24703
+ "grad_norm": 6.78125,
24704
+ "learning_rate": 0.00048269915761524514,
24705
+ "loss": 16.5299,
24706
+ "step": 70560
24707
+ },
24708
+ {
24709
+ "epoch": 0.10455119127327886,
24710
+ "grad_norm": 6.6875,
24711
+ "learning_rate": 0.00048269421868023753,
24712
+ "loss": 16.4576,
24713
+ "step": 70580
24714
+ },
24715
+ {
24716
+ "epoch": 0.10458081756720725,
24717
+ "grad_norm": 6.84375,
24718
+ "learning_rate": 0.00048268927974523,
24719
+ "loss": 16.4926,
24720
+ "step": 70600
24721
+ },
24722
+ {
24723
+ "epoch": 0.10461044386113563,
24724
+ "grad_norm": 6.75,
24725
+ "learning_rate": 0.0004826843408102224,
24726
+ "loss": 16.4703,
24727
+ "step": 70620
24728
+ },
24729
+ {
24730
+ "epoch": 0.10464007015506402,
24731
+ "grad_norm": 6.96875,
24732
+ "learning_rate": 0.0004826794018752149,
24733
+ "loss": 16.5149,
24734
+ "step": 70640
24735
+ },
24736
+ {
24737
+ "epoch": 0.1046696964489924,
24738
+ "grad_norm": 6.3125,
24739
+ "learning_rate": 0.00048267446294020727,
24740
+ "loss": 16.4792,
24741
+ "step": 70660
24742
+ },
24743
+ {
24744
+ "epoch": 0.10469932274292079,
24745
+ "grad_norm": 6.59375,
24746
+ "learning_rate": 0.0004826695240051997,
24747
+ "loss": 16.5295,
24748
+ "step": 70680
24749
+ },
24750
+ {
24751
+ "epoch": 0.10472894903684918,
24752
+ "grad_norm": 8.125,
24753
+ "learning_rate": 0.00048266458507019216,
24754
+ "loss": 16.5405,
24755
+ "step": 70700
24756
+ },
24757
+ {
24758
+ "epoch": 0.10475857533077758,
24759
+ "grad_norm": 7.90625,
24760
+ "learning_rate": 0.0004826596461351846,
24761
+ "loss": 16.543,
24762
+ "step": 70720
24763
+ },
24764
+ {
24765
+ "epoch": 0.10478820162470596,
24766
+ "grad_norm": 6.3125,
24767
+ "learning_rate": 0.000482654707200177,
24768
+ "loss": 16.5194,
24769
+ "step": 70740
24770
+ },
24771
+ {
24772
+ "epoch": 0.10481782791863435,
24773
+ "grad_norm": 6.75,
24774
+ "learning_rate": 0.00048264976826516945,
24775
+ "loss": 16.5217,
24776
+ "step": 70760
24777
+ },
24778
+ {
24779
+ "epoch": 0.10484745421256274,
24780
+ "grad_norm": 6.09375,
24781
+ "learning_rate": 0.0004826448293301619,
24782
+ "loss": 16.5735,
24783
+ "step": 70780
24784
+ },
24785
+ {
24786
+ "epoch": 0.10487708050649112,
24787
+ "grad_norm": 6.125,
24788
+ "learning_rate": 0.00048263989039515435,
24789
+ "loss": 16.4886,
24790
+ "step": 70800
24791
+ },
24792
+ {
24793
+ "epoch": 0.10490670680041951,
24794
+ "grad_norm": 7.71875,
24795
+ "learning_rate": 0.00048263495146014674,
24796
+ "loss": 16.4874,
24797
+ "step": 70820
24798
+ },
24799
+ {
24800
+ "epoch": 0.1049363330943479,
24801
+ "grad_norm": 6.59375,
24802
+ "learning_rate": 0.00048263001252513913,
24803
+ "loss": 16.5424,
24804
+ "step": 70840
24805
+ },
24806
+ {
24807
+ "epoch": 0.10496595938827628,
24808
+ "grad_norm": 6.84375,
24809
+ "learning_rate": 0.00048262507359013164,
24810
+ "loss": 16.5108,
24811
+ "step": 70860
24812
+ },
24813
+ {
24814
+ "epoch": 0.10499558568220467,
24815
+ "grad_norm": 6.71875,
24816
+ "learning_rate": 0.00048262013465512403,
24817
+ "loss": 16.4778,
24818
+ "step": 70880
24819
+ },
24820
+ {
24821
+ "epoch": 0.10502521197613306,
24822
+ "grad_norm": 7.21875,
24823
+ "learning_rate": 0.0004826151957201165,
24824
+ "loss": 16.4908,
24825
+ "step": 70900
24826
+ },
24827
+ {
24828
+ "epoch": 0.10505483827006144,
24829
+ "grad_norm": 6.8125,
24830
+ "learning_rate": 0.0004826102567851089,
24831
+ "loss": 16.4564,
24832
+ "step": 70920
24833
+ },
24834
+ {
24835
+ "epoch": 0.10508446456398983,
24836
+ "grad_norm": 6.625,
24837
+ "learning_rate": 0.0004826053178501014,
24838
+ "loss": 16.5258,
24839
+ "step": 70940
24840
+ },
24841
+ {
24842
+ "epoch": 0.10511409085791822,
24843
+ "grad_norm": 6.21875,
24844
+ "learning_rate": 0.00048260037891509377,
24845
+ "loss": 16.5403,
24846
+ "step": 70960
24847
+ },
24848
+ {
24849
+ "epoch": 0.1051437171518466,
24850
+ "grad_norm": 7.375,
24851
+ "learning_rate": 0.0004825954399800862,
24852
+ "loss": 16.575,
24853
+ "step": 70980
24854
+ },
24855
+ {
24856
+ "epoch": 0.10517334344577499,
24857
+ "grad_norm": 6.4375,
24858
+ "learning_rate": 0.00048259050104507866,
24859
+ "loss": 16.5599,
24860
+ "step": 71000
24861
  }
24862
  ],
24863
  "logging_steps": 20,
 
24877
  "attributes": {}
24878
  }
24879
  },
24880
+ "total_flos": 5.220171364156257e+19,
24881
  "train_batch_size": 48,
24882
  "trial_name": null,
24883
  "trial_params": null