rovdetection commited on
Commit
d496973
·
verified ·
1 Parent(s): a231b74

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -26,7 +26,7 @@
26
  "rope_type": "default"
27
  },
28
  "tie_word_embeddings": false,
29
- "transformers_version": "5.8.0",
30
  "use_cache": false,
31
  "vocab_size": 50257
32
  }
 
26
  "rope_type": "default"
27
  },
28
  "tie_word_embeddings": false,
29
+ "transformers_version": "5.8.1",
30
  "use_cache": false,
31
  "vocab_size": 50257
32
  }
last-checkpoint/generation_config.json CHANGED
@@ -5,6 +5,6 @@
5
  "output_attentions": false,
6
  "output_hidden_states": false,
7
  "pad_token_id": 0,
8
- "transformers_version": "5.8.0",
9
  "use_cache": true
10
  }
 
5
  "output_attentions": false,
6
  "output_hidden_states": false,
7
  "pad_token_id": 0,
8
+ "transformers_version": "5.8.1",
9
  "use_cache": true
10
  }
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f30fded0506f5abc758d4b29697667a6c05e1a95e6c9e0686c7f108dc11e49a0
3
  size 4523108832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5357a9d40da12fb2def448cb62f5c3aed9fa0739e8f2c2c0ec7fec7354241c0
3
  size 4523108832
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62b7d9c7d41c6b73fcbd87de771044e3b905349cd837f4d94e3d38a8f1a95531
3
  size 2912179275
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:107bb493c2caa0cd6fa40a129559390214dfd1f7e807e09d56c5d2aebb1c47e9
3
  size 2912179275
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd61fdcd042c7209212ed4986c4cd6aa5d57a730e78431a78580137d7a601038
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41
3
  size 14645
last-checkpoint/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb083ebf35d0e4448f128a01dd67912abe022f1092025a85ebc3f0f8d0fcb3ba
3
  size 1383
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1831663cfc396872c81aa655b9b01b2d24ce27a5ca47e7cf38d581dbb6adb922
3
  size 1383
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41630e2df6dcc25df36d789c3c5fa21ee69d78ee6210c7879940bab6e3977772
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86a40e24c6251fa4890a76644fa4c014362ca533cd573d7139b9e226b18d8d04
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 7.0128,
6
  "eval_steps": 500,
7
- "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4558,6 +4558,356 @@
4558
  "learning_rate": 7.004e-05,
4559
  "loss": 0.030304345488548278,
4560
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4561
  }
4562
  ],
4563
  "logging_steps": 10,
@@ -4577,7 +4927,7 @@
4577
  "attributes": {}
4578
  }
4579
  },
4580
- "total_flos": 5.501953492276101e+17,
4581
  "train_batch_size": 1,
4582
  "trial_name": null,
4583
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0188,
6
  "eval_steps": 500,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4558
  "learning_rate": 7.004e-05,
4559
  "loss": 0.030304345488548278,
4560
  "step": 6500
4561
+ },
4562
+ {
4563
+ "epoch": 1.001,
4564
+ "grad_norm": 0.17337968945503235,
4565
+ "learning_rate": 6.984000000000001e-05,
4566
+ "loss": 0.04341032803058624,
4567
+ "step": 6510
4568
+ },
4569
+ {
4570
+ "epoch": 1.002,
4571
+ "grad_norm": 0.14447154104709625,
4572
+ "learning_rate": 6.964e-05,
4573
+ "loss": 0.02765703499317169,
4574
+ "step": 6520
4575
+ },
4576
+ {
4577
+ "epoch": 1.003,
4578
+ "grad_norm": 0.1359182447195053,
4579
+ "learning_rate": 6.944e-05,
4580
+ "loss": 0.027671322226524353,
4581
+ "step": 6530
4582
+ },
4583
+ {
4584
+ "epoch": 1.004,
4585
+ "grad_norm": 0.1486900895833969,
4586
+ "learning_rate": 6.924000000000001e-05,
4587
+ "loss": 0.024222801625728606,
4588
+ "step": 6540
4589
+ },
4590
+ {
4591
+ "epoch": 1.005,
4592
+ "grad_norm": 0.17833292484283447,
4593
+ "learning_rate": 6.904e-05,
4594
+ "loss": 0.024685877561569213,
4595
+ "step": 6550
4596
+ },
4597
+ {
4598
+ "epoch": 1.006,
4599
+ "grad_norm": 0.1444295197725296,
4600
+ "learning_rate": 6.884e-05,
4601
+ "loss": 0.027162906527519227,
4602
+ "step": 6560
4603
+ },
4604
+ {
4605
+ "epoch": 1.007,
4606
+ "grad_norm": 0.18214137852191925,
4607
+ "learning_rate": 6.864000000000001e-05,
4608
+ "loss": 0.02484011501073837,
4609
+ "step": 6570
4610
+ },
4611
+ {
4612
+ "epoch": 1.008,
4613
+ "grad_norm": 0.16924667358398438,
4614
+ "learning_rate": 6.844e-05,
4615
+ "loss": 0.025604432821273802,
4616
+ "step": 6580
4617
+ },
4618
+ {
4619
+ "epoch": 1.009,
4620
+ "grad_norm": 0.16066834330558777,
4621
+ "learning_rate": 6.824e-05,
4622
+ "loss": 0.025216665863990784,
4623
+ "step": 6590
4624
+ },
4625
+ {
4626
+ "epoch": 1.01,
4627
+ "grad_norm": 0.14215150475502014,
4628
+ "learning_rate": 6.804e-05,
4629
+ "loss": 0.024471497535705565,
4630
+ "step": 6600
4631
+ },
4632
+ {
4633
+ "epoch": 1.011,
4634
+ "grad_norm": 0.128558948636055,
4635
+ "learning_rate": 6.784e-05,
4636
+ "loss": 0.02575332820415497,
4637
+ "step": 6610
4638
+ },
4639
+ {
4640
+ "epoch": 1.012,
4641
+ "grad_norm": 0.33806535601615906,
4642
+ "learning_rate": 6.764000000000001e-05,
4643
+ "loss": 0.025676625967025756,
4644
+ "step": 6620
4645
+ },
4646
+ {
4647
+ "epoch": 1.013,
4648
+ "grad_norm": 0.16000686585903168,
4649
+ "learning_rate": 6.744e-05,
4650
+ "loss": 0.025471991300582884,
4651
+ "step": 6630
4652
+ },
4653
+ {
4654
+ "epoch": 1.014,
4655
+ "grad_norm": 0.19417209923267365,
4656
+ "learning_rate": 6.724e-05,
4657
+ "loss": 0.0291011780500412,
4658
+ "step": 6640
4659
+ },
4660
+ {
4661
+ "epoch": 1.015,
4662
+ "grad_norm": 0.2124422937631607,
4663
+ "learning_rate": 6.704000000000001e-05,
4664
+ "loss": 0.02770865261554718,
4665
+ "step": 6650
4666
+ },
4667
+ {
4668
+ "epoch": 1.016,
4669
+ "grad_norm": 0.16012416779994965,
4670
+ "learning_rate": 6.684e-05,
4671
+ "loss": 0.03058260679244995,
4672
+ "step": 6660
4673
+ },
4674
+ {
4675
+ "epoch": 1.017,
4676
+ "grad_norm": 0.1718786507844925,
4677
+ "learning_rate": 6.664e-05,
4678
+ "loss": 0.02695387601852417,
4679
+ "step": 6670
4680
+ },
4681
+ {
4682
+ "epoch": 1.018,
4683
+ "grad_norm": 0.18105700612068176,
4684
+ "learning_rate": 6.644000000000001e-05,
4685
+ "loss": 0.026461568474769593,
4686
+ "step": 6680
4687
+ },
4688
+ {
4689
+ "epoch": 1.019,
4690
+ "grad_norm": 0.15785875916481018,
4691
+ "learning_rate": 6.624e-05,
4692
+ "loss": 0.02598581314086914,
4693
+ "step": 6690
4694
+ },
4695
+ {
4696
+ "epoch": 1.02,
4697
+ "grad_norm": 0.15605275332927704,
4698
+ "learning_rate": 6.604e-05,
4699
+ "loss": 0.026878923177719116,
4700
+ "step": 6700
4701
+ },
4702
+ {
4703
+ "epoch": 1.021,
4704
+ "grad_norm": 0.1681290715932846,
4705
+ "learning_rate": 6.584e-05,
4706
+ "loss": 0.026533681154251098,
4707
+ "step": 6710
4708
+ },
4709
+ {
4710
+ "epoch": 1.022,
4711
+ "grad_norm": 0.17714430391788483,
4712
+ "learning_rate": 6.564e-05,
4713
+ "loss": 0.026042383909225465,
4714
+ "step": 6720
4715
+ },
4716
+ {
4717
+ "epoch": 1.023,
4718
+ "grad_norm": 0.17928007245063782,
4719
+ "learning_rate": 6.544e-05,
4720
+ "loss": 0.026147887110710144,
4721
+ "step": 6730
4722
+ },
4723
+ {
4724
+ "epoch": 1.024,
4725
+ "grad_norm": 0.21016575396060944,
4726
+ "learning_rate": 6.524e-05,
4727
+ "loss": 0.026558607816696167,
4728
+ "step": 6740
4729
+ },
4730
+ {
4731
+ "epoch": 1.025,
4732
+ "grad_norm": 0.19502075016498566,
4733
+ "learning_rate": 6.504e-05,
4734
+ "loss": 0.02726261019706726,
4735
+ "step": 6750
4736
+ },
4737
+ {
4738
+ "epoch": 1.026,
4739
+ "grad_norm": 0.1563853919506073,
4740
+ "learning_rate": 6.484e-05,
4741
+ "loss": 0.025972676277160645,
4742
+ "step": 6760
4743
+ },
4744
+ {
4745
+ "epoch": 1.027,
4746
+ "grad_norm": 0.15548229217529297,
4747
+ "learning_rate": 6.464e-05,
4748
+ "loss": 0.027564069628715514,
4749
+ "step": 6770
4750
+ },
4751
+ {
4752
+ "epoch": 1.028,
4753
+ "grad_norm": 0.18457072973251343,
4754
+ "learning_rate": 6.444e-05,
4755
+ "loss": 0.027495378255844118,
4756
+ "step": 6780
4757
+ },
4758
+ {
4759
+ "epoch": 1.029,
4760
+ "grad_norm": 0.15842361748218536,
4761
+ "learning_rate": 6.424e-05,
4762
+ "loss": 0.026171448826789855,
4763
+ "step": 6790
4764
+ },
4765
+ {
4766
+ "epoch": 1.03,
4767
+ "grad_norm": 0.19476866722106934,
4768
+ "learning_rate": 6.404e-05,
4769
+ "loss": 0.031318637728691104,
4770
+ "step": 6800
4771
+ },
4772
+ {
4773
+ "epoch": 1.031,
4774
+ "grad_norm": 0.1272721141576767,
4775
+ "learning_rate": 6.384e-05,
4776
+ "loss": 0.026132452487945556,
4777
+ "step": 6810
4778
+ },
4779
+ {
4780
+ "epoch": 2.0008,
4781
+ "grad_norm": 0.14176732301712036,
4782
+ "learning_rate": 6.364e-05,
4783
+ "loss": 0.03041217029094696,
4784
+ "step": 6820
4785
+ },
4786
+ {
4787
+ "epoch": 2.0018,
4788
+ "grad_norm": 0.14128795266151428,
4789
+ "learning_rate": 6.344e-05,
4790
+ "loss": 0.02396068423986435,
4791
+ "step": 6830
4792
+ },
4793
+ {
4794
+ "epoch": 2.0028,
4795
+ "grad_norm": 0.14129574596881866,
4796
+ "learning_rate": 6.324e-05,
4797
+ "loss": 0.02366064041852951,
4798
+ "step": 6840
4799
+ },
4800
+ {
4801
+ "epoch": 2.0038,
4802
+ "grad_norm": 0.1310533732175827,
4803
+ "learning_rate": 6.303999999999999e-05,
4804
+ "loss": 0.02287290096282959,
4805
+ "step": 6850
4806
+ },
4807
+ {
4808
+ "epoch": 2.0048,
4809
+ "grad_norm": 0.11340674012899399,
4810
+ "learning_rate": 6.284e-05,
4811
+ "loss": 0.02178637236356735,
4812
+ "step": 6860
4813
+ },
4814
+ {
4815
+ "epoch": 2.0058,
4816
+ "grad_norm": 0.10936598479747772,
4817
+ "learning_rate": 6.264e-05,
4818
+ "loss": 0.02486586421728134,
4819
+ "step": 6870
4820
+ },
4821
+ {
4822
+ "epoch": 2.0068,
4823
+ "grad_norm": 0.14033988118171692,
4824
+ "learning_rate": 6.244e-05,
4825
+ "loss": 0.022671811282634735,
4826
+ "step": 6880
4827
+ },
4828
+ {
4829
+ "epoch": 2.0078,
4830
+ "grad_norm": 0.1289263367652893,
4831
+ "learning_rate": 6.224e-05,
4832
+ "loss": 0.021725392341613768,
4833
+ "step": 6890
4834
+ },
4835
+ {
4836
+ "epoch": 2.0088,
4837
+ "grad_norm": 0.13452701270580292,
4838
+ "learning_rate": 6.204e-05,
4839
+ "loss": 0.02472696304321289,
4840
+ "step": 6900
4841
+ },
4842
+ {
4843
+ "epoch": 2.0098,
4844
+ "grad_norm": 0.14499792456626892,
4845
+ "learning_rate": 6.184e-05,
4846
+ "loss": 0.022322843968868255,
4847
+ "step": 6910
4848
+ },
4849
+ {
4850
+ "epoch": 2.0108,
4851
+ "grad_norm": 0.1398196518421173,
4852
+ "learning_rate": 6.164e-05,
4853
+ "loss": 0.022544071078300476,
4854
+ "step": 6920
4855
+ },
4856
+ {
4857
+ "epoch": 2.0118,
4858
+ "grad_norm": 0.1454416811466217,
4859
+ "learning_rate": 6.144e-05,
4860
+ "loss": 0.022087126970291138,
4861
+ "step": 6930
4862
+ },
4863
+ {
4864
+ "epoch": 2.0128,
4865
+ "grad_norm": 0.12601584196090698,
4866
+ "learning_rate": 6.124e-05,
4867
+ "loss": 0.023739957809448244,
4868
+ "step": 6940
4869
+ },
4870
+ {
4871
+ "epoch": 2.0138,
4872
+ "grad_norm": 0.13553965091705322,
4873
+ "learning_rate": 6.104000000000001e-05,
4874
+ "loss": 0.024189202487468718,
4875
+ "step": 6950
4876
+ },
4877
+ {
4878
+ "epoch": 2.0148,
4879
+ "grad_norm": 0.17853738367557526,
4880
+ "learning_rate": 6.084000000000001e-05,
4881
+ "loss": 0.024137826263904573,
4882
+ "step": 6960
4883
+ },
4884
+ {
4885
+ "epoch": 2.0158,
4886
+ "grad_norm": 0.1292908936738968,
4887
+ "learning_rate": 6.064000000000001e-05,
4888
+ "loss": 0.029603716731071473,
4889
+ "step": 6970
4890
+ },
4891
+ {
4892
+ "epoch": 2.0168,
4893
+ "grad_norm": 0.13389606773853302,
4894
+ "learning_rate": 6.044000000000001e-05,
4895
+ "loss": 0.02270825654268265,
4896
+ "step": 6980
4897
+ },
4898
+ {
4899
+ "epoch": 2.0178,
4900
+ "grad_norm": 0.10673966258764267,
4901
+ "learning_rate": 6.0240000000000006e-05,
4902
+ "loss": 0.0229933500289917,
4903
+ "step": 6990
4904
+ },
4905
+ {
4906
+ "epoch": 2.0188,
4907
+ "grad_norm": 0.13245923817157745,
4908
+ "learning_rate": 6.004000000000001e-05,
4909
+ "loss": 0.02357790470123291,
4910
+ "step": 7000
4911
  }
4912
  ],
4913
  "logging_steps": 10,
 
4927
  "attributes": {}
4928
  }
4929
  },
4930
+ "total_flos": 5.92542131384107e+17,
4931
  "train_batch_size": 1,
4932
  "trial_name": null,
4933
  "trial_params": null