Azrail commited on
Commit
cac81bd
·
verified ·
1 Parent(s): 0483c23

Training in progress, step 139000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:294d2d3cfce69d5bcc552541aff1b1d0c5c39d6adabe16e718423a5d850f0d32
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6173b4bc562c2e11366705c8c76e7d31698b3a60389b9a754914d9b8842cf90f
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57a69c4accd4194b5ef200a371a59ef019db1dfd38dcb87b64dd42832f583b7c
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3048a59b63da999ae8fc02b473b5d2a50c2be60b98f1004a6c79f0035ac60f1
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdb15604f71f08bf635b865cf27878158a353a64f3dcaa6e5902e3e52c7eb375
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ef3d8a81eedcecdd331f8207cd63df8c3721e9e06bbee141ce7de5f7de358d9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d87ee32367beeb896fbea0e404a77621c8cd628a4eb1251b30dc94e06f2eb792
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0f0628bbbac738b6a9aa97ca88652280d641a00de879a3f6b83636f7c99513d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.316529329692214,
6
  "eval_steps": 500,
7
- "global_step": 138000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24572,11 +24572,189 @@
24572
  "eval_steps_per_second": 15.219,
24573
  "num_input_tokens_seen": 72340003200,
24574
  "step": 138000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24575
  }
24576
  ],
24577
  "logging_steps": 50,
24578
  "max_steps": 140000,
24579
- "num_input_tokens_seen": 72340003200,
24580
  "num_train_epochs": 2,
24581
  "save_steps": 1000,
24582
  "stateful_callbacks": {
@@ -24591,7 +24769,7 @@
24591
  "attributes": {}
24592
  }
24593
  },
24594
- "total_flos": 1.2802861084741632e+20,
24595
  "train_batch_size": 32,
24596
  "trial_name": null,
24597
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.3260693800159795,
6
  "eval_steps": 500,
7
+ "global_step": 139000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24572
  "eval_steps_per_second": 15.219,
24573
  "num_input_tokens_seen": 72340003200,
24574
  "step": 138000
24575
+ },
24576
+ {
24577
+ "epoch": 1.3170063322084025,
24578
+ "grad_norm": 0.11734651029109955,
24579
+ "learning_rate": 1.191954812408308e-05,
24580
+ "loss": 2.0241,
24581
+ "num_input_tokens_seen": 72366217600,
24582
+ "step": 138050
24583
+ },
24584
+ {
24585
+ "epoch": 1.3174833347245907,
24586
+ "grad_norm": 0.11315104365348816,
24587
+ "learning_rate": 1.1318413143740436e-05,
24588
+ "loss": 2.0195,
24589
+ "num_input_tokens_seen": 72392425632,
24590
+ "step": 138100
24591
+ },
24592
+ {
24593
+ "epoch": 1.3179603372407789,
24594
+ "grad_norm": 0.11212780326604843,
24595
+ "learning_rate": 1.0732657886902309e-05,
24596
+ "loss": 2.0379,
24597
+ "num_input_tokens_seen": 72418637536,
24598
+ "step": 138150
24599
+ },
24600
+ {
24601
+ "epoch": 1.3184373397569673,
24602
+ "grad_norm": 0.11390957236289978,
24603
+ "learning_rate": 1.0162300788382261e-05,
24604
+ "loss": 2.0245,
24605
+ "num_input_tokens_seen": 72444850752,
24606
+ "step": 138200
24607
+ },
24608
+ {
24609
+ "epoch": 1.3189143422731555,
24610
+ "grad_norm": 0.11521212011575699,
24611
+ "learning_rate": 9.607359798384786e-06,
24612
+ "loss": 2.0313,
24613
+ "num_input_tokens_seen": 72471060032,
24614
+ "step": 138250
24615
+ },
24616
+ {
24617
+ "epoch": 1.3193913447893437,
24618
+ "grad_norm": 0.11375854164361954,
24619
+ "learning_rate": 9.0678523819408e-06,
24620
+ "loss": 2.0313,
24621
+ "num_input_tokens_seen": 72497274432,
24622
+ "step": 138300
24623
+ },
24624
+ {
24625
+ "epoch": 1.319868347305532,
24626
+ "grad_norm": 0.11399056017398834,
24627
+ "learning_rate": 8.543795518357766e-06,
24628
+ "loss": 2.0256,
24629
+ "num_input_tokens_seen": 72523485952,
24630
+ "step": 138350
24631
+ },
24632
+ {
24633
+ "epoch": 1.3203453498217204,
24634
+ "grad_norm": 0.11128194630146027,
24635
+ "learning_rate": 8.035205700685167e-06,
24636
+ "loss": 2.0338,
24637
+ "num_input_tokens_seen": 72549700352,
24638
+ "step": 138400
24639
+ },
24640
+ {
24641
+ "epoch": 1.3208223523379086,
24642
+ "grad_norm": 0.11179857701063156,
24643
+ "learning_rate": 7.542098935195918e-06,
24644
+ "loss": 2.0362,
24645
+ "num_input_tokens_seen": 72575912992,
24646
+ "step": 138450
24647
+ },
24648
+ {
24649
+ "epoch": 1.3212993548540968,
24650
+ "grad_norm": 0.11500924825668335,
24651
+ "learning_rate": 7.064490740882057e-06,
24652
+ "loss": 2.0285,
24653
+ "num_input_tokens_seen": 72602127392,
24654
+ "step": 138500
24655
+ },
24656
+ {
24657
+ "epoch": 1.3212993548540968,
24658
+ "eval_loss": 1.951123833656311,
24659
+ "eval_runtime": 82.6672,
24660
+ "eval_samples_per_second": 60.484,
24661
+ "eval_steps_per_second": 15.121,
24662
+ "num_input_tokens_seen": 72602127392,
24663
+ "step": 138500
24664
+ },
24665
+ {
24666
+ "epoch": 1.3217763573702852,
24667
+ "grad_norm": 0.1176285520195961,
24668
+ "learning_rate": 6.602396148966794e-06,
24669
+ "loss": 2.0295,
24670
+ "num_input_tokens_seen": 72628340704,
24671
+ "step": 138550
24672
+ },
24673
+ {
24674
+ "epoch": 1.3222533598864734,
24675
+ "grad_norm": 0.11359469592571259,
24676
+ "learning_rate": 6.15582970243117e-06,
24677
+ "loss": 2.0206,
24678
+ "num_input_tokens_seen": 72654548704,
24679
+ "step": 138600
24680
+ },
24681
+ {
24682
+ "epoch": 1.3227303624026616,
24683
+ "grad_norm": 0.11230379343032837,
24684
+ "learning_rate": 5.72480545555637e-06,
24685
+ "loss": 2.0285,
24686
+ "num_input_tokens_seen": 72680760704,
24687
+ "step": 138650
24688
+ },
24689
+ {
24690
+ "epoch": 1.3232073649188498,
24691
+ "grad_norm": 0.11325126886367798,
24692
+ "learning_rate": 5.309336973481682e-06,
24693
+ "loss": 2.0316,
24694
+ "num_input_tokens_seen": 72706975104,
24695
+ "step": 138700
24696
+ },
24697
+ {
24698
+ "epoch": 1.3236843674350383,
24699
+ "grad_norm": 0.11530512571334839,
24700
+ "learning_rate": 4.909437331777178e-06,
24701
+ "loss": 2.0295,
24702
+ "num_input_tokens_seen": 72733189504,
24703
+ "step": 138750
24704
+ },
24705
+ {
24706
+ "epoch": 1.3241613699512265,
24707
+ "grad_norm": 0.11637042462825775,
24708
+ "learning_rate": 4.52511911603265e-06,
24709
+ "loss": 2.0358,
24710
+ "num_input_tokens_seen": 72759403904,
24711
+ "step": 138800
24712
+ },
24713
+ {
24714
+ "epoch": 1.324638372467415,
24715
+ "grad_norm": 0.11307495832443237,
24716
+ "learning_rate": 4.15639442146093e-06,
24717
+ "loss": 2.0256,
24718
+ "num_input_tokens_seen": 72785609280,
24719
+ "step": 138850
24720
+ },
24721
+ {
24722
+ "epoch": 1.325115374983603,
24723
+ "grad_norm": 0.11408944427967072,
24724
+ "learning_rate": 3.803274852517968e-06,
24725
+ "loss": 2.0432,
24726
+ "num_input_tokens_seen": 72811823680,
24727
+ "step": 138900
24728
+ },
24729
+ {
24730
+ "epoch": 1.3255923774997913,
24731
+ "grad_norm": 0.11304306238889694,
24732
+ "learning_rate": 3.4657715225368535e-06,
24733
+ "loss": 2.0342,
24734
+ "num_input_tokens_seen": 72838035008,
24735
+ "step": 138950
24736
+ },
24737
+ {
24738
+ "epoch": 1.3260693800159795,
24739
+ "grad_norm": 0.11682960391044617,
24740
+ "learning_rate": 3.143895053378698e-06,
24741
+ "loss": 2.0353,
24742
+ "num_input_tokens_seen": 72864248896,
24743
+ "step": 139000
24744
+ },
24745
+ {
24746
+ "epoch": 1.3260693800159795,
24747
+ "eval_loss": 1.9510550498962402,
24748
+ "eval_runtime": 82.5623,
24749
+ "eval_samples_per_second": 60.56,
24750
+ "eval_steps_per_second": 15.14,
24751
+ "num_input_tokens_seen": 72864248896,
24752
+ "step": 139000
24753
  }
24754
  ],
24755
  "logging_steps": 50,
24756
  "max_steps": 140000,
24757
+ "num_input_tokens_seen": 72864248896,
24758
  "num_train_epochs": 2,
24759
  "save_steps": 1000,
24760
  "stateful_callbacks": {
 
24769
  "attributes": {}
24770
  }
24771
  },
24772
+ "total_flos": 1.2895643010692137e+20,
24773
  "train_batch_size": 32,
24774
  "trial_name": null,
24775
  "trial_params": null