Azrail commited on
Commit
0a14e70
·
verified ·
1 Parent(s): 5005ea0

Training in progress, step 133000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9e57d7de320997016d5d2199393f3c6d5ccbb8649da9e46ae713874cd8a8e24
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ad31cc8051a774ff84bf50a2f043b12568d60c659ab713450ad489e60ff067
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f36f287a4da99bdaf6e0deca55af9eddec679234fd5785a93d74c5b7275a731
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f98fbf6f84fc645d4e9351e4872ab3409232339169c895981f2ca6168553f54
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3da6ad8ffd940afd42f47dbccd6a99fedee37b4e239b9c682223ad1635ee1326
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eda9968c0f9e110957e79edd3603196e5c46bdd8acc1a9a916fa49100e905254
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21aed170a2d0b5ca9750f891383cff878afad1161fd25ef679259c6d8c42258b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c40f5e3cc10bc35190c452a89f96d672b73ffd5edfe6d4e72f9d0b88f5a7c9a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2592890277496214,
6
  "eval_steps": 500,
7
- "global_step": 132000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23504,11 +23504,189 @@
23504
  "eval_steps_per_second": 15.132,
23505
  "num_input_tokens_seen": 69194840608,
23506
  "step": 132000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23507
  }
23508
  ],
23509
  "logging_steps": 50,
23510
  "max_steps": 140000,
23511
- "num_input_tokens_seen": 69194840608,
23512
  "num_train_epochs": 2,
23513
  "save_steps": 1000,
23514
  "stateful_callbacks": {
@@ -23523,7 +23701,7 @@
23523
  "attributes": {}
23524
  }
23525
  },
23526
- "total_flos": 1.224622467372331e+20,
23527
  "train_batch_size": 32,
23528
  "trial_name": null,
23529
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2688290780733869,
6
  "eval_steps": 500,
7
+ "global_step": 133000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23504
  "eval_steps_per_second": 15.132,
23505
  "num_input_tokens_seen": 69194840608,
23506
  "step": 132000
23507
+ },
23508
+ {
23509
+ "epoch": 1.2597660302658096,
23510
+ "grad_norm": 0.11991748213768005,
23511
+ "learning_rate": 0.0001860669839912626,
23512
+ "loss": 2.0354,
23513
+ "num_input_tokens_seen": 69221050496,
23514
+ "step": 132050
23515
+ },
23516
+ {
23517
+ "epoch": 1.260243032781998,
23518
+ "grad_norm": 0.11859247088432312,
23519
+ "learning_rate": 0.00018388874897104518,
23520
+ "loss": 2.0449,
23521
+ "num_input_tokens_seen": 69247257536,
23522
+ "step": 132100
23523
+ },
23524
+ {
23525
+ "epoch": 1.2607200352981862,
23526
+ "grad_norm": 0.12269642949104309,
23527
+ "learning_rate": 0.00018172046256311088,
23528
+ "loss": 2.0427,
23529
+ "num_input_tokens_seen": 69273469824,
23530
+ "step": 132150
23531
+ },
23532
+ {
23533
+ "epoch": 1.2611970378143744,
23534
+ "grad_norm": 0.11893275380134583,
23535
+ "learning_rate": 0.00017956219300748795,
23536
+ "loss": 2.0366,
23537
+ "num_input_tokens_seen": 69299684224,
23538
+ "step": 132200
23539
+ },
23540
+ {
23541
+ "epoch": 1.2616740403305626,
23542
+ "grad_norm": 0.12191104143857956,
23543
+ "learning_rate": 0.0001774140082289563,
23544
+ "loss": 2.0393,
23545
+ "num_input_tokens_seen": 69325894496,
23546
+ "step": 132250
23547
+ },
23548
+ {
23549
+ "epoch": 1.262151042846751,
23550
+ "grad_norm": 0.12704069912433624,
23551
+ "learning_rate": 0.00017527597583490823,
23552
+ "loss": 2.0551,
23553
+ "num_input_tokens_seen": 69352101952,
23554
+ "step": 132300
23555
+ },
23556
+ {
23557
+ "epoch": 1.2626280453629393,
23558
+ "grad_norm": 0.12682849168777466,
23559
+ "learning_rate": 0.00017314816311322218,
23560
+ "loss": 2.0376,
23561
+ "num_input_tokens_seen": 69378314752,
23562
+ "step": 132350
23563
+ },
23564
+ {
23565
+ "epoch": 1.2631050478791277,
23566
+ "grad_norm": 0.1246429830789566,
23567
+ "learning_rate": 0.00017103063703014372,
23568
+ "loss": 2.0402,
23569
+ "num_input_tokens_seen": 69404523776,
23570
+ "step": 132400
23571
+ },
23572
+ {
23573
+ "epoch": 1.263582050395316,
23574
+ "grad_norm": 0.12006555497646332,
23575
+ "learning_rate": 0.00016892346422817944,
23576
+ "loss": 2.0383,
23577
+ "num_input_tokens_seen": 69430732160,
23578
+ "step": 132450
23579
+ },
23580
+ {
23581
+ "epoch": 1.264059052911504,
23582
+ "grad_norm": 0.12435656785964966,
23583
+ "learning_rate": 0.00016682671102399805,
23584
+ "loss": 2.0347,
23585
+ "num_input_tokens_seen": 69456943424,
23586
+ "step": 132500
23587
+ },
23588
+ {
23589
+ "epoch": 1.264059052911504,
23590
+ "eval_loss": 1.9590063095092773,
23591
+ "eval_runtime": 82.7888,
23592
+ "eval_samples_per_second": 60.395,
23593
+ "eval_steps_per_second": 15.099,
23594
+ "num_input_tokens_seen": 69456943424,
23595
+ "step": 132500
23596
+ },
23597
+ {
23598
+ "epoch": 1.2645360554276923,
23599
+ "grad_norm": 0.12412598729133606,
23600
+ "learning_rate": 0.0001647404434063447,
23601
+ "loss": 2.0436,
23602
+ "num_input_tokens_seen": 69483146688,
23603
+ "step": 132550
23604
+ },
23605
+ {
23606
+ "epoch": 1.2650130579438805,
23607
+ "grad_norm": 0.12309623509645462,
23608
+ "learning_rate": 0.00016266472703396284,
23609
+ "loss": 2.028,
23610
+ "num_input_tokens_seen": 69509359968,
23611
+ "step": 132600
23612
+ },
23613
+ {
23614
+ "epoch": 1.265490060460069,
23615
+ "grad_norm": 0.12758532166481018,
23616
+ "learning_rate": 0.0001605996272335291,
23617
+ "loss": 2.041,
23618
+ "num_input_tokens_seen": 69535568960,
23619
+ "step": 132650
23620
+ },
23621
+ {
23622
+ "epoch": 1.2659670629762572,
23623
+ "grad_norm": 0.11922606080770493,
23624
+ "learning_rate": 0.00015854520899759655,
23625
+ "loss": 2.0308,
23626
+ "num_input_tokens_seen": 69561777024,
23627
+ "step": 132700
23628
+ },
23629
+ {
23630
+ "epoch": 1.2664440654924456,
23631
+ "grad_norm": 0.1239946112036705,
23632
+ "learning_rate": 0.00015650153698254916,
23633
+ "loss": 2.0336,
23634
+ "num_input_tokens_seen": 69587981952,
23635
+ "step": 132750
23636
+ },
23637
+ {
23638
+ "epoch": 1.2669210680086338,
23639
+ "grad_norm": 0.12584541738033295,
23640
+ "learning_rate": 0.00015446867550656767,
23641
+ "loss": 2.0376,
23642
+ "num_input_tokens_seen": 69614192832,
23643
+ "step": 132800
23644
+ },
23645
+ {
23646
+ "epoch": 1.267398070524822,
23647
+ "grad_norm": 0.12514598667621613,
23648
+ "learning_rate": 0.00015244668854760458,
23649
+ "loss": 2.0411,
23650
+ "num_input_tokens_seen": 69640405600,
23651
+ "step": 132850
23652
+ },
23653
+ {
23654
+ "epoch": 1.2678750730410102,
23655
+ "grad_norm": 0.12181352823972702,
23656
+ "learning_rate": 0.00015043563974137132,
23657
+ "loss": 2.0404,
23658
+ "num_input_tokens_seen": 69666619040,
23659
+ "step": 132900
23660
+ },
23661
+ {
23662
+ "epoch": 1.2683520755571986,
23663
+ "grad_norm": 0.11871461570262909,
23664
+ "learning_rate": 0.00014843559237933475,
23665
+ "loss": 2.0458,
23666
+ "num_input_tokens_seen": 69692833440,
23667
+ "step": 132950
23668
+ },
23669
+ {
23670
+ "epoch": 1.2688290780733869,
23671
+ "grad_norm": 0.12271245568990707,
23672
+ "learning_rate": 0.00014644660940672628,
23673
+ "loss": 2.0354,
23674
+ "num_input_tokens_seen": 69719047840,
23675
+ "step": 133000
23676
+ },
23677
+ {
23678
+ "epoch": 1.2688290780733869,
23679
+ "eval_loss": 1.9576880931854248,
23680
+ "eval_runtime": 82.558,
23681
+ "eval_samples_per_second": 60.564,
23682
+ "eval_steps_per_second": 15.141,
23683
+ "num_input_tokens_seen": 69719047840,
23684
+ "step": 133000
23685
  }
23686
  ],
23687
  "logging_steps": 50,
23688
  "max_steps": 140000,
23689
+ "num_input_tokens_seen": 69719047840,
23690
  "num_train_epochs": 2,
23691
  "save_steps": 1000,
23692
  "stateful_callbacks": {
 
23701
  "attributes": {}
23702
  }
23703
  },
23704
+ "total_flos": 1.2338999792247398e+20,
23705
  "train_batch_size": 32,
23706
  "trial_name": null,
23707
  "trial_params": null