Azrail commited on
Commit
30c36b0
·
verified ·
1 Parent(s): 28dc3e1

Training in progress, step 43000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a2fefdad4151a99e63ae166d56e8a067e9a45ca5f3593e23025024d085087cf
3
  size 563074920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f40af4f458fcf98ff975a8a67bf6d8f825776f93c4fc893bfff9e777a429186
3
  size 563074920
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d5b1d4b777cae5e9916f147902b1092566fa9ffed9791e0e98c64471a83c547
3
  size 1125916346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2806be43a2dbc11749a984f9f27c3c727021a3459f016ea43cf735de07b8e8b
3
  size 1125916346
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb738c4448993bfabe6ccb3cd4eb736ae6765e9a75360a49be850fad829fbf26
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4616c29adbb72fca86a53186f80355f9390c75c85ef3660d2db8c34d983194a4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b25eb8da59d422374fd03939a74626640022fbe8ead6db5fc1660c4003101153
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf398bf7bc1350249be7408612b67c8ebc4068beabac28de92ab798dacce92e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2825129527143945,
6
  "eval_steps": 500,
7
- "global_step": 42000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7484,11 +7484,189 @@
7484
  "eval_steps_per_second": 8.724,
7485
  "num_input_tokens_seen": 11010048000,
7486
  "step": 42000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7487
  }
7488
  ],
7489
  "logging_steps": 50,
7490
  "max_steps": 60000,
7491
- "num_input_tokens_seen": 11010048000,
7492
  "num_train_epochs": 1,
7493
  "save_steps": 1000,
7494
  "stateful_callbacks": {
@@ -7503,7 +7681,7 @@
7503
  "attributes": {}
7504
  }
7505
  },
7506
- "total_flos": 7.01579283922944e+18,
7507
  "train_batch_size": 64,
7508
  "trial_name": null,
7509
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2892394515885468,
6
  "eval_steps": 500,
7
+ "global_step": 43000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7484
  "eval_steps_per_second": 8.724,
7485
  "num_input_tokens_seen": 11010048000,
7486
  "step": 42000
7487
+ },
7488
+ {
7489
+ "epoch": 0.28284927765810214,
7490
+ "grad_norm": 0.6817448735237122,
7491
+ "learning_rate": 0.0005871683190370497,
7492
+ "loss": 12.0507,
7493
+ "num_input_tokens_seen": 11023155200,
7494
+ "step": 42050
7495
+ },
7496
+ {
7497
+ "epoch": 0.28318560260180975,
7498
+ "grad_norm": 1.443415641784668,
7499
+ "learning_rate": 0.0005864706724332221,
7500
+ "loss": 12.0804,
7501
+ "num_input_tokens_seen": 11036262400,
7502
+ "step": 42100
7503
+ },
7504
+ {
7505
+ "epoch": 0.28352192754551736,
7506
+ "grad_norm": 0.7497735619544983,
7507
+ "learning_rate": 0.0005857549961807582,
7508
+ "loss": 12.1135,
7509
+ "num_input_tokens_seen": 11049369600,
7510
+ "step": 42150
7511
+ },
7512
+ {
7513
+ "epoch": 0.28385825248922497,
7514
+ "grad_norm": 0.7141171097755432,
7515
+ "learning_rate": 0.0005850213353222835,
7516
+ "loss": 12.0707,
7517
+ "num_input_tokens_seen": 11062476800,
7518
+ "step": 42200
7519
+ },
7520
+ {
7521
+ "epoch": 0.2841945774329326,
7522
+ "grad_norm": 0.6800997257232666,
7523
+ "learning_rate": 0.0005842697360323246,
7524
+ "loss": 12.0946,
7525
+ "num_input_tokens_seen": 11075584000,
7526
+ "step": 42250
7527
+ },
7528
+ {
7529
+ "epoch": 0.2845309023766402,
7530
+ "grad_norm": 0.6729973554611206,
7531
+ "learning_rate": 0.0005835002456144005,
7532
+ "loss": 12.0882,
7533
+ "num_input_tokens_seen": 11088691200,
7534
+ "step": 42300
7535
+ },
7536
+ {
7537
+ "epoch": 0.2848672273203478,
7538
+ "grad_norm": 0.715886116027832,
7539
+ "learning_rate": 0.0005827129124980481,
7540
+ "loss": 12.0713,
7541
+ "num_input_tokens_seen": 11101798400,
7542
+ "step": 42350
7543
+ },
7544
+ {
7545
+ "epoch": 0.2852035522640554,
7546
+ "grad_norm": 0.7392980456352234,
7547
+ "learning_rate": 0.0005819077862357724,
7548
+ "loss": 12.0934,
7549
+ "num_input_tokens_seen": 11114905600,
7550
+ "step": 42400
7551
+ },
7552
+ {
7553
+ "epoch": 0.285539877207763,
7554
+ "grad_norm": 0.7118540406227112,
7555
+ "learning_rate": 0.0005810849174999285,
7556
+ "loss": 12.0531,
7557
+ "num_input_tokens_seen": 11128012800,
7558
+ "step": 42450
7559
+ },
7560
+ {
7561
+ "epoch": 0.28587620215147064,
7562
+ "grad_norm": 0.6643871665000916,
7563
+ "learning_rate": 0.000580244358079532,
7564
+ "loss": 12.0812,
7565
+ "num_input_tokens_seen": 11141120000,
7566
+ "step": 42500
7567
+ },
7568
+ {
7569
+ "epoch": 0.28587620215147064,
7570
+ "eval_loss": 2.9250741004943848,
7571
+ "eval_runtime": 143.6479,
7572
+ "eval_samples_per_second": 34.807,
7573
+ "eval_steps_per_second": 8.702,
7574
+ "num_input_tokens_seen": 11141120000,
7575
+ "step": 42500
7576
+ },
7577
+ {
7578
+ "epoch": 0.2862125270951783,
7579
+ "grad_norm": 0.7261589169502258,
7580
+ "learning_rate": 0.0005793861608770001,
7581
+ "loss": 12.0856,
7582
+ "num_input_tokens_seen": 11154227200,
7583
+ "step": 42550
7584
+ },
7585
+ {
7586
+ "epoch": 0.2865488520388859,
7587
+ "grad_norm": 0.7352684140205383,
7588
+ "learning_rate": 0.0005785103799048218,
7589
+ "loss": 12.094,
7590
+ "num_input_tokens_seen": 11167334400,
7591
+ "step": 42600
7592
+ },
7593
+ {
7594
+ "epoch": 0.2868851769825935,
7595
+ "grad_norm": 0.650610089302063,
7596
+ "learning_rate": 0.0005776170702821582,
7597
+ "loss": 12.0796,
7598
+ "num_input_tokens_seen": 11180441600,
7599
+ "step": 42650
7600
+ },
7601
+ {
7602
+ "epoch": 0.28722150192630114,
7603
+ "grad_norm": 0.6917529106140137,
7604
+ "learning_rate": 0.0005767062882313743,
7605
+ "loss": 12.0511,
7606
+ "num_input_tokens_seen": 11193548800,
7607
+ "step": 42700
7608
+ },
7609
+ {
7610
+ "epoch": 0.28755782687000875,
7611
+ "grad_norm": 0.8611562252044678,
7612
+ "learning_rate": 0.0005757780910744997,
7613
+ "loss": 12.0772,
7614
+ "num_input_tokens_seen": 11206656000,
7615
+ "step": 42750
7616
+ },
7617
+ {
7618
+ "epoch": 0.28789415181371636,
7619
+ "grad_norm": 0.7321364283561707,
7620
+ "learning_rate": 0.0005748325372296208,
7621
+ "loss": 12.0432,
7622
+ "num_input_tokens_seen": 11219763200,
7623
+ "step": 42800
7624
+ },
7625
+ {
7626
+ "epoch": 0.28823047675742397,
7627
+ "grad_norm": 0.6974388957023621,
7628
+ "learning_rate": 0.0005738696862072053,
7629
+ "loss": 12.0408,
7630
+ "num_input_tokens_seen": 11232870400,
7631
+ "step": 42850
7632
+ },
7633
+ {
7634
+ "epoch": 0.2885668017011316,
7635
+ "grad_norm": 0.6981905102729797,
7636
+ "learning_rate": 0.0005728895986063554,
7637
+ "loss": 12.0419,
7638
+ "num_input_tokens_seen": 11245977600,
7639
+ "step": 42900
7640
+ },
7641
+ {
7642
+ "epoch": 0.2889031266448392,
7643
+ "grad_norm": 0.7019402384757996,
7644
+ "learning_rate": 0.000571892336110995,
7645
+ "loss": 12.0206,
7646
+ "num_input_tokens_seen": 11259084800,
7647
+ "step": 42950
7648
+ },
7649
+ {
7650
+ "epoch": 0.2892394515885468,
7651
+ "grad_norm": 0.7176699042320251,
7652
+ "learning_rate": 0.0005708779614859863,
7653
+ "loss": 12.0641,
7654
+ "num_input_tokens_seen": 11272192000,
7655
+ "step": 43000
7656
+ },
7657
+ {
7658
+ "epoch": 0.2892394515885468,
7659
+ "eval_loss": 2.9219655990600586,
7660
+ "eval_runtime": 144.3813,
7661
+ "eval_samples_per_second": 34.631,
7662
+ "eval_steps_per_second": 8.658,
7663
+ "num_input_tokens_seen": 11272192000,
7664
+ "step": 43000
7665
  }
7666
  ],
7667
  "logging_steps": 50,
7668
  "max_steps": 60000,
7669
+ "num_input_tokens_seen": 11272192000,
7670
  "num_train_epochs": 1,
7671
  "save_steps": 1000,
7672
  "stateful_callbacks": {
 
7681
  "attributes": {}
7682
  }
7683
  },
7684
+ "total_flos": 7.18283552587776e+18,
7685
  "train_batch_size": 64,
7686
  "trial_name": null,
7687
  "trial_params": null