Azrail commited on
Commit
39141bc
·
verified ·
1 Parent(s): dde2809

Training in progress, step 26000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca1305d807a0d62209066bee9cbe48b75438f197b4d11307eb4ba5e592a11386
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28394340845b35c88e8a63417e18c503dadf4a251790835d2715e5a4962f656e
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed4d9687ffe945b21f6759ab92e79d3a46252bbf5731184d996dc881364e21e9
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4565341b2daf769a1d6b98280e7a99c73d3df5a11f570b225860490fa5b0252c
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a59157d1ca64ffae44fbe8134d666bfe8e12822f27ca50fb6e1f0b29f58d3b64
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b4ee497eed0fe85641f8ca254d6d7e11e60873712ef2108d29f717bef60c5dd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8c565830d05eccabcd7df396792d29e3638ccbd6988e240ff15902ef690b7e6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ced8856b9ff194699de7fca54070bd17a17efd31d5f5d4d7e4c8ff1ec712ca9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.549151081072547,
6
  "eval_steps": 500,
7
- "global_step": 25000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4458,11 +4458,189 @@
4458
  "eval_steps_per_second": 18.595,
4459
  "num_input_tokens_seen": 26214396160,
4460
  "step": 25000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4461
  }
4462
  ],
4463
  "logging_steps": 50,
4464
  "max_steps": 200000,
4465
- "num_input_tokens_seen": 26214396160,
4466
  "num_train_epochs": 5,
4467
  "save_steps": 1000,
4468
  "stateful_callbacks": {
@@ -4477,7 +4655,7 @@
4477
  "attributes": {}
4478
  }
4479
  },
4480
- "total_flos": 1.492929071221506e+19,
4481
  "train_batch_size": 64,
4482
  "trial_name": null,
4483
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5711171243154488,
6
  "eval_steps": 500,
7
+ "global_step": 26000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4458
  "eval_steps_per_second": 18.595,
4459
  "num_input_tokens_seen": 26214396160,
4460
  "step": 25000
4461
+ },
4462
+ {
4463
+ "epoch": 0.5502493832346921,
4464
+ "grad_norm": 0.16484692692756653,
4465
+ "learning_rate": 0.001,
4466
+ "loss": 2.6843,
4467
+ "num_input_tokens_seen": 26266824960,
4468
+ "step": 25050
4469
+ },
4470
+ {
4471
+ "epoch": 0.5513476853968372,
4472
+ "grad_norm": 0.1583317369222641,
4473
+ "learning_rate": 0.001,
4474
+ "loss": 2.6825,
4475
+ "num_input_tokens_seen": 26319253760,
4476
+ "step": 25100
4477
+ },
4478
+ {
4479
+ "epoch": 0.5524459875589822,
4480
+ "grad_norm": 0.1569424867630005,
4481
+ "learning_rate": 0.001,
4482
+ "loss": 2.6787,
4483
+ "num_input_tokens_seen": 26371682560,
4484
+ "step": 25150
4485
+ },
4486
+ {
4487
+ "epoch": 0.5535442897211273,
4488
+ "grad_norm": 0.13633306324481964,
4489
+ "learning_rate": 0.001,
4490
+ "loss": 2.6872,
4491
+ "num_input_tokens_seen": 26424111360,
4492
+ "step": 25200
4493
+ },
4494
+ {
4495
+ "epoch": 0.5546425918832725,
4496
+ "grad_norm": 0.1480533927679062,
4497
+ "learning_rate": 0.001,
4498
+ "loss": 2.6842,
4499
+ "num_input_tokens_seen": 26476540160,
4500
+ "step": 25250
4501
+ },
4502
+ {
4503
+ "epoch": 0.5557408940454175,
4504
+ "grad_norm": 0.1267666518688202,
4505
+ "learning_rate": 0.001,
4506
+ "loss": 2.6839,
4507
+ "num_input_tokens_seen": 26528968960,
4508
+ "step": 25300
4509
+ },
4510
+ {
4511
+ "epoch": 0.5568391962075626,
4512
+ "grad_norm": 0.13951599597930908,
4513
+ "learning_rate": 0.001,
4514
+ "loss": 2.6799,
4515
+ "num_input_tokens_seen": 26581397760,
4516
+ "step": 25350
4517
+ },
4518
+ {
4519
+ "epoch": 0.5579374983697077,
4520
+ "grad_norm": 0.15044580399990082,
4521
+ "learning_rate": 0.001,
4522
+ "loss": 2.6846,
4523
+ "num_input_tokens_seen": 26633826560,
4524
+ "step": 25400
4525
+ },
4526
+ {
4527
+ "epoch": 0.5590358005318529,
4528
+ "grad_norm": 0.12891829013824463,
4529
+ "learning_rate": 0.001,
4530
+ "loss": 2.682,
4531
+ "num_input_tokens_seen": 26686255360,
4532
+ "step": 25450
4533
+ },
4534
+ {
4535
+ "epoch": 0.5601341026939979,
4536
+ "grad_norm": 0.12812241911888123,
4537
+ "learning_rate": 0.001,
4538
+ "loss": 2.684,
4539
+ "num_input_tokens_seen": 26738684160,
4540
+ "step": 25500
4541
+ },
4542
+ {
4543
+ "epoch": 0.5601341026939979,
4544
+ "eval_loss": 2.5832085609436035,
4545
+ "eval_runtime": 66.9038,
4546
+ "eval_samples_per_second": 74.734,
4547
+ "eval_steps_per_second": 18.684,
4548
+ "num_input_tokens_seen": 26738684160,
4549
+ "step": 25500
4550
+ },
4551
+ {
4552
+ "epoch": 0.561232404856143,
4553
+ "grad_norm": 0.14243654906749725,
4554
+ "learning_rate": 0.001,
4555
+ "loss": 2.6883,
4556
+ "num_input_tokens_seen": 26791112960,
4557
+ "step": 25550
4558
+ },
4559
+ {
4560
+ "epoch": 0.5623307070182881,
4561
+ "grad_norm": 0.14436320960521698,
4562
+ "learning_rate": 0.001,
4563
+ "loss": 2.6835,
4564
+ "num_input_tokens_seen": 26843541760,
4565
+ "step": 25600
4566
+ },
4567
+ {
4568
+ "epoch": 0.5634290091804331,
4569
+ "grad_norm": 0.1516960710287094,
4570
+ "learning_rate": 0.001,
4571
+ "loss": 2.6752,
4572
+ "num_input_tokens_seen": 26895970560,
4573
+ "step": 25650
4574
+ },
4575
+ {
4576
+ "epoch": 0.5645273113425783,
4577
+ "grad_norm": 0.14002515375614166,
4578
+ "learning_rate": 0.001,
4579
+ "loss": 2.6817,
4580
+ "num_input_tokens_seen": 26948399360,
4581
+ "step": 25700
4582
+ },
4583
+ {
4584
+ "epoch": 0.5656256135047234,
4585
+ "grad_norm": 0.1379036009311676,
4586
+ "learning_rate": 0.001,
4587
+ "loss": 2.6904,
4588
+ "num_input_tokens_seen": 27000828160,
4589
+ "step": 25750
4590
+ },
4591
+ {
4592
+ "epoch": 0.5667239156668685,
4593
+ "grad_norm": 0.16127964854240417,
4594
+ "learning_rate": 0.001,
4595
+ "loss": 2.6813,
4596
+ "num_input_tokens_seen": 27053256960,
4597
+ "step": 25800
4598
+ },
4599
+ {
4600
+ "epoch": 0.5678222178290135,
4601
+ "grad_norm": 0.15714125335216522,
4602
+ "learning_rate": 0.001,
4603
+ "loss": 2.6851,
4604
+ "num_input_tokens_seen": 27105685760,
4605
+ "step": 25850
4606
+ },
4607
+ {
4608
+ "epoch": 0.5689205199911587,
4609
+ "grad_norm": 0.15288160741329193,
4610
+ "learning_rate": 0.001,
4611
+ "loss": 2.6832,
4612
+ "num_input_tokens_seen": 27158114560,
4613
+ "step": 25900
4614
+ },
4615
+ {
4616
+ "epoch": 0.5700188221533038,
4617
+ "grad_norm": 0.1398363709449768,
4618
+ "learning_rate": 0.001,
4619
+ "loss": 2.6814,
4620
+ "num_input_tokens_seen": 27210543360,
4621
+ "step": 25950
4622
+ },
4623
+ {
4624
+ "epoch": 0.5711171243154488,
4625
+ "grad_norm": 0.15253235399723053,
4626
+ "learning_rate": 0.001,
4627
+ "loss": 2.6755,
4628
+ "num_input_tokens_seen": 27262972160,
4629
+ "step": 26000
4630
+ },
4631
+ {
4632
+ "epoch": 0.5711171243154488,
4633
+ "eval_loss": 2.5809168815612793,
4634
+ "eval_runtime": 66.151,
4635
+ "eval_samples_per_second": 75.585,
4636
+ "eval_steps_per_second": 18.896,
4637
+ "num_input_tokens_seen": 27262972160,
4638
+ "step": 26000
4639
  }
4640
  ],
4641
  "logging_steps": 50,
4642
  "max_steps": 200000,
4643
+ "num_input_tokens_seen": 27262972160,
4644
  "num_train_epochs": 5,
4645
  "save_steps": 1000,
4646
  "stateful_callbacks": {
 
4655
  "attributes": {}
4656
  }
4657
  },
4658
+ "total_flos": 1.5526462428179988e+19,
4659
  "train_batch_size": 64,
4660
  "trial_name": null,
4661
  "trial_params": null