Azrail commited on
Commit
30a1120
·
verified ·
1 Parent(s): 1f568c2

Training in progress, step 16568, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeeda4b1ee34efcb8cc9870b2c843e6649254ccabfeca77e86d631e6043a12c7
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5aeb4ed25696f7e10e7543b90c5a47b6138a594baf57483d3c889f4f1c258f4
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad619e41a755fe598f7951c93afa49e976ac527f447e6a0b8b599bd265a02a63
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe702f58a491cea98553ed6f74aecfeac0fa377b5678dc16b12b8bc2dfc14360
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce8b7e7bf463c8aae6a90e1a089b32f412ca31160cb21f8d8c8b0ee26b9ca28d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28e82f09e6e0a8d547118b1ff3e149779a831528016aa1b10eb3d2c1c802ac19
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab47855ede8e2538e1a83a739503d4b0055ccb1a5bd772cbc798bb06eb73f571
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d3bb1f3bb0340b0b65b79ab24d36ed0f271ff49414747938c92ca607d666859
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.8620788508834134,
6
  "eval_steps": 500,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3496,11 +3496,130 @@
3496
  "eval_steps_per_second": 20.463,
3497
  "num_input_tokens_seen": 7729128049,
3498
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3499
  }
3500
  ],
3501
  "logging_steps": 50,
3502
  "max_steps": 16568,
3503
- "num_input_tokens_seen": 7729128049,
3504
  "num_train_epochs": 4,
3505
  "save_steps": 1000,
3506
  "stateful_callbacks": {
@@ -3510,12 +3629,12 @@
3510
  "should_evaluate": false,
3511
  "should_log": false,
3512
  "should_save": true,
3513
- "should_training_stop": false
3514
  },
3515
  "attributes": {}
3516
  }
3517
  },
3518
- "total_flos": 2.0676171893972582e+18,
3519
  "train_batch_size": 16,
3520
  "trial_name": null,
3521
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.9992003259049143,
6
  "eval_steps": 500,
7
+ "global_step": 16568,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3496
  "eval_steps_per_second": 20.463,
3497
  "num_input_tokens_seen": 7729128049,
3498
  "step": 16000
3499
+ },
3500
+ {
3501
+ "epoch": 3.874149403262067,
3502
+ "grad_norm": 0.26953125,
3503
+ "learning_rate": 1.9541270559830995e-06,
3504
+ "loss": 2.0974,
3505
+ "mean_token_accuracy": 0.5552676925435662,
3506
+ "num_input_tokens_seen": 7753351249,
3507
+ "num_tokens": 3267712597.0,
3508
+ "step": 16050
3509
+ },
3510
+ {
3511
+ "epoch": 3.88621995564072,
3512
+ "grad_norm": 0.27734375,
3513
+ "learning_rate": 1.7655047532820282e-06,
3514
+ "loss": 2.0956,
3515
+ "mean_token_accuracy": 0.5543999705091118,
3516
+ "num_input_tokens_seen": 7777499137,
3517
+ "num_tokens": 3277902761.0,
3518
+ "step": 16100
3519
+ },
3520
+ {
3521
+ "epoch": 3.8982905080193735,
3522
+ "grad_norm": 0.25,
3523
+ "learning_rate": 1.576882450580957e-06,
3524
+ "loss": 2.0949,
3525
+ "mean_token_accuracy": 0.5548696434870363,
3526
+ "num_input_tokens_seen": 7801633089,
3527
+ "num_tokens": 3288082221.0,
3528
+ "step": 16150
3529
+ },
3530
+ {
3531
+ "epoch": 3.9103610603980266,
3532
+ "grad_norm": 0.24609375,
3533
+ "learning_rate": 1.3882601478798854e-06,
3534
+ "loss": 2.0986,
3535
+ "mean_token_accuracy": 0.5540741100907326,
3536
+ "num_input_tokens_seen": 7825738481,
3537
+ "num_tokens": 3298341216.0,
3538
+ "step": 16200
3539
+ },
3540
+ {
3541
+ "epoch": 3.9224316127766796,
3542
+ "grad_norm": 0.263671875,
3543
+ "learning_rate": 1.199637845178814e-06,
3544
+ "loss": 2.0917,
3545
+ "mean_token_accuracy": 0.556021711602807,
3546
+ "num_input_tokens_seen": 7849877409,
3547
+ "num_tokens": 3308431697.0,
3548
+ "step": 16250
3549
+ },
3550
+ {
3551
+ "epoch": 3.934502165155333,
3552
+ "grad_norm": 0.255859375,
3553
+ "learning_rate": 1.0110155424777427e-06,
3554
+ "loss": 2.0983,
3555
+ "mean_token_accuracy": 0.5542013296857476,
3556
+ "num_input_tokens_seen": 7873955105,
3557
+ "num_tokens": 3318516095.0,
3558
+ "step": 16300
3559
+ },
3560
+ {
3561
+ "epoch": 3.946572717533986,
3562
+ "grad_norm": 0.2490234375,
3563
+ "learning_rate": 8.223932397766712e-07,
3564
+ "loss": 2.1049,
3565
+ "mean_token_accuracy": 0.553213356398046,
3566
+ "num_input_tokens_seen": 7898153793,
3567
+ "num_tokens": 3328721862.0,
3568
+ "step": 16350
3569
+ },
3570
+ {
3571
+ "epoch": 3.9586432699126393,
3572
+ "grad_norm": 0.2734375,
3573
+ "learning_rate": 6.337709370755999e-07,
3574
+ "loss": 2.0981,
3575
+ "mean_token_accuracy": 0.5537924468889832,
3576
+ "num_input_tokens_seen": 7922266993,
3577
+ "num_tokens": 3338965333.0,
3578
+ "step": 16400
3579
+ },
3580
+ {
3581
+ "epoch": 3.9707138222912928,
3582
+ "grad_norm": 0.2431640625,
3583
+ "learning_rate": 4.4514863437452844e-07,
3584
+ "loss": 2.0952,
3585
+ "mean_token_accuracy": 0.5555017331615091,
3586
+ "num_input_tokens_seen": 7946376977,
3587
+ "num_tokens": 3349129194.0,
3588
+ "step": 16450
3589
+ },
3590
+ {
3591
+ "epoch": 3.982784374669946,
3592
+ "grad_norm": 0.279296875,
3593
+ "learning_rate": 2.565263316734571e-07,
3594
+ "loss": 2.0973,
3595
+ "num_input_tokens_seen": 7970557393,
3596
+ "step": 16500
3597
+ },
3598
+ {
3599
+ "epoch": 3.982784374669946,
3600
+ "eval_loss": 1.9680447578430176,
3601
+ "eval_mean_token_accuracy": 0.5785172289325018,
3602
+ "eval_num_tokens": 3359357384.0,
3603
+ "eval_runtime": 131.3028,
3604
+ "eval_samples_per_second": 81.582,
3605
+ "eval_steps_per_second": 20.396,
3606
+ "num_input_tokens_seen": 7970557393,
3607
+ "step": 16500
3608
+ },
3609
+ {
3610
+ "epoch": 3.994854927048599,
3611
+ "grad_norm": 0.2392578125,
3612
+ "learning_rate": 6.79040289723857e-08,
3613
+ "loss": 2.0972,
3614
+ "mean_token_accuracy": 0.5545667923986912,
3615
+ "num_input_tokens_seen": 7994787313,
3616
+ "num_tokens": 3369677528.0,
3617
+ "step": 16550
3618
  }
3619
  ],
3620
  "logging_steps": 50,
3621
  "max_steps": 16568,
3622
+ "num_input_tokens_seen": 8003589233,
3623
  "num_train_epochs": 4,
3624
  "save_steps": 1000,
3625
  "stateful_callbacks": {
 
3629
  "should_evaluate": false,
3630
  "should_log": false,
3631
  "should_save": true,
3632
+ "should_training_stop": true
3633
  },
3634
  "attributes": {}
3635
  }
3636
  },
3637
+ "total_flos": 2.141038234858414e+18,
3638
  "train_batch_size": 16,
3639
  "trial_name": null,
3640
  "trial_params": null