Azrail commited on
Commit
7674809
·
verified ·
1 Parent(s): 67f9420

Training in progress, step 54000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36284509c9073dd2ec02f720b116d0aa77896518dd97e6cb1b15bf18c5f1971e
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa0a1572ea481edcf54695292d1afeb45339e9d2c4b988649cdfc68bb148006f
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d392d61f01d27253443cebf734042b0281783df28f92f8dae1e7a4619df1a45
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88539436cc04b7ab674c4b703cc9d0b734fa709baeed11a8b4233a791dc8b00e
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5f53c01b35d1753a4f571c1ddd2b16976530a7b71c320877f1fbd74ce1de4ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3771019f4815646a43bbc09acce698c65d4ba61e6cbb0516a172314f7fbb077
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.25281133357978464,
6
  "eval_steps": 500,
7
- "global_step": 53000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9442,11 +9442,189 @@
9442
  "eval_steps_per_second": 23.269,
9443
  "num_input_tokens_seen": 13893627456,
9444
  "step": 53000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9445
  }
9446
  ],
9447
  "logging_steps": 50,
9448
  "max_steps": 70000,
9449
- "num_input_tokens_seen": 13893627456,
9450
  "num_train_epochs": 1,
9451
  "save_steps": 1000,
9452
  "stateful_callbacks": {
@@ -9461,7 +9639,7 @@
9461
  "attributes": {}
9462
  }
9463
  },
9464
- "total_flos": 3.7166809462839706e+18,
9465
  "train_batch_size": 64,
9466
  "trial_name": null,
9467
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2575813587416674,
6
  "eval_steps": 500,
7
+ "global_step": 54000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9442
  "eval_steps_per_second": 23.269,
9443
  "num_input_tokens_seen": 13893627456,
9444
  "step": 53000
9445
+ },
9446
+ {
9447
+ "epoch": 0.2530498348378788,
9448
+ "grad_norm": 0.20486177504062653,
9449
+ "learning_rate": 0.001,
9450
+ "loss": 2.5977,
9451
+ "num_input_tokens_seen": 13906734656,
9452
+ "step": 53050
9453
+ },
9454
+ {
9455
+ "epoch": 0.2532883360959729,
9456
+ "grad_norm": 0.18098385632038116,
9457
+ "learning_rate": 0.001,
9458
+ "loss": 2.5931,
9459
+ "num_input_tokens_seen": 13919841856,
9460
+ "step": 53100
9461
+ },
9462
+ {
9463
+ "epoch": 0.25352683735406706,
9464
+ "grad_norm": 0.1933833658695221,
9465
+ "learning_rate": 0.001,
9466
+ "loss": 2.6058,
9467
+ "num_input_tokens_seen": 13932949056,
9468
+ "step": 53150
9469
+ },
9470
+ {
9471
+ "epoch": 0.25376533861216116,
9472
+ "grad_norm": 0.29640141129493713,
9473
+ "learning_rate": 0.001,
9474
+ "loss": 2.5864,
9475
+ "num_input_tokens_seen": 13946056256,
9476
+ "step": 53200
9477
+ },
9478
+ {
9479
+ "epoch": 0.2540038398702553,
9480
+ "grad_norm": 0.2559553384780884,
9481
+ "learning_rate": 0.001,
9482
+ "loss": 2.6137,
9483
+ "num_input_tokens_seen": 13959163456,
9484
+ "step": 53250
9485
+ },
9486
+ {
9487
+ "epoch": 0.2542423411283494,
9488
+ "grad_norm": 0.21698619425296783,
9489
+ "learning_rate": 0.001,
9490
+ "loss": 2.6184,
9491
+ "num_input_tokens_seen": 13972270656,
9492
+ "step": 53300
9493
+ },
9494
+ {
9495
+ "epoch": 0.2544808423864436,
9496
+ "grad_norm": 0.19658173620700836,
9497
+ "learning_rate": 0.001,
9498
+ "loss": 2.5938,
9499
+ "num_input_tokens_seen": 13985377856,
9500
+ "step": 53350
9501
+ },
9502
+ {
9503
+ "epoch": 0.25471934364453774,
9504
+ "grad_norm": 0.2056342512369156,
9505
+ "learning_rate": 0.001,
9506
+ "loss": 2.5952,
9507
+ "num_input_tokens_seen": 13998485056,
9508
+ "step": 53400
9509
+ },
9510
+ {
9511
+ "epoch": 0.25495784490263185,
9512
+ "grad_norm": 0.1932424008846283,
9513
+ "learning_rate": 0.001,
9514
+ "loss": 2.6101,
9515
+ "num_input_tokens_seen": 14011592256,
9516
+ "step": 53450
9517
+ },
9518
+ {
9519
+ "epoch": 0.255196346160726,
9520
+ "grad_norm": 0.19347251951694489,
9521
+ "learning_rate": 0.001,
9522
+ "loss": 2.5976,
9523
+ "num_input_tokens_seen": 14024699456,
9524
+ "step": 53500
9525
+ },
9526
+ {
9527
+ "epoch": 0.255196346160726,
9528
+ "eval_loss": 2.4863245487213135,
9529
+ "eval_runtime": 53.2426,
9530
+ "eval_samples_per_second": 93.91,
9531
+ "eval_steps_per_second": 23.477,
9532
+ "num_input_tokens_seen": 14024699456,
9533
+ "step": 53500
9534
+ },
9535
+ {
9536
+ "epoch": 0.2554348474188201,
9537
+ "grad_norm": 0.1986820101737976,
9538
+ "learning_rate": 0.001,
9539
+ "loss": 2.6066,
9540
+ "num_input_tokens_seen": 14037806656,
9541
+ "step": 53550
9542
+ },
9543
+ {
9544
+ "epoch": 0.25567334867691427,
9545
+ "grad_norm": 0.21295565366744995,
9546
+ "learning_rate": 0.001,
9547
+ "loss": 2.6107,
9548
+ "num_input_tokens_seen": 14050913856,
9549
+ "step": 53600
9550
+ },
9551
+ {
9552
+ "epoch": 0.25591184993500843,
9553
+ "grad_norm": 0.21585114300251007,
9554
+ "learning_rate": 0.001,
9555
+ "loss": 2.6077,
9556
+ "num_input_tokens_seen": 14064021056,
9557
+ "step": 53650
9558
+ },
9559
+ {
9560
+ "epoch": 0.25615035119310253,
9561
+ "grad_norm": 0.19424305856227875,
9562
+ "learning_rate": 0.001,
9563
+ "loss": 2.5931,
9564
+ "num_input_tokens_seen": 14077128256,
9565
+ "step": 53700
9566
+ },
9567
+ {
9568
+ "epoch": 0.2563888524511967,
9569
+ "grad_norm": 0.20265349745750427,
9570
+ "learning_rate": 0.001,
9571
+ "loss": 2.5901,
9572
+ "num_input_tokens_seen": 14090235456,
9573
+ "step": 53750
9574
+ },
9575
+ {
9576
+ "epoch": 0.2566273537092908,
9577
+ "grad_norm": 1.037636160850525,
9578
+ "learning_rate": 0.001,
9579
+ "loss": 2.5775,
9580
+ "num_input_tokens_seen": 14103342656,
9581
+ "step": 53800
9582
+ },
9583
+ {
9584
+ "epoch": 0.25686585496738495,
9585
+ "grad_norm": 0.32030293345451355,
9586
+ "learning_rate": 0.001,
9587
+ "loss": 2.6242,
9588
+ "num_input_tokens_seen": 14116449856,
9589
+ "step": 53850
9590
+ },
9591
+ {
9592
+ "epoch": 0.2571043562254791,
9593
+ "grad_norm": 0.2339978665113449,
9594
+ "learning_rate": 0.001,
9595
+ "loss": 2.6122,
9596
+ "num_input_tokens_seen": 14129557056,
9597
+ "step": 53900
9598
+ },
9599
+ {
9600
+ "epoch": 0.2573428574835732,
9601
+ "grad_norm": 0.22179783880710602,
9602
+ "learning_rate": 0.001,
9603
+ "loss": 2.6025,
9604
+ "num_input_tokens_seen": 14142664256,
9605
+ "step": 53950
9606
+ },
9607
+ {
9608
+ "epoch": 0.2575813587416674,
9609
+ "grad_norm": 0.22616736590862274,
9610
+ "learning_rate": 0.001,
9611
+ "loss": 2.5916,
9612
+ "num_input_tokens_seen": 14155771456,
9613
+ "step": 54000
9614
+ },
9615
+ {
9616
+ "epoch": 0.2575813587416674,
9617
+ "eval_loss": 2.4871394634246826,
9618
+ "eval_runtime": 53.8695,
9619
+ "eval_samples_per_second": 92.817,
9620
+ "eval_steps_per_second": 23.204,
9621
+ "num_input_tokens_seen": 14155771456,
9622
+ "step": 54000
9623
  }
9624
  ],
9625
  "logging_steps": 50,
9626
  "max_steps": 70000,
9627
+ "num_input_tokens_seen": 14155771456,
9628
  "num_train_epochs": 1,
9629
  "save_steps": 1000,
9630
  "stateful_callbacks": {
 
9639
  "attributes": {}
9640
  }
9641
  },
9642
+ "total_flos": 3.7868070248094106e+18,
9643
  "train_batch_size": 64,
9644
  "trial_name": null,
9645
  "trial_params": null