Azrail commited on
Commit
15bc99c
·
verified ·
1 Parent(s): cbab9bf

Training in progress, step 54000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76223d6bdee171cffd4cda1d9c4bbbab95942f789f412cecdfbcec4b8715383c
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7005ee4ac699efbe46e787cdaab363f958cca84ce68e125ca53c53198e13eeac
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e3b0c1d87658be3018021f1815500a16d4cf88fae3993a3710e48c97c61995c
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5151b63ca0c165877166c8eeb6faa3b784251ae57745f30c89f3dbaf08defd7
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5f53c01b35d1753a4f571c1ddd2b16976530a7b71c320877f1fbd74ce1de4ed
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e516d1931a63763a7fdfb84f01f54aaada25beb218520b62969ba08ff897cee4
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3771019f4815646a43bbc09acce698c65d4ba61e6cbb0516a172314f7fbb077
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b89459823d581d70469027e8df5427d5b9a07aadbd42c55eac43368b994e74e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3565044403300693,
6
  "eval_steps": 500,
7
- "global_step": 53000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9442,11 +9442,189 @@
9442
  "eval_steps_per_second": 23.488,
9443
  "num_input_tokens_seen": 13893632000,
9444
  "step": 53000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9445
  }
9446
  ],
9447
  "logging_steps": 50,
9448
  "max_steps": 60000,
9449
- "num_input_tokens_seen": 13893632000,
9450
  "num_train_epochs": 1,
9451
  "save_steps": 1000,
9452
  "stateful_callbacks": {
@@ -9461,7 +9639,7 @@
9461
  "attributes": {}
9462
  }
9463
  },
9464
- "total_flos": 3.71668216184832e+18,
9465
  "train_batch_size": 64,
9466
  "trial_name": null,
9467
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.36323093920422156,
6
  "eval_steps": 500,
7
+ "global_step": 54000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9442
  "eval_steps_per_second": 23.488,
9443
  "num_input_tokens_seen": 13893632000,
9444
  "step": 53000
9445
+ },
9446
+ {
9447
+ "epoch": 0.3568407652737769,
9448
+ "grad_norm": 0.3147699236869812,
9449
+ "learning_rate": 0.001,
9450
+ "loss": 3.0557,
9451
+ "num_input_tokens_seen": 13906739200,
9452
+ "step": 53050
9453
+ },
9454
+ {
9455
+ "epoch": 0.3571770902174845,
9456
+ "grad_norm": 0.22110533714294434,
9457
+ "learning_rate": 0.001,
9458
+ "loss": 3.0515,
9459
+ "num_input_tokens_seen": 13919846400,
9460
+ "step": 53100
9461
+ },
9462
+ {
9463
+ "epoch": 0.3575134151611921,
9464
+ "grad_norm": 0.23334212601184845,
9465
+ "learning_rate": 0.001,
9466
+ "loss": 3.0523,
9467
+ "num_input_tokens_seen": 13932953600,
9468
+ "step": 53150
9469
+ },
9470
+ {
9471
+ "epoch": 0.3578497401048997,
9472
+ "grad_norm": 0.200640469789505,
9473
+ "learning_rate": 0.001,
9474
+ "loss": 3.0621,
9475
+ "num_input_tokens_seen": 13946060800,
9476
+ "step": 53200
9477
+ },
9478
+ {
9479
+ "epoch": 0.35818606504860734,
9480
+ "grad_norm": 0.20875929296016693,
9481
+ "learning_rate": 0.001,
9482
+ "loss": 3.0591,
9483
+ "num_input_tokens_seen": 13959168000,
9484
+ "step": 53250
9485
+ },
9486
+ {
9487
+ "epoch": 0.35852238999231495,
9488
+ "grad_norm": 0.19065573811531067,
9489
+ "learning_rate": 0.001,
9490
+ "loss": 3.0591,
9491
+ "num_input_tokens_seen": 13972275200,
9492
+ "step": 53300
9493
+ },
9494
+ {
9495
+ "epoch": 0.35885871493602256,
9496
+ "grad_norm": 0.18688392639160156,
9497
+ "learning_rate": 0.001,
9498
+ "loss": 3.0475,
9499
+ "num_input_tokens_seen": 13985382400,
9500
+ "step": 53350
9501
+ },
9502
+ {
9503
+ "epoch": 0.3591950398797302,
9504
+ "grad_norm": 0.1864282786846161,
9505
+ "learning_rate": 0.001,
9506
+ "loss": 3.0485,
9507
+ "num_input_tokens_seen": 13998489600,
9508
+ "step": 53400
9509
+ },
9510
+ {
9511
+ "epoch": 0.35953136482343784,
9512
+ "grad_norm": 0.20456114411354065,
9513
+ "learning_rate": 0.001,
9514
+ "loss": 3.0529,
9515
+ "num_input_tokens_seen": 14011596800,
9516
+ "step": 53450
9517
+ },
9518
+ {
9519
+ "epoch": 0.35986768976714545,
9520
+ "grad_norm": 0.24362069368362427,
9521
+ "learning_rate": 0.001,
9522
+ "loss": 3.0444,
9523
+ "num_input_tokens_seen": 14024704000,
9524
+ "step": 53500
9525
+ },
9526
+ {
9527
+ "epoch": 0.35986768976714545,
9528
+ "eval_loss": 2.943416118621826,
9529
+ "eval_runtime": 53.1574,
9530
+ "eval_samples_per_second": 94.06,
9531
+ "eval_steps_per_second": 23.515,
9532
+ "num_input_tokens_seen": 14024704000,
9533
+ "step": 53500
9534
+ },
9535
+ {
9536
+ "epoch": 0.36020401471085306,
9537
+ "grad_norm": 0.19701169431209564,
9538
+ "learning_rate": 0.001,
9539
+ "loss": 3.0513,
9540
+ "num_input_tokens_seen": 14037811200,
9541
+ "step": 53550
9542
+ },
9543
+ {
9544
+ "epoch": 0.36054033965456067,
9545
+ "grad_norm": 0.1785692274570465,
9546
+ "learning_rate": 0.001,
9547
+ "loss": 3.0541,
9548
+ "num_input_tokens_seen": 14050918400,
9549
+ "step": 53600
9550
+ },
9551
+ {
9552
+ "epoch": 0.3608766645982683,
9553
+ "grad_norm": 0.1865462064743042,
9554
+ "learning_rate": 0.001,
9555
+ "loss": 3.0367,
9556
+ "num_input_tokens_seen": 14064025600,
9557
+ "step": 53650
9558
+ },
9559
+ {
9560
+ "epoch": 0.3612129895419759,
9561
+ "grad_norm": 0.4129047095775604,
9562
+ "learning_rate": 0.001,
9563
+ "loss": 3.043,
9564
+ "num_input_tokens_seen": 14077132800,
9565
+ "step": 53700
9566
+ },
9567
+ {
9568
+ "epoch": 0.3615493144856835,
9569
+ "grad_norm": 0.21066440641880035,
9570
+ "learning_rate": 0.001,
9571
+ "loss": 3.0585,
9572
+ "num_input_tokens_seen": 14090240000,
9573
+ "step": 53750
9574
+ },
9575
+ {
9576
+ "epoch": 0.3618856394293911,
9577
+ "grad_norm": 0.6820788383483887,
9578
+ "learning_rate": 0.001,
9579
+ "loss": 3.0534,
9580
+ "num_input_tokens_seen": 14103347200,
9581
+ "step": 53800
9582
+ },
9583
+ {
9584
+ "epoch": 0.3622219643730987,
9585
+ "grad_norm": 0.9664424657821655,
9586
+ "learning_rate": 0.001,
9587
+ "loss": 3.069,
9588
+ "num_input_tokens_seen": 14116454400,
9589
+ "step": 53850
9590
+ },
9591
+ {
9592
+ "epoch": 0.36255828931680634,
9593
+ "grad_norm": 0.35416921973228455,
9594
+ "learning_rate": 0.001,
9595
+ "loss": 3.0629,
9596
+ "num_input_tokens_seen": 14129561600,
9597
+ "step": 53900
9598
+ },
9599
+ {
9600
+ "epoch": 0.36289461426051395,
9601
+ "grad_norm": 0.3159606158733368,
9602
+ "learning_rate": 0.001,
9603
+ "loss": 3.0722,
9604
+ "num_input_tokens_seen": 14142668800,
9605
+ "step": 53950
9606
+ },
9607
+ {
9608
+ "epoch": 0.36323093920422156,
9609
+ "grad_norm": 0.2518790662288666,
9610
+ "learning_rate": 0.001,
9611
+ "loss": 3.071,
9612
+ "num_input_tokens_seen": 14155776000,
9613
+ "step": 54000
9614
+ },
9615
+ {
9616
+ "epoch": 0.36323093920422156,
9617
+ "eval_loss": 2.9483964443206787,
9618
+ "eval_runtime": 53.2042,
9619
+ "eval_samples_per_second": 93.978,
9620
+ "eval_steps_per_second": 23.494,
9621
+ "num_input_tokens_seen": 14155776000,
9622
+ "step": 54000
9623
  }
9624
  ],
9625
  "logging_steps": 50,
9626
  "max_steps": 60000,
9627
+ "num_input_tokens_seen": 14155776000,
9628
  "num_train_epochs": 1,
9629
  "save_steps": 1000,
9630
  "stateful_callbacks": {
 
9639
  "attributes": {}
9640
  }
9641
  },
9642
+ "total_flos": 3.78680824037376e+18,
9643
  "train_batch_size": 64,
9644
  "trial_name": null,
9645
  "trial_params": null