Azrail commited on
Commit
cf6f164
·
verified ·
1 Parent(s): 75e7f58

Training in progress, step 48000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f4e08ed2a6d62d28d840192a090317a05ca939879ecf26aa2b319d9c763f735
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83040a9f33c98136e5fdace56390e4f45897e63e4e108ccbd4b366bd299ccd64
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b2e15feb0f7f3fe2709a8b7d31a3a5c543a260dee03048851f465de58a0a6ac
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437a042365907c8955a2fc7d892d047bacaeaffa159edfba16e54b9aa6d50132
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef6d6c68b31cc97d3a7886b7338b6c21c45d7ba1c6c1b89db7e0a3456d53ecda
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3179ff7b9a01d9e9850f6d8ae042cb4934a5fc48309149cb50bc43cd37884f1d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30a691323d967d54c1c0f6fb771a9863c3def8ea94c66492bb5dbdffa3e83798
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de67d78be185ea67aa4ca20dcc37ca7f9d17d76246f8cfa3148b96b4fc56902c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.22419118260848825,
6
  "eval_steps": 500,
7
- "global_step": 47000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8374,11 +8374,189 @@
8374
  "eval_steps_per_second": 24.428,
8375
  "num_input_tokens_seen": 12320763456,
8376
  "step": 47000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8377
  }
8378
  ],
8379
  "logging_steps": 50,
8380
  "max_steps": 70000,
8381
- "num_input_tokens_seen": 12320763456,
8382
  "num_train_epochs": 1,
8383
  "save_steps": 1000,
8384
  "stateful_callbacks": {
@@ -8393,7 +8571,7 @@
8393
  "attributes": {}
8394
  }
8395
  },
8396
- "total_flos": 3.2959244751313306e+18,
8397
  "train_batch_size": 64,
8398
  "trial_name": null,
8399
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.228961207770371,
6
  "eval_steps": 500,
7
+ "global_step": 48000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8374
  "eval_steps_per_second": 24.428,
8375
  "num_input_tokens_seen": 12320763456,
8376
  "step": 47000
8377
+ },
8378
+ {
8379
+ "epoch": 0.2244296838665824,
8380
+ "grad_norm": 0.18894509971141815,
8381
+ "learning_rate": 0.001,
8382
+ "loss": 2.6069,
8383
+ "num_input_tokens_seen": 12333870656,
8384
+ "step": 47050
8385
+ },
8386
+ {
8387
+ "epoch": 0.22466818512467654,
8388
+ "grad_norm": 0.23441652953624725,
8389
+ "learning_rate": 0.001,
8390
+ "loss": 2.6092,
8391
+ "num_input_tokens_seen": 12346977856,
8392
+ "step": 47100
8393
+ },
8394
+ {
8395
+ "epoch": 0.22490668638277067,
8396
+ "grad_norm": 0.20195326209068298,
8397
+ "learning_rate": 0.001,
8398
+ "loss": 2.6135,
8399
+ "num_input_tokens_seen": 12360085056,
8400
+ "step": 47150
8401
+ },
8402
+ {
8403
+ "epoch": 0.2251451876408648,
8404
+ "grad_norm": 0.22025838494300842,
8405
+ "learning_rate": 0.001,
8406
+ "loss": 2.6034,
8407
+ "num_input_tokens_seen": 12373192256,
8408
+ "step": 47200
8409
+ },
8410
+ {
8411
+ "epoch": 0.22538368889895893,
8412
+ "grad_norm": 0.19111979007720947,
8413
+ "learning_rate": 0.001,
8414
+ "loss": 2.6151,
8415
+ "num_input_tokens_seen": 12386299456,
8416
+ "step": 47250
8417
+ },
8418
+ {
8419
+ "epoch": 0.22562219015705307,
8420
+ "grad_norm": 0.2010103464126587,
8421
+ "learning_rate": 0.001,
8422
+ "loss": 2.6031,
8423
+ "num_input_tokens_seen": 12399406656,
8424
+ "step": 47300
8425
+ },
8426
+ {
8427
+ "epoch": 0.22586069141514722,
8428
+ "grad_norm": 0.21569807827472687,
8429
+ "learning_rate": 0.001,
8430
+ "loss": 2.6012,
8431
+ "num_input_tokens_seen": 12412513856,
8432
+ "step": 47350
8433
+ },
8434
+ {
8435
+ "epoch": 0.22609919267324136,
8436
+ "grad_norm": 0.18600653111934662,
8437
+ "learning_rate": 0.001,
8438
+ "loss": 2.6087,
8439
+ "num_input_tokens_seen": 12425621056,
8440
+ "step": 47400
8441
+ },
8442
+ {
8443
+ "epoch": 0.2263376939313355,
8444
+ "grad_norm": 0.19476164877414703,
8445
+ "learning_rate": 0.001,
8446
+ "loss": 2.6179,
8447
+ "num_input_tokens_seen": 12438728256,
8448
+ "step": 47450
8449
+ },
8450
+ {
8451
+ "epoch": 0.22657619518942962,
8452
+ "grad_norm": 0.19705821573734283,
8453
+ "learning_rate": 0.001,
8454
+ "loss": 2.5983,
8455
+ "num_input_tokens_seen": 12451835456,
8456
+ "step": 47500
8457
+ },
8458
+ {
8459
+ "epoch": 0.22657619518942962,
8460
+ "eval_loss": 2.495936393737793,
8461
+ "eval_runtime": 51.8116,
8462
+ "eval_samples_per_second": 96.504,
8463
+ "eval_steps_per_second": 24.126,
8464
+ "num_input_tokens_seen": 12451835456,
8465
+ "step": 47500
8466
+ },
8467
+ {
8468
+ "epoch": 0.22681469644752375,
8469
+ "grad_norm": 0.23161695897579193,
8470
+ "learning_rate": 0.001,
8471
+ "loss": 2.5974,
8472
+ "num_input_tokens_seen": 12464942656,
8473
+ "step": 47550
8474
+ },
8475
+ {
8476
+ "epoch": 0.2270531977056179,
8477
+ "grad_norm": 0.2022540420293808,
8478
+ "learning_rate": 0.001,
8479
+ "loss": 2.6251,
8480
+ "num_input_tokens_seen": 12478049856,
8481
+ "step": 47600
8482
+ },
8483
+ {
8484
+ "epoch": 0.22729169896371204,
8485
+ "grad_norm": 1.0341856479644775,
8486
+ "learning_rate": 0.001,
8487
+ "loss": 2.5831,
8488
+ "num_input_tokens_seen": 12491157056,
8489
+ "step": 47650
8490
+ },
8491
+ {
8492
+ "epoch": 0.22753020022180617,
8493
+ "grad_norm": 0.3812394440174103,
8494
+ "learning_rate": 0.001,
8495
+ "loss": 2.6407,
8496
+ "num_input_tokens_seen": 12504264256,
8497
+ "step": 47700
8498
+ },
8499
+ {
8500
+ "epoch": 0.2277687014799003,
8501
+ "grad_norm": 0.27030590176582336,
8502
+ "learning_rate": 0.001,
8503
+ "loss": 2.6327,
8504
+ "num_input_tokens_seen": 12517371456,
8505
+ "step": 47750
8506
+ },
8507
+ {
8508
+ "epoch": 0.22800720273799444,
8509
+ "grad_norm": 1.3918724060058594,
8510
+ "learning_rate": 0.001,
8511
+ "loss": 2.6344,
8512
+ "num_input_tokens_seen": 12530478656,
8513
+ "step": 47800
8514
+ },
8515
+ {
8516
+ "epoch": 0.22824570399608857,
8517
+ "grad_norm": 0.22610582411289215,
8518
+ "learning_rate": 0.001,
8519
+ "loss": 2.6444,
8520
+ "num_input_tokens_seen": 12543585856,
8521
+ "step": 47850
8522
+ },
8523
+ {
8524
+ "epoch": 0.22848420525418273,
8525
+ "grad_norm": 0.21421480178833008,
8526
+ "learning_rate": 0.001,
8527
+ "loss": 2.6169,
8528
+ "num_input_tokens_seen": 12556693056,
8529
+ "step": 47900
8530
+ },
8531
+ {
8532
+ "epoch": 0.22872270651227686,
8533
+ "grad_norm": 0.20389467477798462,
8534
+ "learning_rate": 0.001,
8535
+ "loss": 2.6158,
8536
+ "num_input_tokens_seen": 12569800256,
8537
+ "step": 47950
8538
+ },
8539
+ {
8540
+ "epoch": 0.228961207770371,
8541
+ "grad_norm": 0.2265746295452118,
8542
+ "learning_rate": 0.001,
8543
+ "loss": 2.6101,
8544
+ "num_input_tokens_seen": 12582907456,
8545
+ "step": 48000
8546
+ },
8547
+ {
8548
+ "epoch": 0.228961207770371,
8549
+ "eval_loss": 2.4971351623535156,
8550
+ "eval_runtime": 54.0453,
8551
+ "eval_samples_per_second": 92.515,
8552
+ "eval_steps_per_second": 23.129,
8553
+ "num_input_tokens_seen": 12582907456,
8554
+ "step": 48000
8555
  }
8556
  ],
8557
  "logging_steps": 50,
8558
  "max_steps": 70000,
8559
+ "num_input_tokens_seen": 12582907456,
8560
  "num_train_epochs": 1,
8561
  "save_steps": 1000,
8562
  "stateful_callbacks": {
 
8571
  "attributes": {}
8572
  }
8573
  },
8574
+ "total_flos": 3.3660505536567706e+18,
8575
  "train_batch_size": 64,
8576
  "trial_name": null,
8577
  "trial_params": null