Azrail commited on
Commit
7283aff
·
verified ·
1 Parent(s): 580b4a0

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50692c69fe3ea90614dc625956890e6dd059a4900ffb733cb441c9d9b0be1ed6
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0634cd3b48faa896331e649d644ee85a0e0af72246ab7393a66a3c2518bb02e
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f34ad85e7a64410399bc0984c1c1c25765a6659574c5d382b0c132a27be2f0f8
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:611dbdaa20f4f869458e449fe2e70d417e2df56bd8ff59602f5187369567bda1
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09184de0af072dcf6f15e331e61deb81a6900d407b5c7ebcb519d56082f36e97
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd42aefaf8cffc05ebd908742fc863dc5486d9c9296568766959af6a5b7610ad
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02c3d80aaacee80212417a329afbc88c74b35bad8004900a2301b44b629b4ab7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6662ae68d38995d5846f13e724946a2acb1395046b7d08977dde3dab733945c0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4173548216151357,
6
  "eval_steps": 500,
7
- "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3390,11 +3390,189 @@
3390
  "eval_steps_per_second": 19.01,
3391
  "num_input_tokens_seen": 19922944000,
3392
  "step": 19000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3393
  }
3394
  ],
3395
  "logging_steps": 50,
3396
  "max_steps": 200000,
3397
- "num_input_tokens_seen": 19922944000,
3398
  "num_train_epochs": 5,
3399
  "save_steps": 1000,
3400
  "stateful_callbacks": {
@@ -3409,7 +3587,7 @@
3409
  "attributes": {}
3410
  }
3411
  },
3412
- "total_flos": 1.1346262603333632e+19,
3413
  "train_batch_size": 64,
3414
  "trial_name": null,
3415
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.43932086485803756,
6
  "eval_steps": 500,
7
+ "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3390
  "eval_steps_per_second": 19.01,
3391
  "num_input_tokens_seen": 19922944000,
3392
  "step": 19000
3393
+ },
3394
+ {
3395
+ "epoch": 0.4184531237772808,
3396
+ "grad_norm": 0.12389284372329712,
3397
+ "learning_rate": 0.001,
3398
+ "loss": 2.7222,
3399
+ "num_input_tokens_seen": 19975372800,
3400
+ "step": 19050
3401
+ },
3402
+ {
3403
+ "epoch": 0.4195514259394259,
3404
+ "grad_norm": 0.14157339930534363,
3405
+ "learning_rate": 0.001,
3406
+ "loss": 2.7178,
3407
+ "num_input_tokens_seen": 20027801600,
3408
+ "step": 19100
3409
+ },
3410
+ {
3411
+ "epoch": 0.420649728101571,
3412
+ "grad_norm": 0.1490466445684433,
3413
+ "learning_rate": 0.001,
3414
+ "loss": 2.7185,
3415
+ "num_input_tokens_seen": 20080230400,
3416
+ "step": 19150
3417
+ },
3418
+ {
3419
+ "epoch": 0.4217480302637161,
3420
+ "grad_norm": 0.14112494885921478,
3421
+ "learning_rate": 0.001,
3422
+ "loss": 2.7166,
3423
+ "num_input_tokens_seen": 20132659200,
3424
+ "step": 19200
3425
+ },
3426
+ {
3427
+ "epoch": 0.42284633242586117,
3428
+ "grad_norm": 0.13986504077911377,
3429
+ "learning_rate": 0.001,
3430
+ "loss": 2.7201,
3431
+ "num_input_tokens_seen": 20185088000,
3432
+ "step": 19250
3433
+ },
3434
+ {
3435
+ "epoch": 0.42394463458800624,
3436
+ "grad_norm": 0.14087803661823273,
3437
+ "learning_rate": 0.001,
3438
+ "loss": 2.7175,
3439
+ "num_input_tokens_seen": 20237516800,
3440
+ "step": 19300
3441
+ },
3442
+ {
3443
+ "epoch": 0.42504293675015137,
3444
+ "grad_norm": 0.165438711643219,
3445
+ "learning_rate": 0.001,
3446
+ "loss": 2.7155,
3447
+ "num_input_tokens_seen": 20289945600,
3448
+ "step": 19350
3449
+ },
3450
+ {
3451
+ "epoch": 0.42614123891229644,
3452
+ "grad_norm": 0.132109135389328,
3453
+ "learning_rate": 0.001,
3454
+ "loss": 2.7116,
3455
+ "num_input_tokens_seen": 20342374400,
3456
+ "step": 19400
3457
+ },
3458
+ {
3459
+ "epoch": 0.42723954107444156,
3460
+ "grad_norm": 0.1372772753238678,
3461
+ "learning_rate": 0.001,
3462
+ "loss": 2.7137,
3463
+ "num_input_tokens_seen": 20394803200,
3464
+ "step": 19450
3465
+ },
3466
+ {
3467
+ "epoch": 0.42833784323658664,
3468
+ "grad_norm": 0.1470147669315338,
3469
+ "learning_rate": 0.001,
3470
+ "loss": 2.7081,
3471
+ "num_input_tokens_seen": 20447232000,
3472
+ "step": 19500
3473
+ },
3474
+ {
3475
+ "epoch": 0.42833784323658664,
3476
+ "eval_loss": 2.615947961807251,
3477
+ "eval_runtime": 65.588,
3478
+ "eval_samples_per_second": 76.233,
3479
+ "eval_steps_per_second": 19.058,
3480
+ "num_input_tokens_seen": 20447232000,
3481
+ "step": 19500
3482
+ },
3483
+ {
3484
+ "epoch": 0.42943614539873176,
3485
+ "grad_norm": 0.15671676397323608,
3486
+ "learning_rate": 0.001,
3487
+ "loss": 2.7176,
3488
+ "num_input_tokens_seen": 20499660800,
3489
+ "step": 19550
3490
+ },
3491
+ {
3492
+ "epoch": 0.43053444756087683,
3493
+ "grad_norm": 0.13104794919490814,
3494
+ "learning_rate": 0.001,
3495
+ "loss": 2.7108,
3496
+ "num_input_tokens_seen": 20552089600,
3497
+ "step": 19600
3498
+ },
3499
+ {
3500
+ "epoch": 0.4316327497230219,
3501
+ "grad_norm": 0.14532406628131866,
3502
+ "learning_rate": 0.001,
3503
+ "loss": 2.7087,
3504
+ "num_input_tokens_seen": 20604518400,
3505
+ "step": 19650
3506
+ },
3507
+ {
3508
+ "epoch": 0.43273105188516703,
3509
+ "grad_norm": 0.16199354827404022,
3510
+ "learning_rate": 0.001,
3511
+ "loss": 2.7178,
3512
+ "num_input_tokens_seen": 20656947200,
3513
+ "step": 19700
3514
+ },
3515
+ {
3516
+ "epoch": 0.4338293540473121,
3517
+ "grad_norm": 0.13537316024303436,
3518
+ "learning_rate": 0.001,
3519
+ "loss": 2.7124,
3520
+ "num_input_tokens_seen": 20709376000,
3521
+ "step": 19750
3522
+ },
3523
+ {
3524
+ "epoch": 0.4349276562094572,
3525
+ "grad_norm": 0.15098537504673004,
3526
+ "learning_rate": 0.001,
3527
+ "loss": 2.7119,
3528
+ "num_input_tokens_seen": 20761804800,
3529
+ "step": 19800
3530
+ },
3531
+ {
3532
+ "epoch": 0.4360259583716023,
3533
+ "grad_norm": 0.21563659608364105,
3534
+ "learning_rate": 0.001,
3535
+ "loss": 2.7118,
3536
+ "num_input_tokens_seen": 20814233600,
3537
+ "step": 19850
3538
+ },
3539
+ {
3540
+ "epoch": 0.43712426053374737,
3541
+ "grad_norm": 0.15981121361255646,
3542
+ "learning_rate": 0.001,
3543
+ "loss": 2.7043,
3544
+ "num_input_tokens_seen": 20866662400,
3545
+ "step": 19900
3546
+ },
3547
+ {
3548
+ "epoch": 0.4382225626958925,
3549
+ "grad_norm": 0.15192069113254547,
3550
+ "learning_rate": 0.001,
3551
+ "loss": 2.7137,
3552
+ "num_input_tokens_seen": 20919091200,
3553
+ "step": 19950
3554
+ },
3555
+ {
3556
+ "epoch": 0.43932086485803756,
3557
+ "grad_norm": 0.14211437106132507,
3558
+ "learning_rate": 0.001,
3559
+ "loss": 2.7128,
3560
+ "num_input_tokens_seen": 20971520000,
3561
+ "step": 20000
3562
+ },
3563
+ {
3564
+ "epoch": 0.43932086485803756,
3565
+ "eval_loss": 2.611689567565918,
3566
+ "eval_runtime": 66.3456,
3567
+ "eval_samples_per_second": 75.363,
3568
+ "eval_steps_per_second": 18.841,
3569
+ "num_input_tokens_seen": 20971520000,
3570
+ "step": 20000
3571
  }
3572
  ],
3573
  "logging_steps": 50,
3574
  "max_steps": 200000,
3575
+ "num_input_tokens_seen": 20971520000,
3576
  "num_train_epochs": 5,
3577
  "save_steps": 1000,
3578
  "stateful_callbacks": {
 
3587
  "attributes": {}
3588
  }
3589
  },
3590
+ "total_flos": 1.194343431929856e+19,
3591
  "train_batch_size": 64,
3592
  "trial_name": null,
3593
  "trial_params": null