AiAF commited on
Commit
3928ed9
·
verified ·
1 Parent(s): b7c51fa

Training in progress, step 700, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f324031f7a5f64d996914a53f616440d3542f7bfd1d1bd047c6bf8351b781971
3
  size 102264160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59badcbc1d668a371853f284e39f9ca33e2fe2af68b773148163044bb0f70bdd
3
  size 102264160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67210f76ed9c029d6ef5061227adf74372a001ffef8daca3ef3d136da719dbb9
3
  size 52162827
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49665f71e34f2a3db3bbae94d41e9706d6e4267d7bf49d604935f42728af0512
3
  size 52162827
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a1d3c69b35f53b118782dd94c78466c7746e86131456f7cde73b319b454bd68
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2376b84b97294d583dff60749feb13d6533baf27b96b9a245af922803baac53
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5592f46c154d334eaa5a16d750cd9060bfacc1786394b3ad334927a4f8e7542
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fccf8d05f51ee90d9abfa90ec4fa092bb34ce369846454436f6371151204846
3
  size 1465
last-checkpoint/tokens_state.json CHANGED
@@ -1 +1 @@
1
- {"total": 9076224, "trainable": 3746515}
 
1
+ {"total": 9769472, "trainable": 4042644}
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.31703450798683086,
6
  "eval_steps": 50,
7
- "global_step": 650,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9276,6 +9276,718 @@
9276
  "memory/max_active (GiB)": 11.76,
9277
  "memory/max_allocated (GiB)": 11.76,
9278
  "step": 650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9279
  }
9280
  ],
9281
  "logging_steps": 1,
@@ -9295,7 +10007,7 @@
9295
  "attributes": {}
9296
  }
9297
  },
9298
- "total_flos": 1.1164174449455923e+17,
9299
  "train_batch_size": 2,
9300
  "trial_name": null,
9301
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3414217778319717,
6
  "eval_steps": 50,
7
+ "global_step": 700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9276
  "memory/max_active (GiB)": 11.76,
9277
  "memory/max_allocated (GiB)": 11.76,
9278
  "step": 650
9279
+ },
9280
+ {
9281
+ "epoch": 0.3175222533837337,
9282
+ "grad_norm": 0.1710362732410431,
9283
+ "learning_rate": 5.765665457425102e-05,
9284
+ "loss": 2.4334278106689453,
9285
+ "memory/device_reserved (GiB)": 25.88,
9286
+ "memory/max_active (GiB)": 15.19,
9287
+ "memory/max_allocated (GiB)": 15.19,
9288
+ "ppl": 11.39788,
9289
+ "step": 651,
9290
+ "tokens/total": 9089664,
9291
+ "tokens/train_per_sec_per_gpu": 899.25,
9292
+ "tokens/trainable": 3750762
9293
+ },
9294
+ {
9295
+ "epoch": 0.3180099987806365,
9296
+ "grad_norm": 0.14897273480892181,
9297
+ "learning_rate": 5.736346951157544e-05,
9298
+ "loss": 2.455512523651123,
9299
+ "memory/device_reserved (GiB)": 37.6,
9300
+ "memory/max_active (GiB)": 16.51,
9301
+ "memory/max_allocated (GiB)": 16.51,
9302
+ "ppl": 11.6524,
9303
+ "step": 652,
9304
+ "tokens/total": 9104128,
9305
+ "tokens/train_per_sec_per_gpu": 3318.27,
9306
+ "tokens/trainable": 3756193
9307
+ },
9308
+ {
9309
+ "epoch": 0.31849774417753934,
9310
+ "grad_norm": 0.13437563180923462,
9311
+ "learning_rate": 5.707073168592942e-05,
9312
+ "loss": 2.4900941848754883,
9313
+ "memory/device_reserved (GiB)": 37.6,
9314
+ "memory/max_active (GiB)": 15.19,
9315
+ "memory/max_allocated (GiB)": 15.19,
9316
+ "ppl": 12.06241,
9317
+ "step": 653,
9318
+ "tokens/total": 9118592,
9319
+ "tokens/train_per_sec_per_gpu": 1940.99,
9320
+ "tokens/trainable": 3763520
9321
+ },
9322
+ {
9323
+ "epoch": 0.3189854895744421,
9324
+ "grad_norm": 0.153215691447258,
9325
+ "learning_rate": 5.677844416799424e-05,
9326
+ "loss": 2.5800952911376953,
9327
+ "memory/device_reserved (GiB)": 37.6,
9328
+ "memory/max_active (GiB)": 16.07,
9329
+ "memory/max_allocated (GiB)": 16.07,
9330
+ "ppl": 13.1984,
9331
+ "step": 654,
9332
+ "tokens/total": 9133824,
9333
+ "tokens/train_per_sec_per_gpu": 2126.06,
9334
+ "tokens/trainable": 3769074
9335
+ },
9336
+ {
9337
+ "epoch": 0.31947323497134494,
9338
+ "grad_norm": 0.14280448853969574,
9339
+ "learning_rate": 5.648661002372768e-05,
9340
+ "loss": 2.5871028900146484,
9341
+ "memory/device_reserved (GiB)": 37.6,
9342
+ "memory/max_active (GiB)": 16.51,
9343
+ "memory/max_allocated (GiB)": 16.51,
9344
+ "ppl": 13.29121,
9345
+ "step": 655,
9346
+ "tokens/total": 9147904,
9347
+ "tokens/train_per_sec_per_gpu": 3428.97,
9348
+ "tokens/trainable": 3776266
9349
+ },
9350
+ {
9351
+ "epoch": 0.31996098036824777,
9352
+ "grad_norm": 0.1566459834575653,
9353
+ "learning_rate": 5.6195232314331766e-05,
9354
+ "loss": 2.5909602642059326,
9355
+ "memory/device_reserved (GiB)": 37.6,
9356
+ "memory/max_active (GiB)": 15.63,
9357
+ "memory/max_allocated (GiB)": 15.63,
9358
+ "ppl": 13.34258,
9359
+ "step": 656,
9360
+ "tokens/total": 9161344,
9361
+ "tokens/train_per_sec_per_gpu": 1993.38,
9362
+ "tokens/trainable": 3782089
9363
+ },
9364
+ {
9365
+ "epoch": 0.3204487257651506,
9366
+ "grad_norm": 0.16187436878681183,
9367
+ "learning_rate": 5.590431409622081e-05,
9368
+ "loss": 2.4071998596191406,
9369
+ "memory/device_reserved (GiB)": 37.6,
9370
+ "memory/max_active (GiB)": 15.63,
9371
+ "memory/max_allocated (GiB)": 15.63,
9372
+ "ppl": 11.10283,
9373
+ "step": 657,
9374
+ "tokens/total": 9175040,
9375
+ "tokens/train_per_sec_per_gpu": 390.92,
9376
+ "tokens/trainable": 3786753
9377
+ },
9378
+ {
9379
+ "epoch": 0.3209364711620534,
9380
+ "grad_norm": 0.17244231700897217,
9381
+ "learning_rate": 5.56138584209893e-05,
9382
+ "loss": 2.428713083267212,
9383
+ "memory/device_reserved (GiB)": 37.6,
9384
+ "memory/max_active (GiB)": 15.53,
9385
+ "memory/max_allocated (GiB)": 15.53,
9386
+ "ppl": 11.34427,
9387
+ "step": 658,
9388
+ "tokens/total": 9187328,
9389
+ "tokens/train_per_sec_per_gpu": 2066.3,
9390
+ "tokens/trainable": 3791029
9391
+ },
9392
+ {
9393
+ "epoch": 0.3214242165589562,
9394
+ "grad_norm": 0.14595621824264526,
9395
+ "learning_rate": 5.532386833537977e-05,
9396
+ "loss": 2.5427656173706055,
9397
+ "memory/device_reserved (GiB)": 37.6,
9398
+ "memory/max_active (GiB)": 14.74,
9399
+ "memory/max_allocated (GiB)": 14.74,
9400
+ "ppl": 12.71479,
9401
+ "step": 659,
9402
+ "tokens/total": 9199872,
9403
+ "tokens/train_per_sec_per_gpu": 3074.28,
9404
+ "tokens/trainable": 3797470
9405
+ },
9406
+ {
9407
+ "epoch": 0.321911961955859,
9408
+ "grad_norm": 0.1848934441804886,
9409
+ "learning_rate": 5.503434688125104e-05,
9410
+ "loss": 2.55776309967041,
9411
+ "memory/device_reserved (GiB)": 37.6,
9412
+ "memory/max_active (GiB)": 15.09,
9413
+ "memory/max_allocated (GiB)": 15.09,
9414
+ "ppl": 12.90691,
9415
+ "step": 660,
9416
+ "tokens/total": 9213184,
9417
+ "tokens/train_per_sec_per_gpu": 236.15,
9418
+ "tokens/trainable": 3801347
9419
+ },
9420
+ {
9421
+ "epoch": 0.32239970735276186,
9422
+ "grad_norm": 0.2647537291049957,
9423
+ "learning_rate": 5.474529709554612e-05,
9424
+ "loss": 2.4955523014068604,
9425
+ "memory/device_reserved (GiB)": 37.6,
9426
+ "memory/max_active (GiB)": 14.3,
9427
+ "memory/max_allocated (GiB)": 14.3,
9428
+ "ppl": 12.12843,
9429
+ "step": 661,
9430
+ "tokens/total": 9225984,
9431
+ "tokens/train_per_sec_per_gpu": 1517.0,
9432
+ "tokens/trainable": 3807345
9433
+ },
9434
+ {
9435
+ "epoch": 0.3228874527496647,
9436
+ "grad_norm": 0.16561807692050934,
9437
+ "learning_rate": 5.445672201026054e-05,
9438
+ "loss": 2.59391450881958,
9439
+ "memory/device_reserved (GiB)": 37.6,
9440
+ "memory/max_active (GiB)": 12.97,
9441
+ "memory/max_allocated (GiB)": 12.97,
9442
+ "ppl": 13.38205,
9443
+ "step": 662,
9444
+ "tokens/total": 9237504,
9445
+ "tokens/train_per_sec_per_gpu": 3075.71,
9446
+ "tokens/trainable": 3812751
9447
+ },
9448
+ {
9449
+ "epoch": 0.3233751981465675,
9450
+ "grad_norm": 0.12832888960838318,
9451
+ "learning_rate": 5.416862465241033e-05,
9452
+ "loss": 2.464712619781494,
9453
+ "memory/device_reserved (GiB)": 37.6,
9454
+ "memory/max_active (GiB)": 16.07,
9455
+ "memory/max_allocated (GiB)": 16.07,
9456
+ "ppl": 11.7601,
9457
+ "step": 663,
9458
+ "tokens/total": 9251968,
9459
+ "tokens/train_per_sec_per_gpu": 1501.25,
9460
+ "tokens/trainable": 3820648
9461
+ },
9462
+ {
9463
+ "epoch": 0.3238629435434703,
9464
+ "grad_norm": 0.11801256984472275,
9465
+ "learning_rate": 5.388100804400049e-05,
9466
+ "loss": 2.523484230041504,
9467
+ "memory/device_reserved (GiB)": 37.6,
9468
+ "memory/max_active (GiB)": 16.51,
9469
+ "memory/max_allocated (GiB)": 16.51,
9470
+ "ppl": 12.47198,
9471
+ "step": 664,
9472
+ "tokens/total": 9267200,
9473
+ "tokens/train_per_sec_per_gpu": 3130.58,
9474
+ "tokens/trainable": 3830437
9475
+ },
9476
+ {
9477
+ "epoch": 0.3243506889403731,
9478
+ "grad_norm": 0.12580764293670654,
9479
+ "learning_rate": 5.3593875201993174e-05,
9480
+ "loss": 2.391364336013794,
9481
+ "memory/device_reserved (GiB)": 37.6,
9482
+ "memory/max_active (GiB)": 16.51,
9483
+ "memory/max_allocated (GiB)": 16.51,
9484
+ "ppl": 10.92839,
9485
+ "step": 665,
9486
+ "tokens/total": 9282944,
9487
+ "tokens/train_per_sec_per_gpu": 1146.11,
9488
+ "tokens/trainable": 3838657
9489
+ },
9490
+ {
9491
+ "epoch": 0.32483843433727594,
9492
+ "grad_norm": 0.13414451479911804,
9493
+ "learning_rate": 5.3307229138275936e-05,
9494
+ "loss": 2.372819662094116,
9495
+ "memory/device_reserved (GiB)": 37.6,
9496
+ "memory/max_active (GiB)": 16.51,
9497
+ "memory/max_allocated (GiB)": 16.51,
9498
+ "ppl": 10.7276,
9499
+ "step": 666,
9500
+ "tokens/total": 9297920,
9501
+ "tokens/train_per_sec_per_gpu": 1654.91,
9502
+ "tokens/trainable": 3845393
9503
+ },
9504
+ {
9505
+ "epoch": 0.32532617973417877,
9506
+ "grad_norm": 0.13741862773895264,
9507
+ "learning_rate": 5.302107285963045e-05,
9508
+ "loss": 2.618802309036255,
9509
+ "memory/device_reserved (GiB)": 37.6,
9510
+ "memory/max_active (GiB)": 15.19,
9511
+ "memory/max_allocated (GiB)": 15.19,
9512
+ "ppl": 13.71928,
9513
+ "step": 667,
9514
+ "tokens/total": 9311360,
9515
+ "tokens/train_per_sec_per_gpu": 2520.66,
9516
+ "tokens/trainable": 3852665
9517
+ },
9518
+ {
9519
+ "epoch": 0.3258139251310816,
9520
+ "grad_norm": 0.12451744079589844,
9521
+ "learning_rate": 5.273540936770058e-05,
9522
+ "loss": 2.497060775756836,
9523
+ "memory/device_reserved (GiB)": 37.6,
9524
+ "memory/max_active (GiB)": 16.51,
9525
+ "memory/max_allocated (GiB)": 16.51,
9526
+ "ppl": 12.14674,
9527
+ "step": 668,
9528
+ "tokens/total": 9325952,
9529
+ "tokens/train_per_sec_per_gpu": 2434.46,
9530
+ "tokens/trainable": 3860517
9531
+ },
9532
+ {
9533
+ "epoch": 0.32630167052798437,
9534
+ "grad_norm": 0.14122170209884644,
9535
+ "learning_rate": 5.245024165896126e-05,
9536
+ "loss": 2.5780842304229736,
9537
+ "memory/device_reserved (GiB)": 37.6,
9538
+ "memory/max_active (GiB)": 16.51,
9539
+ "memory/max_allocated (GiB)": 16.51,
9540
+ "ppl": 13.17188,
9541
+ "step": 669,
9542
+ "tokens/total": 9340928,
9543
+ "tokens/train_per_sec_per_gpu": 2394.1,
9544
+ "tokens/trainable": 3867023
9545
+ },
9546
+ {
9547
+ "epoch": 0.3267894159248872,
9548
+ "grad_norm": 0.1293308287858963,
9549
+ "learning_rate": 5.2165572724686754e-05,
9550
+ "loss": 2.517449140548706,
9551
+ "memory/device_reserved (GiB)": 37.6,
9552
+ "memory/max_active (GiB)": 16.51,
9553
+ "memory/max_allocated (GiB)": 16.51,
9554
+ "ppl": 12.39693,
9555
+ "step": 670,
9556
+ "tokens/total": 9355392,
9557
+ "tokens/train_per_sec_per_gpu": 2504.15,
9558
+ "tokens/trainable": 3874779
9559
+ },
9560
+ {
9561
+ "epoch": 0.32727716132179,
9562
+ "grad_norm": 0.1419108510017395,
9563
+ "learning_rate": 5.1881405550919493e-05,
9564
+ "loss": 2.5625345706939697,
9565
+ "memory/device_reserved (GiB)": 37.6,
9566
+ "memory/max_active (GiB)": 15.19,
9567
+ "memory/max_allocated (GiB)": 15.19,
9568
+ "ppl": 12.96865,
9569
+ "step": 671,
9570
+ "tokens/total": 9369600,
9571
+ "tokens/train_per_sec_per_gpu": 2788.16,
9572
+ "tokens/trainable": 3881901
9573
+ },
9574
+ {
9575
+ "epoch": 0.32776490671869285,
9576
+ "grad_norm": 0.15044128894805908,
9577
+ "learning_rate": 5.1597743118438726e-05,
9578
+ "loss": 2.6445508003234863,
9579
+ "memory/device_reserved (GiB)": 37.6,
9580
+ "memory/max_active (GiB)": 15.63,
9581
+ "memory/max_allocated (GiB)": 15.63,
9582
+ "ppl": 14.07712,
9583
+ "step": 672,
9584
+ "tokens/total": 9383808,
9585
+ "tokens/train_per_sec_per_gpu": 2152.92,
9586
+ "tokens/trainable": 3887683
9587
+ },
9588
+ {
9589
+ "epoch": 0.3282526521155957,
9590
+ "grad_norm": 0.13477809727191925,
9591
+ "learning_rate": 5.1314588402729044e-05,
9592
+ "loss": 2.459366798400879,
9593
+ "memory/device_reserved (GiB)": 37.6,
9594
+ "memory/max_active (GiB)": 15.53,
9595
+ "memory/max_allocated (GiB)": 15.53,
9596
+ "ppl": 11.6974,
9597
+ "step": 673,
9598
+ "tokens/total": 9397376,
9599
+ "tokens/train_per_sec_per_gpu": 1205.32,
9600
+ "tokens/trainable": 3894517
9601
+ },
9602
+ {
9603
+ "epoch": 0.32874039751249845,
9604
+ "grad_norm": 0.16951484978199005,
9605
+ "learning_rate": 5.103194437394952e-05,
9606
+ "loss": 2.6503396034240723,
9607
+ "memory/device_reserved (GiB)": 37.6,
9608
+ "memory/max_active (GiB)": 14.74,
9609
+ "memory/max_allocated (GiB)": 14.74,
9610
+ "ppl": 14.15885,
9611
+ "step": 674,
9612
+ "tokens/total": 9410176,
9613
+ "tokens/train_per_sec_per_gpu": 402.56,
9614
+ "tokens/trainable": 3898680
9615
+ },
9616
+ {
9617
+ "epoch": 0.3292281429094013,
9618
+ "grad_norm": 0.14310821890830994,
9619
+ "learning_rate": 5.074981399690218e-05,
9620
+ "loss": 2.5750184059143066,
9621
+ "memory/device_reserved (GiB)": 37.6,
9622
+ "memory/max_active (GiB)": 15.09,
9623
+ "memory/max_allocated (GiB)": 15.09,
9624
+ "ppl": 13.13156,
9625
+ "step": 675,
9626
+ "tokens/total": 9423360,
9627
+ "tokens/train_per_sec_per_gpu": 1274.61,
9628
+ "tokens/trainable": 3904449
9629
+ },
9630
+ {
9631
+ "epoch": 0.3297158883063041,
9632
+ "grad_norm": 0.14187775552272797,
9633
+ "learning_rate": 5.0468200231001286e-05,
9634
+ "loss": 2.58148455619812,
9635
+ "memory/device_reserved (GiB)": 37.6,
9636
+ "memory/max_active (GiB)": 16.07,
9637
+ "memory/max_allocated (GiB)": 16.07,
9638
+ "ppl": 13.21674,
9639
+ "step": 676,
9640
+ "tokens/total": 9438080,
9641
+ "tokens/train_per_sec_per_gpu": 808.62,
9642
+ "tokens/trainable": 3910987
9643
+ },
9644
+ {
9645
+ "epoch": 0.33020363370320693,
9646
+ "grad_norm": 0.16191494464874268,
9647
+ "learning_rate": 5.018710603024187e-05,
9648
+ "loss": 2.709486484527588,
9649
+ "memory/device_reserved (GiB)": 37.6,
9650
+ "memory/max_active (GiB)": 15.98,
9651
+ "memory/max_allocated (GiB)": 15.98,
9652
+ "ppl": 15.02156,
9653
+ "step": 677,
9654
+ "tokens/total": 9452672,
9655
+ "tokens/train_per_sec_per_gpu": 2706.9,
9656
+ "tokens/trainable": 3917526
9657
+ },
9658
+ {
9659
+ "epoch": 0.33069137910010976,
9660
+ "grad_norm": 0.1644907146692276,
9661
+ "learning_rate": 4.9906534343169144e-05,
9662
+ "loss": 2.374467372894287,
9663
+ "memory/device_reserved (GiB)": 37.6,
9664
+ "memory/max_active (GiB)": 16.51,
9665
+ "memory/max_allocated (GiB)": 16.51,
9666
+ "ppl": 10.74529,
9667
+ "step": 678,
9668
+ "tokens/total": 9466624,
9669
+ "tokens/train_per_sec_per_gpu": 2207.07,
9670
+ "tokens/trainable": 3922866
9671
+ },
9672
+ {
9673
+ "epoch": 0.33117912449701253,
9674
+ "grad_norm": 0.14618027210235596,
9675
+ "learning_rate": 4.962648811284738e-05,
9676
+ "loss": 2.3652446269989014,
9677
+ "memory/device_reserved (GiB)": 37.6,
9678
+ "memory/max_active (GiB)": 16.51,
9679
+ "memory/max_allocated (GiB)": 16.51,
9680
+ "ppl": 10.64664,
9681
+ "step": 679,
9682
+ "tokens/total": 9481216,
9683
+ "tokens/train_per_sec_per_gpu": 2138.53,
9684
+ "tokens/trainable": 3929184
9685
+ },
9686
+ {
9687
+ "epoch": 0.33166686989391536,
9688
+ "grad_norm": 0.13205529749393463,
9689
+ "learning_rate": 4.934697027682894e-05,
9690
+ "loss": 2.431748867034912,
9691
+ "memory/device_reserved (GiB)": 37.6,
9692
+ "memory/max_active (GiB)": 16.07,
9693
+ "memory/max_allocated (GiB)": 16.07,
9694
+ "ppl": 11.37876,
9695
+ "step": 680,
9696
+ "tokens/total": 9496064,
9697
+ "tokens/train_per_sec_per_gpu": 2242.79,
9698
+ "tokens/trainable": 3937050
9699
+ },
9700
+ {
9701
+ "epoch": 0.3321546152908182,
9702
+ "grad_norm": 0.15257883071899414,
9703
+ "learning_rate": 4.9067983767123736e-05,
9704
+ "loss": 2.757232666015625,
9705
+ "memory/device_reserved (GiB)": 37.6,
9706
+ "memory/max_active (GiB)": 15.63,
9707
+ "memory/max_allocated (GiB)": 15.63,
9708
+ "ppl": 15.75618,
9709
+ "step": 681,
9710
+ "tokens/total": 9509504,
9711
+ "tokens/train_per_sec_per_gpu": 1140.87,
9712
+ "tokens/trainable": 3943613
9713
+ },
9714
+ {
9715
+ "epoch": 0.332642360687721,
9716
+ "grad_norm": 0.14146484434604645,
9717
+ "learning_rate": 4.8789531510168163e-05,
9718
+ "loss": 2.426405191421509,
9719
+ "memory/device_reserved (GiB)": 37.6,
9720
+ "memory/max_active (GiB)": 16.42,
9721
+ "memory/max_allocated (GiB)": 16.42,
9722
+ "ppl": 11.31812,
9723
+ "step": 682,
9724
+ "tokens/total": 9523072,
9725
+ "tokens/train_per_sec_per_gpu": 1019.98,
9726
+ "tokens/trainable": 3950129
9727
+ },
9728
+ {
9729
+ "epoch": 0.33313010608462384,
9730
+ "grad_norm": 0.13973264396190643,
9731
+ "learning_rate": 4.851161642679466e-05,
9732
+ "loss": 2.4615488052368164,
9733
+ "memory/device_reserved (GiB)": 37.6,
9734
+ "memory/max_active (GiB)": 15.19,
9735
+ "memory/max_allocated (GiB)": 15.19,
9736
+ "ppl": 11.72295,
9737
+ "step": 683,
9738
+ "tokens/total": 9536768,
9739
+ "tokens/train_per_sec_per_gpu": 2048.1,
9740
+ "tokens/trainable": 3956791
9741
+ },
9742
+ {
9743
+ "epoch": 0.3336178514815266,
9744
+ "grad_norm": 0.1458214819431305,
9745
+ "learning_rate": 4.8234241432200965e-05,
9746
+ "loss": 2.595818519592285,
9747
+ "memory/device_reserved (GiB)": 37.6,
9748
+ "memory/max_active (GiB)": 13.86,
9749
+ "memory/max_allocated (GiB)": 13.86,
9750
+ "ppl": 13.40756,
9751
+ "step": 684,
9752
+ "tokens/total": 9548672,
9753
+ "tokens/train_per_sec_per_gpu": 3464.81,
9754
+ "tokens/trainable": 3963305
9755
+ },
9756
+ {
9757
+ "epoch": 0.33410559687842944,
9758
+ "grad_norm": 0.18916182219982147,
9759
+ "learning_rate": 4.795740943591955e-05,
9760
+ "loss": 2.2587080001831055,
9761
+ "memory/device_reserved (GiB)": 37.6,
9762
+ "memory/max_active (GiB)": 15.19,
9763
+ "memory/max_allocated (GiB)": 15.19,
9764
+ "ppl": 9.57072,
9765
+ "step": 685,
9766
+ "tokens/total": 9562496,
9767
+ "tokens/train_per_sec_per_gpu": 324.45,
9768
+ "tokens/trainable": 3966822
9769
+ },
9770
+ {
9771
+ "epoch": 0.3345933422753323,
9772
+ "grad_norm": 0.14918474853038788,
9773
+ "learning_rate": 4.768112334178699e-05,
9774
+ "loss": 2.419451951980591,
9775
+ "memory/device_reserved (GiB)": 37.6,
9776
+ "memory/max_active (GiB)": 16.42,
9777
+ "memory/max_allocated (GiB)": 16.42,
9778
+ "ppl": 11.2397,
9779
+ "step": 686,
9780
+ "tokens/total": 9577472,
9781
+ "tokens/train_per_sec_per_gpu": 2453.15,
9782
+ "tokens/trainable": 3972755
9783
+ },
9784
+ {
9785
+ "epoch": 0.3350810876722351,
9786
+ "grad_norm": 0.19332581758499146,
9787
+ "learning_rate": 4.74053860479137e-05,
9788
+ "loss": 2.4659688472747803,
9789
+ "memory/device_reserved (GiB)": 37.6,
9790
+ "memory/max_active (GiB)": 16.51,
9791
+ "memory/max_allocated (GiB)": 16.51,
9792
+ "ppl": 11.77488,
9793
+ "step": 687,
9794
+ "tokens/total": 9593344,
9795
+ "tokens/train_per_sec_per_gpu": 1020.5,
9796
+ "tokens/trainable": 3977603
9797
+ },
9798
+ {
9799
+ "epoch": 0.33556883306913793,
9800
+ "grad_norm": 0.23045431077480316,
9801
+ "learning_rate": 4.7130200446653475e-05,
9802
+ "loss": 2.4577431678771973,
9803
+ "memory/device_reserved (GiB)": 37.6,
9804
+ "memory/max_active (GiB)": 15.63,
9805
+ "memory/max_allocated (GiB)": 15.63,
9806
+ "ppl": 11.67843,
9807
+ "step": 688,
9808
+ "tokens/total": 9604992,
9809
+ "tokens/train_per_sec_per_gpu": 354.7,
9810
+ "tokens/trainable": 3980037
9811
+ },
9812
+ {
9813
+ "epoch": 0.3360565784660407,
9814
+ "grad_norm": 0.13135406374931335,
9815
+ "learning_rate": 4.6855569424572955e-05,
9816
+ "loss": 2.274285316467285,
9817
+ "memory/device_reserved (GiB)": 37.6,
9818
+ "memory/max_active (GiB)": 16.42,
9819
+ "memory/max_allocated (GiB)": 16.42,
9820
+ "ppl": 9.72097,
9821
+ "step": 689,
9822
+ "tokens/total": 9620096,
9823
+ "tokens/train_per_sec_per_gpu": 2860.29,
9824
+ "tokens/trainable": 3988505
9825
+ },
9826
+ {
9827
+ "epoch": 0.33654432386294353,
9828
+ "grad_norm": 0.17153258621692657,
9829
+ "learning_rate": 4.65814958624217e-05,
9830
+ "loss": 2.4314942359924316,
9831
+ "memory/device_reserved (GiB)": 37.6,
9832
+ "memory/max_active (GiB)": 15.19,
9833
+ "memory/max_allocated (GiB)": 15.19,
9834
+ "ppl": 11.37587,
9835
+ "step": 690,
9836
+ "tokens/total": 9631488,
9837
+ "tokens/train_per_sec_per_gpu": 1934.44,
9838
+ "tokens/trainable": 3992936
9839
+ },
9840
+ {
9841
+ "epoch": 0.33703206925984636,
9842
+ "grad_norm": 0.14782929420471191,
9843
+ "learning_rate": 4.630798263510162e-05,
9844
+ "loss": 2.453141689300537,
9845
+ "memory/device_reserved (GiB)": 37.6,
9846
+ "memory/max_active (GiB)": 14.74,
9847
+ "memory/max_allocated (GiB)": 14.74,
9848
+ "ppl": 11.62481,
9849
+ "step": 691,
9850
+ "tokens/total": 9644544,
9851
+ "tokens/train_per_sec_per_gpu": 2826.18,
9852
+ "tokens/trainable": 3998788
9853
+ },
9854
+ {
9855
+ "epoch": 0.3375198146567492,
9856
+ "grad_norm": 0.20830675959587097,
9857
+ "learning_rate": 4.6035032611637094e-05,
9858
+ "loss": 2.244356393814087,
9859
+ "memory/device_reserved (GiB)": 37.6,
9860
+ "memory/max_active (GiB)": 16.51,
9861
+ "memory/max_allocated (GiB)": 16.51,
9862
+ "ppl": 9.43434,
9863
+ "step": 692,
9864
+ "tokens/total": 9659776,
9865
+ "tokens/train_per_sec_per_gpu": 553.59,
9866
+ "tokens/trainable": 4001704
9867
+ },
9868
+ {
9869
+ "epoch": 0.338007560053652,
9870
+ "grad_norm": 0.13705019652843475,
9871
+ "learning_rate": 4.5762648655144666e-05,
9872
+ "loss": 2.3326563835144043,
9873
+ "memory/device_reserved (GiB)": 37.6,
9874
+ "memory/max_active (GiB)": 16.42,
9875
+ "memory/max_allocated (GiB)": 16.42,
9876
+ "ppl": 10.30528,
9877
+ "step": 693,
9878
+ "tokens/total": 9675008,
9879
+ "tokens/train_per_sec_per_gpu": 2153.61,
9880
+ "tokens/trainable": 4008399
9881
+ },
9882
+ {
9883
+ "epoch": 0.3384953054505548,
9884
+ "grad_norm": 0.17856715619564056,
9885
+ "learning_rate": 4.549083362280317e-05,
9886
+ "loss": 2.6045002937316895,
9887
+ "memory/device_reserved (GiB)": 37.6,
9888
+ "memory/max_active (GiB)": 16.07,
9889
+ "memory/max_allocated (GiB)": 16.07,
9890
+ "ppl": 13.52447,
9891
+ "step": 694,
9892
+ "tokens/total": 9687424,
9893
+ "tokens/train_per_sec_per_gpu": 1395.96,
9894
+ "tokens/trainable": 4012593
9895
+ },
9896
+ {
9897
+ "epoch": 0.3389830508474576,
9898
+ "grad_norm": 0.17393389344215393,
9899
+ "learning_rate": 4.5219590365823714e-05,
9900
+ "loss": 2.6212544441223145,
9901
+ "memory/device_reserved (GiB)": 37.6,
9902
+ "memory/max_active (GiB)": 16.51,
9903
+ "memory/max_allocated (GiB)": 16.51,
9904
+ "ppl": 13.75297,
9905
+ "step": 695,
9906
+ "tokens/total": 9701120,
9907
+ "tokens/train_per_sec_per_gpu": 621.02,
9908
+ "tokens/trainable": 4017183
9909
+ },
9910
+ {
9911
+ "epoch": 0.33947079624436044,
9912
+ "grad_norm": 0.15557512640953064,
9913
+ "learning_rate": 4.494892172941965e-05,
9914
+ "loss": 2.5806403160095215,
9915
+ "memory/device_reserved (GiB)": 37.6,
9916
+ "memory/max_active (GiB)": 15.19,
9917
+ "memory/max_allocated (GiB)": 15.19,
9918
+ "ppl": 13.20559,
9919
+ "step": 696,
9920
+ "tokens/total": 9714304,
9921
+ "tokens/train_per_sec_per_gpu": 1263.71,
9922
+ "tokens/trainable": 4022987
9923
+ },
9924
+ {
9925
+ "epoch": 0.33995854164126327,
9926
+ "grad_norm": 0.17690803110599518,
9927
+ "learning_rate": 4.467883055277695e-05,
9928
+ "loss": 2.4236552715301514,
9929
+ "memory/device_reserved (GiB)": 37.6,
9930
+ "memory/max_active (GiB)": 15.63,
9931
+ "memory/max_allocated (GiB)": 15.63,
9932
+ "ppl": 11.28704,
9933
+ "step": 697,
9934
+ "tokens/total": 9727104,
9935
+ "tokens/train_per_sec_per_gpu": 2278.1,
9936
+ "tokens/trainable": 4027598
9937
+ },
9938
+ {
9939
+ "epoch": 0.3404462870381661,
9940
+ "grad_norm": 0.16057541966438293,
9941
+ "learning_rate": 4.440931966902418e-05,
9942
+ "loss": 2.4536919593811035,
9943
+ "memory/device_reserved (GiB)": 37.6,
9944
+ "memory/max_active (GiB)": 15.53,
9945
+ "memory/max_allocated (GiB)": 15.53,
9946
+ "ppl": 11.63121,
9947
+ "step": 698,
9948
+ "tokens/total": 9740672,
9949
+ "tokens/train_per_sec_per_gpu": 2386.73,
9950
+ "tokens/trainable": 4032248
9951
+ },
9952
+ {
9953
+ "epoch": 0.34093403243506887,
9954
+ "grad_norm": 0.1556730717420578,
9955
+ "learning_rate": 4.414039190520308e-05,
9956
+ "loss": 2.69823956489563,
9957
+ "memory/device_reserved (GiB)": 37.6,
9958
+ "memory/max_active (GiB)": 16.51,
9959
+ "memory/max_allocated (GiB)": 16.51,
9960
+ "ppl": 14.85356,
9961
+ "step": 699,
9962
+ "tokens/total": 9755904,
9963
+ "tokens/train_per_sec_per_gpu": 1962.29,
9964
+ "tokens/trainable": 4038156
9965
+ },
9966
+ {
9967
+ "epoch": 0.3414217778319717,
9968
+ "grad_norm": 0.17577911913394928,
9969
+ "learning_rate": 4.387205008223854e-05,
9970
+ "loss": 2.6918764114379883,
9971
+ "memory/device_reserved (GiB)": 37.6,
9972
+ "memory/max_active (GiB)": 15.63,
9973
+ "memory/max_allocated (GiB)": 15.63,
9974
+ "ppl": 14.75934,
9975
+ "step": 700,
9976
+ "tokens/total": 9769472,
9977
+ "tokens/train_per_sec_per_gpu": 893.71,
9978
+ "tokens/trainable": 4042644
9979
+ },
9980
+ {
9981
+ "epoch": 0.3414217778319717,
9982
+ "eval_loss": 2.505732774734497,
9983
+ "eval_ppl": 12.25253,
9984
+ "eval_runtime": 6.0458,
9985
+ "eval_samples_per_second": 33.081,
9986
+ "eval_steps_per_second": 16.541,
9987
+ "memory/device_reserved (GiB)": 37.6,
9988
+ "memory/max_active (GiB)": 11.76,
9989
+ "memory/max_allocated (GiB)": 11.76,
9990
+ "step": 700
9991
  }
9992
  ],
9993
  "logging_steps": 1,
 
10007
  "attributes": {}
10008
  }
10009
  },
10010
+ "total_flos": 1.201690148756521e+17,
10011
  "train_batch_size": 2,
10012
  "trial_name": null,
10013
  "trial_params": null