AiAF commited on
Commit
ed45871
·
verified ·
1 Parent(s): 9a07e06

Training in progress, step 850, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd804fe5a6a07ca92c0d9df3ee8901a99a952af466c85b5d67804f3b9b5754fc
3
  size 102264160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad29cff1b863587cbb2ca948354cb20133cf91efb3ab95cc9e09274cb6bcac5b
3
  size 102264160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cc0bed6cff1a4618fb4cd1381e691366f8ad28f8182c56da1f0df2fb19366078
3
  size 52162827
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99e35caf9f22b9501f8794b7071db015d9c5f1cc2081e5e6b308b86d01258be1
3
  size 52162827
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f05bb1ddd76152fd645931407e88adee7bc96ff7799e0d5b2faef63c077f8ed
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e394ddf37d3569e21dd7164d17df1486101a840dc12b8080abbcaca06573e244
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c0f6da37afd2d18fa5e85c27927c29b3e2c21ee39c49983ca41ec400e0b2cd5
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b233ed6e5d634209b3aa9991eded2c9aa4b12fa1b2fb73e19124dd488ff69f21
3
  size 1465
last-checkpoint/tokens_state.json CHANGED
@@ -1 +1 @@
1
- {"total": 11163776, "trainable": 4620168}
 
1
+ {"total": 11857664, "trainable": 4907297}
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.39019631752225337,
6
  "eval_steps": 50,
7
- "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11412,6 +11412,718 @@
11412
  "memory/max_active (GiB)": 11.76,
11413
  "memory/max_allocated (GiB)": 11.76,
11414
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11415
  }
11416
  ],
11417
  "logging_steps": 1,
@@ -11431,7 +12143,7 @@
11431
  "attributes": {}
11432
  }
11433
  },
11434
- "total_flos": 1.3731959764176077e+17,
11435
  "train_batch_size": 2,
11436
  "trial_name": null,
11437
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4145835873673942,
6
  "eval_steps": 50,
7
+ "global_step": 850,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11412
  "memory/max_active (GiB)": 11.76,
11413
  "memory/max_allocated (GiB)": 11.76,
11414
  "step": 800
11415
+ },
11416
+ {
11417
+ "epoch": 0.3906840629191562,
11418
+ "grad_norm": 0.17632552981376648,
11419
+ "learning_rate": 2.025571894372794e-05,
11420
+ "loss": 2.419203281402588,
11421
+ "memory/device_reserved (GiB)": 27.47,
11422
+ "memory/max_active (GiB)": 16.51,
11423
+ "memory/max_allocated (GiB)": 16.51,
11424
+ "ppl": 11.2369,
11425
+ "step": 801,
11426
+ "tokens/total": 11177856,
11427
+ "tokens/train_per_sec_per_gpu": 2251.92,
11428
+ "tokens/trainable": 4624735
11429
+ },
11430
+ {
11431
+ "epoch": 0.391171808316059,
11432
+ "grad_norm": 0.1677611917257309,
11433
+ "learning_rate": 2.0060712799926408e-05,
11434
+ "loss": 2.390589475631714,
11435
+ "memory/device_reserved (GiB)": 27.47,
11436
+ "memory/max_active (GiB)": 16.07,
11437
+ "memory/max_allocated (GiB)": 16.07,
11438
+ "ppl": 10.91993,
11439
+ "step": 802,
11440
+ "tokens/total": 11191808,
11441
+ "tokens/train_per_sec_per_gpu": 1610.5,
11442
+ "tokens/trainable": 4629319
11443
+ },
11444
+ {
11445
+ "epoch": 0.39165955371296185,
11446
+ "grad_norm": 0.14507685601711273,
11447
+ "learning_rate": 1.9866545181421013e-05,
11448
+ "loss": 2.7266347408294678,
11449
+ "memory/device_reserved (GiB)": 27.47,
11450
+ "memory/max_active (GiB)": 16.07,
11451
+ "memory/max_allocated (GiB)": 16.07,
11452
+ "ppl": 15.28137,
11453
+ "step": 803,
11454
+ "tokens/total": 11205120,
11455
+ "tokens/train_per_sec_per_gpu": 201.89,
11456
+ "tokens/trainable": 4636143
11457
+ },
11458
+ {
11459
+ "epoch": 0.3921472991098647,
11460
+ "grad_norm": 0.14650315046310425,
11461
+ "learning_rate": 1.967321812493813e-05,
11462
+ "loss": 2.496601104736328,
11463
+ "memory/device_reserved (GiB)": 27.47,
11464
+ "memory/max_active (GiB)": 15.63,
11465
+ "memory/max_allocated (GiB)": 15.63,
11466
+ "ppl": 12.14116,
11467
+ "step": 804,
11468
+ "tokens/total": 11217920,
11469
+ "tokens/train_per_sec_per_gpu": 1516.0,
11470
+ "tokens/trainable": 4641943
11471
+ },
11472
+ {
11473
+ "epoch": 0.39263504450676745,
11474
+ "grad_norm": 0.2653907835483551,
11475
+ "learning_rate": 1.9480733658387175e-05,
11476
+ "loss": 2.582747220993042,
11477
+ "memory/device_reserved (GiB)": 27.47,
11478
+ "memory/max_active (GiB)": 15.53,
11479
+ "memory/max_allocated (GiB)": 15.53,
11480
+ "ppl": 13.23344,
11481
+ "step": 805,
11482
+ "tokens/total": 11231872,
11483
+ "tokens/train_per_sec_per_gpu": 701.31,
11484
+ "tokens/trainable": 4643824
11485
+ },
11486
+ {
11487
+ "epoch": 0.3931227899036703,
11488
+ "grad_norm": 0.1440833956003189,
11489
+ "learning_rate": 1.9289093800839066e-05,
11490
+ "loss": 2.470148801803589,
11491
+ "memory/device_reserved (GiB)": 27.47,
11492
+ "memory/max_active (GiB)": 15.63,
11493
+ "memory/max_allocated (GiB)": 15.63,
11494
+ "ppl": 11.82421,
11495
+ "step": 806,
11496
+ "tokens/total": 11245824,
11497
+ "tokens/train_per_sec_per_gpu": 3396.95,
11498
+ "tokens/trainable": 4650548
11499
+ },
11500
+ {
11501
+ "epoch": 0.3936105353005731,
11502
+ "grad_norm": 0.1409664899110794,
11503
+ "learning_rate": 1.9098300562505266e-05,
11504
+ "loss": 2.7668747901916504,
11505
+ "memory/device_reserved (GiB)": 27.47,
11506
+ "memory/max_active (GiB)": 15.98,
11507
+ "memory/max_allocated (GiB)": 15.98,
11508
+ "ppl": 15.90884,
11509
+ "step": 807,
11510
+ "tokens/total": 11259648,
11511
+ "tokens/train_per_sec_per_gpu": 1139.14,
11512
+ "tokens/trainable": 4657579
11513
+ },
11514
+ {
11515
+ "epoch": 0.39409828069747593,
11516
+ "grad_norm": 0.1468340903520584,
11517
+ "learning_rate": 1.8908355944716517e-05,
11518
+ "loss": 2.6180667877197266,
11519
+ "memory/device_reserved (GiB)": 27.47,
11520
+ "memory/max_active (GiB)": 15.98,
11521
+ "memory/max_allocated (GiB)": 15.98,
11522
+ "ppl": 13.7092,
11523
+ "step": 808,
11524
+ "tokens/total": 11274240,
11525
+ "tokens/train_per_sec_per_gpu": 1312.1,
11526
+ "tokens/trainable": 4664365
11527
+ },
11528
+ {
11529
+ "epoch": 0.39458602609437876,
11530
+ "grad_norm": 0.1398187279701233,
11531
+ "learning_rate": 1.871926193990202e-05,
11532
+ "loss": 2.5571842193603516,
11533
+ "memory/device_reserved (GiB)": 27.47,
11534
+ "memory/max_active (GiB)": 16.51,
11535
+ "memory/max_allocated (GiB)": 16.51,
11536
+ "ppl": 12.89944,
11537
+ "step": 809,
11538
+ "tokens/total": 11287296,
11539
+ "tokens/train_per_sec_per_gpu": 1438.48,
11540
+ "tokens/trainable": 4671448
11541
+ },
11542
+ {
11543
+ "epoch": 0.39507377149128153,
11544
+ "grad_norm": 0.13157154619693756,
11545
+ "learning_rate": 1.8531020531568378e-05,
11546
+ "loss": 2.4374163150787354,
11547
+ "memory/device_reserved (GiB)": 27.47,
11548
+ "memory/max_active (GiB)": 16.51,
11549
+ "memory/max_allocated (GiB)": 16.51,
11550
+ "ppl": 11.44344,
11551
+ "step": 810,
11552
+ "tokens/total": 11303296,
11553
+ "tokens/train_per_sec_per_gpu": 1925.09,
11554
+ "tokens/trainable": 4679027
11555
+ },
11556
+ {
11557
+ "epoch": 0.39556151688818436,
11558
+ "grad_norm": 0.1602177768945694,
11559
+ "learning_rate": 1.8343633694278895e-05,
11560
+ "loss": 2.5065877437591553,
11561
+ "memory/device_reserved (GiB)": 27.47,
11562
+ "memory/max_active (GiB)": 16.51,
11563
+ "memory/max_allocated (GiB)": 16.51,
11564
+ "ppl": 12.26301,
11565
+ "step": 811,
11566
+ "tokens/total": 11317120,
11567
+ "tokens/train_per_sec_per_gpu": 2074.92,
11568
+ "tokens/trainable": 4684195
11569
+ },
11570
+ {
11571
+ "epoch": 0.3960492622850872,
11572
+ "grad_norm": 0.17014168202877045,
11573
+ "learning_rate": 1.8157103393632868e-05,
11574
+ "loss": 2.4969608783721924,
11575
+ "memory/device_reserved (GiB)": 27.47,
11576
+ "memory/max_active (GiB)": 16.51,
11577
+ "memory/max_allocated (GiB)": 16.51,
11578
+ "ppl": 12.14553,
11579
+ "step": 812,
11580
+ "tokens/total": 11331712,
11581
+ "tokens/train_per_sec_per_gpu": 1899.17,
11582
+ "tokens/trainable": 4688512
11583
+ },
11584
+ {
11585
+ "epoch": 0.39653700768199,
11586
+ "grad_norm": 0.15981672704219818,
11587
+ "learning_rate": 1.7971431586244815e-05,
11588
+ "loss": 2.3524038791656494,
11589
+ "memory/device_reserved (GiB)": 27.47,
11590
+ "memory/max_active (GiB)": 13.86,
11591
+ "memory/max_allocated (GiB)": 13.86,
11592
+ "ppl": 10.51081,
11593
+ "step": 813,
11594
+ "tokens/total": 11344256,
11595
+ "tokens/train_per_sec_per_gpu": 2253.56,
11596
+ "tokens/trainable": 4693239
11597
+ },
11598
+ {
11599
+ "epoch": 0.39702475307889284,
11600
+ "grad_norm": 0.1451166570186615,
11601
+ "learning_rate": 1.7786620219724204e-05,
11602
+ "loss": 2.3406598567962646,
11603
+ "memory/device_reserved (GiB)": 27.47,
11604
+ "memory/max_active (GiB)": 16.42,
11605
+ "memory/max_allocated (GiB)": 16.42,
11606
+ "ppl": 10.38809,
11607
+ "step": 814,
11608
+ "tokens/total": 11359104,
11609
+ "tokens/train_per_sec_per_gpu": 797.01,
11610
+ "tokens/trainable": 4699549
11611
+ },
11612
+ {
11613
+ "epoch": 0.3975124984757956,
11614
+ "grad_norm": 0.184647798538208,
11615
+ "learning_rate": 1.7602671232654754e-05,
11616
+ "loss": 2.687480926513672,
11617
+ "memory/device_reserved (GiB)": 27.47,
11618
+ "memory/max_active (GiB)": 16.51,
11619
+ "memory/max_allocated (GiB)": 16.51,
11620
+ "ppl": 14.69461,
11621
+ "step": 815,
11622
+ "tokens/total": 11373568,
11623
+ "tokens/train_per_sec_per_gpu": 1141.17,
11624
+ "tokens/trainable": 4703613
11625
+ },
11626
+ {
11627
+ "epoch": 0.39800024387269844,
11628
+ "grad_norm": 0.1620160937309265,
11629
+ "learning_rate": 1.741958655457436e-05,
11630
+ "loss": 2.4154233932495117,
11631
+ "memory/device_reserved (GiB)": 27.47,
11632
+ "memory/max_active (GiB)": 16.07,
11633
+ "memory/max_allocated (GiB)": 16.07,
11634
+ "ppl": 11.19451,
11635
+ "step": 816,
11636
+ "tokens/total": 11385600,
11637
+ "tokens/train_per_sec_per_gpu": 87.94,
11638
+ "tokens/trainable": 4708168
11639
+ },
11640
+ {
11641
+ "epoch": 0.3984879892696013,
11642
+ "grad_norm": 0.15860387682914734,
11643
+ "learning_rate": 1.723736810595461e-05,
11644
+ "loss": 2.539144992828369,
11645
+ "memory/device_reserved (GiB)": 27.47,
11646
+ "memory/max_active (GiB)": 16.51,
11647
+ "memory/max_allocated (GiB)": 16.51,
11648
+ "ppl": 12.66883,
11649
+ "step": 817,
11650
+ "tokens/total": 11399680,
11651
+ "tokens/train_per_sec_per_gpu": 1662.93,
11652
+ "tokens/trainable": 4713327
11653
+ },
11654
+ {
11655
+ "epoch": 0.3989757346665041,
11656
+ "grad_norm": 0.14269250631332397,
11657
+ "learning_rate": 1.7056017798180824e-05,
11658
+ "loss": 2.400291919708252,
11659
+ "memory/device_reserved (GiB)": 27.47,
11660
+ "memory/max_active (GiB)": 16.42,
11661
+ "memory/max_allocated (GiB)": 16.42,
11662
+ "ppl": 11.02639,
11663
+ "step": 818,
11664
+ "tokens/total": 11415040,
11665
+ "tokens/train_per_sec_per_gpu": 1938.98,
11666
+ "tokens/trainable": 4720448
11667
+ },
11668
+ {
11669
+ "epoch": 0.39946348006340693,
11670
+ "grad_norm": 0.182223379611969,
11671
+ "learning_rate": 1.6875537533531948e-05,
11672
+ "loss": 2.5135679244995117,
11673
+ "memory/device_reserved (GiB)": 27.47,
11674
+ "memory/max_active (GiB)": 16.51,
11675
+ "memory/max_allocated (GiB)": 16.51,
11676
+ "ppl": 12.34891,
11677
+ "step": 819,
11678
+ "tokens/total": 11428480,
11679
+ "tokens/train_per_sec_per_gpu": 2430.12,
11680
+ "tokens/trainable": 4724427
11681
+ },
11682
+ {
11683
+ "epoch": 0.3999512254603097,
11684
+ "grad_norm": 0.15434423089027405,
11685
+ "learning_rate": 1.6695929205160487e-05,
11686
+ "loss": 2.6116271018981934,
11687
+ "memory/device_reserved (GiB)": 27.47,
11688
+ "memory/max_active (GiB)": 16.51,
11689
+ "memory/max_allocated (GiB)": 16.51,
11690
+ "ppl": 13.6212,
11691
+ "step": 820,
11692
+ "tokens/total": 11443200,
11693
+ "tokens/train_per_sec_per_gpu": 2846.47,
11694
+ "tokens/trainable": 4730440
11695
+ },
11696
+ {
11697
+ "epoch": 0.40043897085721253,
11698
+ "grad_norm": 0.14820340275764465,
11699
+ "learning_rate": 1.65171946970729e-05,
11700
+ "loss": 2.5509421825408936,
11701
+ "memory/device_reserved (GiB)": 27.47,
11702
+ "memory/max_active (GiB)": 16.51,
11703
+ "memory/max_allocated (GiB)": 16.51,
11704
+ "ppl": 12.81918,
11705
+ "step": 821,
11706
+ "tokens/total": 11458048,
11707
+ "tokens/train_per_sec_per_gpu": 2417.91,
11708
+ "tokens/trainable": 4737120
11709
+ },
11710
+ {
11711
+ "epoch": 0.40092671625411536,
11712
+ "grad_norm": 0.17228034138679504,
11713
+ "learning_rate": 1.6339335884109518e-05,
11714
+ "loss": 2.514219284057617,
11715
+ "memory/device_reserved (GiB)": 27.47,
11716
+ "memory/max_active (GiB)": 15.63,
11717
+ "memory/max_allocated (GiB)": 15.63,
11718
+ "ppl": 12.35696,
11719
+ "step": 822,
11720
+ "tokens/total": 11470848,
11721
+ "tokens/train_per_sec_per_gpu": 1171.68,
11722
+ "tokens/trainable": 4743584
11723
+ },
11724
+ {
11725
+ "epoch": 0.4014144616510182,
11726
+ "grad_norm": 0.14200446009635925,
11727
+ "learning_rate": 1.6162354631925204e-05,
11728
+ "loss": 2.7033231258392334,
11729
+ "memory/device_reserved (GiB)": 27.47,
11730
+ "memory/max_active (GiB)": 15.53,
11731
+ "memory/max_allocated (GiB)": 15.53,
11732
+ "ppl": 14.92926,
11733
+ "step": 823,
11734
+ "tokens/total": 11484160,
11735
+ "tokens/train_per_sec_per_gpu": 1981.38,
11736
+ "tokens/trainable": 4750111
11737
+ },
11738
+ {
11739
+ "epoch": 0.401902207047921,
11740
+ "grad_norm": 0.1785208135843277,
11741
+ "learning_rate": 1.598625279696948e-05,
11742
+ "loss": 2.621516704559326,
11743
+ "memory/device_reserved (GiB)": 27.47,
11744
+ "memory/max_active (GiB)": 16.07,
11745
+ "memory/max_allocated (GiB)": 16.07,
11746
+ "ppl": 13.75657,
11747
+ "step": 824,
11748
+ "tokens/total": 11497728,
11749
+ "tokens/train_per_sec_per_gpu": 2868.11,
11750
+ "tokens/trainable": 4754852
11751
+ },
11752
+ {
11753
+ "epoch": 0.4023899524448238,
11754
+ "grad_norm": 0.15656448900699615,
11755
+ "learning_rate": 1.5811032226467305e-05,
11756
+ "loss": 2.681117534637451,
11757
+ "memory/device_reserved (GiB)": 27.47,
11758
+ "memory/max_active (GiB)": 16.42,
11759
+ "memory/max_allocated (GiB)": 16.42,
11760
+ "ppl": 14.6014,
11761
+ "step": 825,
11762
+ "tokens/total": 11511808,
11763
+ "tokens/train_per_sec_per_gpu": 1996.95,
11764
+ "tokens/trainable": 4761473
11765
+ },
11766
+ {
11767
+ "epoch": 0.4028776978417266,
11768
+ "grad_norm": 0.13972437381744385,
11769
+ "learning_rate": 1.563669475839956e-05,
11770
+ "loss": 2.4459619522094727,
11771
+ "memory/device_reserved (GiB)": 27.47,
11772
+ "memory/max_active (GiB)": 16.51,
11773
+ "memory/max_allocated (GiB)": 16.51,
11774
+ "ppl": 11.54165,
11775
+ "step": 826,
11776
+ "tokens/total": 11525376,
11777
+ "tokens/train_per_sec_per_gpu": 1750.28,
11778
+ "tokens/trainable": 4768003
11779
+ },
11780
+ {
11781
+ "epoch": 0.40336544323862944,
11782
+ "grad_norm": 0.1425899863243103,
11783
+ "learning_rate": 1.5463242221483743e-05,
11784
+ "loss": 2.4396560192108154,
11785
+ "memory/device_reserved (GiB)": 27.47,
11786
+ "memory/max_active (GiB)": 16.51,
11787
+ "memory/max_allocated (GiB)": 16.51,
11788
+ "ppl": 11.46909,
11789
+ "step": 827,
11790
+ "tokens/total": 11540352,
11791
+ "tokens/train_per_sec_per_gpu": 3054.08,
11792
+ "tokens/trainable": 4774058
11793
+ },
11794
+ {
11795
+ "epoch": 0.40385318863553227,
11796
+ "grad_norm": 0.1668749898672104,
11797
+ "learning_rate": 1.529067643515495e-05,
11798
+ "loss": 2.672379493713379,
11799
+ "memory/device_reserved (GiB)": 27.47,
11800
+ "memory/max_active (GiB)": 16.51,
11801
+ "memory/max_allocated (GiB)": 16.51,
11802
+ "ppl": 14.47437,
11803
+ "step": 828,
11804
+ "tokens/total": 11554304,
11805
+ "tokens/train_per_sec_per_gpu": 1615.7,
11806
+ "tokens/trainable": 4779147
11807
+ },
11808
+ {
11809
+ "epoch": 0.4043409340324351,
11810
+ "grad_norm": 0.1699647754430771,
11811
+ "learning_rate": 1.5118999209546559e-05,
11812
+ "loss": 2.7491025924682617,
11813
+ "memory/device_reserved (GiB)": 27.47,
11814
+ "memory/max_active (GiB)": 16.51,
11815
+ "memory/max_allocated (GiB)": 16.51,
11816
+ "ppl": 15.6286,
11817
+ "step": 829,
11818
+ "tokens/total": 11568000,
11819
+ "tokens/train_per_sec_per_gpu": 2331.87,
11820
+ "tokens/trainable": 4785120
11821
+ },
11822
+ {
11823
+ "epoch": 0.40482867942933787,
11824
+ "grad_norm": 0.1574130356311798,
11825
+ "learning_rate": 1.4948212345471491e-05,
11826
+ "loss": 2.519521713256836,
11827
+ "memory/device_reserved (GiB)": 27.47,
11828
+ "memory/max_active (GiB)": 16.07,
11829
+ "memory/max_allocated (GiB)": 16.07,
11830
+ "ppl": 12.42265,
11831
+ "step": 830,
11832
+ "tokens/total": 11583360,
11833
+ "tokens/train_per_sec_per_gpu": 1494.79,
11834
+ "tokens/trainable": 4790591
11835
+ },
11836
+ {
11837
+ "epoch": 0.4053164248262407,
11838
+ "grad_norm": 0.12484201788902283,
11839
+ "learning_rate": 1.4778317634403083e-05,
11840
+ "loss": 2.391390800476074,
11841
+ "memory/device_reserved (GiB)": 27.47,
11842
+ "memory/max_active (GiB)": 16.51,
11843
+ "memory/max_allocated (GiB)": 16.51,
11844
+ "ppl": 10.92868,
11845
+ "step": 831,
11846
+ "tokens/total": 11598080,
11847
+ "tokens/train_per_sec_per_gpu": 4005.3,
11848
+ "tokens/trainable": 4799487
11849
+ },
11850
+ {
11851
+ "epoch": 0.4058041702231435,
11852
+ "grad_norm": 0.21536649763584137,
11853
+ "learning_rate": 1.460931685845649e-05,
11854
+ "loss": 2.217477560043335,
11855
+ "memory/device_reserved (GiB)": 27.47,
11856
+ "memory/max_active (GiB)": 15.98,
11857
+ "memory/max_allocated (GiB)": 15.98,
11858
+ "ppl": 9.18414,
11859
+ "step": 832,
11860
+ "tokens/total": 11610496,
11861
+ "tokens/train_per_sec_per_gpu": 2295.79,
11862
+ "tokens/trainable": 4802257
11863
+ },
11864
+ {
11865
+ "epoch": 0.40629191562004635,
11866
+ "grad_norm": 0.1689203679561615,
11867
+ "learning_rate": 1.444121179036989e-05,
11868
+ "loss": 2.681854724884033,
11869
+ "memory/device_reserved (GiB)": 27.47,
11870
+ "memory/max_active (GiB)": 15.53,
11871
+ "memory/max_allocated (GiB)": 15.53,
11872
+ "ppl": 14.61217,
11873
+ "step": 833,
11874
+ "tokens/total": 11622784,
11875
+ "tokens/train_per_sec_per_gpu": 1075.61,
11876
+ "tokens/trainable": 4807534
11877
+ },
11878
+ {
11879
+ "epoch": 0.4067796610169492,
11880
+ "grad_norm": 0.16477236151695251,
11881
+ "learning_rate": 1.427400419348588e-05,
11882
+ "loss": 2.518036127090454,
11883
+ "memory/device_reserved (GiB)": 27.47,
11884
+ "memory/max_active (GiB)": 15.63,
11885
+ "memory/max_allocated (GiB)": 15.63,
11886
+ "ppl": 12.40421,
11887
+ "step": 834,
11888
+ "tokens/total": 11637248,
11889
+ "tokens/train_per_sec_per_gpu": 2330.01,
11890
+ "tokens/trainable": 4812618
11891
+ },
11892
+ {
11893
+ "epoch": 0.40726740641385195,
11894
+ "grad_norm": 0.15567028522491455,
11895
+ "learning_rate": 1.4107695821733025e-05,
11896
+ "loss": 2.579047203063965,
11897
+ "memory/device_reserved (GiB)": 27.47,
11898
+ "memory/max_active (GiB)": 16.07,
11899
+ "memory/max_allocated (GiB)": 16.07,
11900
+ "ppl": 13.18457,
11901
+ "step": 835,
11902
+ "tokens/total": 11651072,
11903
+ "tokens/train_per_sec_per_gpu": 2918.59,
11904
+ "tokens/trainable": 4818469
11905
+ },
11906
+ {
11907
+ "epoch": 0.4077551518107548,
11908
+ "grad_norm": 0.15685085952281952,
11909
+ "learning_rate": 1.3942288419607475e-05,
11910
+ "loss": 2.431553840637207,
11911
+ "memory/device_reserved (GiB)": 27.47,
11912
+ "memory/max_active (GiB)": 16.51,
11913
+ "memory/max_allocated (GiB)": 16.51,
11914
+ "ppl": 11.37655,
11915
+ "step": 836,
11916
+ "tokens/total": 11665408,
11917
+ "tokens/train_per_sec_per_gpu": 2168.92,
11918
+ "tokens/trainable": 4823738
11919
+ },
11920
+ {
11921
+ "epoch": 0.4082428972076576,
11922
+ "grad_norm": 0.19555719196796417,
11923
+ "learning_rate": 1.3777783722154603e-05,
11924
+ "loss": 2.263695478439331,
11925
+ "memory/device_reserved (GiB)": 27.47,
11926
+ "memory/max_active (GiB)": 15.63,
11927
+ "memory/max_allocated (GiB)": 15.63,
11928
+ "ppl": 9.61857,
11929
+ "step": 837,
11930
+ "tokens/total": 11679744,
11931
+ "tokens/train_per_sec_per_gpu": 1541.14,
11932
+ "tokens/trainable": 4826962
11933
+ },
11934
+ {
11935
+ "epoch": 0.40873064260456043,
11936
+ "grad_norm": 0.13367053866386414,
11937
+ "learning_rate": 1.3614183454950824e-05,
11938
+ "loss": 2.4465866088867188,
11939
+ "memory/device_reserved (GiB)": 27.47,
11940
+ "memory/max_active (GiB)": 16.51,
11941
+ "memory/max_allocated (GiB)": 16.51,
11942
+ "ppl": 11.54886,
11943
+ "step": 838,
11944
+ "tokens/total": 11693952,
11945
+ "tokens/train_per_sec_per_gpu": 1674.24,
11946
+ "tokens/trainable": 4834337
11947
+ },
11948
+ {
11949
+ "epoch": 0.40921838800146326,
11950
+ "grad_norm": 0.14650499820709229,
11951
+ "learning_rate": 1.3451489334085554e-05,
11952
+ "loss": 2.3801074028015137,
11953
+ "memory/device_reserved (GiB)": 27.47,
11954
+ "memory/max_active (GiB)": 14.74,
11955
+ "memory/max_allocated (GiB)": 14.74,
11956
+ "ppl": 10.80606,
11957
+ "step": 839,
11958
+ "tokens/total": 11707136,
11959
+ "tokens/train_per_sec_per_gpu": 2774.02,
11960
+ "tokens/trainable": 4840420
11961
+ },
11962
+ {
11963
+ "epoch": 0.40970613339836603,
11964
+ "grad_norm": 0.18212567269802094,
11965
+ "learning_rate": 1.3289703066143111e-05,
11966
+ "loss": 2.615509510040283,
11967
+ "memory/device_reserved (GiB)": 27.47,
11968
+ "memory/max_active (GiB)": 16.07,
11969
+ "memory/max_allocated (GiB)": 16.07,
11970
+ "ppl": 13.67418,
11971
+ "step": 840,
11972
+ "tokens/total": 11721600,
11973
+ "tokens/train_per_sec_per_gpu": 2407.64,
11974
+ "tokens/trainable": 4845453
11975
+ },
11976
+ {
11977
+ "epoch": 0.41019387879526886,
11978
+ "grad_norm": 0.1324673295021057,
11979
+ "learning_rate": 1.3128826348184887e-05,
11980
+ "loss": 2.3340201377868652,
11981
+ "memory/device_reserved (GiB)": 27.47,
11982
+ "memory/max_active (GiB)": 16.51,
11983
+ "memory/max_allocated (GiB)": 16.51,
11984
+ "ppl": 10.31934,
11985
+ "step": 841,
11986
+ "tokens/total": 11736832,
11987
+ "tokens/train_per_sec_per_gpu": 1690.93,
11988
+ "tokens/trainable": 4852363
11989
+ },
11990
+ {
11991
+ "epoch": 0.4106816241921717,
11992
+ "grad_norm": 0.1828589141368866,
11993
+ "learning_rate": 1.2968860867731569e-05,
11994
+ "loss": 2.5910964012145996,
11995
+ "memory/device_reserved (GiB)": 27.47,
11996
+ "memory/max_active (GiB)": 15.18,
11997
+ "memory/max_allocated (GiB)": 15.18,
11998
+ "ppl": 13.34439,
11999
+ "step": 842,
12000
+ "tokens/total": 11749760,
12001
+ "tokens/train_per_sec_per_gpu": 2010.81,
12002
+ "tokens/trainable": 4860314
12003
+ },
12004
+ {
12005
+ "epoch": 0.4111693695890745,
12006
+ "grad_norm": 0.17424485087394714,
12007
+ "learning_rate": 1.2809808302745297e-05,
12008
+ "loss": 2.2547149658203125,
12009
+ "memory/device_reserved (GiB)": 27.47,
12010
+ "memory/max_active (GiB)": 14.3,
12011
+ "memory/max_allocated (GiB)": 14.3,
12012
+ "ppl": 9.53258,
12013
+ "step": 843,
12014
+ "tokens/total": 11761536,
12015
+ "tokens/train_per_sec_per_gpu": 1108.13,
12016
+ "tokens/trainable": 4864065
12017
+ },
12018
+ {
12019
+ "epoch": 0.41165711498597735,
12020
+ "grad_norm": 0.1569896787405014,
12021
+ "learning_rate": 1.2651670321612263e-05,
12022
+ "loss": 2.6433074474334717,
12023
+ "memory/device_reserved (GiB)": 27.47,
12024
+ "memory/max_active (GiB)": 16.07,
12025
+ "memory/max_allocated (GiB)": 16.07,
12026
+ "ppl": 14.05963,
12027
+ "step": 844,
12028
+ "tokens/total": 11775104,
12029
+ "tokens/train_per_sec_per_gpu": 1862.26,
12030
+ "tokens/trainable": 4869648
12031
+ },
12032
+ {
12033
+ "epoch": 0.4121448603828801,
12034
+ "grad_norm": 0.1626082807779312,
12035
+ "learning_rate": 1.2494448583125018e-05,
12036
+ "loss": 2.5848560333251953,
12037
+ "memory/device_reserved (GiB)": 27.47,
12038
+ "memory/max_active (GiB)": 14.74,
12039
+ "memory/max_allocated (GiB)": 14.74,
12040
+ "ppl": 13.26138,
12041
+ "step": 845,
12042
+ "tokens/total": 11787136,
12043
+ "tokens/train_per_sec_per_gpu": 3456.79,
12044
+ "tokens/trainable": 4874679
12045
+ },
12046
+ {
12047
+ "epoch": 0.41263260577978295,
12048
+ "grad_norm": 0.13333570957183838,
12049
+ "learning_rate": 1.233814473646524e-05,
12050
+ "loss": 2.450986385345459,
12051
+ "memory/device_reserved (GiB)": 27.47,
12052
+ "memory/max_active (GiB)": 16.42,
12053
+ "memory/max_allocated (GiB)": 16.42,
12054
+ "ppl": 11.59978,
12055
+ "step": 846,
12056
+ "tokens/total": 11802112,
12057
+ "tokens/train_per_sec_per_gpu": 3875.89,
12058
+ "tokens/trainable": 4882554
12059
+ },
12060
+ {
12061
+ "epoch": 0.4131203511766858,
12062
+ "grad_norm": 0.15294967591762543,
12063
+ "learning_rate": 1.218276042118629e-05,
12064
+ "loss": 2.352025032043457,
12065
+ "memory/device_reserved (GiB)": 27.47,
12066
+ "memory/max_active (GiB)": 15.18,
12067
+ "memory/max_allocated (GiB)": 15.18,
12068
+ "ppl": 10.50682,
12069
+ "step": 847,
12070
+ "tokens/total": 11815936,
12071
+ "tokens/train_per_sec_per_gpu": 2607.66,
12072
+ "tokens/trainable": 4888862
12073
+ },
12074
+ {
12075
+ "epoch": 0.4136080965735886,
12076
+ "grad_norm": 0.13894210755825043,
12077
+ "learning_rate": 1.202829726719611e-05,
12078
+ "loss": 2.7392213344573975,
12079
+ "memory/device_reserved (GiB)": 27.47,
12080
+ "memory/max_active (GiB)": 15.18,
12081
+ "memory/max_allocated (GiB)": 15.18,
12082
+ "ppl": 15.47493,
12083
+ "step": 848,
12084
+ "tokens/total": 11828352,
12085
+ "tokens/train_per_sec_per_gpu": 1599.17,
12086
+ "tokens/trainable": 4896381
12087
+ },
12088
+ {
12089
+ "epoch": 0.41409584197049143,
12090
+ "grad_norm": 0.16302239894866943,
12091
+ "learning_rate": 1.1874756894740135e-05,
12092
+ "loss": 2.470451831817627,
12093
+ "memory/device_reserved (GiB)": 27.47,
12094
+ "memory/max_active (GiB)": 16.51,
12095
+ "memory/max_allocated (GiB)": 16.51,
12096
+ "ppl": 11.82779,
12097
+ "step": 849,
12098
+ "tokens/total": 11843712,
12099
+ "tokens/train_per_sec_per_gpu": 1074.34,
12100
+ "tokens/trainable": 4901622
12101
+ },
12102
+ {
12103
+ "epoch": 0.4145835873673942,
12104
+ "grad_norm": 0.149576798081398,
12105
+ "learning_rate": 1.172214091438416e-05,
12106
+ "loss": 2.4945759773254395,
12107
+ "memory/device_reserved (GiB)": 27.47,
12108
+ "memory/max_active (GiB)": 16.51,
12109
+ "memory/max_allocated (GiB)": 16.51,
12110
+ "ppl": 12.11659,
12111
+ "step": 850,
12112
+ "tokens/total": 11857664,
12113
+ "tokens/train_per_sec_per_gpu": 2578.75,
12114
+ "tokens/trainable": 4907297
12115
+ },
12116
+ {
12117
+ "epoch": 0.4145835873673942,
12118
+ "eval_loss": 2.491702079772949,
12119
+ "eval_ppl": 12.08182,
12120
+ "eval_runtime": 6.0732,
12121
+ "eval_samples_per_second": 32.932,
12122
+ "eval_steps_per_second": 16.466,
12123
+ "memory/device_reserved (GiB)": 27.47,
12124
+ "memory/max_active (GiB)": 11.76,
12125
+ "memory/max_allocated (GiB)": 11.76,
12126
+ "step": 850
12127
  }
12128
  ],
12129
  "logging_steps": 1,
 
12143
  "attributes": {}
12144
  }
12145
  },
12146
+ "total_flos": 1.4585474031825715e+17,
12147
  "train_batch_size": 2,
12148
  "trial_name": null,
12149
  "trial_params": null