AiAF commited on
Commit
c86dc89
·
verified ·
1 Parent(s): fadd720

Training in progress, step 900, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad29cff1b863587cbb2ca948354cb20133cf91efb3ab95cc9e09274cb6bcac5b
3
  size 102264160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92825f995b10c587276dcbb59bd1d6ee8e64825522d2dd1211f3d32eb56271e0
3
  size 102264160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99e35caf9f22b9501f8794b7071db015d9c5f1cc2081e5e6b308b86d01258be1
3
  size 52162827
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310b6425692196f794c4a8c4e6a433c67cf109b9958b71fd97fd7d7987695364
3
  size 52162827
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e394ddf37d3569e21dd7164d17df1486101a840dc12b8080abbcaca06573e244
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e94609f49fe622efb94028eb554b792c6f84218319e3570798403eabc10e0789
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b233ed6e5d634209b3aa9991eded2c9aa4b12fa1b2fb73e19124dd488ff69f21
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd147b564a8c5ec603af92237fa27a3ed62221eb04924da2d44364eed74d116
3
  size 1465
last-checkpoint/tokens_state.json CHANGED
@@ -1 +1 @@
1
- {"total": 11857664, "trainable": 4907297}
 
1
+ {"total": 12555520, "trainable": 5198878}
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4145835873673942,
6
  "eval_steps": 50,
7
- "global_step": 850,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -12124,6 +12124,718 @@
12124
  "memory/max_active (GiB)": 11.76,
12125
  "memory/max_allocated (GiB)": 11.76,
12126
  "step": 850
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12127
  }
12128
  ],
12129
  "logging_steps": 1,
@@ -12143,7 +12855,7 @@
12143
  "attributes": {}
12144
  }
12145
  },
12146
- "total_flos": 1.4585474031825715e+17,
12147
  "train_batch_size": 2,
12148
  "trial_name": null,
12149
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.43897085721253504,
6
  "eval_steps": 50,
7
+ "global_step": 900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
12124
  "memory/max_active (GiB)": 11.76,
12125
  "memory/max_allocated (GiB)": 11.76,
12126
  "step": 850
12127
+ },
12128
+ {
12129
+ "epoch": 0.41507133276429703,
12130
+ "grad_norm": 0.1435479074716568,
12131
+ "learning_rate": 1.1570450926997655e-05,
12132
+ "loss": 2.4883711338043213,
12133
+ "memory/device_reserved (GiB)": 25.22,
12134
+ "memory/max_active (GiB)": 15.63,
12135
+ "memory/max_allocated (GiB)": 15.63,
12136
+ "ppl": 12.04165,
12137
+ "step": 851,
12138
+ "tokens/total": 11870720,
12139
+ "tokens/train_per_sec_per_gpu": 3333.56,
12140
+ "tokens/trainable": 4914049
12141
+ },
12142
+ {
12143
+ "epoch": 0.41555907816119986,
12144
+ "grad_norm": 0.1966681033372879,
12145
+ "learning_rate": 1.141968852373676e-05,
12146
+ "loss": 2.431966781616211,
12147
+ "memory/device_reserved (GiB)": 25.23,
12148
+ "memory/max_active (GiB)": 15.63,
12149
+ "memory/max_allocated (GiB)": 15.63,
12150
+ "ppl": 11.38124,
12151
+ "step": 852,
12152
+ "tokens/total": 11884800,
12153
+ "tokens/train_per_sec_per_gpu": 2765.34,
12154
+ "tokens/trainable": 4917836
12155
+ },
12156
+ {
12157
+ "epoch": 0.4160468235581027,
12158
+ "grad_norm": 0.1497463434934616,
12159
+ "learning_rate": 1.1269855286027797e-05,
12160
+ "loss": 2.442220687866211,
12161
+ "memory/device_reserved (GiB)": 36.95,
12162
+ "memory/max_active (GiB)": 16.51,
12163
+ "memory/max_allocated (GiB)": 16.51,
12164
+ "ppl": 11.49855,
12165
+ "step": 853,
12166
+ "tokens/total": 11898240,
12167
+ "tokens/train_per_sec_per_gpu": 653.89,
12168
+ "tokens/trainable": 4923467
12169
+ },
12170
+ {
12171
+ "epoch": 0.4165345689550055,
12172
+ "grad_norm": 0.1307368278503418,
12173
+ "learning_rate": 1.1120952785550476e-05,
12174
+ "loss": 2.2968218326568604,
12175
+ "memory/device_reserved (GiB)": 36.95,
12176
+ "memory/max_active (GiB)": 16.51,
12177
+ "memory/max_allocated (GiB)": 16.51,
12178
+ "ppl": 9.94253,
12179
+ "step": 854,
12180
+ "tokens/total": 11913984,
12181
+ "tokens/train_per_sec_per_gpu": 2660.99,
12182
+ "tokens/trainable": 4931210
12183
+ },
12184
+ {
12185
+ "epoch": 0.4170223143519083,
12186
+ "grad_norm": 0.1639799326658249,
12187
+ "learning_rate": 1.0972982584221592e-05,
12188
+ "loss": 2.3355393409729004,
12189
+ "memory/device_reserved (GiB)": 36.95,
12190
+ "memory/max_active (GiB)": 16.42,
12191
+ "memory/max_allocated (GiB)": 16.42,
12192
+ "ppl": 10.33503,
12193
+ "step": 855,
12194
+ "tokens/total": 11929088,
12195
+ "tokens/train_per_sec_per_gpu": 1326.43,
12196
+ "tokens/trainable": 4937919
12197
+ },
12198
+ {
12199
+ "epoch": 0.4175100597488111,
12200
+ "grad_norm": 0.14141331613063812,
12201
+ "learning_rate": 1.0825946234178574e-05,
12202
+ "loss": 2.449113607406616,
12203
+ "memory/device_reserved (GiB)": 36.95,
12204
+ "memory/max_active (GiB)": 16.07,
12205
+ "memory/max_allocated (GiB)": 16.07,
12206
+ "ppl": 11.57808,
12207
+ "step": 856,
12208
+ "tokens/total": 11944320,
12209
+ "tokens/train_per_sec_per_gpu": 1926.65,
12210
+ "tokens/trainable": 4944515
12211
+ },
12212
+ {
12213
+ "epoch": 0.41799780514571394,
12214
+ "grad_norm": 0.17068256437778473,
12215
+ "learning_rate": 1.067984527776309e-05,
12216
+ "loss": 2.702491521835327,
12217
+ "memory/device_reserved (GiB)": 36.95,
12218
+ "memory/max_active (GiB)": 15.63,
12219
+ "memory/max_allocated (GiB)": 15.63,
12220
+ "ppl": 14.91685,
12221
+ "step": 857,
12222
+ "tokens/total": 11958016,
12223
+ "tokens/train_per_sec_per_gpu": 1277.78,
12224
+ "tokens/trainable": 4949764
12225
+ },
12226
+ {
12227
+ "epoch": 0.41848555054261677,
12228
+ "grad_norm": 0.199791818857193,
12229
+ "learning_rate": 1.0534681247505106e-05,
12230
+ "loss": 2.603161573410034,
12231
+ "memory/device_reserved (GiB)": 36.95,
12232
+ "memory/max_active (GiB)": 16.07,
12233
+ "memory/max_allocated (GiB)": 16.07,
12234
+ "ppl": 13.50637,
12235
+ "step": 858,
12236
+ "tokens/total": 11970304,
12237
+ "tokens/train_per_sec_per_gpu": 1420.03,
12238
+ "tokens/trainable": 4953574
12239
+ },
12240
+ {
12241
+ "epoch": 0.4189732959395196,
12242
+ "grad_norm": 0.1493324339389801,
12243
+ "learning_rate": 1.0390455666106547e-05,
12244
+ "loss": 2.5382094383239746,
12245
+ "memory/device_reserved (GiB)": 36.95,
12246
+ "memory/max_active (GiB)": 16.51,
12247
+ "memory/max_allocated (GiB)": 16.51,
12248
+ "ppl": 12.65699,
12249
+ "step": 859,
12250
+ "tokens/total": 11985920,
12251
+ "tokens/train_per_sec_per_gpu": 2890.82,
12252
+ "tokens/trainable": 4960025
12253
+ },
12254
+ {
12255
+ "epoch": 0.41946104133642237,
12256
+ "grad_norm": 0.1596572995185852,
12257
+ "learning_rate": 1.024717004642557e-05,
12258
+ "loss": 2.5015199184417725,
12259
+ "memory/device_reserved (GiB)": 36.95,
12260
+ "memory/max_active (GiB)": 15.09,
12261
+ "memory/max_allocated (GiB)": 15.09,
12262
+ "ppl": 12.20102,
12263
+ "step": 860,
12264
+ "tokens/total": 12000256,
12265
+ "tokens/train_per_sec_per_gpu": 1781.45,
12266
+ "tokens/trainable": 4965371
12267
+ },
12268
+ {
12269
+ "epoch": 0.4199487867333252,
12270
+ "grad_norm": 0.16531601548194885,
12271
+ "learning_rate": 1.010482589146048e-05,
12272
+ "loss": 2.519444704055786,
12273
+ "memory/device_reserved (GiB)": 36.95,
12274
+ "memory/max_active (GiB)": 16.51,
12275
+ "memory/max_allocated (GiB)": 16.51,
12276
+ "ppl": 12.4217,
12277
+ "step": 861,
12278
+ "tokens/total": 12014592,
12279
+ "tokens/train_per_sec_per_gpu": 2384.76,
12280
+ "tokens/trainable": 4970292
12281
+ },
12282
+ {
12283
+ "epoch": 0.420436532130228,
12284
+ "grad_norm": 0.15156078338623047,
12285
+ "learning_rate": 9.963424694334122e-06,
12286
+ "loss": 2.636232376098633,
12287
+ "memory/device_reserved (GiB)": 36.95,
12288
+ "memory/max_active (GiB)": 16.51,
12289
+ "memory/max_allocated (GiB)": 16.51,
12290
+ "ppl": 13.96051,
12291
+ "step": 862,
12292
+ "tokens/total": 12027520,
12293
+ "tokens/train_per_sec_per_gpu": 1668.48,
12294
+ "tokens/trainable": 4977083
12295
+ },
12296
+ {
12297
+ "epoch": 0.42092427752713085,
12298
+ "grad_norm": 0.15729570388793945,
12299
+ "learning_rate": 9.822967938278171e-06,
12300
+ "loss": 2.6120800971984863,
12301
+ "memory/device_reserved (GiB)": 36.95,
12302
+ "memory/max_active (GiB)": 16.07,
12303
+ "memory/max_allocated (GiB)": 16.07,
12304
+ "ppl": 13.62737,
12305
+ "step": 863,
12306
+ "tokens/total": 12040960,
12307
+ "tokens/train_per_sec_per_gpu": 1881.0,
12308
+ "tokens/trainable": 4982959
12309
+ },
12310
+ {
12311
+ "epoch": 0.4214120229240337,
12312
+ "grad_norm": 0.16363853216171265,
12313
+ "learning_rate": 9.683457096617488e-06,
12314
+ "loss": 2.4688925743103027,
12315
+ "memory/device_reserved (GiB)": 36.95,
12316
+ "memory/max_active (GiB)": 16.51,
12317
+ "memory/max_allocated (GiB)": 16.51,
12318
+ "ppl": 11.80936,
12319
+ "step": 864,
12320
+ "tokens/total": 12053760,
12321
+ "tokens/train_per_sec_per_gpu": 1967.45,
12322
+ "tokens/trainable": 4987799
12323
+ },
12324
+ {
12325
+ "epoch": 0.42189976832093645,
12326
+ "grad_norm": 0.14668720960617065,
12327
+ "learning_rate": 9.544893632754814e-06,
12328
+ "loss": 2.505845546722412,
12329
+ "memory/device_reserved (GiB)": 36.95,
12330
+ "memory/max_active (GiB)": 15.63,
12331
+ "memory/max_allocated (GiB)": 15.63,
12332
+ "ppl": 12.25392,
12333
+ "step": 865,
12334
+ "tokens/total": 12067072,
12335
+ "tokens/train_per_sec_per_gpu": 1906.6,
12336
+ "tokens/trainable": 4994507
12337
+ },
12338
+ {
12339
+ "epoch": 0.4223875137178393,
12340
+ "grad_norm": 0.16504798829555511,
12341
+ "learning_rate": 9.407279000155312e-06,
12342
+ "loss": 2.658405303955078,
12343
+ "memory/device_reserved (GiB)": 36.95,
12344
+ "memory/max_active (GiB)": 16.51,
12345
+ "memory/max_allocated (GiB)": 16.51,
12346
+ "ppl": 14.27351,
12347
+ "step": 866,
12348
+ "tokens/total": 12080384,
12349
+ "tokens/train_per_sec_per_gpu": 341.69,
12350
+ "tokens/trainable": 4999747
12351
+ },
12352
+ {
12353
+ "epoch": 0.4228752591147421,
12354
+ "grad_norm": 0.1413465142250061,
12355
+ "learning_rate": 9.270614642331376e-06,
12356
+ "loss": 2.5570576190948486,
12357
+ "memory/device_reserved (GiB)": 36.95,
12358
+ "memory/max_active (GiB)": 14.74,
12359
+ "memory/max_allocated (GiB)": 14.74,
12360
+ "ppl": 12.89781,
12361
+ "step": 867,
12362
+ "tokens/total": 12093056,
12363
+ "tokens/train_per_sec_per_gpu": 1759.22,
12364
+ "tokens/trainable": 5006280
12365
+ },
12366
+ {
12367
+ "epoch": 0.42336300451164494,
12368
+ "grad_norm": 0.16530846059322357,
12369
+ "learning_rate": 9.134901992827427e-06,
12370
+ "loss": 2.3816144466400146,
12371
+ "memory/device_reserved (GiB)": 36.95,
12372
+ "memory/max_active (GiB)": 15.18,
12373
+ "memory/max_allocated (GiB)": 15.18,
12374
+ "ppl": 10.82236,
12375
+ "step": 868,
12376
+ "tokens/total": 12106880,
12377
+ "tokens/train_per_sec_per_gpu": 230.38,
12378
+ "tokens/trainable": 5011197
12379
+ },
12380
+ {
12381
+ "epoch": 0.42385074990854776,
12382
+ "grad_norm": 0.15594670176506042,
12383
+ "learning_rate": 9.000142475204964e-06,
12384
+ "loss": 2.468984603881836,
12385
+ "memory/device_reserved (GiB)": 36.95,
12386
+ "memory/max_active (GiB)": 16.07,
12387
+ "memory/max_allocated (GiB)": 16.07,
12388
+ "ppl": 11.81045,
12389
+ "step": 869,
12390
+ "tokens/total": 12120704,
12391
+ "tokens/train_per_sec_per_gpu": 2162.37,
12392
+ "tokens/trainable": 5016737
12393
+ },
12394
+ {
12395
+ "epoch": 0.42433849530545054,
12396
+ "grad_norm": 0.16707849502563477,
12397
+ "learning_rate": 8.866337503027522e-06,
12398
+ "loss": 2.6235711574554443,
12399
+ "memory/device_reserved (GiB)": 36.95,
12400
+ "memory/max_active (GiB)": 15.63,
12401
+ "memory/max_allocated (GiB)": 15.63,
12402
+ "ppl": 13.78486,
12403
+ "step": 870,
12404
+ "tokens/total": 12133504,
12405
+ "tokens/train_per_sec_per_gpu": 1585.54,
12406
+ "tokens/trainable": 5021777
12407
+ },
12408
+ {
12409
+ "epoch": 0.42482624070235336,
12410
+ "grad_norm": 0.14890620112419128,
12411
+ "learning_rate": 8.733488479845997e-06,
12412
+ "loss": 2.49348783493042,
12413
+ "memory/device_reserved (GiB)": 36.95,
12414
+ "memory/max_active (GiB)": 16.51,
12415
+ "memory/max_allocated (GiB)": 16.51,
12416
+ "ppl": 12.10342,
12417
+ "step": 871,
12418
+ "tokens/total": 12149120,
12419
+ "tokens/train_per_sec_per_gpu": 2713.5,
12420
+ "tokens/trainable": 5028442
12421
+ },
12422
+ {
12423
+ "epoch": 0.4253139860992562,
12424
+ "grad_norm": 0.18396082520484924,
12425
+ "learning_rate": 8.60159679918372e-06,
12426
+ "loss": 2.4837136268615723,
12427
+ "memory/device_reserved (GiB)": 36.95,
12428
+ "memory/max_active (GiB)": 14.3,
12429
+ "memory/max_allocated (GiB)": 14.3,
12430
+ "ppl": 11.98569,
12431
+ "step": 872,
12432
+ "tokens/total": 12162432,
12433
+ "tokens/train_per_sec_per_gpu": 1947.57,
12434
+ "tokens/trainable": 5033319
12435
+ },
12436
+ {
12437
+ "epoch": 0.425801731496159,
12438
+ "grad_norm": 0.18063481152057648,
12439
+ "learning_rate": 8.470663844522052e-06,
12440
+ "loss": 2.8627002239227295,
12441
+ "memory/device_reserved (GiB)": 36.95,
12442
+ "memory/max_active (GiB)": 16.07,
12443
+ "memory/max_allocated (GiB)": 16.07,
12444
+ "ppl": 17.50874,
12445
+ "step": 873,
12446
+ "tokens/total": 12177664,
12447
+ "tokens/train_per_sec_per_gpu": 620.63,
12448
+ "tokens/trainable": 5038529
12449
+ },
12450
+ {
12451
+ "epoch": 0.42628947689306185,
12452
+ "grad_norm": 0.19020754098892212,
12453
+ "learning_rate": 8.340690989285726e-06,
12454
+ "loss": 2.350053071975708,
12455
+ "memory/device_reserved (GiB)": 36.95,
12456
+ "memory/max_active (GiB)": 15.98,
12457
+ "memory/max_allocated (GiB)": 15.98,
12458
+ "ppl": 10.48613,
12459
+ "step": 874,
12460
+ "tokens/total": 12192256,
12461
+ "tokens/train_per_sec_per_gpu": 254.37,
12462
+ "tokens/trainable": 5041985
12463
+ },
12464
+ {
12465
+ "epoch": 0.4267772222899646,
12466
+ "grad_norm": 0.13190138339996338,
12467
+ "learning_rate": 8.21167959682848e-06,
12468
+ "loss": 2.2473227977752686,
12469
+ "memory/device_reserved (GiB)": 36.95,
12470
+ "memory/max_active (GiB)": 15.63,
12471
+ "memory/max_allocated (GiB)": 15.63,
12472
+ "ppl": 9.46237,
12473
+ "step": 875,
12474
+ "tokens/total": 12205440,
12475
+ "tokens/train_per_sec_per_gpu": 3375.17,
12476
+ "tokens/trainable": 5049048
12477
+ },
12478
+ {
12479
+ "epoch": 0.42726496768686745,
12480
+ "grad_norm": 0.16024993360042572,
12481
+ "learning_rate": 8.083631020418791e-06,
12482
+ "loss": 2.3730978965759277,
12483
+ "memory/device_reserved (GiB)": 36.95,
12484
+ "memory/max_active (GiB)": 16.07,
12485
+ "memory/max_allocated (GiB)": 16.07,
12486
+ "ppl": 10.73058,
12487
+ "step": 876,
12488
+ "tokens/total": 12220160,
12489
+ "tokens/train_per_sec_per_gpu": 2040.44,
12490
+ "tokens/trainable": 5054947
12491
+ },
12492
+ {
12493
+ "epoch": 0.4277527130837703,
12494
+ "grad_norm": 0.1305093914270401,
12495
+ "learning_rate": 7.956546603225601e-06,
12496
+ "loss": 2.5211825370788574,
12497
+ "memory/device_reserved (GiB)": 36.95,
12498
+ "memory/max_active (GiB)": 16.51,
12499
+ "memory/max_allocated (GiB)": 16.51,
12500
+ "ppl": 12.4433,
12501
+ "step": 877,
12502
+ "tokens/total": 12235520,
12503
+ "tokens/train_per_sec_per_gpu": 2041.36,
12504
+ "tokens/trainable": 5063371
12505
+ },
12506
+ {
12507
+ "epoch": 0.4282404584806731,
12508
+ "grad_norm": 0.13371102511882782,
12509
+ "learning_rate": 7.830427678304353e-06,
12510
+ "loss": 2.4531219005584717,
12511
+ "memory/device_reserved (GiB)": 36.95,
12512
+ "memory/max_active (GiB)": 16.07,
12513
+ "memory/max_allocated (GiB)": 16.07,
12514
+ "ppl": 11.62458,
12515
+ "step": 878,
12516
+ "tokens/total": 12250368,
12517
+ "tokens/train_per_sec_per_gpu": 3181.56,
12518
+ "tokens/trainable": 5071429
12519
+ },
12520
+ {
12521
+ "epoch": 0.42872820387757593,
12522
+ "grad_norm": 0.1576300710439682,
12523
+ "learning_rate": 7.705275568582848e-06,
12524
+ "loss": 2.5344905853271484,
12525
+ "memory/device_reserved (GiB)": 36.95,
12526
+ "memory/max_active (GiB)": 16.51,
12527
+ "memory/max_allocated (GiB)": 16.51,
12528
+ "ppl": 12.61001,
12529
+ "step": 879,
12530
+ "tokens/total": 12265344,
12531
+ "tokens/train_per_sec_per_gpu": 3537.63,
12532
+ "tokens/trainable": 5076986
12533
+ },
12534
+ {
12535
+ "epoch": 0.4292159492744787,
12536
+ "grad_norm": 0.13250482082366943,
12537
+ "learning_rate": 7.581091586847522e-06,
12538
+ "loss": 2.433558464050293,
12539
+ "memory/device_reserved (GiB)": 36.95,
12540
+ "memory/max_active (GiB)": 16.51,
12541
+ "memory/max_allocated (GiB)": 16.51,
12542
+ "ppl": 11.39937,
12543
+ "step": 880,
12544
+ "tokens/total": 12280320,
12545
+ "tokens/train_per_sec_per_gpu": 2714.44,
12546
+ "tokens/trainable": 5084389
12547
+ },
12548
+ {
12549
+ "epoch": 0.42970369467138153,
12550
+ "grad_norm": 0.12432190030813217,
12551
+ "learning_rate": 7.457877035729588e-06,
12552
+ "loss": 2.548605442047119,
12553
+ "memory/device_reserved (GiB)": 36.95,
12554
+ "memory/max_active (GiB)": 15.63,
12555
+ "memory/max_allocated (GiB)": 15.63,
12556
+ "ppl": 12.78926,
12557
+ "step": 881,
12558
+ "tokens/total": 12293376,
12559
+ "tokens/train_per_sec_per_gpu": 3840.17,
12560
+ "tokens/trainable": 5093340
12561
+ },
12562
+ {
12563
+ "epoch": 0.43019144006828436,
12564
+ "grad_norm": 0.15672548115253448,
12565
+ "learning_rate": 7.335633207691361e-06,
12566
+ "loss": 2.5341434478759766,
12567
+ "memory/device_reserved (GiB)": 36.95,
12568
+ "memory/max_active (GiB)": 14.3,
12569
+ "memory/max_allocated (GiB)": 14.3,
12570
+ "ppl": 12.60563,
12571
+ "step": 882,
12572
+ "tokens/total": 12305792,
12573
+ "tokens/train_per_sec_per_gpu": 1558.83,
12574
+ "tokens/trainable": 5098803
12575
+ },
12576
+ {
12577
+ "epoch": 0.4306791854651872,
12578
+ "grad_norm": 0.17533928155899048,
12579
+ "learning_rate": 7.21436138501278e-06,
12580
+ "loss": 2.480517864227295,
12581
+ "memory/device_reserved (GiB)": 36.95,
12582
+ "memory/max_active (GiB)": 15.09,
12583
+ "memory/max_allocated (GiB)": 15.09,
12584
+ "ppl": 11.94745,
12585
+ "step": 883,
12586
+ "tokens/total": 12318592,
12587
+ "tokens/train_per_sec_per_gpu": 1472.63,
12588
+ "tokens/trainable": 5103041
12589
+ },
12590
+ {
12591
+ "epoch": 0.43116693086209,
12592
+ "grad_norm": 0.14066585898399353,
12593
+ "learning_rate": 7.094062839777837e-06,
12594
+ "loss": 2.64518404006958,
12595
+ "memory/device_reserved (GiB)": 36.95,
12596
+ "memory/max_active (GiB)": 15.18,
12597
+ "memory/max_allocated (GiB)": 15.18,
12598
+ "ppl": 14.08604,
12599
+ "step": 884,
12600
+ "tokens/total": 12331776,
12601
+ "tokens/train_per_sec_per_gpu": 846.43,
12602
+ "tokens/trainable": 5109866
12603
+ },
12604
+ {
12605
+ "epoch": 0.4316546762589928,
12606
+ "grad_norm": 0.1537938266992569,
12607
+ "learning_rate": 6.974738833861383e-06,
12608
+ "loss": 2.3791351318359375,
12609
+ "memory/device_reserved (GiB)": 36.95,
12610
+ "memory/max_active (GiB)": 15.18,
12611
+ "memory/max_allocated (GiB)": 15.18,
12612
+ "ppl": 10.79556,
12613
+ "step": 885,
12614
+ "tokens/total": 12345856,
12615
+ "tokens/train_per_sec_per_gpu": 1987.51,
12616
+ "tokens/trainable": 5115305
12617
+ },
12618
+ {
12619
+ "epoch": 0.4321424216558956,
12620
+ "grad_norm": 0.13529153168201447,
12621
+ "learning_rate": 6.856390618915775e-06,
12622
+ "loss": 2.581418037414551,
12623
+ "memory/device_reserved (GiB)": 36.95,
12624
+ "memory/max_active (GiB)": 16.07,
12625
+ "memory/max_allocated (GiB)": 16.07,
12626
+ "ppl": 13.21587,
12627
+ "step": 886,
12628
+ "tokens/total": 12359936,
12629
+ "tokens/train_per_sec_per_gpu": 3477.99,
12630
+ "tokens/trainable": 5122721
12631
+ },
12632
+ {
12633
+ "epoch": 0.43263016705279844,
12634
+ "grad_norm": 0.151899516582489,
12635
+ "learning_rate": 6.739019436357774e-06,
12636
+ "loss": 2.509517192840576,
12637
+ "memory/device_reserved (GiB)": 36.95,
12638
+ "memory/max_active (GiB)": 16.07,
12639
+ "memory/max_allocated (GiB)": 16.07,
12640
+ "ppl": 12.29899,
12641
+ "step": 887,
12642
+ "tokens/total": 12374272,
12643
+ "tokens/train_per_sec_per_gpu": 3268.25,
12644
+ "tokens/trainable": 5129148
12645
+ },
12646
+ {
12647
+ "epoch": 0.43311791244970127,
12648
+ "grad_norm": 0.13883507251739502,
12649
+ "learning_rate": 6.622626517355557e-06,
12650
+ "loss": 2.570188522338867,
12651
+ "memory/device_reserved (GiB)": 36.95,
12652
+ "memory/max_active (GiB)": 16.42,
12653
+ "memory/max_allocated (GiB)": 16.42,
12654
+ "ppl": 13.06829,
12655
+ "step": 888,
12656
+ "tokens/total": 12387712,
12657
+ "tokens/train_per_sec_per_gpu": 2985.04,
12658
+ "tokens/trainable": 5136366
12659
+ },
12660
+ {
12661
+ "epoch": 0.4336056578466041,
12662
+ "grad_norm": 0.17564928531646729,
12663
+ "learning_rate": 6.507213082815744e-06,
12664
+ "loss": 2.4525790214538574,
12665
+ "memory/device_reserved (GiB)": 36.95,
12666
+ "memory/max_active (GiB)": 15.63,
12667
+ "memory/max_allocated (GiB)": 15.63,
12668
+ "ppl": 11.61827,
12669
+ "step": 889,
12670
+ "tokens/total": 12401280,
12671
+ "tokens/train_per_sec_per_gpu": 776.96,
12672
+ "tokens/trainable": 5140656
12673
+ },
12674
+ {
12675
+ "epoch": 0.43409340324350687,
12676
+ "grad_norm": 0.14762923121452332,
12677
+ "learning_rate": 6.392780343370686e-06,
12678
+ "loss": 2.6221022605895996,
12679
+ "memory/device_reserved (GiB)": 36.95,
12680
+ "memory/max_active (GiB)": 16.07,
12681
+ "memory/max_allocated (GiB)": 16.07,
12682
+ "ppl": 13.76463,
12683
+ "step": 890,
12684
+ "tokens/total": 12414080,
12685
+ "tokens/train_per_sec_per_gpu": 2521.43,
12686
+ "tokens/trainable": 5147622
12687
+ },
12688
+ {
12689
+ "epoch": 0.4345811486404097,
12690
+ "grad_norm": 0.14871041476726532,
12691
+ "learning_rate": 6.2793294993656494e-06,
12692
+ "loss": 2.450669527053833,
12693
+ "memory/device_reserved (GiB)": 36.95,
12694
+ "memory/max_active (GiB)": 16.51,
12695
+ "memory/max_allocated (GiB)": 16.51,
12696
+ "ppl": 11.59611,
12697
+ "step": 891,
12698
+ "tokens/total": 12429184,
12699
+ "tokens/train_per_sec_per_gpu": 983.15,
12700
+ "tokens/trainable": 5153346
12701
+ },
12702
+ {
12703
+ "epoch": 0.4350688940373125,
12704
+ "grad_norm": 0.2055574506521225,
12705
+ "learning_rate": 6.166861740846297e-06,
12706
+ "loss": 2.7320761680603027,
12707
+ "memory/device_reserved (GiB)": 36.95,
12708
+ "memory/max_active (GiB)": 15.63,
12709
+ "memory/max_allocated (GiB)": 15.63,
12710
+ "ppl": 15.36475,
12711
+ "step": 892,
12712
+ "tokens/total": 12443136,
12713
+ "tokens/train_per_sec_per_gpu": 615.37,
12714
+ "tokens/trainable": 5156569
12715
+ },
12716
+ {
12717
+ "epoch": 0.43555663943421535,
12718
+ "grad_norm": 0.16989806294441223,
12719
+ "learning_rate": 6.055378247546218e-06,
12720
+ "loss": 2.3722715377807617,
12721
+ "memory/device_reserved (GiB)": 36.95,
12722
+ "memory/max_active (GiB)": 16.51,
12723
+ "memory/max_allocated (GiB)": 16.51,
12724
+ "ppl": 10.72172,
12725
+ "step": 893,
12726
+ "tokens/total": 12456576,
12727
+ "tokens/train_per_sec_per_gpu": 289.52,
12728
+ "tokens/trainable": 5161189
12729
+ },
12730
+ {
12731
+ "epoch": 0.4360443848311182,
12732
+ "grad_norm": 0.158726304769516,
12733
+ "learning_rate": 5.9448801888744795e-06,
12734
+ "loss": 2.1923623085021973,
12735
+ "memory/device_reserved (GiB)": 36.95,
12736
+ "memory/max_active (GiB)": 15.63,
12737
+ "memory/max_allocated (GiB)": 15.63,
12738
+ "ppl": 8.95635,
12739
+ "step": 894,
12740
+ "tokens/total": 12469888,
12741
+ "tokens/train_per_sec_per_gpu": 3135.48,
12742
+ "tokens/trainable": 5165648
12743
+ },
12744
+ {
12745
+ "epoch": 0.43653213022802095,
12746
+ "grad_norm": 0.12554942071437836,
12747
+ "learning_rate": 5.835368723903456e-06,
12748
+ "loss": 2.471595287322998,
12749
+ "memory/device_reserved (GiB)": 36.95,
12750
+ "memory/max_active (GiB)": 15.98,
12751
+ "memory/max_allocated (GiB)": 15.98,
12752
+ "ppl": 11.84132,
12753
+ "step": 895,
12754
+ "tokens/total": 12484992,
12755
+ "tokens/train_per_sec_per_gpu": 2601.89,
12756
+ "tokens/trainable": 5174167
12757
+ },
12758
+ {
12759
+ "epoch": 0.4370198756249238,
12760
+ "grad_norm": 0.17701223492622375,
12761
+ "learning_rate": 5.726845001356573e-06,
12762
+ "loss": 2.3380227088928223,
12763
+ "memory/device_reserved (GiB)": 36.95,
12764
+ "memory/max_active (GiB)": 15.53,
12765
+ "memory/max_allocated (GiB)": 15.53,
12766
+ "ppl": 10.36073,
12767
+ "step": 896,
12768
+ "tokens/total": 12498688,
12769
+ "tokens/train_per_sec_per_gpu": 1107.32,
12770
+ "tokens/trainable": 5177916
12771
+ },
12772
+ {
12773
+ "epoch": 0.4375076210218266,
12774
+ "grad_norm": 0.15862098336219788,
12775
+ "learning_rate": 5.6193101595963585e-06,
12776
+ "loss": 2.281069755554199,
12777
+ "memory/device_reserved (GiB)": 36.95,
12778
+ "memory/max_active (GiB)": 16.42,
12779
+ "memory/max_allocated (GiB)": 16.42,
12780
+ "ppl": 9.78714,
12781
+ "step": 897,
12782
+ "tokens/total": 12512640,
12783
+ "tokens/train_per_sec_per_gpu": 1239.77,
12784
+ "tokens/trainable": 5182570
12785
+ },
12786
+ {
12787
+ "epoch": 0.43799536641872944,
12788
+ "grad_norm": 0.16245107352733612,
12789
+ "learning_rate": 5.512765326612379e-06,
12790
+ "loss": 2.2768242359161377,
12791
+ "memory/device_reserved (GiB)": 36.95,
12792
+ "memory/max_active (GiB)": 16.51,
12793
+ "memory/max_allocated (GiB)": 16.51,
12794
+ "ppl": 9.74568,
12795
+ "step": 898,
12796
+ "tokens/total": 12527872,
12797
+ "tokens/train_per_sec_per_gpu": 742.37,
12798
+ "tokens/trainable": 5187760
12799
+ },
12800
+ {
12801
+ "epoch": 0.43848311181563226,
12802
+ "grad_norm": 0.15560710430145264,
12803
+ "learning_rate": 5.407211620009544e-06,
12804
+ "loss": 2.7127208709716797,
12805
+ "memory/device_reserved (GiB)": 36.95,
12806
+ "memory/max_active (GiB)": 15.53,
12807
+ "memory/max_allocated (GiB)": 15.53,
12808
+ "ppl": 15.07022,
12809
+ "step": 899,
12810
+ "tokens/total": 12541056,
12811
+ "tokens/train_per_sec_per_gpu": 1990.04,
12812
+ "tokens/trainable": 5193948
12813
+ },
12814
+ {
12815
+ "epoch": 0.43897085721253504,
12816
+ "grad_norm": 0.1649906188249588,
12817
+ "learning_rate": 5.30265014699628e-06,
12818
+ "loss": 2.6698319911956787,
12819
+ "memory/device_reserved (GiB)": 36.95,
12820
+ "memory/max_active (GiB)": 16.07,
12821
+ "memory/max_allocated (GiB)": 16.07,
12822
+ "ppl": 14.43754,
12823
+ "step": 900,
12824
+ "tokens/total": 12555520,
12825
+ "tokens/train_per_sec_per_gpu": 2105.77,
12826
+ "tokens/trainable": 5198878
12827
+ },
12828
+ {
12829
+ "epoch": 0.43897085721253504,
12830
+ "eval_loss": 2.4894537925720215,
12831
+ "eval_ppl": 12.05469,
12832
+ "eval_runtime": 6.0203,
12833
+ "eval_samples_per_second": 33.221,
12834
+ "eval_steps_per_second": 16.61,
12835
+ "memory/device_reserved (GiB)": 36.95,
12836
+ "memory/max_active (GiB)": 11.76,
12837
+ "memory/max_allocated (GiB)": 11.76,
12838
+ "step": 900
12839
  }
12840
  ],
12841
  "logging_steps": 1,
 
12855
  "attributes": {}
12856
  }
12857
  },
12858
+ "total_flos": 1.5443869122625536e+17,
12859
  "train_batch_size": 2,
12860
  "trial_name": null,
12861
  "trial_params": null