aghatage commited on
Commit
009a094
·
verified ·
1 Parent(s): 9eed882

Training in progress, step 13500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db7100a45db008b406f2052c3128c88105424250504770688fd4dc9c99873aaa
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1311c9a69e5604b2001ceda10c832e98119547c0e33d82afe5989665de514c3e
3
  size 12017472
last-checkpoint/global_step13500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e5b7fd370b88c57ef6538390266dc426bccc73daf55376f38bfe8614c792f79
3
+ size 71982309
last-checkpoint/global_step13500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2751dde639a5d12f58ec51183d6aef63115a33b7c76078f4a229de16b57b14e
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step13000
 
1
+ global_step13500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f08b6a541827b05fd5c665552fdb001f91a00f8dfca00dd95e706aac683d501
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76d48473cf121167cd401e1842d406e7e5686b60208f0336b7552832934ccc04
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 13000,
3
- "best_metric": 0.540317952632904,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-13000",
5
- "epoch": 9.44791856026177,
6
  "eval_steps": 250,
7
- "global_step": 13000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5217,6 +5217,206 @@
5217
  "eval_samples_per_second": 42.989,
5218
  "eval_steps_per_second": 5.38,
5219
  "step": 13000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5220
  }
5221
  ],
5222
  "logging_steps": 25,
@@ -5236,7 +5436,7 @@
5236
  "attributes": {}
5237
  }
5238
  },
5239
- "total_flos": 7.220988366123172e+17,
5240
  "train_batch_size": 4,
5241
  "trial_name": null,
5242
  "trial_params": null
 
1
  {
2
+ "best_global_step": 13500,
3
+ "best_metric": 0.5390045046806335,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-13500",
5
+ "epoch": 9.811488820214507,
6
  "eval_steps": 250,
7
+ "global_step": 13500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5217
  "eval_samples_per_second": 42.989,
5218
  "eval_steps_per_second": 5.38,
5219
  "step": 13000
5220
+ },
5221
+ {
5222
+ "epoch": 9.466097073259407,
5223
+ "grad_norm": 0.8422950506210327,
5224
+ "learning_rate": 8.584715779344832e-06,
5225
+ "loss": 0.5187,
5226
+ "mean_token_accuracy": 0.8383197170495987,
5227
+ "num_tokens": 286863014.0,
5228
+ "step": 13025
5229
+ },
5230
+ {
5231
+ "epoch": 9.484275586257045,
5232
+ "grad_norm": 0.7978519797325134,
5233
+ "learning_rate": 8.466584529700203e-06,
5234
+ "loss": 0.515,
5235
+ "mean_token_accuracy": 0.839700258076191,
5236
+ "num_tokens": 287406511.0,
5237
+ "step": 13050
5238
+ },
5239
+ {
5240
+ "epoch": 9.502454099254681,
5241
+ "grad_norm": 0.8645240664482117,
5242
+ "learning_rate": 8.349175429248554e-06,
5243
+ "loss": 0.5238,
5244
+ "mean_token_accuracy": 0.8366273155808449,
5245
+ "num_tokens": 287962024.0,
5246
+ "step": 13075
5247
+ },
5248
+ {
5249
+ "epoch": 9.520632612252317,
5250
+ "grad_norm": 0.8597573041915894,
5251
+ "learning_rate": 8.232491166784782e-06,
5252
+ "loss": 0.5159,
5253
+ "mean_token_accuracy": 0.8379004463553429,
5254
+ "num_tokens": 288527560.0,
5255
+ "step": 13100
5256
+ },
5257
+ {
5258
+ "epoch": 9.538811125249955,
5259
+ "grad_norm": 0.8828545808792114,
5260
+ "learning_rate": 8.116534414504232e-06,
5261
+ "loss": 0.5118,
5262
+ "mean_token_accuracy": 0.8406583109498024,
5263
+ "num_tokens": 289060843.0,
5264
+ "step": 13125
5265
+ },
5266
+ {
5267
+ "epoch": 9.556989638247591,
5268
+ "grad_norm": 0.8724490404129028,
5269
+ "learning_rate": 8.00130782794148e-06,
5270
+ "loss": 0.5239,
5271
+ "mean_token_accuracy": 0.8369137379527092,
5272
+ "num_tokens": 289603965.0,
5273
+ "step": 13150
5274
+ },
5275
+ {
5276
+ "epoch": 9.575168151245228,
5277
+ "grad_norm": 0.8818336129188538,
5278
+ "learning_rate": 7.886814045909515e-06,
5279
+ "loss": 0.5244,
5280
+ "mean_token_accuracy": 0.8372589892148972,
5281
+ "num_tokens": 290146905.0,
5282
+ "step": 13175
5283
+ },
5284
+ {
5285
+ "epoch": 9.593346664242866,
5286
+ "grad_norm": 0.9488387703895569,
5287
+ "learning_rate": 7.773055690439326e-06,
5288
+ "loss": 0.5131,
5289
+ "mean_token_accuracy": 0.8400958624482154,
5290
+ "num_tokens": 290702107.0,
5291
+ "step": 13200
5292
+ },
5293
+ {
5294
+ "epoch": 9.611525177240502,
5295
+ "grad_norm": 0.8438289165496826,
5296
+ "learning_rate": 7.66003536671982e-06,
5297
+ "loss": 0.5131,
5298
+ "mean_token_accuracy": 0.8400224041938782,
5299
+ "num_tokens": 291241779.0,
5300
+ "step": 13225
5301
+ },
5302
+ {
5303
+ "epoch": 9.629703690238138,
5304
+ "grad_norm": 0.8664806485176086,
5305
+ "learning_rate": 7.547755663038212e-06,
5306
+ "loss": 0.5107,
5307
+ "mean_token_accuracy": 0.8407774633169174,
5308
+ "num_tokens": 291796633.0,
5309
+ "step": 13250
5310
+ },
5311
+ {
5312
+ "epoch": 9.629703690238138,
5313
+ "eval_loss": 0.5401590466499329,
5314
+ "eval_mean_token_accuracy": 0.8320043968414169,
5315
+ "eval_num_tokens": 291796633.0,
5316
+ "eval_runtime": 112.5867,
5317
+ "eval_samples_per_second": 43.433,
5318
+ "eval_steps_per_second": 5.436,
5319
+ "step": 13250
5320
+ },
5321
+ {
5322
+ "epoch": 9.647882203235776,
5323
+ "grad_norm": 0.8282386064529419,
5324
+ "learning_rate": 7.436219150720698e-06,
5325
+ "loss": 0.5155,
5326
+ "mean_token_accuracy": 0.84046880453825,
5327
+ "num_tokens": 292340922.0,
5328
+ "step": 13275
5329
+ },
5330
+ {
5331
+ "epoch": 9.666060716233412,
5332
+ "grad_norm": 0.872983455657959,
5333
+ "learning_rate": 7.325428384073592e-06,
5334
+ "loss": 0.5231,
5335
+ "mean_token_accuracy": 0.8363588589429856,
5336
+ "num_tokens": 292895625.0,
5337
+ "step": 13300
5338
+ },
5339
+ {
5340
+ "epoch": 9.684239229231048,
5341
+ "grad_norm": 0.8708329200744629,
5342
+ "learning_rate": 7.215385900324832e-06,
5343
+ "loss": 0.5144,
5344
+ "mean_token_accuracy": 0.8397229793667793,
5345
+ "num_tokens": 293448542.0,
5346
+ "step": 13325
5347
+ },
5348
+ {
5349
+ "epoch": 9.702417742228686,
5350
+ "grad_norm": 0.8467702269554138,
5351
+ "learning_rate": 7.106094219565869e-06,
5352
+ "loss": 0.5171,
5353
+ "mean_token_accuracy": 0.8385615301132202,
5354
+ "num_tokens": 294000478.0,
5355
+ "step": 13350
5356
+ },
5357
+ {
5358
+ "epoch": 9.720596255226322,
5359
+ "grad_norm": 0.8231089115142822,
5360
+ "learning_rate": 6.9975558446939665e-06,
5361
+ "loss": 0.5132,
5362
+ "mean_token_accuracy": 0.8399266812205315,
5363
+ "num_tokens": 294557047.0,
5364
+ "step": 13375
5365
+ },
5366
+ {
5367
+ "epoch": 9.738774768223958,
5368
+ "grad_norm": 0.9206160306930542,
5369
+ "learning_rate": 6.8897732613548526e-06,
5370
+ "loss": 0.5096,
5371
+ "mean_token_accuracy": 0.8407321670651435,
5372
+ "num_tokens": 295104353.0,
5373
+ "step": 13400
5374
+ },
5375
+ {
5376
+ "epoch": 9.756953281221596,
5377
+ "grad_norm": 0.8946228623390198,
5378
+ "learning_rate": 6.782748937885842e-06,
5379
+ "loss": 0.5157,
5380
+ "mean_token_accuracy": 0.8397801405191422,
5381
+ "num_tokens": 295655574.0,
5382
+ "step": 13425
5383
+ },
5384
+ {
5385
+ "epoch": 9.775131794219233,
5386
+ "grad_norm": 0.7474434971809387,
5387
+ "learning_rate": 6.6764853252592585e-06,
5388
+ "loss": 0.5217,
5389
+ "mean_token_accuracy": 0.8362213695049285,
5390
+ "num_tokens": 296223611.0,
5391
+ "step": 13450
5392
+ },
5393
+ {
5394
+ "epoch": 9.79331030721687,
5395
+ "grad_norm": 0.8649734258651733,
5396
+ "learning_rate": 6.5709848570263324e-06,
5397
+ "loss": 0.5151,
5398
+ "mean_token_accuracy": 0.838211068212986,
5399
+ "num_tokens": 296787088.0,
5400
+ "step": 13475
5401
+ },
5402
+ {
5403
+ "epoch": 9.811488820214507,
5404
+ "grad_norm": 0.7948579788208008,
5405
+ "learning_rate": 6.466249949261474e-06,
5406
+ "loss": 0.5165,
5407
+ "mean_token_accuracy": 0.8387623742222786,
5408
+ "num_tokens": 297344033.0,
5409
+ "step": 13500
5410
+ },
5411
+ {
5412
+ "epoch": 9.811488820214507,
5413
+ "eval_loss": 0.5390045046806335,
5414
+ "eval_mean_token_accuracy": 0.8321733054966708,
5415
+ "eval_num_tokens": 297344033.0,
5416
+ "eval_runtime": 113.1601,
5417
+ "eval_samples_per_second": 43.213,
5418
+ "eval_steps_per_second": 5.408,
5419
+ "step": 13500
5420
  }
5421
  ],
5422
  "logging_steps": 25,
 
5436
  "attributes": {}
5437
  }
5438
  },
5439
+ "total_flos": 7.499312044798116e+17,
5440
  "train_batch_size": 4,
5441
  "trial_name": null,
5442
  "trial_params": null