aghatage commited on
Commit
696b514
·
verified ·
1 Parent(s): e6472bb

Training in progress, step 8500, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07328eeab3354e923721dcf5dc81f90b0d35763e3d2da61af29191c7a2e7c269
3
  size 12017472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ebf61e4d96852d448117769c201ad05ba342a330472e0c4f7a17e11064f0353
3
  size 12017472
last-checkpoint/global_step8500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9faf9cbf90034bc7c6932f8e7206842c828fa7da1b8607e63186a879d9056a1
3
+ size 71982309
last-checkpoint/global_step8500/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6b067642615c61b221d7f18c7372bb6f436f7c58261bf5c3784fb34aee8621a
3
+ size 146356645
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step8000
 
1
+ global_step8500
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:63794b157bba20e542419c9f34c95871186573432daaa310cf18fc7cd73ac609
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b568051b69d3c00915acebc4a453fb6368ef5a43d2d24e5d733c17ec637f2069
3
  size 14709
last-checkpoint/trainer_state.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "best_global_step": 8000,
3
- "best_metric": 0.5604261755943298,
4
- "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-8000",
5
- "epoch": 5.814397382294128,
6
  "eval_steps": 250,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3217,6 +3217,206 @@
3217
  "eval_samples_per_second": 43.235,
3218
  "eval_steps_per_second": 5.411,
3219
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3220
  }
3221
  ],
3222
  "logging_steps": 25,
@@ -3236,7 +3436,7 @@
3236
  "attributes": {}
3237
  }
3238
  },
3239
- "total_flos": 4.4438800024417075e+17,
3240
  "train_batch_size": 4,
3241
  "trial_name": null,
3242
  "trial_params": null
 
1
  {
2
+ "best_global_step": 8500,
3
+ "best_metric": 0.5578206181526184,
4
+ "best_model_checkpoint": "/root/leap-finetune/outputs/sft/lfm2_350m_marathi_optimized_12ep/checkpoint-8500",
5
+ "epoch": 6.1774222868569355,
6
  "eval_steps": 250,
7
+ "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3217
  "eval_samples_per_second": 43.235,
3218
  "eval_steps_per_second": 5.411,
3219
  "step": 8000
3220
+ },
3221
+ {
3222
+ "epoch": 5.8325758952917655,
3223
+ "grad_norm": 0.774117648601532,
3224
+ "learning_rate": 4.215052865315844e-05,
3225
+ "loss": 0.5424,
3226
+ "mean_token_accuracy": 0.8316737848520279,
3227
+ "num_tokens": 176748107.0,
3228
+ "step": 8025
3229
+ },
3230
+ {
3231
+ "epoch": 5.850754408289402,
3232
+ "grad_norm": 0.7777197360992432,
3233
+ "learning_rate": 4.1959361143150435e-05,
3234
+ "loss": 0.5522,
3235
+ "mean_token_accuracy": 0.8278136014938354,
3236
+ "num_tokens": 177303487.0,
3237
+ "step": 8050
3238
+ },
3239
+ {
3240
+ "epoch": 5.868932921287039,
3241
+ "grad_norm": 0.7735779285430908,
3242
+ "learning_rate": 4.176814876166096e-05,
3243
+ "loss": 0.5499,
3244
+ "mean_token_accuracy": 0.8293804588913918,
3245
+ "num_tokens": 177849190.0,
3246
+ "step": 8075
3247
+ },
3248
+ {
3249
+ "epoch": 5.887111434284676,
3250
+ "grad_norm": 0.8152751922607422,
3251
+ "learning_rate": 4.157689588765956e-05,
3252
+ "loss": 0.5504,
3253
+ "mean_token_accuracy": 0.8286561304330826,
3254
+ "num_tokens": 178400389.0,
3255
+ "step": 8100
3256
+ },
3257
+ {
3258
+ "epoch": 5.905289947282312,
3259
+ "grad_norm": 0.7434157729148865,
3260
+ "learning_rate": 4.138560690104317e-05,
3261
+ "loss": 0.5603,
3262
+ "mean_token_accuracy": 0.8263833120465278,
3263
+ "num_tokens": 178953283.0,
3264
+ "step": 8125
3265
+ },
3266
+ {
3267
+ "epoch": 5.923468460279949,
3268
+ "grad_norm": 0.7802156805992126,
3269
+ "learning_rate": 4.119428618253569e-05,
3270
+ "loss": 0.5476,
3271
+ "mean_token_accuracy": 0.8301771306991577,
3272
+ "num_tokens": 179508280.0,
3273
+ "step": 8150
3274
+ },
3275
+ {
3276
+ "epoch": 5.941646973277586,
3277
+ "grad_norm": 0.7939499020576477,
3278
+ "learning_rate": 4.100293811358773e-05,
3279
+ "loss": 0.5466,
3280
+ "mean_token_accuracy": 0.8302853351831436,
3281
+ "num_tokens": 180043519.0,
3282
+ "step": 8175
3283
+ },
3284
+ {
3285
+ "epoch": 5.959825486275223,
3286
+ "grad_norm": 0.840653121471405,
3287
+ "learning_rate": 4.081156707627624e-05,
3288
+ "loss": 0.5432,
3289
+ "mean_token_accuracy": 0.8307925063371658,
3290
+ "num_tokens": 180593050.0,
3291
+ "step": 8200
3292
+ },
3293
+ {
3294
+ "epoch": 5.978003999272859,
3295
+ "grad_norm": 0.8524306416511536,
3296
+ "learning_rate": 4.0620177453204224e-05,
3297
+ "loss": 0.5482,
3298
+ "mean_token_accuracy": 0.8293575036525727,
3299
+ "num_tokens": 181133882.0,
3300
+ "step": 8225
3301
+ },
3302
+ {
3303
+ "epoch": 5.996182512270496,
3304
+ "grad_norm": 0.7460827827453613,
3305
+ "learning_rate": 4.042877362740026e-05,
3306
+ "loss": 0.5553,
3307
+ "mean_token_accuracy": 0.8276761430501938,
3308
+ "num_tokens": 181691715.0,
3309
+ "step": 8250
3310
+ },
3311
+ {
3312
+ "epoch": 5.996182512270496,
3313
+ "eval_loss": 0.5585607886314392,
3314
+ "eval_mean_token_accuracy": 0.8256761994626787,
3315
+ "eval_num_tokens": 181691715.0,
3316
+ "eval_runtime": 112.4662,
3317
+ "eval_samples_per_second": 43.48,
3318
+ "eval_steps_per_second": 5.442,
3319
+ "step": 8250
3320
+ },
3321
+ {
3322
+ "epoch": 6.013815669878204,
3323
+ "grad_norm": 0.8060737252235413,
3324
+ "learning_rate": 4.02373599822182e-05,
3325
+ "loss": 0.5458,
3326
+ "mean_token_accuracy": 0.8301435733578869,
3327
+ "num_tokens": 182221571.0,
3328
+ "step": 8275
3329
+ },
3330
+ {
3331
+ "epoch": 6.031994182875841,
3332
+ "grad_norm": 0.7671223878860474,
3333
+ "learning_rate": 4.004594090123678e-05,
3334
+ "loss": 0.5435,
3335
+ "mean_token_accuracy": 0.8307790219783783,
3336
+ "num_tokens": 182762751.0,
3337
+ "step": 8300
3338
+ },
3339
+ {
3340
+ "epoch": 6.050172695873478,
3341
+ "grad_norm": 0.7601523995399475,
3342
+ "learning_rate": 3.985452076815922e-05,
3343
+ "loss": 0.5353,
3344
+ "mean_token_accuracy": 0.83306546241045,
3345
+ "num_tokens": 183305848.0,
3346
+ "step": 8325
3347
+ },
3348
+ {
3349
+ "epoch": 6.068351208871114,
3350
+ "grad_norm": 0.848628044128418,
3351
+ "learning_rate": 3.966310396671283e-05,
3352
+ "loss": 0.5394,
3353
+ "mean_token_accuracy": 0.8321545705199241,
3354
+ "num_tokens": 183850351.0,
3355
+ "step": 8350
3356
+ },
3357
+ {
3358
+ "epoch": 6.086529721868751,
3359
+ "grad_norm": 0.7746974229812622,
3360
+ "learning_rate": 3.9471694880548625e-05,
3361
+ "loss": 0.5387,
3362
+ "mean_token_accuracy": 0.8319272243976593,
3363
+ "num_tokens": 184403683.0,
3364
+ "step": 8375
3365
+ },
3366
+ {
3367
+ "epoch": 6.104708234866388,
3368
+ "grad_norm": 0.8478145599365234,
3369
+ "learning_rate": 3.9280297893140924e-05,
3370
+ "loss": 0.5316,
3371
+ "mean_token_accuracy": 0.8347961682081223,
3372
+ "num_tokens": 184948375.0,
3373
+ "step": 8400
3374
+ },
3375
+ {
3376
+ "epoch": 6.122886747864024,
3377
+ "grad_norm": 0.7637465000152588,
3378
+ "learning_rate": 3.9088917387686984e-05,
3379
+ "loss": 0.545,
3380
+ "mean_token_accuracy": 0.8306651490926743,
3381
+ "num_tokens": 185499651.0,
3382
+ "step": 8425
3383
+ },
3384
+ {
3385
+ "epoch": 6.141065260861661,
3386
+ "grad_norm": 0.8355098366737366,
3387
+ "learning_rate": 3.8897557747006604e-05,
3388
+ "loss": 0.537,
3389
+ "mean_token_accuracy": 0.8326031097769737,
3390
+ "num_tokens": 186059243.0,
3391
+ "step": 8450
3392
+ },
3393
+ {
3394
+ "epoch": 6.159243773859298,
3395
+ "grad_norm": 0.8073794841766357,
3396
+ "learning_rate": 3.870622335344174e-05,
3397
+ "loss": 0.5374,
3398
+ "mean_token_accuracy": 0.8339161434769631,
3399
+ "num_tokens": 186615115.0,
3400
+ "step": 8475
3401
+ },
3402
+ {
3403
+ "epoch": 6.1774222868569355,
3404
+ "grad_norm": 0.7971922159194946,
3405
+ "learning_rate": 3.851491858875619e-05,
3406
+ "loss": 0.5328,
3407
+ "mean_token_accuracy": 0.8337607860565186,
3408
+ "num_tokens": 187158233.0,
3409
+ "step": 8500
3410
+ },
3411
+ {
3412
+ "epoch": 6.1774222868569355,
3413
+ "eval_loss": 0.5578206181526184,
3414
+ "eval_mean_token_accuracy": 0.8258760601671693,
3415
+ "eval_num_tokens": 187158233.0,
3416
+ "eval_runtime": 112.9076,
3417
+ "eval_samples_per_second": 43.31,
3418
+ "eval_steps_per_second": 5.42,
3419
+ "step": 8500
3420
  }
3421
  ],
3422
  "logging_steps": 25,
 
3436
  "attributes": {}
3437
  }
3438
  },
3439
+ "total_flos": 4.7210511534863155e+17,
3440
  "train_batch_size": 4,
3441
  "trial_name": null,
3442
  "trial_params": null