ToastyPigeon commited on
Commit
0b96f6c
·
verified ·
1 Parent(s): 6d087d5

Training in progress, step 504, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a9ecc8f22f2d134fb829455d62e5eda965f38301e3ee95766f8156c52d05093
3
  size 1824599104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38d0011b7e613a29d5e131744cf9551b13d670173b611c938455f075b5670162
3
  size 1824599104
last-checkpoint/optimizer.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86d8193f3a86823f5c4bfa0eacdb2b01a1e2d4833a8667b609b4e2bca7217d19
3
  size 3649546931
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e16932099408d52ff2c49f7820fe2ff8dfa04000b793ddda91e43c81562d745e
3
  size 3649546931
last-checkpoint/pytorch_model_fsdp.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f208840ef7b8768cb1d5d41027f6f54749723d1944c14ce8b8bc0e95cc382606
3
  size 1824732017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f84a51fc1320d3e9e50e59e4a5b035e5730a7972076835884e11a7038f65480
3
  size 1824732017
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:885e639a7848dfb73b3873d4a966790b7dec9c1825488c160bcd315387591bc1
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb2ac806a466383eb93df8ea80a99b3712cfe8151e695c0b1f758cdeecaffe3f
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20f4c60ba7d22e6ff36124bc1136d96ba5f590431a7bbb775918806c0c101306
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1792063418a35fb366dea84068243d35d9402e3b9a3f5eb21e797819c64aabaf
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5316c25576f6827cfc59e409ab7ee8cf3345edb0500b89ef3e46d8b194b79a11
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cbf2b75e2a57c11b63da0a0bf35923f452238d3e3de2a4a72ee75dbf9a6674a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.8086642599277978,
6
  "eval_steps": 500,
7
- "global_step": 448,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3144,6 +3144,398 @@
3144
  "learning_rate": 7.10455764075067e-06,
3145
  "loss": 2.3279,
3146
  "step": 448
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3147
  }
3148
  ],
3149
  "logging_steps": 1,
@@ -3163,7 +3555,7 @@
3163
  "attributes": {}
3164
  }
3165
  },
3166
- "total_flos": 3.95833078219119e+18,
3167
  "train_batch_size": 1,
3168
  "trial_name": null,
3169
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.9097472924187726,
6
  "eval_steps": 500,
7
+ "global_step": 504,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3144
  "learning_rate": 7.10455764075067e-06,
3145
  "loss": 2.3279,
3146
  "step": 448
3147
+ },
3148
+ {
3149
+ "epoch": 0.8104693140794224,
3150
+ "grad_norm": 0.47747042775154114,
3151
+ "learning_rate": 7.080242751180042e-06,
3152
+ "loss": 2.268,
3153
+ "step": 449
3154
+ },
3155
+ {
3156
+ "epoch": 0.8122743682310469,
3157
+ "grad_norm": 0.4877883195877075,
3158
+ "learning_rate": 7.055630936227952e-06,
3159
+ "loss": 2.1942,
3160
+ "step": 450
3161
+ },
3162
+ {
3163
+ "epoch": 0.8140794223826715,
3164
+ "grad_norm": 0.5198360681533813,
3165
+ "learning_rate": 7.030716723549489e-06,
3166
+ "loss": 2.5021,
3167
+ "step": 451
3168
+ },
3169
+ {
3170
+ "epoch": 0.8158844765342961,
3171
+ "grad_norm": 0.642713725566864,
3172
+ "learning_rate": 7.005494505494505e-06,
3173
+ "loss": 2.4711,
3174
+ "step": 452
3175
+ },
3176
+ {
3177
+ "epoch": 0.8176895306859205,
3178
+ "grad_norm": 0.6020970344543457,
3179
+ "learning_rate": 6.979958534899792e-06,
3180
+ "loss": 2.6217,
3181
+ "step": 453
3182
+ },
3183
+ {
3184
+ "epoch": 0.8194945848375451,
3185
+ "grad_norm": 1.4466404914855957,
3186
+ "learning_rate": 6.954102920723228e-06,
3187
+ "loss": 2.52,
3188
+ "step": 454
3189
+ },
3190
+ {
3191
+ "epoch": 0.8212996389891697,
3192
+ "grad_norm": 0.4950203001499176,
3193
+ "learning_rate": 6.927921623512947e-06,
3194
+ "loss": 2.2925,
3195
+ "step": 455
3196
+ },
3197
+ {
3198
+ "epoch": 0.8231046931407943,
3199
+ "grad_norm": 0.5081182718276978,
3200
+ "learning_rate": 6.901408450704225e-06,
3201
+ "loss": 2.229,
3202
+ "step": 456
3203
+ },
3204
+ {
3205
+ "epoch": 0.8249097472924187,
3206
+ "grad_norm": 0.48568499088287354,
3207
+ "learning_rate": 6.874557051736357e-06,
3208
+ "loss": 2.2602,
3209
+ "step": 457
3210
+ },
3211
+ {
3212
+ "epoch": 0.8267148014440433,
3213
+ "grad_norm": 0.659635603427887,
3214
+ "learning_rate": 6.847360912981457e-06,
3215
+ "loss": 2.1938,
3216
+ "step": 458
3217
+ },
3218
+ {
3219
+ "epoch": 0.8285198555956679,
3220
+ "grad_norm": 0.5137075781822205,
3221
+ "learning_rate": 6.81981335247667e-06,
3222
+ "loss": 2.3663,
3223
+ "step": 459
3224
+ },
3225
+ {
3226
+ "epoch": 0.8303249097472925,
3227
+ "grad_norm": 0.5395336151123047,
3228
+ "learning_rate": 6.791907514450867e-06,
3229
+ "loss": 2.3545,
3230
+ "step": 460
3231
+ },
3232
+ {
3233
+ "epoch": 0.8321299638989169,
3234
+ "grad_norm": 1.247609257698059,
3235
+ "learning_rate": 6.763636363636363e-06,
3236
+ "loss": 2.3752,
3237
+ "step": 461
3238
+ },
3239
+ {
3240
+ "epoch": 0.8339350180505415,
3241
+ "grad_norm": 0.4546639919281006,
3242
+ "learning_rate": 6.734992679355783e-06,
3243
+ "loss": 2.3414,
3244
+ "step": 462
3245
+ },
3246
+ {
3247
+ "epoch": 0.8357400722021661,
3248
+ "grad_norm": 1.2336573600769043,
3249
+ "learning_rate": 6.70596904937362e-06,
3250
+ "loss": 2.4396,
3251
+ "step": 463
3252
+ },
3253
+ {
3254
+ "epoch": 0.8375451263537906,
3255
+ "grad_norm": 0.4393303692340851,
3256
+ "learning_rate": 6.676557863501484e-06,
3257
+ "loss": 2.2201,
3258
+ "step": 464
3259
+ },
3260
+ {
3261
+ "epoch": 0.8393501805054152,
3262
+ "grad_norm": 0.7220252752304077,
3263
+ "learning_rate": 6.646751306945482e-06,
3264
+ "loss": 2.2517,
3265
+ "step": 465
3266
+ },
3267
+ {
3268
+ "epoch": 0.8411552346570397,
3269
+ "grad_norm": 0.5886508226394653,
3270
+ "learning_rate": 6.616541353383459e-06,
3271
+ "loss": 2.5818,
3272
+ "step": 466
3273
+ },
3274
+ {
3275
+ "epoch": 0.8429602888086642,
3276
+ "grad_norm": 0.43092313408851624,
3277
+ "learning_rate": 6.585919757759274e-06,
3278
+ "loss": 2.3287,
3279
+ "step": 467
3280
+ },
3281
+ {
3282
+ "epoch": 0.8447653429602888,
3283
+ "grad_norm": 0.5468039512634277,
3284
+ "learning_rate": 6.554878048780487e-06,
3285
+ "loss": 2.3269,
3286
+ "step": 468
3287
+ },
3288
+ {
3289
+ "epoch": 0.8465703971119134,
3290
+ "grad_norm": 0.47726455330848694,
3291
+ "learning_rate": 6.523407521105143e-06,
3292
+ "loss": 2.4202,
3293
+ "step": 469
3294
+ },
3295
+ {
3296
+ "epoch": 0.8483754512635379,
3297
+ "grad_norm": 0.5380074381828308,
3298
+ "learning_rate": 6.491499227202473e-06,
3299
+ "loss": 2.4387,
3300
+ "step": 470
3301
+ },
3302
+ {
3303
+ "epoch": 0.8501805054151624,
3304
+ "grad_norm": 0.521969735622406,
3305
+ "learning_rate": 6.459143968871595e-06,
3306
+ "loss": 2.2393,
3307
+ "step": 471
3308
+ },
3309
+ {
3310
+ "epoch": 0.851985559566787,
3311
+ "grad_norm": 1.7683531045913696,
3312
+ "learning_rate": 6.426332288401255e-06,
3313
+ "loss": 2.0042,
3314
+ "step": 472
3315
+ },
3316
+ {
3317
+ "epoch": 0.8537906137184116,
3318
+ "grad_norm": 0.5432044863700867,
3319
+ "learning_rate": 6.393054459352802e-06,
3320
+ "loss": 2.613,
3321
+ "step": 473
3322
+ },
3323
+ {
3324
+ "epoch": 0.855595667870036,
3325
+ "grad_norm": 1.0683743953704834,
3326
+ "learning_rate": 6.359300476947537e-06,
3327
+ "loss": 2.6203,
3328
+ "step": 474
3329
+ },
3330
+ {
3331
+ "epoch": 0.8574007220216606,
3332
+ "grad_norm": 0.7491397857666016,
3333
+ "learning_rate": 6.3250600480384304e-06,
3334
+ "loss": 2.2519,
3335
+ "step": 475
3336
+ },
3337
+ {
3338
+ "epoch": 0.8592057761732852,
3339
+ "grad_norm": 0.5388492345809937,
3340
+ "learning_rate": 6.290322580645162e-06,
3341
+ "loss": 2.6697,
3342
+ "step": 476
3343
+ },
3344
+ {
3345
+ "epoch": 0.8610108303249098,
3346
+ "grad_norm": 0.6041284799575806,
3347
+ "learning_rate": 6.2550771730300575e-06,
3348
+ "loss": 2.3559,
3349
+ "step": 477
3350
+ },
3351
+ {
3352
+ "epoch": 0.8628158844765343,
3353
+ "grad_norm": 2.10063099861145,
3354
+ "learning_rate": 6.2193126022913265e-06,
3355
+ "loss": 2.2704,
3356
+ "step": 478
3357
+ },
3358
+ {
3359
+ "epoch": 0.8646209386281588,
3360
+ "grad_norm": 0.5261618494987488,
3361
+ "learning_rate": 6.183017312448475e-06,
3362
+ "loss": 2.1879,
3363
+ "step": 479
3364
+ },
3365
+ {
3366
+ "epoch": 0.8664259927797834,
3367
+ "grad_norm": 0.7307838201522827,
3368
+ "learning_rate": 6.146179401993355e-06,
3369
+ "loss": 2.205,
3370
+ "step": 480
3371
+ },
3372
+ {
3373
+ "epoch": 0.868231046931408,
3374
+ "grad_norm": 0.4849015474319458,
3375
+ "learning_rate": 6.1087866108786605e-06,
3376
+ "loss": 2.4929,
3377
+ "step": 481
3378
+ },
3379
+ {
3380
+ "epoch": 0.8700361010830325,
3381
+ "grad_norm": 0.6549043655395508,
3382
+ "learning_rate": 6.0708263069139976e-06,
3383
+ "loss": 2.0747,
3384
+ "step": 482
3385
+ },
3386
+ {
3387
+ "epoch": 0.871841155234657,
3388
+ "grad_norm": 0.4754807651042938,
3389
+ "learning_rate": 6.0322854715378085e-06,
3390
+ "loss": 2.1042,
3391
+ "step": 483
3392
+ },
3393
+ {
3394
+ "epoch": 0.8736462093862816,
3395
+ "grad_norm": 0.5533527135848999,
3396
+ "learning_rate": 5.993150684931507e-06,
3397
+ "loss": 2.5076,
3398
+ "step": 484
3399
+ },
3400
+ {
3401
+ "epoch": 0.8754512635379061,
3402
+ "grad_norm": 0.5143368244171143,
3403
+ "learning_rate": 5.953408110440034e-06,
3404
+ "loss": 2.2678,
3405
+ "step": 485
3406
+ },
3407
+ {
3408
+ "epoch": 0.8772563176895307,
3409
+ "grad_norm": 0.4785096347332001,
3410
+ "learning_rate": 5.9130434782608696e-06,
3411
+ "loss": 2.4127,
3412
+ "step": 486
3413
+ },
3414
+ {
3415
+ "epoch": 0.8790613718411552,
3416
+ "grad_norm": 0.9804636240005493,
3417
+ "learning_rate": 5.872042068361087e-06,
3418
+ "loss": 2.567,
3419
+ "step": 487
3420
+ },
3421
+ {
3422
+ "epoch": 0.8808664259927798,
3423
+ "grad_norm": 0.6786078810691833,
3424
+ "learning_rate": 5.830388692579505e-06,
3425
+ "loss": 2.8093,
3426
+ "step": 488
3427
+ },
3428
+ {
3429
+ "epoch": 0.8826714801444043,
3430
+ "grad_norm": 0.5135942697525024,
3431
+ "learning_rate": 5.78806767586821e-06,
3432
+ "loss": 2.2935,
3433
+ "step": 489
3434
+ },
3435
+ {
3436
+ "epoch": 0.8844765342960289,
3437
+ "grad_norm": 0.5666589140892029,
3438
+ "learning_rate": 5.7450628366247755e-06,
3439
+ "loss": 2.4249,
3440
+ "step": 490
3441
+ },
3442
+ {
3443
+ "epoch": 0.8862815884476535,
3444
+ "grad_norm": 2.5704774856567383,
3445
+ "learning_rate": 5.7013574660633486e-06,
3446
+ "loss": 2.3688,
3447
+ "step": 491
3448
+ },
3449
+ {
3450
+ "epoch": 0.8880866425992779,
3451
+ "grad_norm": 0.4560219943523407,
3452
+ "learning_rate": 5.656934306569343e-06,
3453
+ "loss": 2.1992,
3454
+ "step": 492
3455
+ },
3456
+ {
3457
+ "epoch": 0.8898916967509025,
3458
+ "grad_norm": 0.4998960494995117,
3459
+ "learning_rate": 5.611775528978841e-06,
3460
+ "loss": 2.2708,
3461
+ "step": 493
3462
+ },
3463
+ {
3464
+ "epoch": 0.8916967509025271,
3465
+ "grad_norm": 0.5405519604682922,
3466
+ "learning_rate": 5.565862708719851e-06,
3467
+ "loss": 2.0934,
3468
+ "step": 494
3469
+ },
3470
+ {
3471
+ "epoch": 0.8935018050541517,
3472
+ "grad_norm": 0.42325398325920105,
3473
+ "learning_rate": 5.519176800748362e-06,
3474
+ "loss": 2.3872,
3475
+ "step": 495
3476
+ },
3477
+ {
3478
+ "epoch": 0.8953068592057761,
3479
+ "grad_norm": 0.8842243552207947,
3480
+ "learning_rate": 5.4716981132075475e-06,
3481
+ "loss": 2.3562,
3482
+ "step": 496
3483
+ },
3484
+ {
3485
+ "epoch": 0.8971119133574007,
3486
+ "grad_norm": 0.8491457104682922,
3487
+ "learning_rate": 5.423406279733587e-06,
3488
+ "loss": 2.2841,
3489
+ "step": 497
3490
+ },
3491
+ {
3492
+ "epoch": 0.8989169675090253,
3493
+ "grad_norm": 0.6096642017364502,
3494
+ "learning_rate": 5.374280230326295e-06,
3495
+ "loss": 2.6791,
3496
+ "step": 498
3497
+ },
3498
+ {
3499
+ "epoch": 0.9007220216606499,
3500
+ "grad_norm": 0.6584725975990295,
3501
+ "learning_rate": 5.324298160696999e-06,
3502
+ "loss": 2.2986,
3503
+ "step": 499
3504
+ },
3505
+ {
3506
+ "epoch": 0.9025270758122743,
3507
+ "grad_norm": 0.5640181303024292,
3508
+ "learning_rate": 5.2734375e-06,
3509
+ "loss": 2.1596,
3510
+ "step": 500
3511
+ },
3512
+ {
3513
+ "epoch": 0.9043321299638989,
3514
+ "grad_norm": 0.5127259492874146,
3515
+ "learning_rate": 5.22167487684729e-06,
3516
+ "loss": 2.7908,
3517
+ "step": 501
3518
+ },
3519
+ {
3520
+ "epoch": 0.9061371841155235,
3521
+ "grad_norm": 0.6018031239509583,
3522
+ "learning_rate": 5.168986083499006e-06,
3523
+ "loss": 2.5961,
3524
+ "step": 502
3525
+ },
3526
+ {
3527
+ "epoch": 0.907942238267148,
3528
+ "grad_norm": 0.9906480312347412,
3529
+ "learning_rate": 5.115346038114343e-06,
3530
+ "loss": 2.7193,
3531
+ "step": 503
3532
+ },
3533
+ {
3534
+ "epoch": 0.9097472924187726,
3535
+ "grad_norm": 0.39638206362724304,
3536
+ "learning_rate": 5.060728744939272e-06,
3537
+ "loss": 2.1839,
3538
+ "step": 504
3539
  }
3540
  ],
3541
  "logging_steps": 1,
 
3555
  "attributes": {}
3556
  }
3557
  },
3558
+ "total_flos": 4.4632785742536376e+18,
3559
  "train_batch_size": 1,
3560
  "trial_name": null,
3561
  "trial_params": null