irishprancer commited on
Commit
8d3e9a4
·
verified ·
1 Parent(s): fa875c0

Training in progress, step 3750, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32f20e5a113f0ce12c0b08ff61de538680ca692bc68665ee0332272cc0d8f53c
3
  size 527048968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f2359775ec058105a768a27e0aec2fd7b09c0fef450becc3fea6a2140d5551
3
  size 527048968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:080d08b596e97f811e550148cdd9224bf440ad30a23acba52af3d08c0021f9aa
3
  size 1054135994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22b0e07d88b69f37af8463bb1ac2f6ff8e912db26c18c2ee123c3a1948596d38
3
  size 1054135994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c26e3773f4be8664a2594f025c73a5f9434f857a45f46fc072657f1fdefb7000
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da0e93581e91c352d5ee493f505f8757c94a31fb5b16f71a9d85577535431525
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:070547fc77391e346b90917e21c08178811df2dccd6cf65dcc04961ee24e1903
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30b7c8be324c8b4289d82c59d6cbd2a46df58415895691106518590654dd09ba
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.7166205048561096,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
- "epoch": 143.47826086956522,
5
  "eval_steps": 150,
6
- "global_step": 3300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3197,6 +3197,441 @@
3197
  "EMA_steps_per_second": 25.834,
3198
  "epoch": 143.47826086956522,
3199
  "step": 3300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3200
  }
3201
  ],
3202
  "logging_steps": 10,
@@ -3216,7 +3651,7 @@
3216
  "attributes": {}
3217
  }
3218
  },
3219
- "total_flos": 8.51203574828974e+16,
3220
  "train_batch_size": 4,
3221
  "trial_name": null,
3222
  "trial_params": null
 
1
  {
2
  "best_metric": 0.7166205048561096,
3
  "best_model_checkpoint": "./output/checkpoint-450",
4
+ "epoch": 163.04347826086956,
5
  "eval_steps": 150,
6
+ "global_step": 3750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3197
  "EMA_steps_per_second": 25.834,
3198
  "epoch": 143.47826086956522,
3199
  "step": 3300
3200
+ },
3201
+ {
3202
+ "epoch": 143.91304347826087,
3203
+ "grad_norm": 1.9049878120422363,
3204
+ "learning_rate": 3.9382995689756636e-06,
3205
+ "loss": 0.2537,
3206
+ "step": 3310
3207
+ },
3208
+ {
3209
+ "epoch": 144.34782608695653,
3210
+ "grad_norm": 1.5125168561935425,
3211
+ "learning_rate": 3.9382593694923146e-06,
3212
+ "loss": 0.2142,
3213
+ "step": 3320
3214
+ },
3215
+ {
3216
+ "epoch": 144.7826086956522,
3217
+ "grad_norm": 1.737127661705017,
3218
+ "learning_rate": 3.938218389718042e-06,
3219
+ "loss": 0.2706,
3220
+ "step": 3330
3221
+ },
3222
+ {
3223
+ "epoch": 145.2173913043478,
3224
+ "grad_norm": 2.886361837387085,
3225
+ "learning_rate": 3.938176629669088e-06,
3226
+ "loss": 0.2079,
3227
+ "step": 3340
3228
+ },
3229
+ {
3230
+ "epoch": 145.65217391304347,
3231
+ "grad_norm": 1.8378046751022339,
3232
+ "learning_rate": 3.938134089362005e-06,
3233
+ "loss": 0.2378,
3234
+ "step": 3350
3235
+ },
3236
+ {
3237
+ "epoch": 146.08695652173913,
3238
+ "grad_norm": 1.9865158796310425,
3239
+ "learning_rate": 3.938090768813655e-06,
3240
+ "loss": 0.2649,
3241
+ "step": 3360
3242
+ },
3243
+ {
3244
+ "epoch": 146.52173913043478,
3245
+ "grad_norm": 1.9572851657867432,
3246
+ "learning_rate": 3.938046668041207e-06,
3247
+ "loss": 0.2268,
3248
+ "step": 3370
3249
+ },
3250
+ {
3251
+ "epoch": 146.95652173913044,
3252
+ "grad_norm": 1.6120030879974365,
3253
+ "learning_rate": 3.9380017870621435e-06,
3254
+ "loss": 0.2154,
3255
+ "step": 3380
3256
+ },
3257
+ {
3258
+ "epoch": 147.3913043478261,
3259
+ "grad_norm": 1.6852221488952637,
3260
+ "learning_rate": 3.9379561258942536e-06,
3261
+ "loss": 0.2284,
3262
+ "step": 3390
3263
+ },
3264
+ {
3265
+ "epoch": 147.82608695652175,
3266
+ "grad_norm": 1.568108081817627,
3267
+ "learning_rate": 3.937909684555634e-06,
3268
+ "loss": 0.2535,
3269
+ "step": 3400
3270
+ },
3271
+ {
3272
+ "epoch": 148.2608695652174,
3273
+ "grad_norm": 1.4495244026184082,
3274
+ "learning_rate": 3.937862463064695e-06,
3275
+ "loss": 0.2152,
3276
+ "step": 3410
3277
+ },
3278
+ {
3279
+ "epoch": 148.69565217391303,
3280
+ "grad_norm": 1.8378851413726807,
3281
+ "learning_rate": 3.937814461440151e-06,
3282
+ "loss": 0.2494,
3283
+ "step": 3420
3284
+ },
3285
+ {
3286
+ "epoch": 149.1304347826087,
3287
+ "grad_norm": 1.866101622581482,
3288
+ "learning_rate": 3.937765679701031e-06,
3289
+ "loss": 0.2711,
3290
+ "step": 3430
3291
+ },
3292
+ {
3293
+ "epoch": 149.56521739130434,
3294
+ "grad_norm": 2.2176806926727295,
3295
+ "learning_rate": 3.937716117866669e-06,
3296
+ "loss": 0.2648,
3297
+ "step": 3440
3298
+ },
3299
+ {
3300
+ "epoch": 150.0,
3301
+ "grad_norm": 3.262206792831421,
3302
+ "learning_rate": 3.93766577595671e-06,
3303
+ "loss": 0.2203,
3304
+ "step": 3450
3305
+ },
3306
+ {
3307
+ "epoch": 150.0,
3308
+ "eval_loss": 0.9511697888374329,
3309
+ "eval_runtime": 0.4583,
3310
+ "eval_samples_per_second": 21.821,
3311
+ "eval_steps_per_second": 21.821,
3312
+ "step": 3450
3313
+ },
3314
+ {
3315
+ "Start_State_loss": 0.8609819412231445,
3316
+ "Start_State_runtime": 0.5187,
3317
+ "Start_State_samples_per_second": 19.279,
3318
+ "Start_State_steps_per_second": 19.279,
3319
+ "epoch": 150.0,
3320
+ "step": 3450
3321
+ },
3322
+ {
3323
+ "Raw_Model_loss": 0.9511697888374329,
3324
+ "Raw_Model_runtime": 0.5075,
3325
+ "Raw_Model_samples_per_second": 19.705,
3326
+ "Raw_Model_steps_per_second": 19.705,
3327
+ "epoch": 150.0,
3328
+ "step": 3450
3329
+ },
3330
+ {
3331
+ "SWA_loss": 0.7842515707015991,
3332
+ "SWA_runtime": 0.4974,
3333
+ "SWA_samples_per_second": 20.104,
3334
+ "SWA_steps_per_second": 20.104,
3335
+ "epoch": 150.0,
3336
+ "step": 3450
3337
+ },
3338
+ {
3339
+ "EMA_loss": 0.8596795201301575,
3340
+ "EMA_runtime": 0.437,
3341
+ "EMA_samples_per_second": 22.881,
3342
+ "EMA_steps_per_second": 22.881,
3343
+ "epoch": 150.0,
3344
+ "step": 3450
3345
+ },
3346
+ {
3347
+ "epoch": 150.43478260869566,
3348
+ "grad_norm": 1.7477214336395264,
3349
+ "learning_rate": 2.5260336320414934e-07,
3350
+ "loss": 0.2137,
3351
+ "step": 3460
3352
+ },
3353
+ {
3354
+ "epoch": 150.8695652173913,
3355
+ "grad_norm": 1.9981499910354614,
3356
+ "learning_rate": 5.052067264082987e-07,
3357
+ "loss": 0.262,
3358
+ "step": 3470
3359
+ },
3360
+ {
3361
+ "epoch": 151.30434782608697,
3362
+ "grad_norm": 1.6229016780853271,
3363
+ "learning_rate": 7.57810089612448e-07,
3364
+ "loss": 0.1996,
3365
+ "step": 3480
3366
+ },
3367
+ {
3368
+ "epoch": 151.7391304347826,
3369
+ "grad_norm": 2.360182046890259,
3370
+ "learning_rate": 1.0104134528165973e-06,
3371
+ "loss": 0.2474,
3372
+ "step": 3490
3373
+ },
3374
+ {
3375
+ "epoch": 152.17391304347825,
3376
+ "grad_norm": 2.097730875015259,
3377
+ "learning_rate": 1.2630168160207466e-06,
3378
+ "loss": 0.2421,
3379
+ "step": 3500
3380
+ },
3381
+ {
3382
+ "epoch": 152.6086956521739,
3383
+ "grad_norm": 1.616011381149292,
3384
+ "learning_rate": 1.515620179224896e-06,
3385
+ "loss": 0.2398,
3386
+ "step": 3510
3387
+ },
3388
+ {
3389
+ "epoch": 153.04347826086956,
3390
+ "grad_norm": 1.5673476457595825,
3391
+ "learning_rate": 1.7682235424290452e-06,
3392
+ "loss": 0.2065,
3393
+ "step": 3520
3394
+ },
3395
+ {
3396
+ "epoch": 153.47826086956522,
3397
+ "grad_norm": 2.3053834438323975,
3398
+ "learning_rate": 2.0208269056331947e-06,
3399
+ "loss": 0.2502,
3400
+ "step": 3530
3401
+ },
3402
+ {
3403
+ "epoch": 153.91304347826087,
3404
+ "grad_norm": 2.665015697479248,
3405
+ "learning_rate": 2.273430268837344e-06,
3406
+ "loss": 0.2317,
3407
+ "step": 3540
3408
+ },
3409
+ {
3410
+ "epoch": 154.34782608695653,
3411
+ "grad_norm": 2.2935352325439453,
3412
+ "learning_rate": 2.5260336320414932e-06,
3413
+ "loss": 0.2402,
3414
+ "step": 3550
3415
+ },
3416
+ {
3417
+ "epoch": 154.7826086956522,
3418
+ "grad_norm": 2.005519151687622,
3419
+ "learning_rate": 2.5260333817317373e-06,
3420
+ "loss": 0.2341,
3421
+ "step": 3560
3422
+ },
3423
+ {
3424
+ "epoch": 155.2173913043478,
3425
+ "grad_norm": 1.6518237590789795,
3426
+ "learning_rate": 2.5260326308025684e-06,
3427
+ "loss": 0.1959,
3428
+ "step": 3570
3429
+ },
3430
+ {
3431
+ "epoch": 155.65217391304347,
3432
+ "grad_norm": 2.093646287918091,
3433
+ "learning_rate": 2.526031379254284e-06,
3434
+ "loss": 0.2666,
3435
+ "step": 3580
3436
+ },
3437
+ {
3438
+ "epoch": 156.08695652173913,
3439
+ "grad_norm": 1.6480534076690674,
3440
+ "learning_rate": 2.5260296270873804e-06,
3441
+ "loss": 0.203,
3442
+ "step": 3590
3443
+ },
3444
+ {
3445
+ "epoch": 156.52173913043478,
3446
+ "grad_norm": 2.494234323501587,
3447
+ "learning_rate": 2.5260273743025526e-06,
3448
+ "loss": 0.2677,
3449
+ "step": 3600
3450
+ },
3451
+ {
3452
+ "epoch": 156.52173913043478,
3453
+ "eval_loss": 0.9589301347732544,
3454
+ "eval_runtime": 0.5211,
3455
+ "eval_samples_per_second": 19.192,
3456
+ "eval_steps_per_second": 19.192,
3457
+ "step": 3600
3458
+ },
3459
+ {
3460
+ "Start_State_loss": 0.8609819412231445,
3461
+ "Start_State_runtime": 0.5189,
3462
+ "Start_State_samples_per_second": 19.273,
3463
+ "Start_State_steps_per_second": 19.273,
3464
+ "epoch": 156.52173913043478,
3465
+ "step": 3600
3466
+ },
3467
+ {
3468
+ "Raw_Model_loss": 0.9589301347732544,
3469
+ "Raw_Model_runtime": 0.4778,
3470
+ "Raw_Model_samples_per_second": 20.928,
3471
+ "Raw_Model_steps_per_second": 20.928,
3472
+ "epoch": 156.52173913043478,
3473
+ "step": 3600
3474
+ },
3475
+ {
3476
+ "SWA_loss": 0.7895848155021667,
3477
+ "SWA_runtime": 0.4495,
3478
+ "SWA_samples_per_second": 22.249,
3479
+ "SWA_steps_per_second": 22.249,
3480
+ "epoch": 156.52173913043478,
3481
+ "step": 3600
3482
+ },
3483
+ {
3484
+ "EMA_loss": 0.8595443964004517,
3485
+ "EMA_runtime": 0.4523,
3486
+ "EMA_samples_per_second": 22.111,
3487
+ "EMA_steps_per_second": 22.111,
3488
+ "epoch": 156.52173913043478,
3489
+ "step": 3600
3490
+ },
3491
+ {
3492
+ "epoch": 156.95652173913044,
3493
+ "grad_norm": 2.5807197093963623,
3494
+ "learning_rate": 2.526024620900692e-06,
3495
+ "loss": 0.1972,
3496
+ "step": 3610
3497
+ },
3498
+ {
3499
+ "epoch": 157.3913043478261,
3500
+ "grad_norm": 1.868238091468811,
3501
+ "learning_rate": 2.526021366882892e-06,
3502
+ "loss": 0.2551,
3503
+ "step": 3620
3504
+ },
3505
+ {
3506
+ "epoch": 157.82608695652175,
3507
+ "grad_norm": 1.9588141441345215,
3508
+ "learning_rate": 2.526017612250441e-06,
3509
+ "loss": 0.2346,
3510
+ "step": 3630
3511
+ },
3512
+ {
3513
+ "epoch": 158.2608695652174,
3514
+ "grad_norm": 2.2502245903015137,
3515
+ "learning_rate": 2.5260133570048273e-06,
3516
+ "loss": 0.2246,
3517
+ "step": 3640
3518
+ },
3519
+ {
3520
+ "epoch": 158.69565217391303,
3521
+ "grad_norm": 3.502547025680542,
3522
+ "learning_rate": 2.526008601147738e-06,
3523
+ "loss": 0.2274,
3524
+ "step": 3650
3525
+ },
3526
+ {
3527
+ "epoch": 159.1304347826087,
3528
+ "grad_norm": 2.578259229660034,
3529
+ "learning_rate": 2.526003344681058e-06,
3530
+ "loss": 0.2527,
3531
+ "step": 3660
3532
+ },
3533
+ {
3534
+ "epoch": 159.56521739130434,
3535
+ "grad_norm": 1.8786590099334717,
3536
+ "learning_rate": 2.5259975876068714e-06,
3537
+ "loss": 0.2368,
3538
+ "step": 3670
3539
+ },
3540
+ {
3541
+ "epoch": 160.0,
3542
+ "grad_norm": 2.6274077892303467,
3543
+ "learning_rate": 2.525991329927459e-06,
3544
+ "loss": 0.2028,
3545
+ "step": 3680
3546
+ },
3547
+ {
3548
+ "epoch": 160.43478260869566,
3549
+ "grad_norm": 2.002021551132202,
3550
+ "learning_rate": 2.5259845716453015e-06,
3551
+ "loss": 0.2279,
3552
+ "step": 3690
3553
+ },
3554
+ {
3555
+ "epoch": 160.8695652173913,
3556
+ "grad_norm": 1.5384571552276611,
3557
+ "learning_rate": 2.525977312763078e-06,
3558
+ "loss": 0.2436,
3559
+ "step": 3700
3560
+ },
3561
+ {
3562
+ "epoch": 161.30434782608697,
3563
+ "grad_norm": 1.5973771810531616,
3564
+ "learning_rate": 2.5259695532836654e-06,
3565
+ "loss": 0.2762,
3566
+ "step": 3710
3567
+ },
3568
+ {
3569
+ "epoch": 161.7391304347826,
3570
+ "grad_norm": 2.0545079708099365,
3571
+ "learning_rate": 2.52596129321014e-06,
3572
+ "loss": 0.2278,
3573
+ "step": 3720
3574
+ },
3575
+ {
3576
+ "epoch": 162.17391304347825,
3577
+ "grad_norm": 2.2520041465759277,
3578
+ "learning_rate": 2.525952532545775e-06,
3579
+ "loss": 0.2153,
3580
+ "step": 3730
3581
+ },
3582
+ {
3583
+ "epoch": 162.6086956521739,
3584
+ "grad_norm": 2.081439971923828,
3585
+ "learning_rate": 2.5259432712940426e-06,
3586
+ "loss": 0.2231,
3587
+ "step": 3740
3588
+ },
3589
+ {
3590
+ "epoch": 163.04347826086956,
3591
+ "grad_norm": 2.37165904045105,
3592
+ "learning_rate": 2.5259335094586143e-06,
3593
+ "loss": 0.2235,
3594
+ "step": 3750
3595
+ },
3596
+ {
3597
+ "epoch": 163.04347826086956,
3598
+ "eval_loss": 0.9614953994750977,
3599
+ "eval_runtime": 0.407,
3600
+ "eval_samples_per_second": 24.568,
3601
+ "eval_steps_per_second": 24.568,
3602
+ "step": 3750
3603
+ },
3604
+ {
3605
+ "Start_State_loss": 0.8609819412231445,
3606
+ "Start_State_runtime": 0.3898,
3607
+ "Start_State_samples_per_second": 25.657,
3608
+ "Start_State_steps_per_second": 25.657,
3609
+ "epoch": 163.04347826086956,
3610
+ "step": 3750
3611
+ },
3612
+ {
3613
+ "Raw_Model_loss": 0.9614953994750977,
3614
+ "Raw_Model_runtime": 0.386,
3615
+ "Raw_Model_samples_per_second": 25.905,
3616
+ "Raw_Model_steps_per_second": 25.905,
3617
+ "epoch": 163.04347826086956,
3618
+ "step": 3750
3619
+ },
3620
+ {
3621
+ "SWA_loss": 0.7928785681724548,
3622
+ "SWA_runtime": 0.3893,
3623
+ "SWA_samples_per_second": 25.686,
3624
+ "SWA_steps_per_second": 25.686,
3625
+ "epoch": 163.04347826086956,
3626
+ "step": 3750
3627
+ },
3628
+ {
3629
+ "EMA_loss": 0.860231876373291,
3630
+ "EMA_runtime": 0.386,
3631
+ "EMA_samples_per_second": 25.904,
3632
+ "EMA_steps_per_second": 25.904,
3633
+ "epoch": 163.04347826086956,
3634
+ "step": 3750
3635
  }
3636
  ],
3637
  "logging_steps": 10,
 
3651
  "attributes": {}
3652
  }
3653
  },
3654
+ "total_flos": 9.668631592798618e+16,
3655
  "train_batch_size": 4,
3656
  "trial_name": null,
3657
  "trial_params": null