ErrorAI commited on
Commit
9d6fbdd
·
verified ·
1 Parent(s): 722ae8f

Training in progress, step 699, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a4fb523f7244c95da5d44515f16148f790310d022275e74bc8e40e31849b204
3
  size 14293800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b098c267df4058757e013da2353ed0e41ef6d403ed82445ed658dcf91b1ba7d
3
  size 14293800
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea830d65530a748ac8f8cc947fc57b973a9776d368f8ef2b2c54bb1876a1de3e
3
  size 7580068
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e04bd14d7c7ad63b28c7159a22b510d6fc239bb5d11f324e22a1faef3024d006
3
  size 7580068
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41245d7ca8c865be9f4e326efb64a9b95a0073396d253aa02ba350e15d2d8d1f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a9390150a36562e39e016d5aad5d61ae4c54518262bb0bae25d0421c3afecd
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1f99997eb39bd4d882b125dd5964b45544bb34241c145b76b123c83b346e86f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:876db3afeec1a47b9cd23534bb10590b976ccab59f5486942f107d4e7111af82
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5009406073636119,
5
  "eval_steps": 233,
6
- "global_step": 466,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3293,6 +3293,1645 @@
3293
  "eval_samples_per_second": 70.571,
3294
  "eval_steps_per_second": 35.286,
3295
  "step": 466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3296
  }
3297
  ],
3298
  "logging_steps": 1,
@@ -3312,7 +4951,7 @@
3312
  "attributes": {}
3313
  }
3314
  },
3315
- "total_flos": 3514992376676352.0,
3316
  "train_batch_size": 2,
3317
  "trial_name": null,
3318
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.7514109110454179,
5
  "eval_steps": 233,
6
+ "global_step": 699,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3293
  "eval_samples_per_second": 70.571,
3294
  "eval_steps_per_second": 35.286,
3295
  "step": 466
3296
+ },
3297
+ {
3298
+ "epoch": 0.5020155872077399,
3299
+ "grad_norm": 14.241241455078125,
3300
+ "learning_rate": 0.0001011938450687699,
3301
+ "loss": 2.6351,
3302
+ "step": 467
3303
+ },
3304
+ {
3305
+ "epoch": 0.5030905670518678,
3306
+ "grad_norm": 15.222787857055664,
3307
+ "learning_rate": 0.00010085275639984904,
3308
+ "loss": 3.6475,
3309
+ "step": 468
3310
+ },
3311
+ {
3312
+ "epoch": 0.5041655468959957,
3313
+ "grad_norm": 12.280945777893066,
3314
+ "learning_rate": 0.00010051165780879504,
3315
+ "loss": 2.5128,
3316
+ "step": 469
3317
+ },
3318
+ {
3319
+ "epoch": 0.5052405267401237,
3320
+ "grad_norm": 14.948744773864746,
3321
+ "learning_rate": 0.00010017055326441494,
3322
+ "loss": 2.9114,
3323
+ "step": 470
3324
+ },
3325
+ {
3326
+ "epoch": 0.5063155065842515,
3327
+ "grad_norm": 17.295438766479492,
3328
+ "learning_rate": 9.982944673558508e-05,
3329
+ "loss": 3.4193,
3330
+ "step": 471
3331
+ },
3332
+ {
3333
+ "epoch": 0.5073904864283795,
3334
+ "grad_norm": 18.69068145751953,
3335
+ "learning_rate": 9.9488342191205e-05,
3336
+ "loss": 4.0637,
3337
+ "step": 472
3338
+ },
3339
+ {
3340
+ "epoch": 0.5084654662725074,
3341
+ "grad_norm": 17.060588836669922,
3342
+ "learning_rate": 9.914724360015099e-05,
3343
+ "loss": 3.1746,
3344
+ "step": 473
3345
+ },
3346
+ {
3347
+ "epoch": 0.5095404461166353,
3348
+ "grad_norm": 13.83548641204834,
3349
+ "learning_rate": 9.880615493123012e-05,
3350
+ "loss": 2.757,
3351
+ "step": 474
3352
+ },
3353
+ {
3354
+ "epoch": 0.5106154259607633,
3355
+ "grad_norm": 12.663851737976074,
3356
+ "learning_rate": 9.846508015313408e-05,
3357
+ "loss": 2.7217,
3358
+ "step": 475
3359
+ },
3360
+ {
3361
+ "epoch": 0.5116904058048911,
3362
+ "grad_norm": 14.959012985229492,
3363
+ "learning_rate": 9.812402323439284e-05,
3364
+ "loss": 3.3345,
3365
+ "step": 476
3366
+ },
3367
+ {
3368
+ "epoch": 0.5127653856490191,
3369
+ "grad_norm": 14.708747863769531,
3370
+ "learning_rate": 9.778298814332863e-05,
3371
+ "loss": 2.8671,
3372
+ "step": 477
3373
+ },
3374
+ {
3375
+ "epoch": 0.513840365493147,
3376
+ "grad_norm": 10.635260581970215,
3377
+ "learning_rate": 9.744197884800969e-05,
3378
+ "loss": 2.6943,
3379
+ "step": 478
3380
+ },
3381
+ {
3382
+ "epoch": 0.5149153453372749,
3383
+ "grad_norm": 14.44356632232666,
3384
+ "learning_rate": 9.710099931620408e-05,
3385
+ "loss": 2.1394,
3386
+ "step": 479
3387
+ },
3388
+ {
3389
+ "epoch": 0.5159903251814029,
3390
+ "grad_norm": 9.918120384216309,
3391
+ "learning_rate": 9.676005351533366e-05,
3392
+ "loss": 2.2011,
3393
+ "step": 480
3394
+ },
3395
+ {
3396
+ "epoch": 0.5170653050255307,
3397
+ "grad_norm": 17.1123046875,
3398
+ "learning_rate": 9.64191454124277e-05,
3399
+ "loss": 3.2862,
3400
+ "step": 481
3401
+ },
3402
+ {
3403
+ "epoch": 0.5181402848696587,
3404
+ "grad_norm": 13.601349830627441,
3405
+ "learning_rate": 9.60782789740769e-05,
3406
+ "loss": 2.3652,
3407
+ "step": 482
3408
+ },
3409
+ {
3410
+ "epoch": 0.5192152647137867,
3411
+ "grad_norm": 16.888429641723633,
3412
+ "learning_rate": 9.573745816638716e-05,
3413
+ "loss": 2.7698,
3414
+ "step": 483
3415
+ },
3416
+ {
3417
+ "epoch": 0.5202902445579145,
3418
+ "grad_norm": 16.15688705444336,
3419
+ "learning_rate": 9.539668695493344e-05,
3420
+ "loss": 2.6367,
3421
+ "step": 484
3422
+ },
3423
+ {
3424
+ "epoch": 0.5213652244020425,
3425
+ "grad_norm": 16.414520263671875,
3426
+ "learning_rate": 9.505596930471367e-05,
3427
+ "loss": 3.9243,
3428
+ "step": 485
3429
+ },
3430
+ {
3431
+ "epoch": 0.5224402042461704,
3432
+ "grad_norm": 13.967657089233398,
3433
+ "learning_rate": 9.471530918010253e-05,
3434
+ "loss": 3.1243,
3435
+ "step": 486
3436
+ },
3437
+ {
3438
+ "epoch": 0.5235151840902983,
3439
+ "grad_norm": 15.140684127807617,
3440
+ "learning_rate": 9.43747105448054e-05,
3441
+ "loss": 2.659,
3442
+ "step": 487
3443
+ },
3444
+ {
3445
+ "epoch": 0.5245901639344263,
3446
+ "grad_norm": 13.074856758117676,
3447
+ "learning_rate": 9.40341773618122e-05,
3448
+ "loss": 3.3224,
3449
+ "step": 488
3450
+ },
3451
+ {
3452
+ "epoch": 0.5256651437785541,
3453
+ "grad_norm": 15.123608589172363,
3454
+ "learning_rate": 9.369371359335128e-05,
3455
+ "loss": 3.385,
3456
+ "step": 489
3457
+ },
3458
+ {
3459
+ "epoch": 0.5267401236226821,
3460
+ "grad_norm": 15.648529052734375,
3461
+ "learning_rate": 9.335332320084331e-05,
3462
+ "loss": 2.8329,
3463
+ "step": 490
3464
+ },
3465
+ {
3466
+ "epoch": 0.52781510346681,
3467
+ "grad_norm": 15.20040225982666,
3468
+ "learning_rate": 9.301301014485528e-05,
3469
+ "loss": 3.5456,
3470
+ "step": 491
3471
+ },
3472
+ {
3473
+ "epoch": 0.5288900833109379,
3474
+ "grad_norm": 23.638113021850586,
3475
+ "learning_rate": 9.267277838505423e-05,
3476
+ "loss": 4.8434,
3477
+ "step": 492
3478
+ },
3479
+ {
3480
+ "epoch": 0.5299650631550659,
3481
+ "grad_norm": 11.857388496398926,
3482
+ "learning_rate": 9.233263188016138e-05,
3483
+ "loss": 2.2761,
3484
+ "step": 493
3485
+ },
3486
+ {
3487
+ "epoch": 0.5310400429991937,
3488
+ "grad_norm": 12.123178482055664,
3489
+ "learning_rate": 9.199257458790591e-05,
3490
+ "loss": 3.0025,
3491
+ "step": 494
3492
+ },
3493
+ {
3494
+ "epoch": 0.5321150228433217,
3495
+ "grad_norm": 11.024534225463867,
3496
+ "learning_rate": 9.165261046497907e-05,
3497
+ "loss": 2.265,
3498
+ "step": 495
3499
+ },
3500
+ {
3501
+ "epoch": 0.5331900026874496,
3502
+ "grad_norm": 16.32103157043457,
3503
+ "learning_rate": 9.131274346698796e-05,
3504
+ "loss": 3.5393,
3505
+ "step": 496
3506
+ },
3507
+ {
3508
+ "epoch": 0.5342649825315775,
3509
+ "grad_norm": 25.771560668945312,
3510
+ "learning_rate": 9.097297754840962e-05,
3511
+ "loss": 3.8375,
3512
+ "step": 497
3513
+ },
3514
+ {
3515
+ "epoch": 0.5353399623757055,
3516
+ "grad_norm": 12.820847511291504,
3517
+ "learning_rate": 9.063331666254503e-05,
3518
+ "loss": 2.6361,
3519
+ "step": 498
3520
+ },
3521
+ {
3522
+ "epoch": 0.5364149422198333,
3523
+ "grad_norm": 12.816265106201172,
3524
+ "learning_rate": 9.029376476147302e-05,
3525
+ "loss": 2.2486,
3526
+ "step": 499
3527
+ },
3528
+ {
3529
+ "epoch": 0.5374899220639613,
3530
+ "grad_norm": 9.368247032165527,
3531
+ "learning_rate": 8.995432579600439e-05,
3532
+ "loss": 2.5467,
3533
+ "step": 500
3534
+ },
3535
+ {
3536
+ "epoch": 0.5385649019080893,
3537
+ "grad_norm": 16.314271926879883,
3538
+ "learning_rate": 8.961500371563585e-05,
3539
+ "loss": 3.1917,
3540
+ "step": 501
3541
+ },
3542
+ {
3543
+ "epoch": 0.5396398817522171,
3544
+ "grad_norm": 17.877838134765625,
3545
+ "learning_rate": 8.927580246850418e-05,
3546
+ "loss": 3.636,
3547
+ "step": 502
3548
+ },
3549
+ {
3550
+ "epoch": 0.5407148615963451,
3551
+ "grad_norm": 15.371697425842285,
3552
+ "learning_rate": 8.893672600134013e-05,
3553
+ "loss": 4.3843,
3554
+ "step": 503
3555
+ },
3556
+ {
3557
+ "epoch": 0.541789841440473,
3558
+ "grad_norm": 14.944757461547852,
3559
+ "learning_rate": 8.859777825942267e-05,
3560
+ "loss": 2.2189,
3561
+ "step": 504
3562
+ },
3563
+ {
3564
+ "epoch": 0.5428648212846009,
3565
+ "grad_norm": 17.238252639770508,
3566
+ "learning_rate": 8.825896318653293e-05,
3567
+ "loss": 2.525,
3568
+ "step": 505
3569
+ },
3570
+ {
3571
+ "epoch": 0.5439398011287289,
3572
+ "grad_norm": 14.758814811706543,
3573
+ "learning_rate": 8.792028472490844e-05,
3574
+ "loss": 3.1758,
3575
+ "step": 506
3576
+ },
3577
+ {
3578
+ "epoch": 0.5450147809728567,
3579
+ "grad_norm": 9.887633323669434,
3580
+ "learning_rate": 8.758174681519721e-05,
3581
+ "loss": 2.2908,
3582
+ "step": 507
3583
+ },
3584
+ {
3585
+ "epoch": 0.5460897608169847,
3586
+ "grad_norm": 13.68622875213623,
3587
+ "learning_rate": 8.724335339641184e-05,
3588
+ "loss": 2.105,
3589
+ "step": 508
3590
+ },
3591
+ {
3592
+ "epoch": 0.5471647406611126,
3593
+ "grad_norm": 12.679695129394531,
3594
+ "learning_rate": 8.690510840588373e-05,
3595
+ "loss": 2.1756,
3596
+ "step": 509
3597
+ },
3598
+ {
3599
+ "epoch": 0.5482397205052405,
3600
+ "grad_norm": 14.024535179138184,
3601
+ "learning_rate": 8.656701577921732e-05,
3602
+ "loss": 3.0431,
3603
+ "step": 510
3604
+ },
3605
+ {
3606
+ "epoch": 0.5493147003493685,
3607
+ "grad_norm": 12.965935707092285,
3608
+ "learning_rate": 8.622907945024417e-05,
3609
+ "loss": 2.1099,
3610
+ "step": 511
3611
+ },
3612
+ {
3613
+ "epoch": 0.5503896801934963,
3614
+ "grad_norm": 19.419710159301758,
3615
+ "learning_rate": 8.589130335097732e-05,
3616
+ "loss": 3.3639,
3617
+ "step": 512
3618
+ },
3619
+ {
3620
+ "epoch": 0.5514646600376243,
3621
+ "grad_norm": 18.27731704711914,
3622
+ "learning_rate": 8.55536914115654e-05,
3623
+ "loss": 3.5416,
3624
+ "step": 513
3625
+ },
3626
+ {
3627
+ "epoch": 0.5525396398817523,
3628
+ "grad_norm": 16.81820297241211,
3629
+ "learning_rate": 8.521624756024705e-05,
3630
+ "loss": 3.8419,
3631
+ "step": 514
3632
+ },
3633
+ {
3634
+ "epoch": 0.5536146197258801,
3635
+ "grad_norm": 14.465489387512207,
3636
+ "learning_rate": 8.487897572330513e-05,
3637
+ "loss": 2.3487,
3638
+ "step": 515
3639
+ },
3640
+ {
3641
+ "epoch": 0.5546895995700081,
3642
+ "grad_norm": 11.499032974243164,
3643
+ "learning_rate": 8.454187982502101e-05,
3644
+ "loss": 2.6283,
3645
+ "step": 516
3646
+ },
3647
+ {
3648
+ "epoch": 0.555764579414136,
3649
+ "grad_norm": 21.029111862182617,
3650
+ "learning_rate": 8.4204963787629e-05,
3651
+ "loss": 4.7078,
3652
+ "step": 517
3653
+ },
3654
+ {
3655
+ "epoch": 0.5568395592582639,
3656
+ "grad_norm": 18.131210327148438,
3657
+ "learning_rate": 8.386823153127064e-05,
3658
+ "loss": 3.6223,
3659
+ "step": 518
3660
+ },
3661
+ {
3662
+ "epoch": 0.5579145391023919,
3663
+ "grad_norm": 15.802128791809082,
3664
+ "learning_rate": 8.353168697394913e-05,
3665
+ "loss": 2.6126,
3666
+ "step": 519
3667
+ },
3668
+ {
3669
+ "epoch": 0.5589895189465197,
3670
+ "grad_norm": 16.3378849029541,
3671
+ "learning_rate": 8.319533403148367e-05,
3672
+ "loss": 2.8075,
3673
+ "step": 520
3674
+ },
3675
+ {
3676
+ "epoch": 0.5600644987906477,
3677
+ "grad_norm": 13.895936012268066,
3678
+ "learning_rate": 8.285917661746401e-05,
3679
+ "loss": 2.7503,
3680
+ "step": 521
3681
+ },
3682
+ {
3683
+ "epoch": 0.5611394786347756,
3684
+ "grad_norm": 13.693531036376953,
3685
+ "learning_rate": 8.25232186432048e-05,
3686
+ "loss": 2.9142,
3687
+ "step": 522
3688
+ },
3689
+ {
3690
+ "epoch": 0.5622144584789035,
3691
+ "grad_norm": 14.982376098632812,
3692
+ "learning_rate": 8.218746401770022e-05,
3693
+ "loss": 3.0101,
3694
+ "step": 523
3695
+ },
3696
+ {
3697
+ "epoch": 0.5632894383230315,
3698
+ "grad_norm": 18.6697998046875,
3699
+ "learning_rate": 8.185191664757828e-05,
3700
+ "loss": 3.6426,
3701
+ "step": 524
3702
+ },
3703
+ {
3704
+ "epoch": 0.5643644181671593,
3705
+ "grad_norm": 12.057175636291504,
3706
+ "learning_rate": 8.151658043705565e-05,
3707
+ "loss": 2.9482,
3708
+ "step": 525
3709
+ },
3710
+ {
3711
+ "epoch": 0.5654393980112873,
3712
+ "grad_norm": 14.079970359802246,
3713
+ "learning_rate": 8.118145928789199e-05,
3714
+ "loss": 3.1769,
3715
+ "step": 526
3716
+ },
3717
+ {
3718
+ "epoch": 0.5665143778554153,
3719
+ "grad_norm": 12.814187049865723,
3720
+ "learning_rate": 8.084655709934462e-05,
3721
+ "loss": 2.472,
3722
+ "step": 527
3723
+ },
3724
+ {
3725
+ "epoch": 0.5675893576995431,
3726
+ "grad_norm": 10.802642822265625,
3727
+ "learning_rate": 8.051187776812326e-05,
3728
+ "loss": 2.0466,
3729
+ "step": 528
3730
+ },
3731
+ {
3732
+ "epoch": 0.5686643375436711,
3733
+ "grad_norm": 12.31850528717041,
3734
+ "learning_rate": 8.017742518834454e-05,
3735
+ "loss": 2.4457,
3736
+ "step": 529
3737
+ },
3738
+ {
3739
+ "epoch": 0.5697393173877989,
3740
+ "grad_norm": 13.78878116607666,
3741
+ "learning_rate": 7.984320325148675e-05,
3742
+ "loss": 2.6326,
3743
+ "step": 530
3744
+ },
3745
+ {
3746
+ "epoch": 0.5708142972319269,
3747
+ "grad_norm": 9.785225868225098,
3748
+ "learning_rate": 7.950921584634461e-05,
3749
+ "loss": 2.8243,
3750
+ "step": 531
3751
+ },
3752
+ {
3753
+ "epoch": 0.5718892770760549,
3754
+ "grad_norm": 16.127605438232422,
3755
+ "learning_rate": 7.917546685898391e-05,
3756
+ "loss": 3.5011,
3757
+ "step": 532
3758
+ },
3759
+ {
3760
+ "epoch": 0.5729642569201827,
3761
+ "grad_norm": 20.46214485168457,
3762
+ "learning_rate": 7.884196017269648e-05,
3763
+ "loss": 2.5311,
3764
+ "step": 533
3765
+ },
3766
+ {
3767
+ "epoch": 0.5740392367643107,
3768
+ "grad_norm": 13.586955070495605,
3769
+ "learning_rate": 7.850869966795476e-05,
3770
+ "loss": 2.8393,
3771
+ "step": 534
3772
+ },
3773
+ {
3774
+ "epoch": 0.5751142166084386,
3775
+ "grad_norm": 16.564584732055664,
3776
+ "learning_rate": 7.817568922236682e-05,
3777
+ "loss": 2.3696,
3778
+ "step": 535
3779
+ },
3780
+ {
3781
+ "epoch": 0.5761891964525665,
3782
+ "grad_norm": 21.446279525756836,
3783
+ "learning_rate": 7.784293271063124e-05,
3784
+ "loss": 4.4285,
3785
+ "step": 536
3786
+ },
3787
+ {
3788
+ "epoch": 0.5772641762966945,
3789
+ "grad_norm": 18.923242568969727,
3790
+ "learning_rate": 7.751043400449197e-05,
3791
+ "loss": 3.2939,
3792
+ "step": 537
3793
+ },
3794
+ {
3795
+ "epoch": 0.5783391561408223,
3796
+ "grad_norm": 16.000579833984375,
3797
+ "learning_rate": 7.717819697269321e-05,
3798
+ "loss": 3.8915,
3799
+ "step": 538
3800
+ },
3801
+ {
3802
+ "epoch": 0.5794141359849503,
3803
+ "grad_norm": 11.695368766784668,
3804
+ "learning_rate": 7.684622548093461e-05,
3805
+ "loss": 2.5856,
3806
+ "step": 539
3807
+ },
3808
+ {
3809
+ "epoch": 0.5804891158290783,
3810
+ "grad_norm": 15.072840690612793,
3811
+ "learning_rate": 7.651452339182613e-05,
3812
+ "loss": 2.8462,
3813
+ "step": 540
3814
+ },
3815
+ {
3816
+ "epoch": 0.5815640956732061,
3817
+ "grad_norm": 18.407136917114258,
3818
+ "learning_rate": 7.618309456484308e-05,
3819
+ "loss": 2.5811,
3820
+ "step": 541
3821
+ },
3822
+ {
3823
+ "epoch": 0.5826390755173341,
3824
+ "grad_norm": 17.274293899536133,
3825
+ "learning_rate": 7.58519428562813e-05,
3826
+ "loss": 3.2455,
3827
+ "step": 542
3828
+ },
3829
+ {
3830
+ "epoch": 0.5837140553614619,
3831
+ "grad_norm": 15.445805549621582,
3832
+ "learning_rate": 7.552107211921229e-05,
3833
+ "loss": 3.3812,
3834
+ "step": 543
3835
+ },
3836
+ {
3837
+ "epoch": 0.5847890352055899,
3838
+ "grad_norm": 16.473222732543945,
3839
+ "learning_rate": 7.519048620343825e-05,
3840
+ "loss": 2.9544,
3841
+ "step": 544
3842
+ },
3843
+ {
3844
+ "epoch": 0.5858640150497179,
3845
+ "grad_norm": 16.593351364135742,
3846
+ "learning_rate": 7.486018895544748e-05,
3847
+ "loss": 3.8982,
3848
+ "step": 545
3849
+ },
3850
+ {
3851
+ "epoch": 0.5869389948938457,
3852
+ "grad_norm": 18.829208374023438,
3853
+ "learning_rate": 7.453018421836946e-05,
3854
+ "loss": 2.933,
3855
+ "step": 546
3856
+ },
3857
+ {
3858
+ "epoch": 0.5880139747379737,
3859
+ "grad_norm": 21.860137939453125,
3860
+ "learning_rate": 7.420047583193019e-05,
3861
+ "loss": 3.5987,
3862
+ "step": 547
3863
+ },
3864
+ {
3865
+ "epoch": 0.5890889545821015,
3866
+ "grad_norm": 15.280534744262695,
3867
+ "learning_rate": 7.387106763240763e-05,
3868
+ "loss": 2.8143,
3869
+ "step": 548
3870
+ },
3871
+ {
3872
+ "epoch": 0.5901639344262295,
3873
+ "grad_norm": 15.49770736694336,
3874
+ "learning_rate": 7.354196345258683e-05,
3875
+ "loss": 2.4214,
3876
+ "step": 549
3877
+ },
3878
+ {
3879
+ "epoch": 0.5912389142703575,
3880
+ "grad_norm": 15.602472305297852,
3881
+ "learning_rate": 7.32131671217155e-05,
3882
+ "loss": 3.4367,
3883
+ "step": 550
3884
+ },
3885
+ {
3886
+ "epoch": 0.5923138941144853,
3887
+ "grad_norm": 14.388925552368164,
3888
+ "learning_rate": 7.288468246545946e-05,
3889
+ "loss": 3.4037,
3890
+ "step": 551
3891
+ },
3892
+ {
3893
+ "epoch": 0.5933888739586133,
3894
+ "grad_norm": 19.788185119628906,
3895
+ "learning_rate": 7.255651330585797e-05,
3896
+ "loss": 3.3473,
3897
+ "step": 552
3898
+ },
3899
+ {
3900
+ "epoch": 0.5944638538027412,
3901
+ "grad_norm": 13.820460319519043,
3902
+ "learning_rate": 7.222866346127953e-05,
3903
+ "loss": 2.5223,
3904
+ "step": 553
3905
+ },
3906
+ {
3907
+ "epoch": 0.5955388336468691,
3908
+ "grad_norm": 16.7007999420166,
3909
+ "learning_rate": 7.190113674637714e-05,
3910
+ "loss": 2.8172,
3911
+ "step": 554
3912
+ },
3913
+ {
3914
+ "epoch": 0.5966138134909971,
3915
+ "grad_norm": 16.35044288635254,
3916
+ "learning_rate": 7.157393697204416e-05,
3917
+ "loss": 2.9871,
3918
+ "step": 555
3919
+ },
3920
+ {
3921
+ "epoch": 0.5976887933351249,
3922
+ "grad_norm": 13.284764289855957,
3923
+ "learning_rate": 7.124706794536983e-05,
3924
+ "loss": 3.6496,
3925
+ "step": 556
3926
+ },
3927
+ {
3928
+ "epoch": 0.5987637731792529,
3929
+ "grad_norm": 12.027867317199707,
3930
+ "learning_rate": 7.09205334695951e-05,
3931
+ "loss": 2.2565,
3932
+ "step": 557
3933
+ },
3934
+ {
3935
+ "epoch": 0.5998387530233809,
3936
+ "grad_norm": 17.299827575683594,
3937
+ "learning_rate": 7.059433734406818e-05,
3938
+ "loss": 2.7168,
3939
+ "step": 558
3940
+ },
3941
+ {
3942
+ "epoch": 0.6009137328675087,
3943
+ "grad_norm": 17.865158081054688,
3944
+ "learning_rate": 7.026848336420054e-05,
3945
+ "loss": 3.6538,
3946
+ "step": 559
3947
+ },
3948
+ {
3949
+ "epoch": 0.6019887127116367,
3950
+ "grad_norm": 12.184990882873535,
3951
+ "learning_rate": 6.99429753214226e-05,
3952
+ "loss": 1.9531,
3953
+ "step": 560
3954
+ },
3955
+ {
3956
+ "epoch": 0.6030636925557645,
3957
+ "grad_norm": 15.416332244873047,
3958
+ "learning_rate": 6.961781700313972e-05,
3959
+ "loss": 2.8138,
3960
+ "step": 561
3961
+ },
3962
+ {
3963
+ "epoch": 0.6041386723998925,
3964
+ "grad_norm": 16.23126220703125,
3965
+ "learning_rate": 6.929301219268805e-05,
3966
+ "loss": 2.6759,
3967
+ "step": 562
3968
+ },
3969
+ {
3970
+ "epoch": 0.6052136522440205,
3971
+ "grad_norm": 21.727291107177734,
3972
+ "learning_rate": 6.896856466929062e-05,
3973
+ "loss": 3.2578,
3974
+ "step": 563
3975
+ },
3976
+ {
3977
+ "epoch": 0.6062886320881483,
3978
+ "grad_norm": 9.553668975830078,
3979
+ "learning_rate": 6.86444782080132e-05,
3980
+ "loss": 2.1244,
3981
+ "step": 564
3982
+ },
3983
+ {
3984
+ "epoch": 0.6073636119322763,
3985
+ "grad_norm": 21.076509475708008,
3986
+ "learning_rate": 6.832075657972054e-05,
3987
+ "loss": 3.2957,
3988
+ "step": 565
3989
+ },
3990
+ {
3991
+ "epoch": 0.6084385917764042,
3992
+ "grad_norm": 11.683152198791504,
3993
+ "learning_rate": 6.799740355103239e-05,
3994
+ "loss": 2.4247,
3995
+ "step": 566
3996
+ },
3997
+ {
3998
+ "epoch": 0.6095135716205321,
3999
+ "grad_norm": 13.204034805297852,
4000
+ "learning_rate": 6.76744228842797e-05,
4001
+ "loss": 2.9528,
4002
+ "step": 567
4003
+ },
4004
+ {
4005
+ "epoch": 0.6105885514646601,
4006
+ "grad_norm": 14.521734237670898,
4007
+ "learning_rate": 6.735181833746086e-05,
4008
+ "loss": 2.5978,
4009
+ "step": 568
4010
+ },
4011
+ {
4012
+ "epoch": 0.6116635313087879,
4013
+ "grad_norm": 17.30651092529297,
4014
+ "learning_rate": 6.702959366419801e-05,
4015
+ "loss": 3.5166,
4016
+ "step": 569
4017
+ },
4018
+ {
4019
+ "epoch": 0.6127385111529159,
4020
+ "grad_norm": 15.7838716506958,
4021
+ "learning_rate": 6.670775261369325e-05,
4022
+ "loss": 2.6126,
4023
+ "step": 570
4024
+ },
4025
+ {
4026
+ "epoch": 0.6138134909970439,
4027
+ "grad_norm": 13.338095664978027,
4028
+ "learning_rate": 6.638629893068515e-05,
4029
+ "loss": 3.1597,
4030
+ "step": 571
4031
+ },
4032
+ {
4033
+ "epoch": 0.6148884708411717,
4034
+ "grad_norm": 16.482463836669922,
4035
+ "learning_rate": 6.60652363554051e-05,
4036
+ "loss": 2.4673,
4037
+ "step": 572
4038
+ },
4039
+ {
4040
+ "epoch": 0.6159634506852997,
4041
+ "grad_norm": 13.602096557617188,
4042
+ "learning_rate": 6.574456862353377e-05,
4043
+ "loss": 2.7441,
4044
+ "step": 573
4045
+ },
4046
+ {
4047
+ "epoch": 0.6170384305294275,
4048
+ "grad_norm": 15.007840156555176,
4049
+ "learning_rate": 6.542429946615774e-05,
4050
+ "loss": 2.7128,
4051
+ "step": 574
4052
+ },
4053
+ {
4054
+ "epoch": 0.6181134103735555,
4055
+ "grad_norm": 11.352252006530762,
4056
+ "learning_rate": 6.510443260972599e-05,
4057
+ "loss": 3.2629,
4058
+ "step": 575
4059
+ },
4060
+ {
4061
+ "epoch": 0.6191883902176835,
4062
+ "grad_norm": 14.53456974029541,
4063
+ "learning_rate": 6.47849717760066e-05,
4064
+ "loss": 1.8599,
4065
+ "step": 576
4066
+ },
4067
+ {
4068
+ "epoch": 0.6202633700618113,
4069
+ "grad_norm": 12.216171264648438,
4070
+ "learning_rate": 6.446592068204341e-05,
4071
+ "loss": 2.2911,
4072
+ "step": 577
4073
+ },
4074
+ {
4075
+ "epoch": 0.6213383499059393,
4076
+ "grad_norm": 15.579245567321777,
4077
+ "learning_rate": 6.41472830401128e-05,
4078
+ "loss": 2.4855,
4079
+ "step": 578
4080
+ },
4081
+ {
4082
+ "epoch": 0.6224133297500671,
4083
+ "grad_norm": 19.280820846557617,
4084
+ "learning_rate": 6.382906255768051e-05,
4085
+ "loss": 4.8336,
4086
+ "step": 579
4087
+ },
4088
+ {
4089
+ "epoch": 0.6234883095941951,
4090
+ "grad_norm": 19.063934326171875,
4091
+ "learning_rate": 6.351126293735843e-05,
4092
+ "loss": 2.7687,
4093
+ "step": 580
4094
+ },
4095
+ {
4096
+ "epoch": 0.6245632894383231,
4097
+ "grad_norm": 13.213850975036621,
4098
+ "learning_rate": 6.319388787686158e-05,
4099
+ "loss": 3.1479,
4100
+ "step": 581
4101
+ },
4102
+ {
4103
+ "epoch": 0.6256382692824509,
4104
+ "grad_norm": 19.5723934173584,
4105
+ "learning_rate": 6.287694106896509e-05,
4106
+ "loss": 4.7255,
4107
+ "step": 582
4108
+ },
4109
+ {
4110
+ "epoch": 0.6267132491265789,
4111
+ "grad_norm": 12.059754371643066,
4112
+ "learning_rate": 6.256042620146119e-05,
4113
+ "loss": 2.5616,
4114
+ "step": 583
4115
+ },
4116
+ {
4117
+ "epoch": 0.6277882289707069,
4118
+ "grad_norm": 10.33979320526123,
4119
+ "learning_rate": 6.224434695711631e-05,
4120
+ "loss": 1.6791,
4121
+ "step": 584
4122
+ },
4123
+ {
4124
+ "epoch": 0.6288632088148347,
4125
+ "grad_norm": 12.472517967224121,
4126
+ "learning_rate": 6.19287070136283e-05,
4127
+ "loss": 2.8273,
4128
+ "step": 585
4129
+ },
4130
+ {
4131
+ "epoch": 0.6299381886589627,
4132
+ "grad_norm": 13.145999908447266,
4133
+ "learning_rate": 6.16135100435836e-05,
4134
+ "loss": 2.4934,
4135
+ "step": 586
4136
+ },
4137
+ {
4138
+ "epoch": 0.6310131685030905,
4139
+ "grad_norm": 12.09467887878418,
4140
+ "learning_rate": 6.129875971441434e-05,
4141
+ "loss": 2.5874,
4142
+ "step": 587
4143
+ },
4144
+ {
4145
+ "epoch": 0.6320881483472185,
4146
+ "grad_norm": 12.754231452941895,
4147
+ "learning_rate": 6.0984459688356e-05,
4148
+ "loss": 3.2328,
4149
+ "step": 588
4150
+ },
4151
+ {
4152
+ "epoch": 0.6331631281913465,
4153
+ "grad_norm": 20.645830154418945,
4154
+ "learning_rate": 6.0670613622404496e-05,
4155
+ "loss": 3.202,
4156
+ "step": 589
4157
+ },
4158
+ {
4159
+ "epoch": 0.6342381080354743,
4160
+ "grad_norm": 13.244977951049805,
4161
+ "learning_rate": 6.035722516827382e-05,
4162
+ "loss": 2.1665,
4163
+ "step": 590
4164
+ },
4165
+ {
4166
+ "epoch": 0.6353130878796023,
4167
+ "grad_norm": 17.669931411743164,
4168
+ "learning_rate": 6.004429797235349e-05,
4169
+ "loss": 2.6613,
4170
+ "step": 591
4171
+ },
4172
+ {
4173
+ "epoch": 0.6363880677237301,
4174
+ "grad_norm": 10.736635208129883,
4175
+ "learning_rate": 5.973183567566605e-05,
4176
+ "loss": 2.4063,
4177
+ "step": 592
4178
+ },
4179
+ {
4180
+ "epoch": 0.6374630475678581,
4181
+ "grad_norm": 18.680782318115234,
4182
+ "learning_rate": 5.9419841913824824e-05,
4183
+ "loss": 2.6796,
4184
+ "step": 593
4185
+ },
4186
+ {
4187
+ "epoch": 0.6385380274119861,
4188
+ "grad_norm": 19.364147186279297,
4189
+ "learning_rate": 5.9108320316991536e-05,
4190
+ "loss": 3.8844,
4191
+ "step": 594
4192
+ },
4193
+ {
4194
+ "epoch": 0.6396130072561139,
4195
+ "grad_norm": 13.238618850708008,
4196
+ "learning_rate": 5.879727450983412e-05,
4197
+ "loss": 3.3821,
4198
+ "step": 595
4199
+ },
4200
+ {
4201
+ "epoch": 0.6406879871002419,
4202
+ "grad_norm": 21.127750396728516,
4203
+ "learning_rate": 5.848670811148451e-05,
4204
+ "loss": 3.8302,
4205
+ "step": 596
4206
+ },
4207
+ {
4208
+ "epoch": 0.6417629669443697,
4209
+ "grad_norm": 19.388427734375,
4210
+ "learning_rate": 5.817662473549651e-05,
4211
+ "loss": 2.4551,
4212
+ "step": 597
4213
+ },
4214
+ {
4215
+ "epoch": 0.6428379467884977,
4216
+ "grad_norm": 13.296785354614258,
4217
+ "learning_rate": 5.786702798980388e-05,
4218
+ "loss": 2.7313,
4219
+ "step": 598
4220
+ },
4221
+ {
4222
+ "epoch": 0.6439129266326257,
4223
+ "grad_norm": 13.282078742980957,
4224
+ "learning_rate": 5.755792147667811e-05,
4225
+ "loss": 2.6865,
4226
+ "step": 599
4227
+ },
4228
+ {
4229
+ "epoch": 0.6449879064767535,
4230
+ "grad_norm": 16.131546020507812,
4231
+ "learning_rate": 5.7249308792686815e-05,
4232
+ "loss": 2.9787,
4233
+ "step": 600
4234
+ },
4235
+ {
4236
+ "epoch": 0.6460628863208815,
4237
+ "grad_norm": 13.292814254760742,
4238
+ "learning_rate": 5.6941193528651596e-05,
4239
+ "loss": 2.7872,
4240
+ "step": 601
4241
+ },
4242
+ {
4243
+ "epoch": 0.6471378661650095,
4244
+ "grad_norm": 13.973624229431152,
4245
+ "learning_rate": 5.663357926960644e-05,
4246
+ "loss": 2.6566,
4247
+ "step": 602
4248
+ },
4249
+ {
4250
+ "epoch": 0.6482128460091373,
4251
+ "grad_norm": 14.040596961975098,
4252
+ "learning_rate": 5.6326469594756034e-05,
4253
+ "loss": 3.0928,
4254
+ "step": 603
4255
+ },
4256
+ {
4257
+ "epoch": 0.6492878258532653,
4258
+ "grad_norm": 17.149208068847656,
4259
+ "learning_rate": 5.6019868077433876e-05,
4260
+ "loss": 3.2098,
4261
+ "step": 604
4262
+ },
4263
+ {
4264
+ "epoch": 0.6503628056973931,
4265
+ "grad_norm": 14.205854415893555,
4266
+ "learning_rate": 5.5713778285061046e-05,
4267
+ "loss": 2.1934,
4268
+ "step": 605
4269
+ },
4270
+ {
4271
+ "epoch": 0.6514377855415211,
4272
+ "grad_norm": 20.856319427490234,
4273
+ "learning_rate": 5.540820377910435e-05,
4274
+ "loss": 4.2625,
4275
+ "step": 606
4276
+ },
4277
+ {
4278
+ "epoch": 0.6525127653856491,
4279
+ "grad_norm": 19.353553771972656,
4280
+ "learning_rate": 5.5103148115035195e-05,
4281
+ "loss": 3.3242,
4282
+ "step": 607
4283
+ },
4284
+ {
4285
+ "epoch": 0.6535877452297769,
4286
+ "grad_norm": 13.190366744995117,
4287
+ "learning_rate": 5.479861484228794e-05,
4288
+ "loss": 2.3837,
4289
+ "step": 608
4290
+ },
4291
+ {
4292
+ "epoch": 0.6546627250739049,
4293
+ "grad_norm": 16.302879333496094,
4294
+ "learning_rate": 5.449460750421883e-05,
4295
+ "loss": 3.2505,
4296
+ "step": 609
4297
+ },
4298
+ {
4299
+ "epoch": 0.6557377049180327,
4300
+ "grad_norm": 13.280342102050781,
4301
+ "learning_rate": 5.419112963806468e-05,
4302
+ "loss": 1.8674,
4303
+ "step": 610
4304
+ },
4305
+ {
4306
+ "epoch": 0.6568126847621607,
4307
+ "grad_norm": 21.006906509399414,
4308
+ "learning_rate": 5.388818477490154e-05,
4309
+ "loss": 3.6557,
4310
+ "step": 611
4311
+ },
4312
+ {
4313
+ "epoch": 0.6578876646062887,
4314
+ "grad_norm": 19.307729721069336,
4315
+ "learning_rate": 5.358577643960403e-05,
4316
+ "loss": 2.2382,
4317
+ "step": 612
4318
+ },
4319
+ {
4320
+ "epoch": 0.6589626444504165,
4321
+ "grad_norm": 11.849048614501953,
4322
+ "learning_rate": 5.328390815080381e-05,
4323
+ "loss": 2.0229,
4324
+ "step": 613
4325
+ },
4326
+ {
4327
+ "epoch": 0.6600376242945445,
4328
+ "grad_norm": 11.787457466125488,
4329
+ "learning_rate": 5.2982583420849116e-05,
4330
+ "loss": 2.6637,
4331
+ "step": 614
4332
+ },
4333
+ {
4334
+ "epoch": 0.6611126041386725,
4335
+ "grad_norm": 15.777995109558105,
4336
+ "learning_rate": 5.268180575576352e-05,
4337
+ "loss": 2.893,
4338
+ "step": 615
4339
+ },
4340
+ {
4341
+ "epoch": 0.6621875839828003,
4342
+ "grad_norm": 19.96381378173828,
4343
+ "learning_rate": 5.238157865520539e-05,
4344
+ "loss": 3.2706,
4345
+ "step": 616
4346
+ },
4347
+ {
4348
+ "epoch": 0.6632625638269283,
4349
+ "grad_norm": 20.699359893798828,
4350
+ "learning_rate": 5.208190561242708e-05,
4351
+ "loss": 2.6676,
4352
+ "step": 617
4353
+ },
4354
+ {
4355
+ "epoch": 0.6643375436710561,
4356
+ "grad_norm": 14.418145179748535,
4357
+ "learning_rate": 5.178279011423417e-05,
4358
+ "loss": 2.6929,
4359
+ "step": 618
4360
+ },
4361
+ {
4362
+ "epoch": 0.6654125235151841,
4363
+ "grad_norm": 12.969032287597656,
4364
+ "learning_rate": 5.148423564094517e-05,
4365
+ "loss": 2.5543,
4366
+ "step": 619
4367
+ },
4368
+ {
4369
+ "epoch": 0.6664875033593121,
4370
+ "grad_norm": 25.956907272338867,
4371
+ "learning_rate": 5.118624566635066e-05,
4372
+ "loss": 3.7616,
4373
+ "step": 620
4374
+ },
4375
+ {
4376
+ "epoch": 0.6675624832034399,
4377
+ "grad_norm": 16.259069442749023,
4378
+ "learning_rate": 5.0888823657673266e-05,
4379
+ "loss": 2.6596,
4380
+ "step": 621
4381
+ },
4382
+ {
4383
+ "epoch": 0.6686374630475679,
4384
+ "grad_norm": 15.737533569335938,
4385
+ "learning_rate": 5.059197307552698e-05,
4386
+ "loss": 3.4037,
4387
+ "step": 622
4388
+ },
4389
+ {
4390
+ "epoch": 0.6697124428916957,
4391
+ "grad_norm": 15.57932186126709,
4392
+ "learning_rate": 5.0295697373877096e-05,
4393
+ "loss": 3.3037,
4394
+ "step": 623
4395
+ },
4396
+ {
4397
+ "epoch": 0.6707874227358237,
4398
+ "grad_norm": 16.854917526245117,
4399
+ "learning_rate": 5.000000000000002e-05,
4400
+ "loss": 2.7998,
4401
+ "step": 624
4402
+ },
4403
+ {
4404
+ "epoch": 0.6718624025799517,
4405
+ "grad_norm": 20.23788070678711,
4406
+ "learning_rate": 4.9704884394442964e-05,
4407
+ "loss": 2.6882,
4408
+ "step": 625
4409
+ },
4410
+ {
4411
+ "epoch": 0.6729373824240795,
4412
+ "grad_norm": 13.206673622131348,
4413
+ "learning_rate": 4.941035399098418e-05,
4414
+ "loss": 2.8392,
4415
+ "step": 626
4416
+ },
4417
+ {
4418
+ "epoch": 0.6740123622682075,
4419
+ "grad_norm": 11.313912391662598,
4420
+ "learning_rate": 4.911641221659279e-05,
4421
+ "loss": 2.6687,
4422
+ "step": 627
4423
+ },
4424
+ {
4425
+ "epoch": 0.6750873421123353,
4426
+ "grad_norm": 17.33076286315918,
4427
+ "learning_rate": 4.8823062491389094e-05,
4428
+ "loss": 2.5485,
4429
+ "step": 628
4430
+ },
4431
+ {
4432
+ "epoch": 0.6761623219564633,
4433
+ "grad_norm": 17.54669952392578,
4434
+ "learning_rate": 4.853030822860455e-05,
4435
+ "loss": 3.4131,
4436
+ "step": 629
4437
+ },
4438
+ {
4439
+ "epoch": 0.6772373018005913,
4440
+ "grad_norm": 14.568754196166992,
4441
+ "learning_rate": 4.823815283454235e-05,
4442
+ "loss": 3.2074,
4443
+ "step": 630
4444
+ },
4445
+ {
4446
+ "epoch": 0.6783122816447191,
4447
+ "grad_norm": 14.021873474121094,
4448
+ "learning_rate": 4.794659970853749e-05,
4449
+ "loss": 2.3931,
4450
+ "step": 631
4451
+ },
4452
+ {
4453
+ "epoch": 0.6793872614888471,
4454
+ "grad_norm": 15.752306938171387,
4455
+ "learning_rate": 4.765565224291743e-05,
4456
+ "loss": 2.4038,
4457
+ "step": 632
4458
+ },
4459
+ {
4460
+ "epoch": 0.6804622413329751,
4461
+ "grad_norm": 17.143024444580078,
4462
+ "learning_rate": 4.7365313822962576e-05,
4463
+ "loss": 2.8179,
4464
+ "step": 633
4465
+ },
4466
+ {
4467
+ "epoch": 0.6815372211771029,
4468
+ "grad_norm": 18.36078643798828,
4469
+ "learning_rate": 4.707558782686677e-05,
4470
+ "loss": 2.4392,
4471
+ "step": 634
4472
+ },
4473
+ {
4474
+ "epoch": 0.6826122010212309,
4475
+ "grad_norm": 17.252010345458984,
4476
+ "learning_rate": 4.67864776256982e-05,
4477
+ "loss": 2.6834,
4478
+ "step": 635
4479
+ },
4480
+ {
4481
+ "epoch": 0.6836871808653587,
4482
+ "grad_norm": 25.259824752807617,
4483
+ "learning_rate": 4.64979865833599e-05,
4484
+ "loss": 3.477,
4485
+ "step": 636
4486
+ },
4487
+ {
4488
+ "epoch": 0.6847621607094867,
4489
+ "grad_norm": 14.85805606842041,
4490
+ "learning_rate": 4.621011805655093e-05,
4491
+ "loss": 2.3259,
4492
+ "step": 637
4493
+ },
4494
+ {
4495
+ "epoch": 0.6858371405536147,
4496
+ "grad_norm": 14.022159576416016,
4497
+ "learning_rate": 4.592287539472701e-05,
4498
+ "loss": 2.6373,
4499
+ "step": 638
4500
+ },
4501
+ {
4502
+ "epoch": 0.6869121203977425,
4503
+ "grad_norm": 13.847736358642578,
4504
+ "learning_rate": 4.563626194006178e-05,
4505
+ "loss": 2.5128,
4506
+ "step": 639
4507
+ },
4508
+ {
4509
+ "epoch": 0.6879871002418705,
4510
+ "grad_norm": 12.617048263549805,
4511
+ "learning_rate": 4.535028102740785e-05,
4512
+ "loss": 2.6718,
4513
+ "step": 640
4514
+ },
4515
+ {
4516
+ "epoch": 0.6890620800859983,
4517
+ "grad_norm": 15.006109237670898,
4518
+ "learning_rate": 4.5064935984257826e-05,
4519
+ "loss": 3.0097,
4520
+ "step": 641
4521
+ },
4522
+ {
4523
+ "epoch": 0.6901370599301263,
4524
+ "grad_norm": 18.60867691040039,
4525
+ "learning_rate": 4.478023013070595e-05,
4526
+ "loss": 3.4391,
4527
+ "step": 642
4528
+ },
4529
+ {
4530
+ "epoch": 0.6912120397742543,
4531
+ "grad_norm": 18.005126953125,
4532
+ "learning_rate": 4.449616677940903e-05,
4533
+ "loss": 2.8679,
4534
+ "step": 643
4535
+ },
4536
+ {
4537
+ "epoch": 0.6922870196183821,
4538
+ "grad_norm": 15.206576347351074,
4539
+ "learning_rate": 4.421274923554835e-05,
4540
+ "loss": 2.5725,
4541
+ "step": 644
4542
+ },
4543
+ {
4544
+ "epoch": 0.6933619994625101,
4545
+ "grad_norm": 10.55386734008789,
4546
+ "learning_rate": 4.392998079679076e-05,
4547
+ "loss": 2.5634,
4548
+ "step": 645
4549
+ },
4550
+ {
4551
+ "epoch": 0.694436979306638,
4552
+ "grad_norm": 17.997459411621094,
4553
+ "learning_rate": 4.364786475325072e-05,
4554
+ "loss": 4.0269,
4555
+ "step": 646
4556
+ },
4557
+ {
4558
+ "epoch": 0.6955119591507659,
4559
+ "grad_norm": 17.598691940307617,
4560
+ "learning_rate": 4.33664043874518e-05,
4561
+ "loss": 3.0651,
4562
+ "step": 647
4563
+ },
4564
+ {
4565
+ "epoch": 0.6965869389948939,
4566
+ "grad_norm": 19.84808349609375,
4567
+ "learning_rate": 4.30856029742884e-05,
4568
+ "loss": 3.4776,
4569
+ "step": 648
4570
+ },
4571
+ {
4572
+ "epoch": 0.6976619188390217,
4573
+ "grad_norm": 19.29751968383789,
4574
+ "learning_rate": 4.280546378098792e-05,
4575
+ "loss": 3.4422,
4576
+ "step": 649
4577
+ },
4578
+ {
4579
+ "epoch": 0.6987368986831497,
4580
+ "grad_norm": 19.02079963684082,
4581
+ "learning_rate": 4.252599006707245e-05,
4582
+ "loss": 2.9825,
4583
+ "step": 650
4584
+ },
4585
+ {
4586
+ "epoch": 0.6998118785272777,
4587
+ "grad_norm": 17.91227912902832,
4588
+ "learning_rate": 4.224718508432113e-05,
4589
+ "loss": 2.7378,
4590
+ "step": 651
4591
+ },
4592
+ {
4593
+ "epoch": 0.7008868583714055,
4594
+ "grad_norm": 20.459753036499023,
4595
+ "learning_rate": 4.196905207673201e-05,
4596
+ "loss": 3.7669,
4597
+ "step": 652
4598
+ },
4599
+ {
4600
+ "epoch": 0.7019618382155335,
4601
+ "grad_norm": 12.434891700744629,
4602
+ "learning_rate": 4.16915942804846e-05,
4603
+ "loss": 2.6119,
4604
+ "step": 653
4605
+ },
4606
+ {
4607
+ "epoch": 0.7030368180596613,
4608
+ "grad_norm": 17.53223419189453,
4609
+ "learning_rate": 4.141481492390197e-05,
4610
+ "loss": 2.7561,
4611
+ "step": 654
4612
+ },
4613
+ {
4614
+ "epoch": 0.7041117979037893,
4615
+ "grad_norm": 15.005716323852539,
4616
+ "learning_rate": 4.113871722741337e-05,
4617
+ "loss": 2.5018,
4618
+ "step": 655
4619
+ },
4620
+ {
4621
+ "epoch": 0.7051867777479173,
4622
+ "grad_norm": 19.277578353881836,
4623
+ "learning_rate": 4.08633044035167e-05,
4624
+ "loss": 3.2063,
4625
+ "step": 656
4626
+ },
4627
+ {
4628
+ "epoch": 0.7062617575920451,
4629
+ "grad_norm": 18.722848892211914,
4630
+ "learning_rate": 4.058857965674101e-05,
4631
+ "loss": 2.4138,
4632
+ "step": 657
4633
+ },
4634
+ {
4635
+ "epoch": 0.7073367374361731,
4636
+ "grad_norm": 12.36088752746582,
4637
+ "learning_rate": 4.031454618360945e-05,
4638
+ "loss": 2.4916,
4639
+ "step": 658
4640
+ },
4641
+ {
4642
+ "epoch": 0.708411717280301,
4643
+ "grad_norm": 18.981199264526367,
4644
+ "learning_rate": 4.0041207172601826e-05,
4645
+ "loss": 3.578,
4646
+ "step": 659
4647
+ },
4648
+ {
4649
+ "epoch": 0.7094866971244289,
4650
+ "grad_norm": 15.9765625,
4651
+ "learning_rate": 3.976856580411774e-05,
4652
+ "loss": 2.896,
4653
+ "step": 660
4654
+ },
4655
+ {
4656
+ "epoch": 0.7105616769685569,
4657
+ "grad_norm": 10.438156127929688,
4658
+ "learning_rate": 3.9496625250439344e-05,
4659
+ "loss": 2.5757,
4660
+ "step": 661
4661
+ },
4662
+ {
4663
+ "epoch": 0.7116366568126847,
4664
+ "grad_norm": 15.564522743225098,
4665
+ "learning_rate": 3.922538867569466e-05,
4666
+ "loss": 2.7255,
4667
+ "step": 662
4668
+ },
4669
+ {
4670
+ "epoch": 0.7127116366568127,
4671
+ "grad_norm": 10.465269088745117,
4672
+ "learning_rate": 3.8954859235820664e-05,
4673
+ "loss": 1.888,
4674
+ "step": 663
4675
+ },
4676
+ {
4677
+ "epoch": 0.7137866165009407,
4678
+ "grad_norm": 10.99638843536377,
4679
+ "learning_rate": 3.8685040078526415e-05,
4680
+ "loss": 2.6292,
4681
+ "step": 664
4682
+ },
4683
+ {
4684
+ "epoch": 0.7148615963450685,
4685
+ "grad_norm": 14.457728385925293,
4686
+ "learning_rate": 3.841593434325675e-05,
4687
+ "loss": 2.3995,
4688
+ "step": 665
4689
+ },
4690
+ {
4691
+ "epoch": 0.7159365761891965,
4692
+ "grad_norm": 17.016172409057617,
4693
+ "learning_rate": 3.814754516115544e-05,
4694
+ "loss": 2.8181,
4695
+ "step": 666
4696
+ },
4697
+ {
4698
+ "epoch": 0.7170115560333243,
4699
+ "grad_norm": 18.437854766845703,
4700
+ "learning_rate": 3.787987565502902e-05,
4701
+ "loss": 2.1409,
4702
+ "step": 667
4703
+ },
4704
+ {
4705
+ "epoch": 0.7180865358774523,
4706
+ "grad_norm": 19.192140579223633,
4707
+ "learning_rate": 3.761292893931019e-05,
4708
+ "loss": 3.0362,
4709
+ "step": 668
4710
+ },
4711
+ {
4712
+ "epoch": 0.7191615157215803,
4713
+ "grad_norm": 18.337635040283203,
4714
+ "learning_rate": 3.734670812002183e-05,
4715
+ "loss": 3.1711,
4716
+ "step": 669
4717
+ },
4718
+ {
4719
+ "epoch": 0.7202364955657081,
4720
+ "grad_norm": 12.289979934692383,
4721
+ "learning_rate": 3.708121629474077e-05,
4722
+ "loss": 2.4776,
4723
+ "step": 670
4724
+ },
4725
+ {
4726
+ "epoch": 0.7213114754098361,
4727
+ "grad_norm": 16.519603729248047,
4728
+ "learning_rate": 3.681645655256159e-05,
4729
+ "loss": 2.5979,
4730
+ "step": 671
4731
+ },
4732
+ {
4733
+ "epoch": 0.7223864552539639,
4734
+ "grad_norm": 16.15471076965332,
4735
+ "learning_rate": 3.655243197406097e-05,
4736
+ "loss": 3.1705,
4737
+ "step": 672
4738
+ },
4739
+ {
4740
+ "epoch": 0.7234614350980919,
4741
+ "grad_norm": 14.030595779418945,
4742
+ "learning_rate": 3.628914563126156e-05,
4743
+ "loss": 2.5392,
4744
+ "step": 673
4745
+ },
4746
+ {
4747
+ "epoch": 0.7245364149422199,
4748
+ "grad_norm": 14.324786186218262,
4749
+ "learning_rate": 3.6026600587596484e-05,
4750
+ "loss": 3.4405,
4751
+ "step": 674
4752
+ },
4753
+ {
4754
+ "epoch": 0.7256113947863477,
4755
+ "grad_norm": 14.356512069702148,
4756
+ "learning_rate": 3.576479989787345e-05,
4757
+ "loss": 2.7351,
4758
+ "step": 675
4759
+ },
4760
+ {
4761
+ "epoch": 0.7266863746304757,
4762
+ "grad_norm": 22.63331413269043,
4763
+ "learning_rate": 3.550374660823949e-05,
4764
+ "loss": 4.4579,
4765
+ "step": 676
4766
+ },
4767
+ {
4768
+ "epoch": 0.7277613544746036,
4769
+ "grad_norm": 16.112449645996094,
4770
+ "learning_rate": 3.52434437561452e-05,
4771
+ "loss": 3.2883,
4772
+ "step": 677
4773
+ },
4774
+ {
4775
+ "epoch": 0.7288363343187315,
4776
+ "grad_norm": 15.933297157287598,
4777
+ "learning_rate": 3.4983894370309665e-05,
4778
+ "loss": 3.1271,
4779
+ "step": 678
4780
+ },
4781
+ {
4782
+ "epoch": 0.7299113141628595,
4783
+ "grad_norm": 18.30646514892578,
4784
+ "learning_rate": 3.472510147068515e-05,
4785
+ "loss": 3.1462,
4786
+ "step": 679
4787
+ },
4788
+ {
4789
+ "epoch": 0.7309862940069873,
4790
+ "grad_norm": 14.52500057220459,
4791
+ "learning_rate": 3.446706806842177e-05,
4792
+ "loss": 3.5975,
4793
+ "step": 680
4794
+ },
4795
+ {
4796
+ "epoch": 0.7320612738511153,
4797
+ "grad_norm": 19.05596160888672,
4798
+ "learning_rate": 3.420979716583279e-05,
4799
+ "loss": 3.3156,
4800
+ "step": 681
4801
+ },
4802
+ {
4803
+ "epoch": 0.7331362536952433,
4804
+ "grad_norm": 17.002887725830078,
4805
+ "learning_rate": 3.395329175635935e-05,
4806
+ "loss": 2.8059,
4807
+ "step": 682
4808
+ },
4809
+ {
4810
+ "epoch": 0.7342112335393711,
4811
+ "grad_norm": 13.947765350341797,
4812
+ "learning_rate": 3.369755482453594e-05,
4813
+ "loss": 2.3958,
4814
+ "step": 683
4815
+ },
4816
+ {
4817
+ "epoch": 0.7352862133834991,
4818
+ "grad_norm": 14.15211009979248,
4819
+ "learning_rate": 3.344258934595539e-05,
4820
+ "loss": 2.7893,
4821
+ "step": 684
4822
+ },
4823
+ {
4824
+ "epoch": 0.7363611932276269,
4825
+ "grad_norm": 16.130531311035156,
4826
+ "learning_rate": 3.31883982872345e-05,
4827
+ "loss": 2.4694,
4828
+ "step": 685
4829
+ },
4830
+ {
4831
+ "epoch": 0.7374361730717549,
4832
+ "grad_norm": 19.640533447265625,
4833
+ "learning_rate": 3.2934984605979424e-05,
4834
+ "loss": 3.6272,
4835
+ "step": 686
4836
+ },
4837
+ {
4838
+ "epoch": 0.7385111529158829,
4839
+ "grad_norm": 16.92746925354004,
4840
+ "learning_rate": 3.268235125075111e-05,
4841
+ "loss": 3.1489,
4842
+ "step": 687
4843
+ },
4844
+ {
4845
+ "epoch": 0.7395861327600107,
4846
+ "grad_norm": 13.478626251220703,
4847
+ "learning_rate": 3.243050116103128e-05,
4848
+ "loss": 4.0042,
4849
+ "step": 688
4850
+ },
4851
+ {
4852
+ "epoch": 0.7406611126041387,
4853
+ "grad_norm": 16.014270782470703,
4854
+ "learning_rate": 3.217943726718795e-05,
4855
+ "loss": 2.8463,
4856
+ "step": 689
4857
+ },
4858
+ {
4859
+ "epoch": 0.7417360924482665,
4860
+ "grad_norm": 10.819585800170898,
4861
+ "learning_rate": 3.1929162490441565e-05,
4862
+ "loss": 2.3828,
4863
+ "step": 690
4864
+ },
4865
+ {
4866
+ "epoch": 0.7428110722923945,
4867
+ "grad_norm": 13.190820693969727,
4868
+ "learning_rate": 3.16796797428308e-05,
4869
+ "loss": 2.5904,
4870
+ "step": 691
4871
+ },
4872
+ {
4873
+ "epoch": 0.7438860521365225,
4874
+ "grad_norm": 16.281742095947266,
4875
+ "learning_rate": 3.1430991927178866e-05,
4876
+ "loss": 2.2573,
4877
+ "step": 692
4878
+ },
4879
+ {
4880
+ "epoch": 0.7449610319806503,
4881
+ "grad_norm": 17.6481990814209,
4882
+ "learning_rate": 3.1183101937059647e-05,
4883
+ "loss": 2.8613,
4884
+ "step": 693
4885
+ },
4886
+ {
4887
+ "epoch": 0.7460360118247783,
4888
+ "grad_norm": 15.284998893737793,
4889
+ "learning_rate": 3.093601265676393e-05,
4890
+ "loss": 3.2382,
4891
+ "step": 694
4892
+ },
4893
+ {
4894
+ "epoch": 0.7471109916689062,
4895
+ "grad_norm": 15.303094863891602,
4896
+ "learning_rate": 3.068972696126611e-05,
4897
+ "loss": 2.2713,
4898
+ "step": 695
4899
+ },
4900
+ {
4901
+ "epoch": 0.7481859715130341,
4902
+ "grad_norm": 17.59407615661621,
4903
+ "learning_rate": 3.044424771619041e-05,
4904
+ "loss": 2.976,
4905
+ "step": 696
4906
+ },
4907
+ {
4908
+ "epoch": 0.7492609513571621,
4909
+ "grad_norm": 15.894051551818848,
4910
+ "learning_rate": 3.0199577777777875e-05,
4911
+ "loss": 3.3442,
4912
+ "step": 697
4913
+ },
4914
+ {
4915
+ "epoch": 0.7503359312012899,
4916
+ "grad_norm": 13.673702239990234,
4917
+ "learning_rate": 2.9955719992852804e-05,
4918
+ "loss": 2.5122,
4919
+ "step": 698
4920
+ },
4921
+ {
4922
+ "epoch": 0.7514109110454179,
4923
+ "grad_norm": 16.84321403503418,
4924
+ "learning_rate": 2.9712677198789916e-05,
4925
+ "loss": 2.9834,
4926
+ "step": 699
4927
+ },
4928
+ {
4929
+ "epoch": 0.7514109110454179,
4930
+ "eval_loss": 0.6793206930160522,
4931
+ "eval_runtime": 5.5979,
4932
+ "eval_samples_per_second": 70.026,
4933
+ "eval_steps_per_second": 35.013,
4934
+ "step": 699
4935
  }
4936
  ],
4937
  "logging_steps": 1,
 
4951
  "attributes": {}
4952
  }
4953
  },
4954
+ "total_flos": 5272488565014528.0,
4955
  "train_batch_size": 2,
4956
  "trial_name": null,
4957
  "trial_params": null