Azrail commited on
Commit
fa90fe3
·
verified ·
1 Parent(s): 59b7b86

Training in progress, step 16000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:15f637ff72e852c00df336464cba31267a78c2fec942618a4cf3dbc081150cb8
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeeda4b1ee34efcb8cc9870b2c843e6649254ccabfeca77e86d631e6043a12c7
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:11255a9366d03d2ecf115313602ca401e81860858d4e1ecad341feef41b0e95b
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad619e41a755fe598f7951c93afa49e976ac527f447e6a0b8b599bd265a02a63
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea828a56e17bf773dc8e4fa2c22d13619b805c6b9321028dd494ff57e5daf8e6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8b7e7bf463c8aae6a90e1a089b32f412ca31160cb21f8d8c8b0ee26b9ca28d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1819c72414dd202fc7a5b387187559436ac1d66f4c4de3f13c18065ffbdf0216
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab47855ede8e2538e1a83a739503d4b0055ccb1a5bd772cbc798bb06eb73f571
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.620667803310349,
6
  "eval_steps": 500,
7
- "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3278,11 +3278,229 @@
3278
  "eval_steps_per_second": 20.492,
3279
  "num_input_tokens_seen": 7245951473,
3280
  "step": 15000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3281
  }
3282
  ],
3283
  "logging_steps": 50,
3284
  "max_steps": 16568,
3285
- "num_input_tokens_seen": 7245951473,
3286
  "num_train_epochs": 4,
3287
  "save_steps": 1000,
3288
  "stateful_callbacks": {
@@ -3297,7 +3515,7 @@
3297
  "attributes": {}
3298
  }
3299
  },
3300
- "total_flos": 1.9383627395138765e+18,
3301
  "train_batch_size": 16,
3302
  "trial_name": null,
3303
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.8620788508834134,
6
  "eval_steps": 500,
7
+ "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3278
  "eval_steps_per_second": 20.492,
3279
  "num_input_tokens_seen": 7245951473,
3280
  "step": 15000
3281
+ },
3282
+ {
3283
+ "epoch": 3.632738355689002,
3284
+ "grad_norm": 0.248046875,
3285
+ "learning_rate": 5.726573110004527e-06,
3286
+ "loss": 2.0923,
3287
+ "mean_token_accuracy": 0.554327048882842,
3288
+ "num_input_tokens_seen": 7269914417,
3289
+ "num_tokens": 3063722766.0,
3290
+ "step": 15050
3291
+ },
3292
+ {
3293
+ "epoch": 3.6448089080676556,
3294
+ "grad_norm": 0.26953125,
3295
+ "learning_rate": 5.5379508073034565e-06,
3296
+ "loss": 2.0861,
3297
+ "mean_token_accuracy": 0.5561711810901762,
3298
+ "num_input_tokens_seen": 7293956449,
3299
+ "num_tokens": 3073837103.0,
3300
+ "step": 15100
3301
+ },
3302
+ {
3303
+ "epoch": 3.6568794604463086,
3304
+ "grad_norm": 0.255859375,
3305
+ "learning_rate": 5.349328504602384e-06,
3306
+ "loss": 2.0949,
3307
+ "mean_token_accuracy": 0.5547518468275666,
3308
+ "num_input_tokens_seen": 7318063713,
3309
+ "num_tokens": 3083990662.0,
3310
+ "step": 15150
3311
+ },
3312
+ {
3313
+ "epoch": 3.668950012824962,
3314
+ "grad_norm": 0.26171875,
3315
+ "learning_rate": 5.160706201901313e-06,
3316
+ "loss": 2.0874,
3317
+ "mean_token_accuracy": 0.555564073510468,
3318
+ "num_input_tokens_seen": 7342169889,
3319
+ "num_tokens": 3094140237.0,
3320
+ "step": 15200
3321
+ },
3322
+ {
3323
+ "epoch": 3.681020565203615,
3324
+ "grad_norm": 0.298828125,
3325
+ "learning_rate": 4.9720838992002415e-06,
3326
+ "loss": 2.1014,
3327
+ "mean_token_accuracy": 0.5540463343262673,
3328
+ "num_input_tokens_seen": 7366510321,
3329
+ "num_tokens": 3104349542.0,
3330
+ "step": 15250
3331
+ },
3332
+ {
3333
+ "epoch": 3.6930911175822683,
3334
+ "grad_norm": 0.349609375,
3335
+ "learning_rate": 4.78346159649917e-06,
3336
+ "loss": 2.1128,
3337
+ "mean_token_accuracy": 0.5523605942726135,
3338
+ "num_input_tokens_seen": 7390741489,
3339
+ "num_tokens": 3114615388.0,
3340
+ "step": 15300
3341
+ },
3342
+ {
3343
+ "epoch": 3.7051616699609218,
3344
+ "grad_norm": 0.265625,
3345
+ "learning_rate": 4.594839293798099e-06,
3346
+ "loss": 2.0836,
3347
+ "mean_token_accuracy": 0.5556057692691684,
3348
+ "num_input_tokens_seen": 7414960897,
3349
+ "num_tokens": 3124852831.0,
3350
+ "step": 15350
3351
+ },
3352
+ {
3353
+ "epoch": 3.717232222339575,
3354
+ "grad_norm": 0.234375,
3355
+ "learning_rate": 4.406216991097027e-06,
3356
+ "loss": 2.0832,
3357
+ "mean_token_accuracy": 0.557135313116014,
3358
+ "num_input_tokens_seen": 7439141473,
3359
+ "num_tokens": 3135118289.0,
3360
+ "step": 15400
3361
+ },
3362
+ {
3363
+ "epoch": 3.729302774718228,
3364
+ "grad_norm": 0.3671875,
3365
+ "learning_rate": 4.217594688395956e-06,
3366
+ "loss": 2.0993,
3367
+ "mean_token_accuracy": 0.554307484254241,
3368
+ "num_input_tokens_seen": 7463114529,
3369
+ "num_tokens": 3145345547.0,
3370
+ "step": 15450
3371
+ },
3372
+ {
3373
+ "epoch": 3.7413733270968814,
3374
+ "grad_norm": 0.27734375,
3375
+ "learning_rate": 4.028972385694885e-06,
3376
+ "loss": 2.0916,
3377
+ "num_input_tokens_seen": 7487502401,
3378
+ "step": 15500
3379
+ },
3380
+ {
3381
+ "epoch": 3.7413733270968814,
3382
+ "eval_loss": 1.9680771827697754,
3383
+ "eval_mean_token_accuracy": 0.5784905481975056,
3384
+ "eval_num_tokens": 3155655029.0,
3385
+ "eval_runtime": 131.7036,
3386
+ "eval_samples_per_second": 81.334,
3387
+ "eval_steps_per_second": 20.334,
3388
+ "num_input_tokens_seen": 7487502401,
3389
+ "step": 15500
3390
+ },
3391
+ {
3392
+ "epoch": 3.7534438794755345,
3393
+ "grad_norm": 0.28125,
3394
+ "learning_rate": 3.840350082993813e-06,
3395
+ "loss": 2.0919,
3396
+ "mean_token_accuracy": 0.5555992320179939,
3397
+ "num_input_tokens_seen": 7511609537,
3398
+ "num_tokens": 3165956280.0,
3399
+ "step": 15550
3400
+ },
3401
+ {
3402
+ "epoch": 3.7655144318541875,
3403
+ "grad_norm": 0.265625,
3404
+ "learning_rate": 3.6517277802927423e-06,
3405
+ "loss": 2.096,
3406
+ "mean_token_accuracy": 0.5547948920354248,
3407
+ "num_input_tokens_seen": 7535873665,
3408
+ "num_tokens": 3176126007.0,
3409
+ "step": 15600
3410
+ },
3411
+ {
3412
+ "epoch": 3.777584984232841,
3413
+ "grad_norm": 0.25390625,
3414
+ "learning_rate": 3.463105477591671e-06,
3415
+ "loss": 2.0963,
3416
+ "mean_token_accuracy": 0.5549294283241033,
3417
+ "num_input_tokens_seen": 7560110225,
3418
+ "num_tokens": 3186367919.0,
3419
+ "step": 15650
3420
+ },
3421
+ {
3422
+ "epoch": 3.789655536611494,
3423
+ "grad_norm": 0.28125,
3424
+ "learning_rate": 3.274483174890599e-06,
3425
+ "loss": 2.0986,
3426
+ "mean_token_accuracy": 0.5547161266207695,
3427
+ "num_input_tokens_seen": 7584157249,
3428
+ "num_tokens": 3196441494.0,
3429
+ "step": 15700
3430
+ },
3431
+ {
3432
+ "epoch": 3.801726088990147,
3433
+ "grad_norm": 0.2470703125,
3434
+ "learning_rate": 3.0858608721895278e-06,
3435
+ "loss": 2.0952,
3436
+ "mean_token_accuracy": 0.554626210257411,
3437
+ "num_input_tokens_seen": 7608282737,
3438
+ "num_tokens": 3206560665.0,
3439
+ "step": 15750
3440
+ },
3441
+ {
3442
+ "epoch": 3.8137966413688007,
3443
+ "grad_norm": 0.265625,
3444
+ "learning_rate": 2.8972385694884564e-06,
3445
+ "loss": 2.0874,
3446
+ "mean_token_accuracy": 0.5568051477894187,
3447
+ "num_input_tokens_seen": 7632420385,
3448
+ "num_tokens": 3216638197.0,
3449
+ "step": 15800
3450
+ },
3451
+ {
3452
+ "epoch": 3.8258671937474538,
3453
+ "grad_norm": 0.357421875,
3454
+ "learning_rate": 2.708616266787385e-06,
3455
+ "loss": 2.0935,
3456
+ "mean_token_accuracy": 0.5554833044111729,
3457
+ "num_input_tokens_seen": 7656546049,
3458
+ "num_tokens": 3226721685.0,
3459
+ "step": 15850
3460
+ },
3461
+ {
3462
+ "epoch": 3.837937746126107,
3463
+ "grad_norm": 0.216796875,
3464
+ "learning_rate": 2.5199939640863136e-06,
3465
+ "loss": 2.1087,
3466
+ "mean_token_accuracy": 0.5525853624939918,
3467
+ "num_input_tokens_seen": 7680650689,
3468
+ "num_tokens": 3236963270.0,
3469
+ "step": 15900
3470
+ },
3471
+ {
3472
+ "epoch": 3.8500082985047603,
3473
+ "grad_norm": 0.267578125,
3474
+ "learning_rate": 2.3313716613852423e-06,
3475
+ "loss": 2.0952,
3476
+ "mean_token_accuracy": 0.5550886183232069,
3477
+ "num_input_tokens_seen": 7704888257,
3478
+ "num_tokens": 3247161827.0,
3479
+ "step": 15950
3480
+ },
3481
+ {
3482
+ "epoch": 3.8620788508834134,
3483
+ "grad_norm": 0.2333984375,
3484
+ "learning_rate": 2.142749358684171e-06,
3485
+ "loss": 2.0914,
3486
+ "num_input_tokens_seen": 7729128049,
3487
+ "step": 16000
3488
+ },
3489
+ {
3490
+ "epoch": 3.8620788508834134,
3491
+ "eval_loss": 1.9680593013763428,
3492
+ "eval_mean_token_accuracy": 0.5785026788667034,
3493
+ "eval_num_tokens": 3257367290.0,
3494
+ "eval_runtime": 130.8705,
3495
+ "eval_samples_per_second": 81.852,
3496
+ "eval_steps_per_second": 20.463,
3497
+ "num_input_tokens_seen": 7729128049,
3498
+ "step": 16000
3499
  }
3500
  ],
3501
  "logging_steps": 50,
3502
  "max_steps": 16568,
3503
+ "num_input_tokens_seen": 7729128049,
3504
  "num_train_epochs": 4,
3505
  "save_steps": 1000,
3506
  "stateful_callbacks": {
 
3515
  "attributes": {}
3516
  }
3517
  },
3518
+ "total_flos": 2.0676171893972582e+18,
3519
  "train_batch_size": 16,
3520
  "trial_name": null,
3521
  "trial_params": null