Azrail commited on
Commit
424a6be
·
verified ·
1 Parent(s): 20e3ee6

Training in progress, step 115000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b778ecb426d78f0896855e8fb4aad5b0ed64f4bb1e53aede2d8069fdd044f83f
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4052b3f6dee6acc6e8461ad996dfa79e27245712edf2d1f3321a44a85660ffc
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e40a86136eefe7a52f906d32b10df1f61bc2559012b7bd8d21fd2f6358ab1422
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0922c14e94c809f8792d25d931657f0739836f8872958cc36e36c78337b7886b
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f9d7695201cafd8e529bbb705c4e86352c97146b7f2c1d17b903edf259b2912
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db41ee9f728a0f615e34c377aa1f203a61ceeaf873404658f962d92e3c5c6285
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a3df12db58d0a78ce660a6cf049d113e8861e8aa8611c9714bf603dc61fb3a9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e98ce821f7f40a728bc6b049ace38a924402d0d066809b7215e9faa83ce3c45c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0875681219218432,
6
  "eval_steps": 500,
7
- "global_step": 114000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20300,11 +20300,189 @@
20300
  "eval_steps_per_second": 15.134,
20301
  "num_input_tokens_seen": 59759249088,
20302
  "step": 114000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20303
  }
20304
  ],
20305
  "logging_steps": 50,
20306
  "max_steps": 140000,
20307
- "num_input_tokens_seen": 59759249088,
20308
  "num_train_epochs": 2,
20309
  "save_steps": 1000,
20310
  "stateful_callbacks": {
@@ -20319,7 +20497,7 @@
20319
  "attributes": {}
20320
  }
20321
  },
20322
- "total_flos": 1.0576297079872635e+20,
20323
  "train_batch_size": 32,
20324
  "trial_name": null,
20325
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0971081722456086,
6
  "eval_steps": 500,
7
+ "global_step": 115000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20300
  "eval_steps_per_second": 15.134,
20301
  "num_input_tokens_seen": 59759249088,
20302
  "step": 114000
20303
+ },
20304
+ {
20305
+ "epoch": 1.0880451244380314,
20306
+ "grad_norm": 0.1358513981103897,
20307
+ "learning_rate": 0.0009868321189035196,
20308
+ "loss": 2.1057,
20309
+ "num_input_tokens_seen": 59785457920,
20310
+ "step": 114050
20311
+ },
20312
+ {
20313
+ "epoch": 1.0885221269542198,
20314
+ "grad_norm": 0.14738275110721588,
20315
+ "learning_rate": 0.0009861849601988384,
20316
+ "loss": 2.099,
20317
+ "num_input_tokens_seen": 59811672288,
20318
+ "step": 114100
20319
+ },
20320
+ {
20321
+ "epoch": 1.088999129470408,
20322
+ "grad_norm": 0.16324234008789062,
20323
+ "learning_rate": 0.0009855225003441628,
20324
+ "loss": 2.0952,
20325
+ "num_input_tokens_seen": 59837885600,
20326
+ "step": 114150
20327
+ },
20328
+ {
20329
+ "epoch": 1.0894761319865962,
20330
+ "grad_norm": 0.15156808495521545,
20331
+ "learning_rate": 0.0009848447601883434,
20332
+ "loss": 2.1014,
20333
+ "num_input_tokens_seen": 59864099392,
20334
+ "step": 114200
20335
+ },
20336
+ {
20337
+ "epoch": 1.0899531345027844,
20338
+ "grad_norm": 0.14273667335510254,
20339
+ "learning_rate": 0.0009841517610611307,
20340
+ "loss": 2.0898,
20341
+ "num_input_tokens_seen": 59890311072,
20342
+ "step": 114250
20343
+ },
20344
+ {
20345
+ "epoch": 1.0904301370189728,
20346
+ "grad_norm": 0.1409289538860321,
20347
+ "learning_rate": 0.0009834435247725033,
20348
+ "loss": 2.0798,
20349
+ "num_input_tokens_seen": 59916523776,
20350
+ "step": 114300
20351
+ },
20352
+ {
20353
+ "epoch": 1.090907139535161,
20354
+ "grad_norm": 0.13659177720546722,
20355
+ "learning_rate": 0.0009827200736119814,
20356
+ "loss": 2.084,
20357
+ "num_input_tokens_seen": 59942727744,
20358
+ "step": 114350
20359
+ },
20360
+ {
20361
+ "epoch": 1.0913841420513493,
20362
+ "grad_norm": 0.14861910045146942,
20363
+ "learning_rate": 0.0009819814303479266,
20364
+ "loss": 2.1021,
20365
+ "num_input_tokens_seen": 59968942144,
20366
+ "step": 114400
20367
+ },
20368
+ {
20369
+ "epoch": 1.0918611445675377,
20370
+ "grad_norm": 0.13872170448303223,
20371
+ "learning_rate": 0.0009812276182268236,
20372
+ "loss": 2.1001,
20373
+ "num_input_tokens_seen": 59995154848,
20374
+ "step": 114450
20375
+ },
20376
+ {
20377
+ "epoch": 1.092338147083726,
20378
+ "grad_norm": 0.14306657016277313,
20379
+ "learning_rate": 0.00098045866097255,
20380
+ "loss": 2.0837,
20381
+ "num_input_tokens_seen": 60021363392,
20382
+ "step": 114500
20383
+ },
20384
+ {
20385
+ "epoch": 1.092338147083726,
20386
+ "eval_loss": 2.0082569122314453,
20387
+ "eval_runtime": 82.8417,
20388
+ "eval_samples_per_second": 60.356,
20389
+ "eval_steps_per_second": 15.089,
20390
+ "num_input_tokens_seen": 60021363392,
20391
+ "step": 114500
20392
+ },
20393
+ {
20394
+ "epoch": 1.092815149599914,
20395
+ "grad_norm": 0.1300678551197052,
20396
+ "learning_rate": 0.000979674582785628,
20397
+ "loss": 2.0904,
20398
+ "num_input_tokens_seen": 60047570880,
20399
+ "step": 114550
20400
+ },
20401
+ {
20402
+ "epoch": 1.0932921521161023,
20403
+ "grad_norm": 0.1488349586725235,
20404
+ "learning_rate": 0.0009788754083424652,
20405
+ "loss": 2.0969,
20406
+ "num_input_tokens_seen": 60073778944,
20407
+ "step": 114600
20408
+ },
20409
+ {
20410
+ "epoch": 1.0937691546322907,
20411
+ "grad_norm": 0.14389395713806152,
20412
+ "learning_rate": 0.000978061162794576,
20413
+ "loss": 2.0956,
20414
+ "num_input_tokens_seen": 60099993344,
20415
+ "step": 114650
20416
+ },
20417
+ {
20418
+ "epoch": 1.094246157148479,
20419
+ "grad_norm": 0.13556672632694244,
20420
+ "learning_rate": 0.0009772318717677904,
20421
+ "loss": 2.0856,
20422
+ "num_input_tokens_seen": 60126204832,
20423
+ "step": 114700
20424
+ },
20425
+ {
20426
+ "epoch": 1.0947231596646672,
20427
+ "grad_norm": 0.14573290944099426,
20428
+ "learning_rate": 0.0009763875613614481,
20429
+ "loss": 2.083,
20430
+ "num_input_tokens_seen": 60152411456,
20431
+ "step": 114750
20432
+ },
20433
+ {
20434
+ "epoch": 1.0952001621808556,
20435
+ "grad_norm": 0.14349648356437683,
20436
+ "learning_rate": 0.0009755282581475768,
20437
+ "loss": 2.099,
20438
+ "num_input_tokens_seen": 60178616832,
20439
+ "step": 114800
20440
+ },
20441
+ {
20442
+ "epoch": 1.0956771646970438,
20443
+ "grad_norm": 0.1363336592912674,
20444
+ "learning_rate": 0.0009746539891700557,
20445
+ "loss": 2.0941,
20446
+ "num_input_tokens_seen": 60204821568,
20447
+ "step": 114850
20448
+ },
20449
+ {
20450
+ "epoch": 1.096154167213232,
20451
+ "grad_norm": 0.14463187754154205,
20452
+ "learning_rate": 0.0009737647819437645,
20453
+ "loss": 2.0987,
20454
+ "num_input_tokens_seen": 60231035968,
20455
+ "step": 114900
20456
+ },
20457
+ {
20458
+ "epoch": 1.0966311697294202,
20459
+ "grad_norm": 0.14132525026798248,
20460
+ "learning_rate": 0.0009728606644537177,
20461
+ "loss": 2.0954,
20462
+ "num_input_tokens_seen": 60257250368,
20463
+ "step": 114950
20464
+ },
20465
+ {
20466
+ "epoch": 1.0971081722456086,
20467
+ "grad_norm": 0.14640025794506073,
20468
+ "learning_rate": 0.0009719416651541838,
20469
+ "loss": 2.0992,
20470
+ "num_input_tokens_seen": 60283464768,
20471
+ "step": 115000
20472
+ },
20473
+ {
20474
+ "epoch": 1.0971081722456086,
20475
+ "eval_loss": 2.007655620574951,
20476
+ "eval_runtime": 82.4937,
20477
+ "eval_samples_per_second": 60.611,
20478
+ "eval_steps_per_second": 15.153,
20479
+ "num_input_tokens_seen": 60283464768,
20480
+ "step": 115000
20481
  }
20482
  ],
20483
  "logging_steps": 50,
20484
  "max_steps": 140000,
20485
+ "num_input_tokens_seen": 60283464768,
20486
  "num_train_epochs": 2,
20487
  "save_steps": 1000,
20488
  "stateful_callbacks": {
 
20497
  "attributes": {}
20498
  }
20499
  },
20500
+ "total_flos": 1.0669073693538632e+20,
20501
  "train_batch_size": 32,
20502
  "trial_name": null,
20503
  "trial_params": null