Azrail commited on
Commit
7d249d7
·
verified ·
1 Parent(s): 0e7c402

Training in progress, step 53000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3592942d50fd128f616a1b607af53de041def2895dde8221a2068841bbfc75f
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36284509c9073dd2ec02f720b116d0aa77896518dd97e6cb1b15bf18c5f1971e
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c28c8ab74c2ab24140a66eba7b08b4da3f0a1c0487aa3d24a61f15278b3cefdb
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d392d61f01d27253443cebf734042b0281783df28f92f8dae1e7a4619df1a45
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5f53c01b35d1753a4f571c1ddd2b16976530a7b71c320877f1fbd74ce1de4ed
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3771019f4815646a43bbc09acce698c65d4ba61e6cbb0516a172314f7fbb077
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2480413084179019,
6
  "eval_steps": 500,
7
- "global_step": 52000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9264,11 +9264,189 @@
9264
  "eval_steps_per_second": 23.286,
9265
  "num_input_tokens_seen": 13631483456,
9266
  "step": 52000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9267
  }
9268
  ],
9269
  "logging_steps": 50,
9270
  "max_steps": 70000,
9271
- "num_input_tokens_seen": 13631483456,
9272
  "num_train_epochs": 1,
9273
  "save_steps": 1000,
9274
  "stateful_callbacks": {
@@ -9283,7 +9461,7 @@
9283
  "attributes": {}
9284
  }
9285
  },
9286
- "total_flos": 3.6465548677585306e+18,
9287
  "train_batch_size": 64,
9288
  "trial_name": null,
9289
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.25281133357978464,
6
  "eval_steps": 500,
7
+ "global_step": 53000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9264
  "eval_steps_per_second": 23.286,
9265
  "num_input_tokens_seen": 13631483456,
9266
  "step": 52000
9267
+ },
9268
+ {
9269
+ "epoch": 0.24827980967599603,
9270
+ "grad_norm": 0.22135989367961884,
9271
+ "learning_rate": 0.001,
9272
+ "loss": 2.5947,
9273
+ "num_input_tokens_seen": 13644590656,
9274
+ "step": 52050
9275
+ },
9276
+ {
9277
+ "epoch": 0.2485183109340902,
9278
+ "grad_norm": 0.3656958341598511,
9279
+ "learning_rate": 0.001,
9280
+ "loss": 2.6263,
9281
+ "num_input_tokens_seen": 13657697856,
9282
+ "step": 52100
9283
+ },
9284
+ {
9285
+ "epoch": 0.24875681219218432,
9286
+ "grad_norm": 0.2960817813873291,
9287
+ "learning_rate": 0.001,
9288
+ "loss": 2.6086,
9289
+ "num_input_tokens_seen": 13670805056,
9290
+ "step": 52150
9291
+ },
9292
+ {
9293
+ "epoch": 0.24899531345027845,
9294
+ "grad_norm": 0.2150612622499466,
9295
+ "learning_rate": 0.001,
9296
+ "loss": 2.6314,
9297
+ "num_input_tokens_seen": 13683912256,
9298
+ "step": 52200
9299
+ },
9300
+ {
9301
+ "epoch": 0.24923381470837258,
9302
+ "grad_norm": 0.23089592158794403,
9303
+ "learning_rate": 0.001,
9304
+ "loss": 2.6072,
9305
+ "num_input_tokens_seen": 13697019456,
9306
+ "step": 52250
9307
+ },
9308
+ {
9309
+ "epoch": 0.2494723159664667,
9310
+ "grad_norm": 0.19151148200035095,
9311
+ "learning_rate": 0.001,
9312
+ "loss": 2.6177,
9313
+ "num_input_tokens_seen": 13710126656,
9314
+ "step": 52300
9315
+ },
9316
+ {
9317
+ "epoch": 0.24971081722456087,
9318
+ "grad_norm": 0.47803962230682373,
9319
+ "learning_rate": 0.001,
9320
+ "loss": 2.6018,
9321
+ "num_input_tokens_seen": 13723233856,
9322
+ "step": 52350
9323
+ },
9324
+ {
9325
+ "epoch": 0.249949318482655,
9326
+ "grad_norm": 0.2346401810646057,
9327
+ "learning_rate": 0.001,
9328
+ "loss": 2.6068,
9329
+ "num_input_tokens_seen": 13736341056,
9330
+ "step": 52400
9331
+ },
9332
+ {
9333
+ "epoch": 0.2501878197407491,
9334
+ "grad_norm": 0.21514126658439636,
9335
+ "learning_rate": 0.001,
9336
+ "loss": 2.6186,
9337
+ "num_input_tokens_seen": 13749448256,
9338
+ "step": 52450
9339
+ },
9340
+ {
9341
+ "epoch": 0.25042632099884327,
9342
+ "grad_norm": 0.20311090350151062,
9343
+ "learning_rate": 0.001,
9344
+ "loss": 2.595,
9345
+ "num_input_tokens_seen": 13762555456,
9346
+ "step": 52500
9347
+ },
9348
+ {
9349
+ "epoch": 0.25042632099884327,
9350
+ "eval_loss": 2.490104913711548,
9351
+ "eval_runtime": 53.8709,
9352
+ "eval_samples_per_second": 92.814,
9353
+ "eval_steps_per_second": 23.204,
9354
+ "num_input_tokens_seen": 13762555456,
9355
+ "step": 52500
9356
+ },
9357
+ {
9358
+ "epoch": 0.2506648222569374,
9359
+ "grad_norm": 0.2120152711868286,
9360
+ "learning_rate": 0.001,
9361
+ "loss": 2.6027,
9362
+ "num_input_tokens_seen": 13775662656,
9363
+ "step": 52550
9364
+ },
9365
+ {
9366
+ "epoch": 0.25090332351503153,
9367
+ "grad_norm": 0.3172776401042938,
9368
+ "learning_rate": 0.001,
9369
+ "loss": 2.6089,
9370
+ "num_input_tokens_seen": 13788769856,
9371
+ "step": 52600
9372
+ },
9373
+ {
9374
+ "epoch": 0.2511418247731257,
9375
+ "grad_norm": 0.24425551295280457,
9376
+ "learning_rate": 0.001,
9377
+ "loss": 2.611,
9378
+ "num_input_tokens_seen": 13801877056,
9379
+ "step": 52650
9380
+ },
9381
+ {
9382
+ "epoch": 0.2513803260312198,
9383
+ "grad_norm": 0.24523352086544037,
9384
+ "learning_rate": 0.001,
9385
+ "loss": 2.6066,
9386
+ "num_input_tokens_seen": 13814984256,
9387
+ "step": 52700
9388
+ },
9389
+ {
9390
+ "epoch": 0.25161882728931395,
9391
+ "grad_norm": 0.21642154455184937,
9392
+ "learning_rate": 0.001,
9393
+ "loss": 2.6069,
9394
+ "num_input_tokens_seen": 13828091456,
9395
+ "step": 52750
9396
+ },
9397
+ {
9398
+ "epoch": 0.2518573285474081,
9399
+ "grad_norm": 0.21867206692695618,
9400
+ "learning_rate": 0.001,
9401
+ "loss": 2.6163,
9402
+ "num_input_tokens_seen": 13841198656,
9403
+ "step": 52800
9404
+ },
9405
+ {
9406
+ "epoch": 0.2520958298055022,
9407
+ "grad_norm": 0.2124466449022293,
9408
+ "learning_rate": 0.001,
9409
+ "loss": 2.6045,
9410
+ "num_input_tokens_seen": 13854305856,
9411
+ "step": 52850
9412
+ },
9413
+ {
9414
+ "epoch": 0.2523343310635964,
9415
+ "grad_norm": 0.20598042011260986,
9416
+ "learning_rate": 0.001,
9417
+ "loss": 2.5881,
9418
+ "num_input_tokens_seen": 13867413056,
9419
+ "step": 52900
9420
+ },
9421
+ {
9422
+ "epoch": 0.2525728323216905,
9423
+ "grad_norm": 0.1949404776096344,
9424
+ "learning_rate": 0.001,
9425
+ "loss": 2.6051,
9426
+ "num_input_tokens_seen": 13880520256,
9427
+ "step": 52950
9428
+ },
9429
+ {
9430
+ "epoch": 0.25281133357978464,
9431
+ "grad_norm": 0.18877142667770386,
9432
+ "learning_rate": 0.001,
9433
+ "loss": 2.608,
9434
+ "num_input_tokens_seen": 13893627456,
9435
+ "step": 53000
9436
+ },
9437
+ {
9438
+ "epoch": 0.25281133357978464,
9439
+ "eval_loss": 2.485513210296631,
9440
+ "eval_runtime": 53.7202,
9441
+ "eval_samples_per_second": 93.075,
9442
+ "eval_steps_per_second": 23.269,
9443
+ "num_input_tokens_seen": 13893627456,
9444
+ "step": 53000
9445
  }
9446
  ],
9447
  "logging_steps": 50,
9448
  "max_steps": 70000,
9449
+ "num_input_tokens_seen": 13893627456,
9450
  "num_train_epochs": 1,
9451
  "save_steps": 1000,
9452
  "stateful_callbacks": {
 
9461
  "attributes": {}
9462
  }
9463
  },
9464
+ "total_flos": 3.7166809462839706e+18,
9465
  "train_batch_size": 64,
9466
  "trial_name": null,
9467
  "trial_params": null