Azrail commited on
Commit
619538a
·
verified ·
1 Parent(s): 95e8f8a

Training in progress, step 25000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54c2b1dc0ce252890792fa50a7ced2b1884b184496f8709b1df62b942e4f6173
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca1305d807a0d62209066bee9cbe48b75438f197b4d11307eb4ba5e592a11386
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6caaef1143ab01dc77c2601e1c5bde16b77c55e497c5f13366c2442c28ab6fac
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed4d9687ffe945b21f6759ab92e79d3a46252bbf5731184d996dc881364e21e9
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4eb9e5f9b752984653e9c2f4587df901a2cc5f64a95a0121fadf8e7c7c268621
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59157d1ca64ffae44fbe8134d666bfe8e12822f27ca50fb6e1f0b29f58d3b64
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06f0f3181677433703f6860ec173100c1f71e33282413595313e7174a82f6998
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8c565830d05eccabcd7df396792d29e3638ccbd6988e240ff15902ef690b7e6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.5271850378296451,
6
  "eval_steps": 500,
7
- "global_step": 24000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4280,11 +4280,189 @@
4280
  "eval_steps_per_second": 18.684,
4281
  "num_input_tokens_seen": 25165820160,
4282
  "step": 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4283
  }
4284
  ],
4285
  "logging_steps": 50,
4286
  "max_steps": 200000,
4287
- "num_input_tokens_seen": 25165820160,
4288
  "num_train_epochs": 5,
4289
  "save_steps": 1000,
4290
  "stateful_callbacks": {
@@ -4299,7 +4477,7 @@
4299
  "attributes": {}
4300
  }
4301
  },
4302
- "total_flos": 1.4332118996250132e+19,
4303
  "train_batch_size": 64,
4304
  "trial_name": null,
4305
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.549151081072547,
6
  "eval_steps": 500,
7
+ "global_step": 25000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4280
  "eval_steps_per_second": 18.684,
4281
  "num_input_tokens_seen": 25165820160,
4282
  "step": 24000
4283
+ },
4284
+ {
4285
+ "epoch": 0.5282833399917902,
4286
+ "grad_norm": 0.14083649218082428,
4287
+ "learning_rate": 0.001,
4288
+ "loss": 2.6851,
4289
+ "num_input_tokens_seen": 25218248960,
4290
+ "step": 24050
4291
+ },
4292
+ {
4293
+ "epoch": 0.5293816421539352,
4294
+ "grad_norm": 0.13934968411922455,
4295
+ "learning_rate": 0.001,
4296
+ "loss": 2.6863,
4297
+ "num_input_tokens_seen": 25270677760,
4298
+ "step": 24100
4299
+ },
4300
+ {
4301
+ "epoch": 0.5304799443160804,
4302
+ "grad_norm": 0.15416787564754486,
4303
+ "learning_rate": 0.001,
4304
+ "loss": 2.6894,
4305
+ "num_input_tokens_seen": 25323106560,
4306
+ "step": 24150
4307
+ },
4308
+ {
4309
+ "epoch": 0.5315782464782255,
4310
+ "grad_norm": 0.17290246486663818,
4311
+ "learning_rate": 0.001,
4312
+ "loss": 2.6907,
4313
+ "num_input_tokens_seen": 25375535360,
4314
+ "step": 24200
4315
+ },
4316
+ {
4317
+ "epoch": 0.5326765486403706,
4318
+ "grad_norm": 0.14260552823543549,
4319
+ "learning_rate": 0.001,
4320
+ "loss": 2.6832,
4321
+ "num_input_tokens_seen": 25427964160,
4322
+ "step": 24250
4323
+ },
4324
+ {
4325
+ "epoch": 0.5337748508025156,
4326
+ "grad_norm": 0.14795690774917603,
4327
+ "learning_rate": 0.001,
4328
+ "loss": 2.6895,
4329
+ "num_input_tokens_seen": 25480392960,
4330
+ "step": 24300
4331
+ },
4332
+ {
4333
+ "epoch": 0.5348731529646608,
4334
+ "grad_norm": 0.15009699761867523,
4335
+ "learning_rate": 0.001,
4336
+ "loss": 2.6819,
4337
+ "num_input_tokens_seen": 25532821760,
4338
+ "step": 24350
4339
+ },
4340
+ {
4341
+ "epoch": 0.5359714551268059,
4342
+ "grad_norm": 0.15425953269004822,
4343
+ "learning_rate": 0.001,
4344
+ "loss": 2.6874,
4345
+ "num_input_tokens_seen": 25585250560,
4346
+ "step": 24400
4347
+ },
4348
+ {
4349
+ "epoch": 0.5370697572889509,
4350
+ "grad_norm": 0.14639410376548767,
4351
+ "learning_rate": 0.001,
4352
+ "loss": 2.6878,
4353
+ "num_input_tokens_seen": 25637679360,
4354
+ "step": 24450
4355
+ },
4356
+ {
4357
+ "epoch": 0.538168059451096,
4358
+ "grad_norm": 0.14785613119602203,
4359
+ "learning_rate": 0.001,
4360
+ "loss": 2.6841,
4361
+ "num_input_tokens_seen": 25690108160,
4362
+ "step": 24500
4363
+ },
4364
+ {
4365
+ "epoch": 0.538168059451096,
4366
+ "eval_loss": 2.5875706672668457,
4367
+ "eval_runtime": 66.9296,
4368
+ "eval_samples_per_second": 74.705,
4369
+ "eval_steps_per_second": 18.676,
4370
+ "num_input_tokens_seen": 25690108160,
4371
+ "step": 24500
4372
+ },
4373
+ {
4374
+ "epoch": 0.5392663616132412,
4375
+ "grad_norm": 0.14224180579185486,
4376
+ "learning_rate": 0.001,
4377
+ "loss": 2.6876,
4378
+ "num_input_tokens_seen": 25742536960,
4379
+ "step": 24550
4380
+ },
4381
+ {
4382
+ "epoch": 0.5403646637753863,
4383
+ "grad_norm": 0.14881493151187897,
4384
+ "learning_rate": 0.001,
4385
+ "loss": 2.6827,
4386
+ "num_input_tokens_seen": 25794965760,
4387
+ "step": 24600
4388
+ },
4389
+ {
4390
+ "epoch": 0.5414629659375313,
4391
+ "grad_norm": 0.17951786518096924,
4392
+ "learning_rate": 0.001,
4393
+ "loss": 2.688,
4394
+ "num_input_tokens_seen": 25847394560,
4395
+ "step": 24650
4396
+ },
4397
+ {
4398
+ "epoch": 0.5425612680996764,
4399
+ "grad_norm": 0.1400926560163498,
4400
+ "learning_rate": 0.001,
4401
+ "loss": 2.6945,
4402
+ "num_input_tokens_seen": 25899823360,
4403
+ "step": 24700
4404
+ },
4405
+ {
4406
+ "epoch": 0.5436595702618215,
4407
+ "grad_norm": 0.1421627402305603,
4408
+ "learning_rate": 0.001,
4409
+ "loss": 2.6852,
4410
+ "num_input_tokens_seen": 25952252160,
4411
+ "step": 24750
4412
+ },
4413
+ {
4414
+ "epoch": 0.5447578724239666,
4415
+ "grad_norm": 0.1617737114429474,
4416
+ "learning_rate": 0.001,
4417
+ "loss": 2.686,
4418
+ "num_input_tokens_seen": 26004680960,
4419
+ "step": 24800
4420
+ },
4421
+ {
4422
+ "epoch": 0.5458561745861117,
4423
+ "grad_norm": 0.1523471176624298,
4424
+ "learning_rate": 0.001,
4425
+ "loss": 2.6945,
4426
+ "num_input_tokens_seen": 26057109760,
4427
+ "step": 24850
4428
+ },
4429
+ {
4430
+ "epoch": 0.5469544767482568,
4431
+ "grad_norm": 0.13078247010707855,
4432
+ "learning_rate": 0.001,
4433
+ "loss": 2.6829,
4434
+ "num_input_tokens_seen": 26109538560,
4435
+ "step": 24900
4436
+ },
4437
+ {
4438
+ "epoch": 0.5480527789104018,
4439
+ "grad_norm": 0.14831651747226715,
4440
+ "learning_rate": 0.001,
4441
+ "loss": 2.6898,
4442
+ "num_input_tokens_seen": 26161967360,
4443
+ "step": 24950
4444
+ },
4445
+ {
4446
+ "epoch": 0.549151081072547,
4447
+ "grad_norm": 0.1782410740852356,
4448
+ "learning_rate": 0.001,
4449
+ "loss": 2.6871,
4450
+ "num_input_tokens_seen": 26214396160,
4451
+ "step": 25000
4452
+ },
4453
+ {
4454
+ "epoch": 0.549151081072547,
4455
+ "eval_loss": 2.5877788066864014,
4456
+ "eval_runtime": 67.2223,
4457
+ "eval_samples_per_second": 74.38,
4458
+ "eval_steps_per_second": 18.595,
4459
+ "num_input_tokens_seen": 26214396160,
4460
+ "step": 25000
4461
  }
4462
  ],
4463
  "logging_steps": 50,
4464
  "max_steps": 200000,
4465
+ "num_input_tokens_seen": 26214396160,
4466
  "num_train_epochs": 5,
4467
  "save_steps": 1000,
4468
  "stateful_callbacks": {
 
4477
  "attributes": {}
4478
  }
4479
  },
4480
+ "total_flos": 1.492929071221506e+19,
4481
  "train_batch_size": 64,
4482
  "trial_name": null,
4483
  "trial_params": null