Azrail commited on
Commit
4078c83
·
verified ·
1 Parent(s): f392983

Training in progress, step 42000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fb18d4c27c64f6607996dc76ab059b3274f96bf50194e20861ca91446bac906
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b793c31018c10b83151888a761e5fecf881d8cfcf10fe82ad108fb7a30b9cb35
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4a71156c2d2f2da1c265821c7ca99486fbc72cc466c418215c7150c425f5836
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c753061fb3a47402b7408e67c6f3761fca04d13fb94ac46b9adfdfc16d0184d4
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:028c63076d3d8e5d0c73e4da1b6fc8793d1c56810af68c19f7f253b3016ce7ac
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aaf95bbf390f32ec661a712de605a0c816388cfa815f81914058fe6bdabdcd9
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bafdd2692f3ffed299379761090a99347b59a938d0713ea16130141db6dd54e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a94a7467707318fda39e274661a096a9de559314c283be40d75a871d8d1d3d18
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1955710316371919,
6
  "eval_steps": 500,
7
- "global_step": 41000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7306,11 +7306,189 @@
7306
  "eval_steps_per_second": 24.179,
7307
  "num_input_tokens_seen": 10747899456,
7308
  "step": 41000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7309
  }
7310
  ],
7311
  "logging_steps": 50,
7312
  "max_steps": 70000,
7313
- "num_input_tokens_seen": 10747899456,
7314
  "num_train_epochs": 1,
7315
  "save_steps": 1000,
7316
  "stateful_callbacks": {
@@ -7325,7 +7503,7 @@
7325
  "attributes": {}
7326
  }
7327
  },
7328
- "total_flos": 2.8751680039786906e+18,
7329
  "train_batch_size": 64,
7330
  "trial_name": null,
7331
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2003410567990746,
6
  "eval_steps": 500,
7
+ "global_step": 42000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7306
  "eval_steps_per_second": 24.179,
7307
  "num_input_tokens_seen": 10747899456,
7308
  "step": 41000
7309
+ },
7310
+ {
7311
+ "epoch": 0.19580953289528602,
7312
+ "grad_norm": 0.20298945903778076,
7313
+ "learning_rate": 0.001,
7314
+ "loss": 2.6233,
7315
+ "num_input_tokens_seen": 10761006656,
7316
+ "step": 41050
7317
+ },
7318
+ {
7319
+ "epoch": 0.19604803415338015,
7320
+ "grad_norm": 0.2280716896057129,
7321
+ "learning_rate": 0.001,
7322
+ "loss": 2.6427,
7323
+ "num_input_tokens_seen": 10774113856,
7324
+ "step": 41100
7325
+ },
7326
+ {
7327
+ "epoch": 0.19628653541147428,
7328
+ "grad_norm": 0.19223643839359283,
7329
+ "learning_rate": 0.001,
7330
+ "loss": 2.6263,
7331
+ "num_input_tokens_seen": 10787221056,
7332
+ "step": 41150
7333
+ },
7334
+ {
7335
+ "epoch": 0.19652503666956844,
7336
+ "grad_norm": 0.19221842288970947,
7337
+ "learning_rate": 0.001,
7338
+ "loss": 2.6401,
7339
+ "num_input_tokens_seen": 10800328256,
7340
+ "step": 41200
7341
+ },
7342
+ {
7343
+ "epoch": 0.19676353792766257,
7344
+ "grad_norm": 0.19479979574680328,
7345
+ "learning_rate": 0.001,
7346
+ "loss": 2.6269,
7347
+ "num_input_tokens_seen": 10813435456,
7348
+ "step": 41250
7349
+ },
7350
+ {
7351
+ "epoch": 0.1970020391857567,
7352
+ "grad_norm": 0.24501195549964905,
7353
+ "learning_rate": 0.001,
7354
+ "loss": 2.618,
7355
+ "num_input_tokens_seen": 10826542656,
7356
+ "step": 41300
7357
+ },
7358
+ {
7359
+ "epoch": 0.19724054044385084,
7360
+ "grad_norm": 0.1994044929742813,
7361
+ "learning_rate": 0.001,
7362
+ "loss": 2.64,
7363
+ "num_input_tokens_seen": 10839649856,
7364
+ "step": 41350
7365
+ },
7366
+ {
7367
+ "epoch": 0.19747904170194497,
7368
+ "grad_norm": 0.20831650495529175,
7369
+ "learning_rate": 0.001,
7370
+ "loss": 2.6513,
7371
+ "num_input_tokens_seen": 10852757056,
7372
+ "step": 41400
7373
+ },
7374
+ {
7375
+ "epoch": 0.19771754296003913,
7376
+ "grad_norm": 0.21919438242912292,
7377
+ "learning_rate": 0.001,
7378
+ "loss": 2.6379,
7379
+ "num_input_tokens_seen": 10865864256,
7380
+ "step": 41450
7381
+ },
7382
+ {
7383
+ "epoch": 0.19795604421813326,
7384
+ "grad_norm": 0.23088768124580383,
7385
+ "learning_rate": 0.001,
7386
+ "loss": 2.6449,
7387
+ "num_input_tokens_seen": 10878971456,
7388
+ "step": 41500
7389
+ },
7390
+ {
7391
+ "epoch": 0.19795604421813326,
7392
+ "eval_loss": 2.5156567096710205,
7393
+ "eval_runtime": 51.6776,
7394
+ "eval_samples_per_second": 96.754,
7395
+ "eval_steps_per_second": 24.188,
7396
+ "num_input_tokens_seen": 10878971456,
7397
+ "step": 41500
7398
+ },
7399
+ {
7400
+ "epoch": 0.1981945454762274,
7401
+ "grad_norm": 0.1982518881559372,
7402
+ "learning_rate": 0.001,
7403
+ "loss": 2.6304,
7404
+ "num_input_tokens_seen": 10892078656,
7405
+ "step": 41550
7406
+ },
7407
+ {
7408
+ "epoch": 0.19843304673432152,
7409
+ "grad_norm": 0.2099853903055191,
7410
+ "learning_rate": 0.001,
7411
+ "loss": 2.6305,
7412
+ "num_input_tokens_seen": 10905185856,
7413
+ "step": 41600
7414
+ },
7415
+ {
7416
+ "epoch": 0.19867154799241565,
7417
+ "grad_norm": 0.19403131306171417,
7418
+ "learning_rate": 0.001,
7419
+ "loss": 2.6419,
7420
+ "num_input_tokens_seen": 10918293056,
7421
+ "step": 41650
7422
+ },
7423
+ {
7424
+ "epoch": 0.19891004925050979,
7425
+ "grad_norm": 0.20865993201732635,
7426
+ "learning_rate": 0.001,
7427
+ "loss": 2.6116,
7428
+ "num_input_tokens_seen": 10931400256,
7429
+ "step": 41700
7430
+ },
7431
+ {
7432
+ "epoch": 0.19914855050860394,
7433
+ "grad_norm": 0.19042626023292542,
7434
+ "learning_rate": 0.001,
7435
+ "loss": 2.6271,
7436
+ "num_input_tokens_seen": 10944507456,
7437
+ "step": 41750
7438
+ },
7439
+ {
7440
+ "epoch": 0.19938705176669808,
7441
+ "grad_norm": 0.20514579117298126,
7442
+ "learning_rate": 0.001,
7443
+ "loss": 2.6348,
7444
+ "num_input_tokens_seen": 10957614656,
7445
+ "step": 41800
7446
+ },
7447
+ {
7448
+ "epoch": 0.1996255530247922,
7449
+ "grad_norm": 0.21224668622016907,
7450
+ "learning_rate": 0.001,
7451
+ "loss": 2.6314,
7452
+ "num_input_tokens_seen": 10970721856,
7453
+ "step": 41850
7454
+ },
7455
+ {
7456
+ "epoch": 0.19986405428288634,
7457
+ "grad_norm": 0.18857082724571228,
7458
+ "learning_rate": 0.001,
7459
+ "loss": 2.6217,
7460
+ "num_input_tokens_seen": 10983829056,
7461
+ "step": 41900
7462
+ },
7463
+ {
7464
+ "epoch": 0.20010255554098047,
7465
+ "grad_norm": 0.18431074917316437,
7466
+ "learning_rate": 0.001,
7467
+ "loss": 2.6267,
7468
+ "num_input_tokens_seen": 10996936256,
7469
+ "step": 41950
7470
+ },
7471
+ {
7472
+ "epoch": 0.2003410567990746,
7473
+ "grad_norm": 0.20570099353790283,
7474
+ "learning_rate": 0.001,
7475
+ "loss": 2.6016,
7476
+ "num_input_tokens_seen": 11010043456,
7477
+ "step": 42000
7478
+ },
7479
+ {
7480
+ "epoch": 0.2003410567990746,
7481
+ "eval_loss": 2.506241798400879,
7482
+ "eval_runtime": 51.5548,
7483
+ "eval_samples_per_second": 96.984,
7484
+ "eval_steps_per_second": 24.246,
7485
+ "num_input_tokens_seen": 11010043456,
7486
+ "step": 42000
7487
  }
7488
  ],
7489
  "logging_steps": 50,
7490
  "max_steps": 70000,
7491
+ "num_input_tokens_seen": 11010043456,
7492
  "num_train_epochs": 1,
7493
  "save_steps": 1000,
7494
  "stateful_callbacks": {
 
7503
  "attributes": {}
7504
  }
7505
  },
7506
+ "total_flos": 2.9452940825041306e+18,
7507
  "train_batch_size": 64,
7508
  "trial_name": null,
7509
  "trial_params": null