Azrail commited on
Commit
1880391
·
verified ·
1 Parent(s): 2b89912

Training in progress, step 59000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24ebb1df57ac2ee9b586e62f321c007518f59293b5104f6e4c9cd4556be49e20
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb24e6d8f2ac2f9fba055776f81932cc139a95f7fc40aa55fb0ec1c2a4f8255a
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:284d00e91b8ed248cc64cf350da118b741fc38fb51627a69c88a312c68a088a3
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20de72116e7f03e1795ea16116920f2218782186eb0cf45bda609f4712918191
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec1bfb0db1c21e8b4cd52af95928aa8366b624cdfe8a7ae4baa053e84325dfb8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfe4fcebd5141fdf7604535ed8dc60cda464d7e4d084d78ec5c9b7105325f9b5
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:546d8e8727a1368f14dcaccf9c4cddd7ddc8e71b1cf1d15c1ef9e8250409d1c7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c63af946e84034ef27ffe1d1d59b07405d72b5713d1851e086bcc930b39f47b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2766614593891983,
6
  "eval_steps": 500,
7
- "global_step": 58000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10332,11 +10332,189 @@
10332
  "eval_steps_per_second": 23.331,
10333
  "num_input_tokens_seen": 15204347456,
10334
  "step": 58000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10335
  }
10336
  ],
10337
  "logging_steps": 50,
10338
  "max_steps": 70000,
10339
- "num_input_tokens_seen": 15204347456,
10340
  "num_train_epochs": 1,
10341
  "save_steps": 1000,
10342
  "stateful_callbacks": {
@@ -10351,7 +10529,7 @@
10351
  "attributes": {}
10352
  }
10353
  },
10354
- "total_flos": 4.0673113389111706e+18,
10355
  "train_batch_size": 64,
10356
  "trial_name": null,
10357
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.281431484551081,
6
  "eval_steps": 500,
7
+ "global_step": 59000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10332
  "eval_steps_per_second": 23.331,
10333
  "num_input_tokens_seen": 15204347456,
10334
  "step": 58000
10335
+ },
10336
+ {
10337
+ "epoch": 0.2768999606472924,
10338
+ "grad_norm": 0.2281644344329834,
10339
+ "learning_rate": 0.0009480220479843627,
10340
+ "loss": 2.6212,
10341
+ "num_input_tokens_seen": 15217454656,
10342
+ "step": 58050
10343
+ },
10344
+ {
10345
+ "epoch": 0.27713846190538655,
10346
+ "grad_norm": 0.2181713730096817,
10347
+ "learning_rate": 0.0009455032620941839,
10348
+ "loss": 2.5927,
10349
+ "num_input_tokens_seen": 15230561856,
10350
+ "step": 58100
10351
+ },
10352
+ {
10353
+ "epoch": 0.2773769631634807,
10354
+ "grad_norm": 0.21573083102703094,
10355
+ "learning_rate": 0.00094292839336179,
10356
+ "loss": 2.6112,
10357
+ "num_input_tokens_seen": 15243669056,
10358
+ "step": 58150
10359
+ },
10360
+ {
10361
+ "epoch": 0.2776154644215748,
10362
+ "grad_norm": 0.2686486840248108,
10363
+ "learning_rate": 0.000940297765928369,
10364
+ "loss": 2.6133,
10365
+ "num_input_tokens_seen": 15256776256,
10366
+ "step": 58200
10367
+ },
10368
+ {
10369
+ "epoch": 0.27785396567966897,
10370
+ "grad_norm": 0.2320137470960617,
10371
+ "learning_rate": 0.0009376117109543769,
10372
+ "loss": 2.6094,
10373
+ "num_input_tokens_seen": 15269883456,
10374
+ "step": 58250
10375
+ },
10376
+ {
10377
+ "epoch": 0.27809246693776307,
10378
+ "grad_norm": 0.22277672588825226,
10379
+ "learning_rate": 0.0009348705665778478,
10380
+ "loss": 2.5885,
10381
+ "num_input_tokens_seen": 15282990656,
10382
+ "step": 58300
10383
+ },
10384
+ {
10385
+ "epoch": 0.27833096819585723,
10386
+ "grad_norm": 0.22681231796741486,
10387
+ "learning_rate": 0.0009320746778718274,
10388
+ "loss": 2.6005,
10389
+ "num_input_tokens_seen": 15296097856,
10390
+ "step": 58350
10391
+ },
10392
+ {
10393
+ "epoch": 0.2785694694539514,
10394
+ "grad_norm": 0.25187453627586365,
10395
+ "learning_rate": 0.000929224396800933,
10396
+ "loss": 2.5944,
10397
+ "num_input_tokens_seen": 15309205056,
10398
+ "step": 58400
10399
+ },
10400
+ {
10401
+ "epoch": 0.2788079707120455,
10402
+ "grad_norm": 0.24962358176708221,
10403
+ "learning_rate": 0.0009263200821770461,
10404
+ "loss": 2.5888,
10405
+ "num_input_tokens_seen": 15322312256,
10406
+ "step": 58450
10407
+ },
10408
+ {
10409
+ "epoch": 0.27904647197013965,
10410
+ "grad_norm": 0.18929679691791534,
10411
+ "learning_rate": 0.0009233620996141421,
10412
+ "loss": 2.5927,
10413
+ "num_input_tokens_seen": 15335419456,
10414
+ "step": 58500
10415
+ },
10416
+ {
10417
+ "epoch": 0.27904647197013965,
10418
+ "eval_loss": 2.4754066467285156,
10419
+ "eval_runtime": 53.7558,
10420
+ "eval_samples_per_second": 93.013,
10421
+ "eval_steps_per_second": 23.253,
10422
+ "num_input_tokens_seen": 15335419456,
10423
+ "step": 58500
10424
+ },
10425
+ {
10426
+ "epoch": 0.27928497322823376,
10427
+ "grad_norm": 0.22240912914276123,
10428
+ "learning_rate": 0.0009203508214822651,
10429
+ "loss": 2.5944,
10430
+ "num_input_tokens_seen": 15348526656,
10431
+ "step": 58550
10432
+ },
10433
+ {
10434
+ "epoch": 0.2795234744863279,
10435
+ "grad_norm": 0.2096235305070877,
10436
+ "learning_rate": 0.0009172866268606513,
10437
+ "loss": 2.5964,
10438
+ "num_input_tokens_seen": 15361633856,
10439
+ "step": 58600
10440
+ },
10441
+ {
10442
+ "epoch": 0.2797619757444221,
10443
+ "grad_norm": 0.2913396954536438,
10444
+ "learning_rate": 0.0009141699014900082,
10445
+ "loss": 2.5975,
10446
+ "num_input_tokens_seen": 15374741056,
10447
+ "step": 58650
10448
+ },
10449
+ {
10450
+ "epoch": 0.2800004770025162,
10451
+ "grad_norm": 0.21000444889068604,
10452
+ "learning_rate": 0.0009110010377239551,
10453
+ "loss": 2.5987,
10454
+ "num_input_tokens_seen": 15387848256,
10455
+ "step": 58700
10456
+ },
10457
+ {
10458
+ "epoch": 0.28023897826061034,
10459
+ "grad_norm": 0.18561489880084991,
10460
+ "learning_rate": 0.0009077804344796301,
10461
+ "loss": 2.5955,
10462
+ "num_input_tokens_seen": 15400955456,
10463
+ "step": 58750
10464
+ },
10465
+ {
10466
+ "epoch": 0.28047747951870444,
10467
+ "grad_norm": 0.330816388130188,
10468
+ "learning_rate": 0.0009045084971874737,
10469
+ "loss": 2.5837,
10470
+ "num_input_tokens_seen": 15414062656,
10471
+ "step": 58800
10472
+ },
10473
+ {
10474
+ "epoch": 0.2807159807767986,
10475
+ "grad_norm": 0.21823953092098236,
10476
+ "learning_rate": 0.000901185637740189,
10477
+ "loss": 2.5921,
10478
+ "num_input_tokens_seen": 15427169856,
10479
+ "step": 58850
10480
+ },
10481
+ {
10482
+ "epoch": 0.28095448203489276,
10483
+ "grad_norm": 0.28721505403518677,
10484
+ "learning_rate": 0.0008978122744408905,
10485
+ "loss": 2.5893,
10486
+ "num_input_tokens_seen": 15440277056,
10487
+ "step": 58900
10488
+ },
10489
+ {
10490
+ "epoch": 0.28119298329298686,
10491
+ "grad_norm": 0.2468225359916687,
10492
+ "learning_rate": 0.0008943888319504456,
10493
+ "loss": 2.5999,
10494
+ "num_input_tokens_seen": 15453384256,
10495
+ "step": 58950
10496
+ },
10497
+ {
10498
+ "epoch": 0.281431484551081,
10499
+ "grad_norm": 0.20486761629581451,
10500
+ "learning_rate": 0.000890915741234015,
10501
+ "loss": 2.6026,
10502
+ "num_input_tokens_seen": 15466491456,
10503
+ "step": 59000
10504
+ },
10505
+ {
10506
+ "epoch": 0.281431484551081,
10507
+ "eval_loss": 2.4756667613983154,
10508
+ "eval_runtime": 53.3408,
10509
+ "eval_samples_per_second": 93.737,
10510
+ "eval_steps_per_second": 23.434,
10511
+ "num_input_tokens_seen": 15466491456,
10512
+ "step": 59000
10513
  }
10514
  ],
10515
  "logging_steps": 50,
10516
  "max_steps": 70000,
10517
+ "num_input_tokens_seen": 15466491456,
10518
  "num_train_epochs": 1,
10519
  "save_steps": 1000,
10520
  "stateful_callbacks": {
 
10529
  "attributes": {}
10530
  }
10531
  },
10532
+ "total_flos": 4.1374374174366106e+18,
10533
  "train_batch_size": 64,
10534
  "trial_name": null,
10535
  "trial_params": null