mohammadmahdinouri commited on
Commit
4337070
·
verified ·
1 Parent(s): c8d80af

Training in progress, step 73000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fbefc43fcc2f8bf8bb8522016041f2a9a7a1389e937a0c7f9efe740c9281e923
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82e96b382e85cf4f91a0957df390eab642f1a5b90594b054112e585987e922fb
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e10ee0c90a6cc09cdc24b1085749ee192ca52841ac52349ee023c635a106f71
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86831135ab2a33d7609f755ab5e685a1ac6602cf0ed6e3f717ff3cd6a64064f2
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0a2f1706dfc950df47249e8d65d6df596c2f98887c24dba54cde743e4804d2cf
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbf5a8e94cdeb9d71543994044a1496c0b99dc653812727d1f2b5879319264c4
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:488c74f8a1dc2a7148ae3d9f18c7e9fcbb141512e2f149cd1d29674d054be2f3
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e585ae00a418f8315b98a87df365e3f31023ec6747db05d48bdc24ed26af3666
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77448ddbc0e5f35d8ef3a4b1063eb25209d701957cc23b3671796af1520e431c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65df296955d0ea7a8b7df67d30426101d0bc72ddcf4935d0366aeb81991dd30
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3acb48030fde17938d59bf929c695a9b6dbd4fe2687e2cce76096a6e14351d6
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75f3a690a6b3c19beeba0982e2eceaedb3e05582e018ecc3f8710afa643876ad
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:514d743b09cdf67b5f7ccba0c67283da3d20aa73a759bcf5ebfccf66234e08c8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1971585f96833288fec52d3fdc773fe9f57b50e9c45dc3d75ed2e10f5ab3dca7
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.10665465814219437,
6
  "eval_steps": 500,
7
- "global_step": 72000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -25208,6 +25208,356 @@
25208
  "learning_rate": 0.00048234355429470035,
25209
  "loss": 16.5261,
25210
  "step": 72000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25211
  }
25212
  ],
25213
  "logging_steps": 20,
@@ -25227,7 +25577,7 @@
25227
  "attributes": {}
25228
  }
25229
  },
25230
- "total_flos": 5.293707639198528e+19,
25231
  "train_batch_size": 48,
25232
  "trial_name": null,
25233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.10813597283861373,
6
  "eval_steps": 500,
7
+ "global_step": 73000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
25208
  "learning_rate": 0.00048234355429470035,
25209
  "loss": 16.5261,
25210
  "step": 72000
25211
+ },
25212
+ {
25213
+ "epoch": 0.10668428443612275,
25214
+ "grad_norm": 6.65625,
25215
+ "learning_rate": 0.00048233861535969274,
25216
+ "loss": 16.4916,
25217
+ "step": 72020
25218
+ },
25219
+ {
25220
+ "epoch": 0.10671391073005114,
25221
+ "grad_norm": 6.6875,
25222
+ "learning_rate": 0.0004823336764246852,
25223
+ "loss": 16.4841,
25224
+ "step": 72040
25225
+ },
25226
+ {
25227
+ "epoch": 0.10674353702397953,
25228
+ "grad_norm": 6.09375,
25229
+ "learning_rate": 0.00048232873748967764,
25230
+ "loss": 16.485,
25231
+ "step": 72060
25232
+ },
25233
+ {
25234
+ "epoch": 0.10677316331790791,
25235
+ "grad_norm": 6.375,
25236
+ "learning_rate": 0.0004823237985546701,
25237
+ "loss": 16.4513,
25238
+ "step": 72080
25239
+ },
25240
+ {
25241
+ "epoch": 0.1068027896118363,
25242
+ "grad_norm": 7.78125,
25243
+ "learning_rate": 0.0004823188596196625,
25244
+ "loss": 16.4513,
25245
+ "step": 72100
25246
+ },
25247
+ {
25248
+ "epoch": 0.10683241590576469,
25249
+ "grad_norm": 7.0,
25250
+ "learning_rate": 0.000482313920684655,
25251
+ "loss": 16.4768,
25252
+ "step": 72120
25253
+ },
25254
+ {
25255
+ "epoch": 0.10686204219969307,
25256
+ "grad_norm": 6.5625,
25257
+ "learning_rate": 0.0004823089817496474,
25258
+ "loss": 16.4426,
25259
+ "step": 72140
25260
+ },
25261
+ {
25262
+ "epoch": 0.10689166849362146,
25263
+ "grad_norm": 7.46875,
25264
+ "learning_rate": 0.0004823040428146398,
25265
+ "loss": 16.4733,
25266
+ "step": 72160
25267
+ },
25268
+ {
25269
+ "epoch": 0.10692129478754984,
25270
+ "grad_norm": 7.625,
25271
+ "learning_rate": 0.0004822991038796322,
25272
+ "loss": 16.459,
25273
+ "step": 72180
25274
+ },
25275
+ {
25276
+ "epoch": 0.10695092108147823,
25277
+ "grad_norm": 6.0,
25278
+ "learning_rate": 0.0004822941649446247,
25279
+ "loss": 16.4675,
25280
+ "step": 72200
25281
+ },
25282
+ {
25283
+ "epoch": 0.10698054737540662,
25284
+ "grad_norm": 6.28125,
25285
+ "learning_rate": 0.0004822892260096171,
25286
+ "loss": 16.517,
25287
+ "step": 72220
25288
+ },
25289
+ {
25290
+ "epoch": 0.107010173669335,
25291
+ "grad_norm": 6.375,
25292
+ "learning_rate": 0.0004822842870746095,
25293
+ "loss": 16.4864,
25294
+ "step": 72240
25295
+ },
25296
+ {
25297
+ "epoch": 0.10703979996326339,
25298
+ "grad_norm": 6.4375,
25299
+ "learning_rate": 0.00048227934813960195,
25300
+ "loss": 16.4141,
25301
+ "step": 72260
25302
+ },
25303
+ {
25304
+ "epoch": 0.10706942625719178,
25305
+ "grad_norm": 6.875,
25306
+ "learning_rate": 0.0004822744092045944,
25307
+ "loss": 16.5031,
25308
+ "step": 72280
25309
+ },
25310
+ {
25311
+ "epoch": 0.10709905255112016,
25312
+ "grad_norm": 6.4375,
25313
+ "learning_rate": 0.00048226947026958685,
25314
+ "loss": 16.4786,
25315
+ "step": 72300
25316
+ },
25317
+ {
25318
+ "epoch": 0.10712867884504856,
25319
+ "grad_norm": 6.4375,
25320
+ "learning_rate": 0.00048226453133457924,
25321
+ "loss": 16.4936,
25322
+ "step": 72320
25323
+ },
25324
+ {
25325
+ "epoch": 0.10715830513897695,
25326
+ "grad_norm": 7.3125,
25327
+ "learning_rate": 0.0004822595923995717,
25328
+ "loss": 16.4618,
25329
+ "step": 72340
25330
+ },
25331
+ {
25332
+ "epoch": 0.10718793143290534,
25333
+ "grad_norm": 6.28125,
25334
+ "learning_rate": 0.00048225465346456414,
25335
+ "loss": 16.4663,
25336
+ "step": 72360
25337
+ },
25338
+ {
25339
+ "epoch": 0.10721755772683372,
25340
+ "grad_norm": 6.25,
25341
+ "learning_rate": 0.0004822497145295566,
25342
+ "loss": 16.4996,
25343
+ "step": 72380
25344
+ },
25345
+ {
25346
+ "epoch": 0.10724718402076211,
25347
+ "grad_norm": 6.25,
25348
+ "learning_rate": 0.000482244775594549,
25349
+ "loss": 16.4604,
25350
+ "step": 72400
25351
+ },
25352
+ {
25353
+ "epoch": 0.1072768103146905,
25354
+ "grad_norm": 6.6875,
25355
+ "learning_rate": 0.0004822398366595415,
25356
+ "loss": 16.4964,
25357
+ "step": 72420
25358
+ },
25359
+ {
25360
+ "epoch": 0.10730643660861888,
25361
+ "grad_norm": 6.8125,
25362
+ "learning_rate": 0.0004822348977245339,
25363
+ "loss": 16.4576,
25364
+ "step": 72440
25365
+ },
25366
+ {
25367
+ "epoch": 0.10733606290254727,
25368
+ "grad_norm": 6.65625,
25369
+ "learning_rate": 0.0004822299587895263,
25370
+ "loss": 16.4561,
25371
+ "step": 72460
25372
+ },
25373
+ {
25374
+ "epoch": 0.10736568919647566,
25375
+ "grad_norm": 6.125,
25376
+ "learning_rate": 0.0004822250198545187,
25377
+ "loss": 16.4686,
25378
+ "step": 72480
25379
+ },
25380
+ {
25381
+ "epoch": 0.10739531549040404,
25382
+ "grad_norm": 6.4375,
25383
+ "learning_rate": 0.0004822200809195112,
25384
+ "loss": 16.477,
25385
+ "step": 72500
25386
+ },
25387
+ {
25388
+ "epoch": 0.10742494178433243,
25389
+ "grad_norm": 7.0,
25390
+ "learning_rate": 0.0004822151419845036,
25391
+ "loss": 16.4681,
25392
+ "step": 72520
25393
+ },
25394
+ {
25395
+ "epoch": 0.10745456807826081,
25396
+ "grad_norm": 6.59375,
25397
+ "learning_rate": 0.000482210203049496,
25398
+ "loss": 16.4467,
25399
+ "step": 72540
25400
+ },
25401
+ {
25402
+ "epoch": 0.1074841943721892,
25403
+ "grad_norm": 6.40625,
25404
+ "learning_rate": 0.00048220526411448845,
25405
+ "loss": 16.4432,
25406
+ "step": 72560
25407
+ },
25408
+ {
25409
+ "epoch": 0.10751382066611759,
25410
+ "grad_norm": 6.46875,
25411
+ "learning_rate": 0.0004822003251794809,
25412
+ "loss": 16.4936,
25413
+ "step": 72580
25414
+ },
25415
+ {
25416
+ "epoch": 0.10754344696004597,
25417
+ "grad_norm": 6.46875,
25418
+ "learning_rate": 0.00048219538624447335,
25419
+ "loss": 16.4488,
25420
+ "step": 72600
25421
+ },
25422
+ {
25423
+ "epoch": 0.10757307325397436,
25424
+ "grad_norm": 6.0,
25425
+ "learning_rate": 0.00048219044730946574,
25426
+ "loss": 16.4461,
25427
+ "step": 72620
25428
+ },
25429
+ {
25430
+ "epoch": 0.10760269954790276,
25431
+ "grad_norm": 6.625,
25432
+ "learning_rate": 0.0004821855083744582,
25433
+ "loss": 16.5009,
25434
+ "step": 72640
25435
+ },
25436
+ {
25437
+ "epoch": 0.10763232584183115,
25438
+ "grad_norm": 7.21875,
25439
+ "learning_rate": 0.00048218056943945064,
25440
+ "loss": 16.4108,
25441
+ "step": 72660
25442
+ },
25443
+ {
25444
+ "epoch": 0.10766195213575953,
25445
+ "grad_norm": 6.40625,
25446
+ "learning_rate": 0.0004821756305044431,
25447
+ "loss": 16.4139,
25448
+ "step": 72680
25449
+ },
25450
+ {
25451
+ "epoch": 0.10769157842968792,
25452
+ "grad_norm": 6.90625,
25453
+ "learning_rate": 0.0004821706915694355,
25454
+ "loss": 16.4898,
25455
+ "step": 72700
25456
+ },
25457
+ {
25458
+ "epoch": 0.1077212047236163,
25459
+ "grad_norm": 7.40625,
25460
+ "learning_rate": 0.000482165752634428,
25461
+ "loss": 16.5149,
25462
+ "step": 72720
25463
+ },
25464
+ {
25465
+ "epoch": 0.1077508310175447,
25466
+ "grad_norm": 6.5,
25467
+ "learning_rate": 0.0004821608136994204,
25468
+ "loss": 16.4915,
25469
+ "step": 72740
25470
+ },
25471
+ {
25472
+ "epoch": 0.10778045731147308,
25473
+ "grad_norm": 7.5,
25474
+ "learning_rate": 0.0004821558747644128,
25475
+ "loss": 16.428,
25476
+ "step": 72760
25477
+ },
25478
+ {
25479
+ "epoch": 0.10781008360540147,
25480
+ "grad_norm": 7.21875,
25481
+ "learning_rate": 0.0004821509358294052,
25482
+ "loss": 16.4355,
25483
+ "step": 72780
25484
+ },
25485
+ {
25486
+ "epoch": 0.10783970989932985,
25487
+ "grad_norm": 6.34375,
25488
+ "learning_rate": 0.0004821459968943977,
25489
+ "loss": 16.4379,
25490
+ "step": 72800
25491
+ },
25492
+ {
25493
+ "epoch": 0.10786933619325824,
25494
+ "grad_norm": 6.53125,
25495
+ "learning_rate": 0.0004821410579593901,
25496
+ "loss": 16.4117,
25497
+ "step": 72820
25498
+ },
25499
+ {
25500
+ "epoch": 0.10789896248718663,
25501
+ "grad_norm": 6.59375,
25502
+ "learning_rate": 0.00048213611902438256,
25503
+ "loss": 16.4863,
25504
+ "step": 72840
25505
+ },
25506
+ {
25507
+ "epoch": 0.10792858878111501,
25508
+ "grad_norm": 6.875,
25509
+ "learning_rate": 0.00048213118008937495,
25510
+ "loss": 16.4081,
25511
+ "step": 72860
25512
+ },
25513
+ {
25514
+ "epoch": 0.1079582150750434,
25515
+ "grad_norm": 6.03125,
25516
+ "learning_rate": 0.0004821262411543674,
25517
+ "loss": 16.4631,
25518
+ "step": 72880
25519
+ },
25520
+ {
25521
+ "epoch": 0.10798784136897178,
25522
+ "grad_norm": 6.34375,
25523
+ "learning_rate": 0.00048212130221935985,
25524
+ "loss": 16.4837,
25525
+ "step": 72900
25526
+ },
25527
+ {
25528
+ "epoch": 0.10801746766290017,
25529
+ "grad_norm": 6.53125,
25530
+ "learning_rate": 0.00048211636328435224,
25531
+ "loss": 16.4245,
25532
+ "step": 72920
25533
+ },
25534
+ {
25535
+ "epoch": 0.10804709395682856,
25536
+ "grad_norm": 6.90625,
25537
+ "learning_rate": 0.0004821114243493447,
25538
+ "loss": 16.4369,
25539
+ "step": 72940
25540
+ },
25541
+ {
25542
+ "epoch": 0.10807672025075696,
25543
+ "grad_norm": 6.8125,
25544
+ "learning_rate": 0.00048210648541433714,
25545
+ "loss": 16.4274,
25546
+ "step": 72960
25547
+ },
25548
+ {
25549
+ "epoch": 0.10810634654468534,
25550
+ "grad_norm": 5.90625,
25551
+ "learning_rate": 0.0004821015464793296,
25552
+ "loss": 16.4141,
25553
+ "step": 72980
25554
+ },
25555
+ {
25556
+ "epoch": 0.10813597283861373,
25557
+ "grad_norm": 6.6875,
25558
+ "learning_rate": 0.000482096607544322,
25559
+ "loss": 16.4235,
25560
+ "step": 73000
25561
  }
25562
  ],
25563
  "logging_steps": 20,
 
25577
  "attributes": {}
25578
  }
25579
  },
25580
+ "total_flos": 5.367243712484711e+19,
25581
  "train_batch_size": 48,
25582
  "trial_name": null,
25583
  "trial_params": null