mohammadmahdinouri commited on
Commit
6fdfdf4
·
verified ·
1 Parent(s): 086ff9c

Training in progress, step 19000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13fbe4723123a9c016392f22f5c5a607f137024e3a3211fa73da181d0f6cd1aa
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01e4827000f30108c5db6d9ab6168d6e7dfecf37eef3edc1465363ee9ea8e490
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aa6efd41ace1816d77bf0b60c121855a1169e94c3066ee2c4a8939be056cb68
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795bb5905ce658a665a647e1035b68562ea8227998cfd6cdd93e835459408e5d
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98e45d3c16114f00517a9e754366d6be11045def442e0374684988d3ee13c529
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6872023f654a65ebb855f875663f2550ec7c7270f37183aedc09afdf3151f71c
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:773184c6d03f9fc1dff724dd2ebc3487575db231883b47dc4663fdc68f33bddb
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d749134b574c8d566f1f7b1e5e174cfc46c406c32210d882ffb530c2f402814
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9a97caacfd2ffecaa53d612d1aaec198c719ff4db983e8469e19a70730a6af9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b2317285b7aac6485bde8423b9bd42301b29e0cd0b6a3f299d06ddf3270099
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee307f509a475bceeb88f57a12c9dbe31c5cc43a16b915e7c00fca8b909b56f5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a512863aeac154eb9ea09654b5c57fb002e6788836adf8be9c2844cb710adf1
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:add33ce1c647f1ad24436fdd2c7095ade5081fad618777000690c7e187278b49
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b73090e5ff4d77e40aae33305c58d2deda13e4f4510f1c076acf40a9f8a97bef
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.035099521769015894,
6
  "eval_steps": 500,
7
- "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6308,6 +6308,356 @@
6308
  "learning_rate": 0.0004943110480558603,
6309
  "loss": 19.8528,
6310
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6311
  }
6312
  ],
6313
  "logging_steps": 20,
@@ -6327,7 +6677,7 @@
6327
  "attributes": {}
6328
  }
6329
  },
6330
- "total_flos": 1.3232995623550058e+19,
6331
  "train_batch_size": 48,
6332
  "trial_name": null,
6333
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.03704949520062789,
6
  "eval_steps": 500,
7
+ "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6308
  "learning_rate": 0.0004943110480558603,
6309
  "loss": 19.8528,
6310
  "step": 18000
6311
+ },
6312
+ {
6313
+ "epoch": 0.035138521237648136,
6314
+ "grad_norm": 12.25,
6315
+ "learning_rate": 0.0004943045460249509,
6316
+ "loss": 19.9118,
6317
+ "step": 18020
6318
+ },
6319
+ {
6320
+ "epoch": 0.03517752070628038,
6321
+ "grad_norm": 12.0625,
6322
+ "learning_rate": 0.0004942980439940416,
6323
+ "loss": 19.7998,
6324
+ "step": 18040
6325
+ },
6326
+ {
6327
+ "epoch": 0.03521652017491262,
6328
+ "grad_norm": 10.5625,
6329
+ "learning_rate": 0.0004942915419631321,
6330
+ "loss": 19.8966,
6331
+ "step": 18060
6332
+ },
6333
+ {
6334
+ "epoch": 0.035255519643544855,
6335
+ "grad_norm": 10.375,
6336
+ "learning_rate": 0.0004942850399322228,
6337
+ "loss": 19.8631,
6338
+ "step": 18080
6339
+ },
6340
+ {
6341
+ "epoch": 0.0352945191121771,
6342
+ "grad_norm": 10.3125,
6343
+ "learning_rate": 0.0004942785379013135,
6344
+ "loss": 19.8226,
6345
+ "step": 18100
6346
+ },
6347
+ {
6348
+ "epoch": 0.03533351858080934,
6349
+ "grad_norm": 9.9375,
6350
+ "learning_rate": 0.0004942720358704041,
6351
+ "loss": 19.8466,
6352
+ "step": 18120
6353
+ },
6354
+ {
6355
+ "epoch": 0.035372518049441574,
6356
+ "grad_norm": 10.875,
6357
+ "learning_rate": 0.0004942655338394948,
6358
+ "loss": 19.8142,
6359
+ "step": 18140
6360
+ },
6361
+ {
6362
+ "epoch": 0.035411517518073816,
6363
+ "grad_norm": 10.0,
6364
+ "learning_rate": 0.0004942590318085854,
6365
+ "loss": 19.8257,
6366
+ "step": 18160
6367
+ },
6368
+ {
6369
+ "epoch": 0.03545051698670606,
6370
+ "grad_norm": 10.125,
6371
+ "learning_rate": 0.0004942525297776761,
6372
+ "loss": 19.7939,
6373
+ "step": 18180
6374
+ },
6375
+ {
6376
+ "epoch": 0.035489516455338294,
6377
+ "grad_norm": 9.5,
6378
+ "learning_rate": 0.0004942460277467667,
6379
+ "loss": 19.8764,
6380
+ "step": 18200
6381
+ },
6382
+ {
6383
+ "epoch": 0.035528515923970536,
6384
+ "grad_norm": 10.375,
6385
+ "learning_rate": 0.0004942395257158573,
6386
+ "loss": 19.8394,
6387
+ "step": 18220
6388
+ },
6389
+ {
6390
+ "epoch": 0.03556751539260278,
6391
+ "grad_norm": 10.375,
6392
+ "learning_rate": 0.000494233023684948,
6393
+ "loss": 19.7666,
6394
+ "step": 18240
6395
+ },
6396
+ {
6397
+ "epoch": 0.03560651486123501,
6398
+ "grad_norm": 10.625,
6399
+ "learning_rate": 0.0004942265216540386,
6400
+ "loss": 19.8165,
6401
+ "step": 18260
6402
+ },
6403
+ {
6404
+ "epoch": 0.035645514329867255,
6405
+ "grad_norm": 9.8125,
6406
+ "learning_rate": 0.0004942200196231293,
6407
+ "loss": 19.9201,
6408
+ "step": 18280
6409
+ },
6410
+ {
6411
+ "epoch": 0.0356845137984995,
6412
+ "grad_norm": 11.0625,
6413
+ "learning_rate": 0.0004942135175922199,
6414
+ "loss": 19.8705,
6415
+ "step": 18300
6416
+ },
6417
+ {
6418
+ "epoch": 0.03572351326713174,
6419
+ "grad_norm": 10.9375,
6420
+ "learning_rate": 0.0004942070155613106,
6421
+ "loss": 19.7906,
6422
+ "step": 18320
6423
+ },
6424
+ {
6425
+ "epoch": 0.035762512735763974,
6426
+ "grad_norm": 10.375,
6427
+ "learning_rate": 0.0004942005135304012,
6428
+ "loss": 19.7983,
6429
+ "step": 18340
6430
+ },
6431
+ {
6432
+ "epoch": 0.035801512204396216,
6433
+ "grad_norm": 10.5,
6434
+ "learning_rate": 0.0004941940114994919,
6435
+ "loss": 19.7921,
6436
+ "step": 18360
6437
+ },
6438
+ {
6439
+ "epoch": 0.03584051167302846,
6440
+ "grad_norm": 11.1875,
6441
+ "learning_rate": 0.0004941875094685825,
6442
+ "loss": 19.6906,
6443
+ "step": 18380
6444
+ },
6445
+ {
6446
+ "epoch": 0.03587951114166069,
6447
+ "grad_norm": 10.375,
6448
+ "learning_rate": 0.0004941810074376732,
6449
+ "loss": 19.8046,
6450
+ "step": 18400
6451
+ },
6452
+ {
6453
+ "epoch": 0.035918510610292935,
6454
+ "grad_norm": 10.5625,
6455
+ "learning_rate": 0.0004941745054067638,
6456
+ "loss": 19.8274,
6457
+ "step": 18420
6458
+ },
6459
+ {
6460
+ "epoch": 0.03595751007892518,
6461
+ "grad_norm": 11.25,
6462
+ "learning_rate": 0.0004941680033758544,
6463
+ "loss": 19.7977,
6464
+ "step": 18440
6465
+ },
6466
+ {
6467
+ "epoch": 0.03599650954755741,
6468
+ "grad_norm": 11.1875,
6469
+ "learning_rate": 0.0004941615013449451,
6470
+ "loss": 19.7892,
6471
+ "step": 18460
6472
+ },
6473
+ {
6474
+ "epoch": 0.036035509016189654,
6475
+ "grad_norm": 10.625,
6476
+ "learning_rate": 0.0004941549993140357,
6477
+ "loss": 19.6819,
6478
+ "step": 18480
6479
+ },
6480
+ {
6481
+ "epoch": 0.036074508484821896,
6482
+ "grad_norm": 12.3125,
6483
+ "learning_rate": 0.0004941484972831264,
6484
+ "loss": 19.7501,
6485
+ "step": 18500
6486
+ },
6487
+ {
6488
+ "epoch": 0.03611350795345413,
6489
+ "grad_norm": 10.0625,
6490
+ "learning_rate": 0.000494141995252217,
6491
+ "loss": 19.7792,
6492
+ "step": 18520
6493
+ },
6494
+ {
6495
+ "epoch": 0.03615250742208637,
6496
+ "grad_norm": 9.3125,
6497
+ "learning_rate": 0.0004941354932213077,
6498
+ "loss": 19.7753,
6499
+ "step": 18540
6500
+ },
6501
+ {
6502
+ "epoch": 0.036191506890718615,
6503
+ "grad_norm": 10.4375,
6504
+ "learning_rate": 0.0004941289911903983,
6505
+ "loss": 19.8244,
6506
+ "step": 18560
6507
+ },
6508
+ {
6509
+ "epoch": 0.03623050635935086,
6510
+ "grad_norm": 10.75,
6511
+ "learning_rate": 0.000494122489159489,
6512
+ "loss": 19.7036,
6513
+ "step": 18580
6514
+ },
6515
+ {
6516
+ "epoch": 0.03626950582798309,
6517
+ "grad_norm": 8.5625,
6518
+ "learning_rate": 0.0004941159871285797,
6519
+ "loss": 19.7776,
6520
+ "step": 18600
6521
+ },
6522
+ {
6523
+ "epoch": 0.036308505296615334,
6524
+ "grad_norm": 10.8125,
6525
+ "learning_rate": 0.0004941094850976703,
6526
+ "loss": 19.8757,
6527
+ "step": 18620
6528
+ },
6529
+ {
6530
+ "epoch": 0.036347504765247576,
6531
+ "grad_norm": 9.625,
6532
+ "learning_rate": 0.000494102983066761,
6533
+ "loss": 19.8473,
6534
+ "step": 18640
6535
+ },
6536
+ {
6537
+ "epoch": 0.03638650423387981,
6538
+ "grad_norm": 14.3125,
6539
+ "learning_rate": 0.0004940964810358516,
6540
+ "loss": 19.6959,
6541
+ "step": 18660
6542
+ },
6543
+ {
6544
+ "epoch": 0.03642550370251205,
6545
+ "grad_norm": 12.25,
6546
+ "learning_rate": 0.0004940899790049422,
6547
+ "loss": 19.7052,
6548
+ "step": 18680
6549
+ },
6550
+ {
6551
+ "epoch": 0.036464503171144295,
6552
+ "grad_norm": 10.625,
6553
+ "learning_rate": 0.0004940834769740328,
6554
+ "loss": 19.7221,
6555
+ "step": 18700
6556
+ },
6557
+ {
6558
+ "epoch": 0.03650350263977653,
6559
+ "grad_norm": 9.125,
6560
+ "learning_rate": 0.0004940769749431235,
6561
+ "loss": 19.636,
6562
+ "step": 18720
6563
+ },
6564
+ {
6565
+ "epoch": 0.03654250210840877,
6566
+ "grad_norm": 9.8125,
6567
+ "learning_rate": 0.0004940704729122142,
6568
+ "loss": 19.7428,
6569
+ "step": 18740
6570
+ },
6571
+ {
6572
+ "epoch": 0.036581501577041015,
6573
+ "grad_norm": 10.5,
6574
+ "learning_rate": 0.0004940639708813048,
6575
+ "loss": 19.7076,
6576
+ "step": 18760
6577
+ },
6578
+ {
6579
+ "epoch": 0.03662050104567325,
6580
+ "grad_norm": 10.125,
6581
+ "learning_rate": 0.0004940574688503955,
6582
+ "loss": 19.6721,
6583
+ "step": 18780
6584
+ },
6585
+ {
6586
+ "epoch": 0.03665950051430549,
6587
+ "grad_norm": 12.4375,
6588
+ "learning_rate": 0.0004940509668194861,
6589
+ "loss": 19.7135,
6590
+ "step": 18800
6591
+ },
6592
+ {
6593
+ "epoch": 0.036698499982937734,
6594
+ "grad_norm": 9.5,
6595
+ "learning_rate": 0.0004940444647885768,
6596
+ "loss": 19.591,
6597
+ "step": 18820
6598
+ },
6599
+ {
6600
+ "epoch": 0.036737499451569976,
6601
+ "grad_norm": 9.3125,
6602
+ "learning_rate": 0.0004940379627576673,
6603
+ "loss": 19.595,
6604
+ "step": 18840
6605
+ },
6606
+ {
6607
+ "epoch": 0.03677649892020221,
6608
+ "grad_norm": 10.375,
6609
+ "learning_rate": 0.000494031460726758,
6610
+ "loss": 19.7223,
6611
+ "step": 18860
6612
+ },
6613
+ {
6614
+ "epoch": 0.03681549838883445,
6615
+ "grad_norm": 12.4375,
6616
+ "learning_rate": 0.0004940249586958486,
6617
+ "loss": 19.6684,
6618
+ "step": 18880
6619
+ },
6620
+ {
6621
+ "epoch": 0.036854497857466695,
6622
+ "grad_norm": 10.0625,
6623
+ "learning_rate": 0.0004940184566649393,
6624
+ "loss": 19.6797,
6625
+ "step": 18900
6626
+ },
6627
+ {
6628
+ "epoch": 0.03689349732609893,
6629
+ "grad_norm": 10.0,
6630
+ "learning_rate": 0.00049401195463403,
6631
+ "loss": 19.712,
6632
+ "step": 18920
6633
+ },
6634
+ {
6635
+ "epoch": 0.03693249679473117,
6636
+ "grad_norm": 10.25,
6637
+ "learning_rate": 0.0004940054526031206,
6638
+ "loss": 19.5788,
6639
+ "step": 18940
6640
+ },
6641
+ {
6642
+ "epoch": 0.036971496263363414,
6643
+ "grad_norm": 11.8125,
6644
+ "learning_rate": 0.0004939989505722113,
6645
+ "loss": 19.6803,
6646
+ "step": 18960
6647
+ },
6648
+ {
6649
+ "epoch": 0.03701049573199565,
6650
+ "grad_norm": 10.5,
6651
+ "learning_rate": 0.0004939924485413019,
6652
+ "loss": 19.705,
6653
+ "step": 18980
6654
+ },
6655
+ {
6656
+ "epoch": 0.03704949520062789,
6657
+ "grad_norm": 9.25,
6658
+ "learning_rate": 0.0004939859465103925,
6659
+ "loss": 19.6594,
6660
+ "step": 19000
6661
  }
6662
  ],
6663
  "logging_steps": 20,
 
6677
  "attributes": {}
6678
  }
6679
  },
6680
+ "total_flos": 1.3968203395446604e+19,
6681
  "train_batch_size": 48,
6682
  "trial_name": null,
6683
  "trial_params": null