shulijia commited on
Commit
cc27913
·
verified ·
1 Parent(s): 3816f73

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f17a33da6a6be08a3dfaa7e6c5e60a20306cc6e557fc8b7a0963fd3e0a820f66
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fc17cf4778de00f56eb3118cc881d33a3737ff10e3eb03e214d86942e058e70
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56d8d3a670435b5f5f1d8c0dbeb4c4bef041897b3d26af31b6008d89bbb0f5da
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa3a4015c1121204d1e50a3f1402c43eca6ceea15098443b41437862247a64e
3
  size 4768663315
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de69a2834426ff9ef8199d077e00892579278af31d8969d77f98235b5cfc010a
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2534e434cd5abbb8f7668d3eab0549db0ef95d6a797a3efa86b712e8e32266a7
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea1916e5e1d99532b0ae3780fa8b68c23b1117b4ecb0a0bdf06a7d5d71cbf5f2
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6db8d4f24a9b059deca696b72055b5814e66617e31dc4227844e631fef5e5cd
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.9824760151520515,
6
  "eval_steps": 100,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6308,6 +6308,456 @@
6308
  "mean_token_accuracy": 0.7816780813038349,
6309
  "num_tokens": 57342976.0,
6310
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6311
  }
6312
  ],
6313
  "logging_steps": 10,
@@ -6327,7 +6777,7 @@
6327
  "attributes": {}
6328
  }
6329
  },
6330
- "total_flos": 1.5154628854913434e+17,
6331
  "train_batch_size": 2,
6332
  "trial_name": null,
6333
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.12404857152972,
6
  "eval_steps": 100,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6308
  "mean_token_accuracy": 0.7816780813038349,
6309
  "num_tokens": 57342976.0,
6310
  "step": 7000
6311
+ },
6312
+ {
6313
+ "epoch": 1.9853081743193968,
6314
+ "grad_norm": 1.2141501903533936,
6315
+ "learning_rate": 3.759572013007448e-06,
6316
+ "loss": 0.1356,
6317
+ "mean_token_accuracy": 0.7747798431664705,
6318
+ "num_tokens": 57424896.0,
6319
+ "step": 7010
6320
+ },
6321
+ {
6322
+ "epoch": 1.9881403334867418,
6323
+ "grad_norm": 1.7239429950714111,
6324
+ "learning_rate": 3.7490821357390124e-06,
6325
+ "loss": 0.1708,
6326
+ "mean_token_accuracy": 0.7312010746449232,
6327
+ "num_tokens": 57506816.0,
6328
+ "step": 7020
6329
+ },
6330
+ {
6331
+ "epoch": 1.990972492654087,
6332
+ "grad_norm": 1.4694973230361938,
6333
+ "learning_rate": 3.7385922584705765e-06,
6334
+ "loss": 0.1418,
6335
+ "mean_token_accuracy": 0.7763209376484156,
6336
+ "num_tokens": 57588736.0,
6337
+ "step": 7030
6338
+ },
6339
+ {
6340
+ "epoch": 1.9938046518214323,
6341
+ "grad_norm": 1.2129154205322266,
6342
+ "learning_rate": 3.72810238120214e-06,
6343
+ "loss": 0.1644,
6344
+ "mean_token_accuracy": 0.7523361068218947,
6345
+ "num_tokens": 57670656.0,
6346
+ "step": 7040
6347
+ },
6348
+ {
6349
+ "epoch": 1.9966368109887775,
6350
+ "grad_norm": 1.0555603504180908,
6351
+ "learning_rate": 3.7176125039337042e-06,
6352
+ "loss": 0.1415,
6353
+ "mean_token_accuracy": 0.7746942289173603,
6354
+ "num_tokens": 57752576.0,
6355
+ "step": 7050
6356
+ },
6357
+ {
6358
+ "epoch": 1.9994689701561228,
6359
+ "grad_norm": 1.5597031116485596,
6360
+ "learning_rate": 3.7071226266652683e-06,
6361
+ "loss": 0.1409,
6362
+ "mean_token_accuracy": 0.7636497039347887,
6363
+ "num_tokens": 57834496.0,
6364
+ "step": 7060
6365
+ },
6366
+ {
6367
+ "epoch": 2.002265727333876,
6368
+ "grad_norm": 1.1237660646438599,
6369
+ "learning_rate": 3.6966327493968324e-06,
6370
+ "loss": 0.1065,
6371
+ "mean_token_accuracy": 0.7782952212080171,
6372
+ "num_tokens": 57915392.0,
6373
+ "step": 7070
6374
+ },
6375
+ {
6376
+ "epoch": 2.0050978865012214,
6377
+ "grad_norm": 1.2385786771774292,
6378
+ "learning_rate": 3.6861428721283965e-06,
6379
+ "loss": 0.11,
6380
+ "mean_token_accuracy": 0.7639187891036272,
6381
+ "num_tokens": 57997312.0,
6382
+ "step": 7080
6383
+ },
6384
+ {
6385
+ "epoch": 2.0079300456685667,
6386
+ "grad_norm": 1.055507779121399,
6387
+ "learning_rate": 3.6756529948599605e-06,
6388
+ "loss": 0.113,
6389
+ "mean_token_accuracy": 0.7746575351804494,
6390
+ "num_tokens": 58079232.0,
6391
+ "step": 7090
6392
+ },
6393
+ {
6394
+ "epoch": 2.010762204835912,
6395
+ "grad_norm": 1.1237515211105347,
6396
+ "learning_rate": 3.665163117591524e-06,
6397
+ "loss": 0.0966,
6398
+ "mean_token_accuracy": 0.781531311571598,
6399
+ "num_tokens": 58161152.0,
6400
+ "step": 7100
6401
+ },
6402
+ {
6403
+ "epoch": 2.013594364003257,
6404
+ "grad_norm": 1.7721480131149292,
6405
+ "learning_rate": 3.6546732403230883e-06,
6406
+ "loss": 0.1252,
6407
+ "mean_token_accuracy": 0.761117908358574,
6408
+ "num_tokens": 58243072.0,
6409
+ "step": 7110
6410
+ },
6411
+ {
6412
+ "epoch": 2.016426523170602,
6413
+ "grad_norm": 1.310492992401123,
6414
+ "learning_rate": 3.6441833630546523e-06,
6415
+ "loss": 0.1261,
6416
+ "mean_token_accuracy": 0.7766267094761133,
6417
+ "num_tokens": 58324992.0,
6418
+ "step": 7120
6419
+ },
6420
+ {
6421
+ "epoch": 2.0192586823379473,
6422
+ "grad_norm": 1.3270450830459595,
6423
+ "learning_rate": 3.6336934857862164e-06,
6424
+ "loss": 0.1702,
6425
+ "mean_token_accuracy": 0.729219663143158,
6426
+ "num_tokens": 58406912.0,
6427
+ "step": 7130
6428
+ },
6429
+ {
6430
+ "epoch": 2.0220908415052925,
6431
+ "grad_norm": 1.4443167448043823,
6432
+ "learning_rate": 3.623203608517781e-06,
6433
+ "loss": 0.1288,
6434
+ "mean_token_accuracy": 0.7712328769266605,
6435
+ "num_tokens": 58488832.0,
6436
+ "step": 7140
6437
+ },
6438
+ {
6439
+ "epoch": 2.0249230006726378,
6440
+ "grad_norm": 1.5470432043075562,
6441
+ "learning_rate": 3.612713731249345e-06,
6442
+ "loss": 0.1346,
6443
+ "mean_token_accuracy": 0.7625000022351742,
6444
+ "num_tokens": 58570752.0,
6445
+ "step": 7150
6446
+ },
6447
+ {
6448
+ "epoch": 2.027755159839983,
6449
+ "grad_norm": 1.260907530784607,
6450
+ "learning_rate": 3.6022238539809086e-06,
6451
+ "loss": 0.104,
6452
+ "mean_token_accuracy": 0.7806873768568039,
6453
+ "num_tokens": 58652672.0,
6454
+ "step": 7160
6455
+ },
6456
+ {
6457
+ "epoch": 2.0305873190073283,
6458
+ "grad_norm": 0.9440592527389526,
6459
+ "learning_rate": 3.5917339767124727e-06,
6460
+ "loss": 0.1294,
6461
+ "mean_token_accuracy": 0.75951565541327,
6462
+ "num_tokens": 58734592.0,
6463
+ "step": 7170
6464
+ },
6465
+ {
6466
+ "epoch": 2.0334194781746735,
6467
+ "grad_norm": 1.4341137409210205,
6468
+ "learning_rate": 3.581244099444037e-06,
6469
+ "loss": 0.1188,
6470
+ "mean_token_accuracy": 0.7584393329918384,
6471
+ "num_tokens": 58816512.0,
6472
+ "step": 7180
6473
+ },
6474
+ {
6475
+ "epoch": 2.0362516373420187,
6476
+ "grad_norm": 1.1970900297164917,
6477
+ "learning_rate": 3.570754222175601e-06,
6478
+ "loss": 0.0963,
6479
+ "mean_token_accuracy": 0.7981164366006851,
6480
+ "num_tokens": 58898432.0,
6481
+ "step": 7190
6482
+ },
6483
+ {
6484
+ "epoch": 2.039083796509364,
6485
+ "grad_norm": 1.7112830877304077,
6486
+ "learning_rate": 3.560264344907165e-06,
6487
+ "loss": 0.1528,
6488
+ "mean_token_accuracy": 0.7553693726658821,
6489
+ "num_tokens": 58980352.0,
6490
+ "step": 7200
6491
+ },
6492
+ {
6493
+ "epoch": 2.0419159556767092,
6494
+ "grad_norm": 1.4228155612945557,
6495
+ "learning_rate": 3.549774467638729e-06,
6496
+ "loss": 0.1225,
6497
+ "mean_token_accuracy": 0.7504280801862478,
6498
+ "num_tokens": 59062272.0,
6499
+ "step": 7210
6500
+ },
6501
+ {
6502
+ "epoch": 2.044748114844054,
6503
+ "grad_norm": 1.5878945589065552,
6504
+ "learning_rate": 3.5392845903702927e-06,
6505
+ "loss": 0.1153,
6506
+ "mean_token_accuracy": 0.7910225056111813,
6507
+ "num_tokens": 59144192.0,
6508
+ "step": 7220
6509
+ },
6510
+ {
6511
+ "epoch": 2.0475802740113993,
6512
+ "grad_norm": 1.1206371784210205,
6513
+ "learning_rate": 3.5287947131018567e-06,
6514
+ "loss": 0.1279,
6515
+ "mean_token_accuracy": 0.7498165342956782,
6516
+ "num_tokens": 59226112.0,
6517
+ "step": 7230
6518
+ },
6519
+ {
6520
+ "epoch": 2.0504124331787446,
6521
+ "grad_norm": 1.6621618270874023,
6522
+ "learning_rate": 3.518304835833421e-06,
6523
+ "loss": 0.1217,
6524
+ "mean_token_accuracy": 0.7606286689639091,
6525
+ "num_tokens": 59308032.0,
6526
+ "step": 7240
6527
+ },
6528
+ {
6529
+ "epoch": 2.05324459234609,
6530
+ "grad_norm": 1.387544870376587,
6531
+ "learning_rate": 3.507814958564985e-06,
6532
+ "loss": 0.1446,
6533
+ "mean_token_accuracy": 0.7522015646100044,
6534
+ "num_tokens": 59389952.0,
6535
+ "step": 7250
6536
+ },
6537
+ {
6538
+ "epoch": 2.056076751513435,
6539
+ "grad_norm": 1.303863525390625,
6540
+ "learning_rate": 3.4973250812965494e-06,
6541
+ "loss": 0.1362,
6542
+ "mean_token_accuracy": 0.7659858129918575,
6543
+ "num_tokens": 59471872.0,
6544
+ "step": 7260
6545
+ },
6546
+ {
6547
+ "epoch": 2.0589089106807803,
6548
+ "grad_norm": 1.3478611707687378,
6549
+ "learning_rate": 3.4868352040281135e-06,
6550
+ "loss": 0.1246,
6551
+ "mean_token_accuracy": 0.7809809193015098,
6552
+ "num_tokens": 59553792.0,
6553
+ "step": 7270
6554
+ },
6555
+ {
6556
+ "epoch": 2.0617410698481256,
6557
+ "grad_norm": 1.1989835500717163,
6558
+ "learning_rate": 3.476345326759677e-06,
6559
+ "loss": 0.1085,
6560
+ "mean_token_accuracy": 0.7756604697555304,
6561
+ "num_tokens": 59635712.0,
6562
+ "step": 7280
6563
+ },
6564
+ {
6565
+ "epoch": 2.064573229015471,
6566
+ "grad_norm": 1.1155551671981812,
6567
+ "learning_rate": 3.465855449491241e-06,
6568
+ "loss": 0.1089,
6569
+ "mean_token_accuracy": 0.7770547963678837,
6570
+ "num_tokens": 59717632.0,
6571
+ "step": 7290
6572
+ },
6573
+ {
6574
+ "epoch": 2.067405388182816,
6575
+ "grad_norm": 1.2903120517730713,
6576
+ "learning_rate": 3.4553655722228053e-06,
6577
+ "loss": 0.1234,
6578
+ "mean_token_accuracy": 0.7721501953899861,
6579
+ "num_tokens": 59799552.0,
6580
+ "step": 7300
6581
+ },
6582
+ {
6583
+ "epoch": 2.0702375473501613,
6584
+ "grad_norm": 1.6666812896728516,
6585
+ "learning_rate": 3.4448756949543694e-06,
6586
+ "loss": 0.1253,
6587
+ "mean_token_accuracy": 0.7631360098719597,
6588
+ "num_tokens": 59881472.0,
6589
+ "step": 7310
6590
+ },
6591
+ {
6592
+ "epoch": 2.073069706517506,
6593
+ "grad_norm": 1.2040691375732422,
6594
+ "learning_rate": 3.4343858176859334e-06,
6595
+ "loss": 0.1359,
6596
+ "mean_token_accuracy": 0.7778008833527565,
6597
+ "num_tokens": 59963392.0,
6598
+ "step": 7320
6599
+ },
6600
+ {
6601
+ "epoch": 2.0759018656848514,
6602
+ "grad_norm": 1.2499768733978271,
6603
+ "learning_rate": 3.4238959404174975e-06,
6604
+ "loss": 0.1113,
6605
+ "mean_token_accuracy": 0.7751345403492451,
6606
+ "num_tokens": 60045312.0,
6607
+ "step": 7330
6608
+ },
6609
+ {
6610
+ "epoch": 2.0787340248521966,
6611
+ "grad_norm": 0.9466302990913391,
6612
+ "learning_rate": 3.413406063149061e-06,
6613
+ "loss": 0.1402,
6614
+ "mean_token_accuracy": 0.7586472611874342,
6615
+ "num_tokens": 60127232.0,
6616
+ "step": 7340
6617
+ },
6618
+ {
6619
+ "epoch": 2.081566184019542,
6620
+ "grad_norm": 1.0079811811447144,
6621
+ "learning_rate": 3.4029161858806252e-06,
6622
+ "loss": 0.1033,
6623
+ "mean_token_accuracy": 0.7875611506402492,
6624
+ "num_tokens": 60209152.0,
6625
+ "step": 7350
6626
+ },
6627
+ {
6628
+ "epoch": 2.084398343186887,
6629
+ "grad_norm": 1.493399977684021,
6630
+ "learning_rate": 3.3924263086121893e-06,
6631
+ "loss": 0.1128,
6632
+ "mean_token_accuracy": 0.7791707415133715,
6633
+ "num_tokens": 60291072.0,
6634
+ "step": 7360
6635
+ },
6636
+ {
6637
+ "epoch": 2.0872305023542324,
6638
+ "grad_norm": 1.5899913311004639,
6639
+ "learning_rate": 3.381936431343754e-06,
6640
+ "loss": 0.154,
6641
+ "mean_token_accuracy": 0.7400317970663309,
6642
+ "num_tokens": 60372992.0,
6643
+ "step": 7370
6644
+ },
6645
+ {
6646
+ "epoch": 2.0900626615215776,
6647
+ "grad_norm": 1.5314220190048218,
6648
+ "learning_rate": 3.371446554075318e-06,
6649
+ "loss": 0.1232,
6650
+ "mean_token_accuracy": 0.7905455000698567,
6651
+ "num_tokens": 60454912.0,
6652
+ "step": 7380
6653
+ },
6654
+ {
6655
+ "epoch": 2.092894820688923,
6656
+ "grad_norm": 1.2721341848373413,
6657
+ "learning_rate": 3.360956676806882e-06,
6658
+ "loss": 0.1085,
6659
+ "mean_token_accuracy": 0.7729696653783321,
6660
+ "num_tokens": 60536832.0,
6661
+ "step": 7390
6662
+ },
6663
+ {
6664
+ "epoch": 2.095726979856268,
6665
+ "grad_norm": 1.1642765998840332,
6666
+ "learning_rate": 3.3504667995384456e-06,
6667
+ "loss": 0.1081,
6668
+ "mean_token_accuracy": 0.7889799427241087,
6669
+ "num_tokens": 60618752.0,
6670
+ "step": 7400
6671
+ },
6672
+ {
6673
+ "epoch": 2.098559139023613,
6674
+ "grad_norm": 1.3702284097671509,
6675
+ "learning_rate": 3.3399769222700097e-06,
6676
+ "loss": 0.1094,
6677
+ "mean_token_accuracy": 0.7912671204656363,
6678
+ "num_tokens": 60700672.0,
6679
+ "step": 7410
6680
+ },
6681
+ {
6682
+ "epoch": 2.101391298190958,
6683
+ "grad_norm": 1.2944626808166504,
6684
+ "learning_rate": 3.3294870450015738e-06,
6685
+ "loss": 0.1313,
6686
+ "mean_token_accuracy": 0.7547211341559887,
6687
+ "num_tokens": 60782592.0,
6688
+ "step": 7420
6689
+ },
6690
+ {
6691
+ "epoch": 2.1042234573583034,
6692
+ "grad_norm": 1.1357483863830566,
6693
+ "learning_rate": 3.318997167733138e-06,
6694
+ "loss": 0.1069,
6695
+ "mean_token_accuracy": 0.7714530322700739,
6696
+ "num_tokens": 60864512.0,
6697
+ "step": 7430
6698
+ },
6699
+ {
6700
+ "epoch": 2.1070556165256487,
6701
+ "grad_norm": 1.0823742151260376,
6702
+ "learning_rate": 3.308507290464702e-06,
6703
+ "loss": 0.1106,
6704
+ "mean_token_accuracy": 0.7675146773457527,
6705
+ "num_tokens": 60946432.0,
6706
+ "step": 7440
6707
+ },
6708
+ {
6709
+ "epoch": 2.109887775692994,
6710
+ "grad_norm": 1.2482222318649292,
6711
+ "learning_rate": 3.298017413196266e-06,
6712
+ "loss": 0.1122,
6713
+ "mean_token_accuracy": 0.7684442289173603,
6714
+ "num_tokens": 61028352.0,
6715
+ "step": 7450
6716
+ },
6717
+ {
6718
+ "epoch": 2.112719934860339,
6719
+ "grad_norm": 1.9791706800460815,
6720
+ "learning_rate": 3.2875275359278296e-06,
6721
+ "loss": 0.1222,
6722
+ "mean_token_accuracy": 0.7773728009313345,
6723
+ "num_tokens": 61110272.0,
6724
+ "step": 7460
6725
+ },
6726
+ {
6727
+ "epoch": 2.1155520940276844,
6728
+ "grad_norm": 1.8661694526672363,
6729
+ "learning_rate": 3.2770376586593937e-06,
6730
+ "loss": 0.1091,
6731
+ "mean_token_accuracy": 0.7745596896857023,
6732
+ "num_tokens": 61192192.0,
6733
+ "step": 7470
6734
+ },
6735
+ {
6736
+ "epoch": 2.1183842531950297,
6737
+ "grad_norm": 0.9961443543434143,
6738
+ "learning_rate": 3.2665477813909578e-06,
6739
+ "loss": 0.1212,
6740
+ "mean_token_accuracy": 0.7665973592549562,
6741
+ "num_tokens": 61274112.0,
6742
+ "step": 7480
6743
+ },
6744
+ {
6745
+ "epoch": 2.121216412362375,
6746
+ "grad_norm": 1.1755738258361816,
6747
+ "learning_rate": 3.2560579041225223e-06,
6748
+ "loss": 0.1216,
6749
+ "mean_token_accuracy": 0.7842465721070766,
6750
+ "num_tokens": 61356032.0,
6751
+ "step": 7490
6752
+ },
6753
+ {
6754
+ "epoch": 2.12404857152972,
6755
+ "grad_norm": 1.227830171585083,
6756
+ "learning_rate": 3.2455680268540864e-06,
6757
+ "loss": 0.1372,
6758
+ "mean_token_accuracy": 0.7647749528288841,
6759
+ "num_tokens": 61437952.0,
6760
+ "step": 7500
6761
  }
6762
  ],
6763
  "logging_steps": 10,
 
6777
  "attributes": {}
6778
  }
6779
  },
6780
+ "total_flos": 1.6236851051574067e+17,
6781
  "train_batch_size": 2,
6782
  "trial_name": null,
6783
  "trial_params": null