Cuttlefish / trainer_state.json
zihaojing's picture
Add files using upload-large-folder tool
a516e1b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02202036884117809,
"grad_norm": 8.139582633972168,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.04783125,
"loss_accumulated": 16.7653,
"step": 10
},
{
"epoch": 0.04404073768235618,
"grad_norm": 6.492475509643555,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.2244375,
"loss_accumulated": 19.591,
"step": 20
},
{
"epoch": 0.06606110652353427,
"grad_norm": 8.457890510559082,
"learning_rate": 5.8e-06,
"loss": 1.0909875,
"loss_accumulated": 17.4558,
"step": 30
},
{
"epoch": 0.08808147536471236,
"grad_norm": 5.892795085906982,
"learning_rate": 7.800000000000002e-06,
"loss": 1.07938125,
"loss_accumulated": 17.2701,
"step": 40
},
{
"epoch": 0.11010184420589045,
"grad_norm": 6.56881856918335,
"learning_rate": 9.800000000000001e-06,
"loss": 1.10010625,
"loss_accumulated": 17.6017,
"step": 50
},
{
"epoch": 0.13212221304706853,
"grad_norm": 4.701152801513672,
"learning_rate": 9.895348837209303e-06,
"loss": 1.0741625,
"loss_accumulated": 17.1866,
"step": 60
},
{
"epoch": 0.15414258188824662,
"grad_norm": 9.48551082611084,
"learning_rate": 9.779069767441862e-06,
"loss": 1.04658125,
"loss_accumulated": 16.7453,
"step": 70
},
{
"epoch": 0.17616295072942473,
"grad_norm": 5.215735912322998,
"learning_rate": 9.662790697674419e-06,
"loss": 1.044925,
"loss_accumulated": 16.7188,
"step": 80
},
{
"epoch": 0.1981833195706028,
"grad_norm": 6.0997314453125,
"learning_rate": 9.546511627906978e-06,
"loss": 1.05530625,
"loss_accumulated": 16.8849,
"step": 90
},
{
"epoch": 0.2202036884117809,
"grad_norm": 6.653469085693359,
"learning_rate": 9.430232558139536e-06,
"loss": 1.08026875,
"loss_accumulated": 17.2843,
"step": 100
},
{
"epoch": 0.2202036884117809,
"eval_loss": 1.0868104696273804,
"eval_runtime": 105.087,
"eval_samples_per_second": 7.689,
"eval_steps_per_second": 7.689,
"step": 100
},
{
"epoch": 0.24222405725295898,
"grad_norm": 5.707369804382324,
"learning_rate": 9.313953488372095e-06,
"loss": 1.24023125,
"loss_accumulated": 19.8437,
"step": 110
},
{
"epoch": 0.26424442609413706,
"grad_norm": 8.452932357788086,
"learning_rate": 9.197674418604652e-06,
"loss": 1.02721875,
"loss_accumulated": 16.4355,
"step": 120
},
{
"epoch": 0.28626479493531515,
"grad_norm": 10.253166198730469,
"learning_rate": 9.08139534883721e-06,
"loss": 1.0215,
"loss_accumulated": 16.344,
"step": 130
},
{
"epoch": 0.30828516377649323,
"grad_norm": 7.390908241271973,
"learning_rate": 8.965116279069767e-06,
"loss": 1.0461375,
"loss_accumulated": 16.7382,
"step": 140
},
{
"epoch": 0.33030553261767137,
"grad_norm": 8.768054962158203,
"learning_rate": 8.848837209302326e-06,
"loss": 1.0388375,
"loss_accumulated": 16.6214,
"step": 150
},
{
"epoch": 0.35232590145884946,
"grad_norm": 7.710715293884277,
"learning_rate": 8.732558139534885e-06,
"loss": 1.04136875,
"loss_accumulated": 16.6619,
"step": 160
},
{
"epoch": 0.37434627030002754,
"grad_norm": 7.349565029144287,
"learning_rate": 8.616279069767443e-06,
"loss": 1.0436375,
"loss_accumulated": 16.6982,
"step": 170
},
{
"epoch": 0.3963666391412056,
"grad_norm": 9.514286994934082,
"learning_rate": 8.5e-06,
"loss": 1.03064375,
"loss_accumulated": 16.4903,
"step": 180
},
{
"epoch": 0.4183870079823837,
"grad_norm": 10.636228561401367,
"learning_rate": 8.383720930232559e-06,
"loss": 1.06431875,
"loss_accumulated": 17.0291,
"step": 190
},
{
"epoch": 0.4404073768235618,
"grad_norm": 8.870360374450684,
"learning_rate": 8.267441860465118e-06,
"loss": 1.04586875,
"loss_accumulated": 16.7339,
"step": 200
},
{
"epoch": 0.4404073768235618,
"eval_loss": 1.0789271593093872,
"eval_runtime": 135.2266,
"eval_samples_per_second": 5.975,
"eval_steps_per_second": 5.975,
"step": 200
},
{
"epoch": 0.4624277456647399,
"grad_norm": 12.370259284973145,
"learning_rate": 8.151162790697676e-06,
"loss": 1.0686875,
"loss_accumulated": 17.099,
"step": 210
},
{
"epoch": 0.48444811450591796,
"grad_norm": 9.170878410339355,
"learning_rate": 8.034883720930233e-06,
"loss": 1.07258125,
"loss_accumulated": 17.1613,
"step": 220
},
{
"epoch": 0.506468483347096,
"grad_norm": 14.712733268737793,
"learning_rate": 7.918604651162792e-06,
"loss": 1.09678125,
"loss_accumulated": 17.5485,
"step": 230
},
{
"epoch": 0.5284888521882741,
"grad_norm": 9.565340042114258,
"learning_rate": 7.80232558139535e-06,
"loss": 1.12413125,
"loss_accumulated": 17.9861,
"step": 240
},
{
"epoch": 0.5505092210294522,
"grad_norm": 11.5183744430542,
"learning_rate": 7.686046511627909e-06,
"loss": 1.0627125,
"loss_accumulated": 17.0034,
"step": 250
},
{
"epoch": 0.5725295898706303,
"grad_norm": 8.43002986907959,
"learning_rate": 7.569767441860466e-06,
"loss": 1.043175,
"loss_accumulated": 16.6908,
"step": 260
},
{
"epoch": 0.5945499587118084,
"grad_norm": 8.955143928527832,
"learning_rate": 7.453488372093024e-06,
"loss": 1.020925,
"loss_accumulated": 16.3348,
"step": 270
},
{
"epoch": 0.6165703275529865,
"grad_norm": 9.91588020324707,
"learning_rate": 7.3372093023255816e-06,
"loss": 1.1777,
"loss_accumulated": 18.8432,
"step": 280
},
{
"epoch": 0.6385906963941645,
"grad_norm": 13.177949905395508,
"learning_rate": 7.22093023255814e-06,
"loss": 1.0682625,
"loss_accumulated": 17.0922,
"step": 290
},
{
"epoch": 0.6606110652353427,
"grad_norm": 9.943979263305664,
"learning_rate": 7.104651162790698e-06,
"loss": 1.03318125,
"loss_accumulated": 16.5309,
"step": 300
},
{
"epoch": 0.6606110652353427,
"eval_loss": 1.0708842277526855,
"eval_runtime": 105.1052,
"eval_samples_per_second": 7.688,
"eval_steps_per_second": 7.688,
"step": 300
},
{
"epoch": 0.6826314340765208,
"grad_norm": 14.787008285522461,
"learning_rate": 6.988372093023257e-06,
"loss": 1.2248875,
"loss_accumulated": 19.5982,
"step": 310
},
{
"epoch": 0.7046518029176989,
"grad_norm": 16.776479721069336,
"learning_rate": 6.8720930232558146e-06,
"loss": 1.07100625,
"loss_accumulated": 17.1361,
"step": 320
},
{
"epoch": 0.726672171758877,
"grad_norm": 10.714720726013184,
"learning_rate": 6.755813953488373e-06,
"loss": 1.1591,
"loss_accumulated": 18.5456,
"step": 330
},
{
"epoch": 0.7486925406000551,
"grad_norm": 9.997598648071289,
"learning_rate": 6.63953488372093e-06,
"loss": 1.1428,
"loss_accumulated": 18.2848,
"step": 340
},
{
"epoch": 0.7707129094412332,
"grad_norm": 11.680377006530762,
"learning_rate": 6.5232558139534885e-06,
"loss": 1.0948625,
"loss_accumulated": 17.5178,
"step": 350
},
{
"epoch": 0.7927332782824112,
"grad_norm": 11.191390037536621,
"learning_rate": 6.4069767441860476e-06,
"loss": 1.07081875,
"loss_accumulated": 17.1331,
"step": 360
},
{
"epoch": 0.8147536471235893,
"grad_norm": 13.758176803588867,
"learning_rate": 6.290697674418606e-06,
"loss": 1.04526875,
"loss_accumulated": 16.7243,
"step": 370
},
{
"epoch": 0.8367740159647674,
"grad_norm": 17.639863967895508,
"learning_rate": 6.174418604651163e-06,
"loss": 1.07876875,
"loss_accumulated": 17.2603,
"step": 380
},
{
"epoch": 0.8587943848059455,
"grad_norm": 10.742379188537598,
"learning_rate": 6.0581395348837215e-06,
"loss": 1.12343125,
"loss_accumulated": 17.9749,
"step": 390
},
{
"epoch": 0.8808147536471236,
"grad_norm": 11.99518871307373,
"learning_rate": 5.941860465116279e-06,
"loss": 1.017175,
"loss_accumulated": 16.2748,
"step": 400
},
{
"epoch": 0.8808147536471236,
"eval_loss": 1.0637564659118652,
"eval_runtime": 109.5725,
"eval_samples_per_second": 7.374,
"eval_steps_per_second": 7.374,
"step": 400
},
{
"epoch": 0.9028351224883017,
"grad_norm": 14.046647071838379,
"learning_rate": 5.825581395348837e-06,
"loss": 1.1387875,
"loss_accumulated": 18.2206,
"step": 410
},
{
"epoch": 0.9248554913294798,
"grad_norm": 15.011589050292969,
"learning_rate": 5.709302325581396e-06,
"loss": 1.02514375,
"loss_accumulated": 16.4023,
"step": 420
},
{
"epoch": 0.9468758601706578,
"grad_norm": 11.991171836853027,
"learning_rate": 5.5930232558139544e-06,
"loss": 1.0738,
"loss_accumulated": 17.1808,
"step": 430
},
{
"epoch": 0.9688962290118359,
"grad_norm": 12.477208137512207,
"learning_rate": 5.476744186046512e-06,
"loss": 1.05196875,
"loss_accumulated": 16.8315,
"step": 440
},
{
"epoch": 0.990916597853014,
"grad_norm": 12.482328414916992,
"learning_rate": 5.36046511627907e-06,
"loss": 1.06891875,
"loss_accumulated": 17.1027,
"step": 450
},
{
"epoch": 1.011010184420589,
"grad_norm": 21.27669906616211,
"learning_rate": 5.2441860465116275e-06,
"loss": 0.95168125,
"loss_accumulated": 15.2269,
"step": 460
},
{
"epoch": 1.0330305532617672,
"grad_norm": 13.355972290039062,
"learning_rate": 5.127906976744187e-06,
"loss": 1.06833125,
"loss_accumulated": 17.0933,
"step": 470
},
{
"epoch": 1.0550509221029452,
"grad_norm": 15.472939491271973,
"learning_rate": 5.011627906976745e-06,
"loss": 1.1307125,
"loss_accumulated": 18.0914,
"step": 480
},
{
"epoch": 1.0770712909441233,
"grad_norm": 12.726252555847168,
"learning_rate": 4.895348837209303e-06,
"loss": 1.05006875,
"loss_accumulated": 16.8011,
"step": 490
},
{
"epoch": 1.0990916597853013,
"grad_norm": 14.358366966247559,
"learning_rate": 4.7790697674418605e-06,
"loss": 1.0619125,
"loss_accumulated": 16.9906,
"step": 500
},
{
"epoch": 1.0990916597853013,
"eval_loss": 1.0603028535842896,
"eval_runtime": 112.2463,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 7.198,
"step": 500
},
{
"epoch": 1.1211120286264795,
"grad_norm": 13.931950569152832,
"learning_rate": 4.66279069767442e-06,
"loss": 1.05581875,
"loss_accumulated": 16.8931,
"step": 510
},
{
"epoch": 1.1431323974676575,
"grad_norm": 12.517531394958496,
"learning_rate": 4.546511627906977e-06,
"loss": 1.05785,
"loss_accumulated": 16.9256,
"step": 520
},
{
"epoch": 1.1651527663088357,
"grad_norm": 17.931734085083008,
"learning_rate": 4.430232558139535e-06,
"loss": 0.9880875,
"loss_accumulated": 15.8094,
"step": 530
},
{
"epoch": 1.1871731351500139,
"grad_norm": 13.656305313110352,
"learning_rate": 4.3139534883720935e-06,
"loss": 1.0042,
"loss_accumulated": 16.0672,
"step": 540
},
{
"epoch": 1.2091935039911919,
"grad_norm": 17.034332275390625,
"learning_rate": 4.197674418604652e-06,
"loss": 1.054775,
"loss_accumulated": 16.8764,
"step": 550
},
{
"epoch": 1.2312138728323698,
"grad_norm": 12.149453163146973,
"learning_rate": 4.08139534883721e-06,
"loss": 1.062975,
"loss_accumulated": 17.0076,
"step": 560
},
{
"epoch": 1.253234241673548,
"grad_norm": 12.923322677612305,
"learning_rate": 3.965116279069768e-06,
"loss": 1.18115,
"loss_accumulated": 18.8984,
"step": 570
},
{
"epoch": 1.2752546105147262,
"grad_norm": 12.064355850219727,
"learning_rate": 3.848837209302326e-06,
"loss": 0.98351875,
"loss_accumulated": 15.7363,
"step": 580
},
{
"epoch": 1.2972749793559042,
"grad_norm": 14.70433521270752,
"learning_rate": 3.7325581395348843e-06,
"loss": 1.0176125,
"loss_accumulated": 16.2818,
"step": 590
},
{
"epoch": 1.3192953481970822,
"grad_norm": 14.562840461730957,
"learning_rate": 3.616279069767442e-06,
"loss": 1.00518125,
"loss_accumulated": 16.0829,
"step": 600
},
{
"epoch": 1.3192953481970822,
"eval_loss": 1.056916356086731,
"eval_runtime": 111.8754,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 7.222,
"step": 600
},
{
"epoch": 1.3413157170382604,
"grad_norm": 17.57223129272461,
"learning_rate": 3.5e-06,
"loss": 1.0139625,
"loss_accumulated": 16.2234,
"step": 610
},
{
"epoch": 1.3633360858794386,
"grad_norm": 21.535526275634766,
"learning_rate": 3.3837209302325586e-06,
"loss": 1.002475,
"loss_accumulated": 16.0396,
"step": 620
},
{
"epoch": 1.3853564547206165,
"grad_norm": 17.39263916015625,
"learning_rate": 3.2674418604651164e-06,
"loss": 1.035375,
"loss_accumulated": 16.566,
"step": 630
},
{
"epoch": 1.4073768235617947,
"grad_norm": 12.677567481994629,
"learning_rate": 3.151162790697675e-06,
"loss": 1.0291,
"loss_accumulated": 16.4656,
"step": 640
},
{
"epoch": 1.4293971924029727,
"grad_norm": 18.014575958251953,
"learning_rate": 3.034883720930233e-06,
"loss": 1.04011875,
"loss_accumulated": 16.6419,
"step": 650
},
{
"epoch": 1.4514175612441509,
"grad_norm": 13.09684944152832,
"learning_rate": 2.9186046511627908e-06,
"loss": 1.0107125,
"loss_accumulated": 16.1714,
"step": 660
},
{
"epoch": 1.4734379300853289,
"grad_norm": 19.25403594970703,
"learning_rate": 2.8023255813953494e-06,
"loss": 1.05929375,
"loss_accumulated": 16.9487,
"step": 670
},
{
"epoch": 1.495458298926507,
"grad_norm": 14.488405227661133,
"learning_rate": 2.6860465116279073e-06,
"loss": 1.0660875,
"loss_accumulated": 17.0574,
"step": 680
},
{
"epoch": 1.5174786677676853,
"grad_norm": 31.23447036743164,
"learning_rate": 2.569767441860465e-06,
"loss": 1.08340625,
"loss_accumulated": 17.3345,
"step": 690
},
{
"epoch": 1.5394990366088632,
"grad_norm": 14.270586967468262,
"learning_rate": 2.4534883720930233e-06,
"loss": 1.0713625,
"loss_accumulated": 17.1418,
"step": 700
},
{
"epoch": 1.5394990366088632,
"eval_loss": 1.054555892944336,
"eval_runtime": 105.0636,
"eval_samples_per_second": 7.691,
"eval_steps_per_second": 7.691,
"step": 700
},
{
"epoch": 1.5615194054500412,
"grad_norm": 13.536142349243164,
"learning_rate": 2.3372093023255816e-06,
"loss": 1.01050625,
"loss_accumulated": 16.1681,
"step": 710
},
{
"epoch": 1.5835397742912194,
"grad_norm": 12.942073822021484,
"learning_rate": 2.22093023255814e-06,
"loss": 1.0091625,
"loss_accumulated": 16.1466,
"step": 720
},
{
"epoch": 1.6055601431323976,
"grad_norm": 13.64247989654541,
"learning_rate": 2.104651162790698e-06,
"loss": 1.1232375,
"loss_accumulated": 17.9718,
"step": 730
},
{
"epoch": 1.6275805119735756,
"grad_norm": 17.50322914123535,
"learning_rate": 1.988372093023256e-06,
"loss": 1.08881875,
"loss_accumulated": 17.4211,
"step": 740
},
{
"epoch": 1.6496008808147535,
"grad_norm": 14.39642333984375,
"learning_rate": 1.872093023255814e-06,
"loss": 1.00690625,
"loss_accumulated": 16.1105,
"step": 750
},
{
"epoch": 1.6716212496559317,
"grad_norm": 17.793312072753906,
"learning_rate": 1.7558139534883722e-06,
"loss": 1.029125,
"loss_accumulated": 16.466,
"step": 760
},
{
"epoch": 1.69364161849711,
"grad_norm": 14.634993553161621,
"learning_rate": 1.6395348837209304e-06,
"loss": 1.1446875,
"loss_accumulated": 18.315,
"step": 770
},
{
"epoch": 1.715661987338288,
"grad_norm": 15.071901321411133,
"learning_rate": 1.5232558139534885e-06,
"loss": 1.05595,
"loss_accumulated": 16.8952,
"step": 780
},
{
"epoch": 1.7376823561794659,
"grad_norm": 16.431106567382812,
"learning_rate": 1.4069767441860465e-06,
"loss": 1.1066,
"loss_accumulated": 17.7056,
"step": 790
},
{
"epoch": 1.759702725020644,
"grad_norm": 13.163710594177246,
"learning_rate": 1.2906976744186048e-06,
"loss": 1.12951875,
"loss_accumulated": 18.0723,
"step": 800
},
{
"epoch": 1.759702725020644,
"eval_loss": 1.0527995824813843,
"eval_runtime": 105.1441,
"eval_samples_per_second": 7.685,
"eval_steps_per_second": 7.685,
"step": 800
},
{
"epoch": 1.7817230938618223,
"grad_norm": 21.816795349121094,
"learning_rate": 1.1744186046511628e-06,
"loss": 1.082375,
"loss_accumulated": 17.318,
"step": 810
},
{
"epoch": 1.8037434627030002,
"grad_norm": 14.944353103637695,
"learning_rate": 1.058139534883721e-06,
"loss": 1.02875,
"loss_accumulated": 16.46,
"step": 820
},
{
"epoch": 1.8257638315441782,
"grad_norm": 13.911181449890137,
"learning_rate": 9.418604651162791e-07,
"loss": 1.036625,
"loss_accumulated": 16.586,
"step": 830
},
{
"epoch": 1.8477842003853564,
"grad_norm": 15.232218742370605,
"learning_rate": 8.255813953488373e-07,
"loss": 1.0375875,
"loss_accumulated": 16.6014,
"step": 840
},
{
"epoch": 1.8698045692265346,
"grad_norm": 22.372093200683594,
"learning_rate": 7.093023255813954e-07,
"loss": 1.071775,
"loss_accumulated": 17.1484,
"step": 850
},
{
"epoch": 1.8918249380677126,
"grad_norm": 23.259920120239258,
"learning_rate": 5.930232558139536e-07,
"loss": 1.01205625,
"loss_accumulated": 16.1929,
"step": 860
},
{
"epoch": 1.9138453069088908,
"grad_norm": 14.526731491088867,
"learning_rate": 4.767441860465117e-07,
"loss": 1.09719375,
"loss_accumulated": 17.5551,
"step": 870
},
{
"epoch": 1.935865675750069,
"grad_norm": 18.644268035888672,
"learning_rate": 3.6046511627906984e-07,
"loss": 1.0167625,
"loss_accumulated": 16.2682,
"step": 880
},
{
"epoch": 1.957886044591247,
"grad_norm": 14.494958877563477,
"learning_rate": 2.4418604651162793e-07,
"loss": 1.06646875,
"loss_accumulated": 17.0635,
"step": 890
},
{
"epoch": 1.979906413432425,
"grad_norm": 12.328352928161621,
"learning_rate": 1.2790697674418605e-07,
"loss": 1.05874375,
"loss_accumulated": 16.9399,
"step": 900
},
{
"epoch": 1.979906413432425,
"eval_loss": 1.051900029182434,
"eval_runtime": 105.0206,
"eval_samples_per_second": 7.694,
"eval_steps_per_second": 7.694,
"step": 900
},
{
"epoch": 2.0,
"grad_norm": 5.32742166519165,
"learning_rate": 1.1627906976744186e-08,
"loss": 0.95435625,
"loss_accumulated": 15.2697,
"step": 910
},
{
"epoch": 2.0,
"step": 910,
"total_flos": 4.037627772142704e+17,
"train_loss": 17.02884989308787,
"train_runtime": 7294.6769,
"train_samples_per_second": 1.992,
"train_steps_per_second": 0.125
}
],
"logging_steps": 10,
"max_steps": 910,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.037627772142704e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}