| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.709726443768997, | |
| "eval_steps": 500, | |
| "global_step": 2250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007598784194528876, | |
| "grad_norm": 21.75, | |
| "learning_rate": 9e-06, | |
| "loss": 3.4313, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.015197568389057751, | |
| "grad_norm": 81.0, | |
| "learning_rate": 1.9e-05, | |
| "loss": 2.2326, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.022796352583586626, | |
| "grad_norm": 85.5, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.4425, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.030395136778115502, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 0.9761, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.037993920972644375, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 4.9e-05, | |
| "loss": 0.8592, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04559270516717325, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 5.9e-05, | |
| "loss": 0.8049, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05319148936170213, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 6.9e-05, | |
| "loss": 0.9155, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.060790273556231005, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 7.900000000000001e-05, | |
| "loss": 0.8197, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06838905775075987, | |
| "grad_norm": 2.0, | |
| "learning_rate": 8.900000000000001e-05, | |
| "loss": 0.7834, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07598784194528875, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.900000000000001e-05, | |
| "loss": 0.8179, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08358662613981763, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 9.958139534883721e-05, | |
| "loss": 0.7661, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0911854103343465, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 9.911627906976745e-05, | |
| "loss": 0.8447, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.09878419452887538, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.865116279069768e-05, | |
| "loss": 0.8758, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.10638297872340426, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 9.818604651162792e-05, | |
| "loss": 0.752, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11398176291793313, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 9.772093023255814e-05, | |
| "loss": 0.788, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12158054711246201, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 9.725581395348837e-05, | |
| "loss": 0.7352, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.12917933130699089, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.67906976744186e-05, | |
| "loss": 0.7905, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.13677811550151975, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 9.632558139534884e-05, | |
| "loss": 0.7821, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.14437689969604864, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.586046511627908e-05, | |
| "loss": 0.6274, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1519756838905775, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 9.539534883720931e-05, | |
| "loss": 0.7823, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1595744680851064, | |
| "grad_norm": 1.75, | |
| "learning_rate": 9.493023255813955e-05, | |
| "loss": 0.725, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.16717325227963525, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.446511627906977e-05, | |
| "loss": 0.7733, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.17477203647416414, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 9.4e-05, | |
| "loss": 0.7945, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.182370820668693, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 9.353488372093023e-05, | |
| "loss": 0.7283, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1899696048632219, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 9.306976744186047e-05, | |
| "loss": 0.6244, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.19756838905775076, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.26046511627907e-05, | |
| "loss": 0.6776, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.20516717325227962, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.213953488372094e-05, | |
| "loss": 0.6386, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2127659574468085, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.167441860465116e-05, | |
| "loss": 0.6425, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.22036474164133737, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.12093023255814e-05, | |
| "loss": 0.5976, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.22796352583586627, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 9.074418604651164e-05, | |
| "loss": 0.6518, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.23556231003039513, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.027906976744186e-05, | |
| "loss": 0.6024, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.24316109422492402, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 8.98139534883721e-05, | |
| "loss": 0.6896, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2507598784194529, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 8.934883720930233e-05, | |
| "loss": 0.5574, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.25835866261398177, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.888372093023257e-05, | |
| "loss": 0.5795, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.26595744680851063, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.841860465116279e-05, | |
| "loss": 0.6064, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2735562310030395, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.795348837209303e-05, | |
| "loss": 0.527, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2811550151975684, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 8.748837209302326e-05, | |
| "loss": 0.5996, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2887537993920973, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8.70232558139535e-05, | |
| "loss": 0.5588, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.29635258358662614, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 8.655813953488372e-05, | |
| "loss": 0.5675, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.303951367781155, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.609302325581396e-05, | |
| "loss": 0.5708, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.31155015197568386, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.562790697674418e-05, | |
| "loss": 0.593, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3191489361702128, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.516279069767442e-05, | |
| "loss": 0.5349, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.32674772036474165, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.469767441860465e-05, | |
| "loss": 0.4981, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3343465045592705, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.423255813953489e-05, | |
| "loss": 0.5477, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.34194528875379937, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.376744186046513e-05, | |
| "loss": 0.5359, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3495440729483283, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.330232558139536e-05, | |
| "loss": 0.5612, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.283720930232559e-05, | |
| "loss": 0.5386, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.364741641337386, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.237209302325581e-05, | |
| "loss": 0.4443, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3723404255319149, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.190697674418605e-05, | |
| "loss": 0.5386, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.3799392097264438, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.144186046511628e-05, | |
| "loss": 0.4538, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.38753799392097266, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.097674418604652e-05, | |
| "loss": 0.4705, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3951367781155015, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.051162790697675e-05, | |
| "loss": 0.5122, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.4027355623100304, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8.004651162790698e-05, | |
| "loss": 0.4675, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.41033434650455924, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 7.958139534883721e-05, | |
| "loss": 0.5141, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.41793313069908816, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 7.911627906976744e-05, | |
| "loss": 0.492, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.425531914893617, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 7.865116279069767e-05, | |
| "loss": 0.4708, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4331306990881459, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 7.818604651162791e-05, | |
| "loss": 0.4668, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.44072948328267475, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 7.772093023255815e-05, | |
| "loss": 0.4105, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.44832826747720367, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 7.725581395348838e-05, | |
| "loss": 0.4586, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.45592705167173253, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 7.67906976744186e-05, | |
| "loss": 0.4625, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4635258358662614, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 7.632558139534884e-05, | |
| "loss": 0.4149, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.47112462006079026, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 7.586046511627908e-05, | |
| "loss": 0.4588, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4787234042553192, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 7.53953488372093e-05, | |
| "loss": 0.4162, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.48632218844984804, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 7.493023255813954e-05, | |
| "loss": 0.3976, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4939209726443769, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 7.446511627906977e-05, | |
| "loss": 0.4161, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5015197568389058, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 7.4e-05, | |
| "loss": 0.3689, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5091185410334347, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 7.353488372093023e-05, | |
| "loss": 0.427, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5167173252279635, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 7.306976744186047e-05, | |
| "loss": 0.3842, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5243161094224924, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 7.26046511627907e-05, | |
| "loss": 0.3853, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5319148936170213, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 7.213953488372094e-05, | |
| "loss": 0.3235, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5395136778115501, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 7.167441860465116e-05, | |
| "loss": 0.3763, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.547112462006079, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 7.12093023255814e-05, | |
| "loss": 0.3651, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.5547112462006079, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 7.074418604651162e-05, | |
| "loss": 0.3187, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5623100303951368, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 7.027906976744186e-05, | |
| "loss": 0.3752, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5699088145896657, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 6.98139534883721e-05, | |
| "loss": 0.3639, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5775075987841946, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 6.934883720930233e-05, | |
| "loss": 0.3682, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5851063829787234, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 6.888372093023257e-05, | |
| "loss": 0.3418, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5927051671732523, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 6.841860465116279e-05, | |
| "loss": 0.3567, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6003039513677811, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 6.795348837209301e-05, | |
| "loss": 0.371, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.60790273556231, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 6.748837209302325e-05, | |
| "loss": 0.334, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6155015197568389, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 6.702325581395349e-05, | |
| "loss": 0.3043, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6231003039513677, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 6.655813953488372e-05, | |
| "loss": 0.3049, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6306990881458967, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 6.609302325581396e-05, | |
| "loss": 0.2673, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6382978723404256, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.56279069767442e-05, | |
| "loss": 0.3203, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6458966565349544, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 6.516279069767442e-05, | |
| "loss": 0.3552, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6534954407294833, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 6.469767441860466e-05, | |
| "loss": 0.2805, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.6610942249240122, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 6.423255813953488e-05, | |
| "loss": 0.3399, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.668693009118541, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 6.376744186046512e-05, | |
| "loss": 0.2798, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6762917933130699, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 6.330232558139535e-05, | |
| "loss": 0.3096, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6838905775075987, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 6.283720930232559e-05, | |
| "loss": 0.3162, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6914893617021277, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.237209302325581e-05, | |
| "loss": 0.2926, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6990881458966566, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 6.190697674418605e-05, | |
| "loss": 0.3684, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7066869300911854, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 6.144186046511628e-05, | |
| "loss": 0.3586, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 6.097674418604652e-05, | |
| "loss": 0.3382, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7218844984802432, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 6.051162790697674e-05, | |
| "loss": 0.3272, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.729483282674772, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 6.004651162790698e-05, | |
| "loss": 0.2808, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.7370820668693009, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 5.958139534883721e-05, | |
| "loss": 0.2965, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7446808510638298, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 5.9116279069767445e-05, | |
| "loss": 0.2778, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.7522796352583586, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 5.8651162790697675e-05, | |
| "loss": 0.3103, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.7598784194528876, | |
| "grad_norm": 1.125, | |
| "learning_rate": 5.818604651162791e-05, | |
| "loss": 0.2904, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7674772036474165, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 5.772093023255815e-05, | |
| "loss": 0.3073, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.7750759878419453, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 5.725581395348838e-05, | |
| "loss": 0.3001, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.7826747720364742, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 5.67906976744186e-05, | |
| "loss": 0.2669, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.790273556231003, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 5.6325581395348836e-05, | |
| "loss": 0.2502, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7978723404255319, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 5.586046511627907e-05, | |
| "loss": 0.306, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8054711246200608, | |
| "grad_norm": 1.0, | |
| "learning_rate": 5.53953488372093e-05, | |
| "loss": 0.2415, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.8130699088145896, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 5.493023255813954e-05, | |
| "loss": 0.2257, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.8206686930091185, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 5.4465116279069775e-05, | |
| "loss": 0.2491, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.8282674772036475, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 0.2659, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.8358662613981763, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 5.353488372093024e-05, | |
| "loss": 0.2436, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8434650455927052, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 5.3069767441860464e-05, | |
| "loss": 0.2539, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 5.2604651162790694e-05, | |
| "loss": 0.2449, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8586626139817629, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 5.213953488372093e-05, | |
| "loss": 0.2424, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.8662613981762918, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 5.1674418604651166e-05, | |
| "loss": 0.2565, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.8738601823708206, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 5.1209302325581396e-05, | |
| "loss": 0.242, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.8814589665653495, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 5.074418604651163e-05, | |
| "loss": 0.2415, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.8890577507598785, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 5.027906976744187e-05, | |
| "loss": 0.2519, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.8966565349544073, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 4.981395348837209e-05, | |
| "loss": 0.2545, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.9042553191489362, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 4.934883720930233e-05, | |
| "loss": 0.2403, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.9118541033434651, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 4.8883720930232564e-05, | |
| "loss": 0.2075, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9194528875379939, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 4.8418604651162794e-05, | |
| "loss": 0.198, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.9270516717325228, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 4.7953488372093023e-05, | |
| "loss": 0.216, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.9346504559270516, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 4.748837209302326e-05, | |
| "loss": 0.1838, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.9422492401215805, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 4.7023255813953496e-05, | |
| "loss": 0.2367, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.9498480243161094, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 4.655813953488372e-05, | |
| "loss": 0.1958, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.9574468085106383, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 4.6093023255813955e-05, | |
| "loss": 0.1862, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.9650455927051672, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 4.562790697674419e-05, | |
| "loss": 0.2328, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.9726443768996961, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 4.516279069767442e-05, | |
| "loss": 0.2483, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.9802431610942249, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 4.469767441860465e-05, | |
| "loss": 0.1677, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.9878419452887538, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 4.423255813953489e-05, | |
| "loss": 0.2192, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9954407294832827, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 4.376744186046512e-05, | |
| "loss": 0.2357, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.0030395136778116, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 4.3302325581395353e-05, | |
| "loss": 0.1562, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.0106382978723405, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 4.283720930232558e-05, | |
| "loss": 0.1043, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.0182370820668694, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 4.237209302325581e-05, | |
| "loss": 0.106, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.0258358662613982, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 4.190697674418605e-05, | |
| "loss": 0.1041, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.033434650455927, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 4.1441860465116285e-05, | |
| "loss": 0.1001, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.041033434650456, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 4.0976744186046515e-05, | |
| "loss": 0.0867, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.0486322188449848, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.0511627906976745e-05, | |
| "loss": 0.1042, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.0562310030395137, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 4.004651162790698e-05, | |
| "loss": 0.1049, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.0638297872340425, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 3.958139534883721e-05, | |
| "loss": 0.0948, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 3.911627906976744e-05, | |
| "loss": 0.0974, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.0790273556231003, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 3.8651162790697677e-05, | |
| "loss": 0.1062, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.0866261398176291, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 3.818604651162791e-05, | |
| "loss": 0.0995, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.094224924012158, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 3.772093023255814e-05, | |
| "loss": 0.1188, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.1018237082066868, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 3.725581395348837e-05, | |
| "loss": 0.0929, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.1094224924012157, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 3.679069767441861e-05, | |
| "loss": 0.098, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.1170212765957448, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 3.632558139534884e-05, | |
| "loss": 0.0958, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.1246200607902737, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 3.5860465116279075e-05, | |
| "loss": 0.1008, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.1322188449848025, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 3.5395348837209304e-05, | |
| "loss": 0.1111, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.1398176291793314, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 3.4930232558139534e-05, | |
| "loss": 0.0917, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.1474164133738602, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 3.446511627906977e-05, | |
| "loss": 0.0912, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.155015197568389, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 0.0948, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.162613981762918, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 3.353488372093023e-05, | |
| "loss": 0.0749, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.1702127659574468, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 3.3069767441860466e-05, | |
| "loss": 0.1059, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.1778115501519757, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 3.26046511627907e-05, | |
| "loss": 0.0942, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.1854103343465046, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 3.213953488372093e-05, | |
| "loss": 0.0739, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.1930091185410334, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 3.167441860465116e-05, | |
| "loss": 0.0931, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.2006079027355623, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 3.12093023255814e-05, | |
| "loss": 0.0955, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.2082066869300911, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 3.074418604651163e-05, | |
| "loss": 0.0842, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.21580547112462, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 3.0279069767441864e-05, | |
| "loss": 0.0871, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.2234042553191489, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 2.9813953488372093e-05, | |
| "loss": 0.0972, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.2310030395136777, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 2.9348837209302326e-05, | |
| "loss": 0.0752, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.2386018237082066, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.888372093023256e-05, | |
| "loss": 0.0881, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.2462006079027357, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 2.8418604651162796e-05, | |
| "loss": 0.063, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.2537993920972643, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 2.7953488372093022e-05, | |
| "loss": 0.0964, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.2613981762917934, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 2.7488372093023258e-05, | |
| "loss": 0.0963, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.2689969604863223, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 2.702325581395349e-05, | |
| "loss": 0.0675, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.2765957446808511, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 2.6558139534883724e-05, | |
| "loss": 0.0826, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.28419452887538, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 2.6093023255813954e-05, | |
| "loss": 0.0659, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.2917933130699089, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 2.5627906976744187e-05, | |
| "loss": 0.0869, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.2993920972644377, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.516279069767442e-05, | |
| "loss": 0.0793, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.3069908814589666, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 2.4697674418604653e-05, | |
| "loss": 0.0979, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.3145896656534954, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 2.4232558139534886e-05, | |
| "loss": 0.0848, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.3221884498480243, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 2.376744186046512e-05, | |
| "loss": 0.0944, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.3297872340425532, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.3302325581395352e-05, | |
| "loss": 0.0668, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.337386018237082, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.283720930232558e-05, | |
| "loss": 0.0787, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.344984802431611, | |
| "grad_norm": 0.79296875, | |
| "learning_rate": 2.2372093023255818e-05, | |
| "loss": 0.0708, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.3525835866261398, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 2.1906976744186047e-05, | |
| "loss": 0.0571, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.3601823708206686, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 2.144186046511628e-05, | |
| "loss": 0.0626, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.3677811550151975, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 2.0976744186046513e-05, | |
| "loss": 0.0687, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.3753799392097266, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 2.0511627906976746e-05, | |
| "loss": 0.0714, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.3829787234042552, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 2.0046511627906976e-05, | |
| "loss": 0.0639, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.3905775075987843, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 1.9581395348837212e-05, | |
| "loss": 0.0674, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.3981762917933132, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 1.9116279069767442e-05, | |
| "loss": 0.0664, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.405775075987842, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 1.8651162790697675e-05, | |
| "loss": 0.0593, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.4133738601823709, | |
| "grad_norm": 0.65625, | |
| "learning_rate": 1.8186046511627908e-05, | |
| "loss": 0.0686, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.4209726443768997, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 1.772093023255814e-05, | |
| "loss": 0.0844, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 1.7255813953488374e-05, | |
| "loss": 0.0662, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.4361702127659575, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.6790697674418607e-05, | |
| "loss": 0.0768, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.4437689969604863, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 1.6325581395348837e-05, | |
| "loss": 0.0686, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.4513677811550152, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 1.5860465116279073e-05, | |
| "loss": 0.0604, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.458966565349544, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 1.5395348837209303e-05, | |
| "loss": 0.0613, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.466565349544073, | |
| "grad_norm": 0.76171875, | |
| "learning_rate": 1.4930232558139537e-05, | |
| "loss": 0.0807, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.4741641337386018, | |
| "grad_norm": 0.796875, | |
| "learning_rate": 1.4465116279069768e-05, | |
| "loss": 0.0923, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.4817629179331306, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 0.0629, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.4893617021276595, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 1.3534883720930233e-05, | |
| "loss": 0.0709, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.4969604863221884, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 1.3069767441860467e-05, | |
| "loss": 0.0765, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.5045592705167175, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 1.2604651162790699e-05, | |
| "loss": 0.07, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.512158054711246, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 1.213953488372093e-05, | |
| "loss": 0.0614, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.5197568389057752, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 1.1674418604651163e-05, | |
| "loss": 0.0509, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.5273556231003038, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 1.1209302325581396e-05, | |
| "loss": 0.0666, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.534954407294833, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 1.0744186046511629e-05, | |
| "loss": 0.0509, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.5425531914893615, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 1.027906976744186e-05, | |
| "loss": 0.0511, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.5501519756838906, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.813953488372093e-06, | |
| "loss": 0.0881, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.5577507598784195, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 9.348837209302326e-06, | |
| "loss": 0.0691, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.5653495440729484, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 8.88372093023256e-06, | |
| "loss": 0.079, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.5729483282674772, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 8.41860465116279e-06, | |
| "loss": 0.0604, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.580547112462006, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 7.953488372093024e-06, | |
| "loss": 0.0634, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.588145896656535, | |
| "grad_norm": 0.7109375, | |
| "learning_rate": 7.488372093023257e-06, | |
| "loss": 0.0644, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.5957446808510638, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 7.023255813953489e-06, | |
| "loss": 0.0787, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.6033434650455927, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 6.558139534883721e-06, | |
| "loss": 0.0639, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.6109422492401215, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 6.093023255813954e-06, | |
| "loss": 0.0706, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.6185410334346506, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 5.627906976744186e-06, | |
| "loss": 0.0735, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.6261398176291793, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 5.162790697674419e-06, | |
| "loss": 0.0672, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.6337386018237083, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 4.697674418604651e-06, | |
| "loss": 0.0514, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.641337386018237, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 4.232558139534884e-06, | |
| "loss": 0.0545, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.648936170212766, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 3.7674418604651167e-06, | |
| "loss": 0.0462, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.6565349544072947, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 3.302325581395349e-06, | |
| "loss": 0.0616, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.6641337386018238, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 2.8372093023255815e-06, | |
| "loss": 0.0588, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.6717325227963524, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 2.372093023255814e-06, | |
| "loss": 0.07, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.6793313069908815, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 1.9069767441860468e-06, | |
| "loss": 0.0622, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.6869300911854104, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 1.4418604651162792e-06, | |
| "loss": 0.0677, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.6945288753799392, | |
| "grad_norm": 0.8046875, | |
| "learning_rate": 9.767441860465117e-07, | |
| "loss": 0.0578, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.702127659574468, | |
| "grad_norm": 1.125, | |
| "learning_rate": 5.116279069767442e-07, | |
| "loss": 0.069, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.709726443768997, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 4.651162790697675e-08, | |
| "loss": 0.0562, | |
| "step": 2250 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.656510331904e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |