| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.125, |
| "eval_steps": 500, |
| "global_step": 2080, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 0.5193641781806946, |
| "eval_runtime": 15.6757, |
| "eval_samples_per_second": 2.041, |
| "eval_steps_per_second": 0.255, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.0006009615384615385, |
| "grad_norm": 55.403633140710234, |
| "learning_rate": 9e-09, |
| "loss": 0.5113, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.001201923076923077, |
| "grad_norm": 23.47426110450185, |
| "learning_rate": 1.8999999999999998e-08, |
| "loss": 0.4385, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0018028846153846155, |
| "grad_norm": 10.224385416254647, |
| "learning_rate": 2.9e-08, |
| "loss": 0.4586, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.002403846153846154, |
| "grad_norm": 20.74032508741799, |
| "learning_rate": 3.9e-08, |
| "loss": 0.4234, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0030048076923076925, |
| "grad_norm": 21.168707763774822, |
| "learning_rate": 4.9e-08, |
| "loss": 0.526, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.003605769230769231, |
| "grad_norm": 22.648636815908606, |
| "learning_rate": 5.899999999999999e-08, |
| "loss": 0.5596, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.004206730769230769, |
| "grad_norm": 13.508051762249275, |
| "learning_rate": 6.900000000000001e-08, |
| "loss": 0.4981, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.004807692307692308, |
| "grad_norm": 23.403551137207025, |
| "learning_rate": 7.899999999999999e-08, |
| "loss": 0.5008, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.005408653846153846, |
| "grad_norm": 48.6142950712791, |
| "learning_rate": 8.899999999999999e-08, |
| "loss": 0.4585, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.006009615384615385, |
| "grad_norm": 8.884428417822688, |
| "learning_rate": 9.9e-08, |
| "loss": 0.4388, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.006610576923076923, |
| "grad_norm": 9.705307595727518, |
| "learning_rate": 1.09e-07, |
| "loss": 0.4471, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.007211538461538462, |
| "grad_norm": 30.306963311413714, |
| "learning_rate": 1.19e-07, |
| "loss": 0.4293, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.0078125, |
| "grad_norm": 14.67235522750005, |
| "learning_rate": 1.29e-07, |
| "loss": 0.4955, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.008413461538461538, |
| "grad_norm": 41.261340367060896, |
| "learning_rate": 1.3900000000000001e-07, |
| "loss": 0.5347, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.009014423076923076, |
| "grad_norm": 28.145775622131612, |
| "learning_rate": 1.49e-07, |
| "loss": 0.4671, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.009615384615384616, |
| "grad_norm": 13.92342254703078, |
| "learning_rate": 1.59e-07, |
| "loss": 0.4318, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.010216346153846154, |
| "grad_norm": 42.72542828466139, |
| "learning_rate": 1.69e-07, |
| "loss": 0.4232, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.010817307692307692, |
| "grad_norm": 18.910952118646854, |
| "learning_rate": 1.7899999999999997e-07, |
| "loss": 0.5434, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.01141826923076923, |
| "grad_norm": 9.945569021169186, |
| "learning_rate": 1.8899999999999999e-07, |
| "loss": 0.5091, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.01201923076923077, |
| "grad_norm": 12.064577086348633, |
| "learning_rate": 1.99e-07, |
| "loss": 0.4811, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.012620192307692308, |
| "grad_norm": 21.22554429640729, |
| "learning_rate": 2.0899999999999998e-07, |
| "loss": 0.486, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.013221153846153846, |
| "grad_norm": 51.114586227567436, |
| "learning_rate": 2.19e-07, |
| "loss": 0.5489, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.013822115384615384, |
| "grad_norm": 33.767338939816305, |
| "learning_rate": 2.29e-07, |
| "loss": 0.5006, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.014423076923076924, |
| "grad_norm": 20.989645421924674, |
| "learning_rate": 2.3899999999999996e-07, |
| "loss": 0.5026, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.015024038461538462, |
| "grad_norm": 46.94572882286708, |
| "learning_rate": 2.4899999999999997e-07, |
| "loss": 0.4662, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 16.1857675192416, |
| "learning_rate": 2.59e-07, |
| "loss": 0.4428, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.01622596153846154, |
| "grad_norm": 22.356923049427976, |
| "learning_rate": 2.69e-07, |
| "loss": 0.466, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.016826923076923076, |
| "grad_norm": 27.919776058404576, |
| "learning_rate": 2.79e-07, |
| "loss": 0.5299, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.017427884615384616, |
| "grad_norm": 20.374817697720214, |
| "learning_rate": 2.8899999999999995e-07, |
| "loss": 0.5575, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.018028846153846152, |
| "grad_norm": 11.1596095155468, |
| "learning_rate": 2.9899999999999996e-07, |
| "loss": 0.5476, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.018629807692307692, |
| "grad_norm": 18.15970681497743, |
| "learning_rate": 3.09e-07, |
| "loss": 0.521, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.019230769230769232, |
| "grad_norm": 7.209018602994909, |
| "learning_rate": 3.19e-07, |
| "loss": 0.4647, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.019831730769230768, |
| "grad_norm": 14.837068682221664, |
| "learning_rate": 3.29e-07, |
| "loss": 0.4864, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.020432692307692308, |
| "grad_norm": 14.235741132745929, |
| "learning_rate": 3.39e-07, |
| "loss": 0.4107, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.021033653846153848, |
| "grad_norm": 27.277651971922182, |
| "learning_rate": 3.4899999999999996e-07, |
| "loss": 0.4536, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.021634615384615384, |
| "grad_norm": 17.10480760395017, |
| "learning_rate": 3.5899999999999997e-07, |
| "loss": 0.4289, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.022235576923076924, |
| "grad_norm": 29.929679141274484, |
| "learning_rate": 3.69e-07, |
| "loss": 0.4647, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.02283653846153846, |
| "grad_norm": 10.398612915945172, |
| "learning_rate": 3.79e-07, |
| "loss": 0.5167, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.0234375, |
| "grad_norm": 9.769808118666157, |
| "learning_rate": 3.89e-07, |
| "loss": 0.4793, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.02403846153846154, |
| "grad_norm": 18.64004530242777, |
| "learning_rate": 3.99e-07, |
| "loss": 0.4865, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.024639423076923076, |
| "grad_norm": 17.699474376401422, |
| "learning_rate": 4.0899999999999997e-07, |
| "loss": 0.4727, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.025240384615384616, |
| "grad_norm": 11.83869387760017, |
| "learning_rate": 4.19e-07, |
| "loss": 0.4803, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.025841346153846152, |
| "grad_norm": 52.6903160822148, |
| "learning_rate": 4.29e-07, |
| "loss": 0.4587, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.026442307692307692, |
| "grad_norm": 8.207347232374364, |
| "learning_rate": 4.39e-07, |
| "loss": 0.4849, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.027043269230769232, |
| "grad_norm": 30.073558497443987, |
| "learning_rate": 4.49e-07, |
| "loss": 0.5023, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.027644230769230768, |
| "grad_norm": 10.373186832780206, |
| "learning_rate": 4.59e-07, |
| "loss": 0.4538, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.028245192307692308, |
| "grad_norm": 25.960320061465094, |
| "learning_rate": 4.689999999999999e-07, |
| "loss": 0.4712, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.028846153846153848, |
| "grad_norm": 12.862144045179527, |
| "learning_rate": 4.79e-07, |
| "loss": 0.4704, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.029447115384615384, |
| "grad_norm": 21.599196101058833, |
| "learning_rate": 4.89e-07, |
| "loss": 0.4695, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.030048076923076924, |
| "grad_norm": 26.447052889694493, |
| "learning_rate": 4.99e-07, |
| "loss": 0.4574, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.03064903846153846, |
| "grad_norm": 19.343061960700307, |
| "learning_rate": 5.09e-07, |
| "loss": 0.4259, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 55.39351897669441, |
| "learning_rate": 5.19e-07, |
| "loss": 0.4362, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.031850961538461536, |
| "grad_norm": 9.138204788385957, |
| "learning_rate": 5.29e-07, |
| "loss": 0.4519, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.03245192307692308, |
| "grad_norm": 9.17406599995409, |
| "learning_rate": 5.39e-07, |
| "loss": 0.4639, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.033052884615384616, |
| "grad_norm": 42.607781411989706, |
| "learning_rate": 5.490000000000001e-07, |
| "loss": 0.4302, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.03365384615384615, |
| "grad_norm": 10.368759637182924, |
| "learning_rate": 5.590000000000001e-07, |
| "loss": 0.4107, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.034254807692307696, |
| "grad_norm": 12.31592718573613, |
| "learning_rate": 5.69e-07, |
| "loss": 0.4403, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.03485576923076923, |
| "grad_norm": 13.670395980874881, |
| "learning_rate": 5.79e-07, |
| "loss": 0.4294, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.03545673076923077, |
| "grad_norm": 16.653414422369462, |
| "learning_rate": 5.89e-07, |
| "loss": 0.439, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.036057692307692304, |
| "grad_norm": 10.215056425825546, |
| "learning_rate": 5.989999999999999e-07, |
| "loss": 0.4069, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.03665865384615385, |
| "grad_norm": 31.589724664408692, |
| "learning_rate": 6.089999999999999e-07, |
| "loss": 0.4354, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.037259615384615384, |
| "grad_norm": 21.847510353862578, |
| "learning_rate": 6.189999999999999e-07, |
| "loss": 0.4638, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.03786057692307692, |
| "grad_norm": 8.076334096727454, |
| "learning_rate": 6.289999999999999e-07, |
| "loss": 0.4312, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.038461538461538464, |
| "grad_norm": 22.52385975163401, |
| "learning_rate": 6.389999999999999e-07, |
| "loss": 0.4364, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.0390625, |
| "grad_norm": 13.204088595346095, |
| "learning_rate": 6.49e-07, |
| "loss": 0.4231, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.039663461538461536, |
| "grad_norm": 33.19312446450432, |
| "learning_rate": 6.59e-07, |
| "loss": 0.4301, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.04026442307692308, |
| "grad_norm": 9.715044908453299, |
| "learning_rate": 6.69e-07, |
| "loss": 0.4201, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.040865384615384616, |
| "grad_norm": 10.574912363532471, |
| "learning_rate": 6.79e-07, |
| "loss": 0.4547, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.04146634615384615, |
| "grad_norm": 22.39620841168723, |
| "learning_rate": 6.889999999999999e-07, |
| "loss": 0.4493, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.042067307692307696, |
| "grad_norm": 8.058186609469454, |
| "learning_rate": 6.989999999999999e-07, |
| "loss": 0.3926, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.04266826923076923, |
| "grad_norm": 21.022586695480996, |
| "learning_rate": 7.089999999999999e-07, |
| "loss": 0.4393, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.04326923076923077, |
| "grad_norm": 12.66012747128014, |
| "learning_rate": 7.189999999999999e-07, |
| "loss": 0.4359, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.043870192307692304, |
| "grad_norm": 5.937649569363215, |
| "learning_rate": 7.289999999999999e-07, |
| "loss": 0.4277, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.04447115384615385, |
| "grad_norm": 16.798833074370908, |
| "learning_rate": 7.389999999999999e-07, |
| "loss": 0.446, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.045072115384615384, |
| "grad_norm": 26.4182799886464, |
| "learning_rate": 7.489999999999999e-07, |
| "loss": 0.4737, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.04567307692307692, |
| "grad_norm": 13.91407808362797, |
| "learning_rate": 7.59e-07, |
| "loss": 0.4796, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.046274038461538464, |
| "grad_norm": 16.734955748274775, |
| "learning_rate": 7.69e-07, |
| "loss": 0.4678, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 11.89547913614258, |
| "learning_rate": 7.79e-07, |
| "loss": 0.4536, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.047475961538461536, |
| "grad_norm": 16.860857694776378, |
| "learning_rate": 7.89e-07, |
| "loss": 0.422, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.04807692307692308, |
| "grad_norm": 20.294362719824335, |
| "learning_rate": 7.99e-07, |
| "loss": 0.4675, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.048677884615384616, |
| "grad_norm": 32.163806042850815, |
| "learning_rate": 8.09e-07, |
| "loss": 0.4162, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.04927884615384615, |
| "grad_norm": 8.835869696589675, |
| "learning_rate": 8.189999999999999e-07, |
| "loss": 0.4531, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.049879807692307696, |
| "grad_norm": 27.004489539541865, |
| "learning_rate": 8.289999999999999e-07, |
| "loss": 0.46, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.05048076923076923, |
| "grad_norm": 10.039120612814386, |
| "learning_rate": 8.389999999999999e-07, |
| "loss": 0.4351, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.05108173076923077, |
| "grad_norm": 25.890820231883552, |
| "learning_rate": 8.489999999999999e-07, |
| "loss": 0.4477, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.051682692307692304, |
| "grad_norm": 21.84065301973945, |
| "learning_rate": 8.59e-07, |
| "loss": 0.4028, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.05228365384615385, |
| "grad_norm": 17.894672539322194, |
| "learning_rate": 8.69e-07, |
| "loss": 0.4231, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.052884615384615384, |
| "grad_norm": 33.13377776596334, |
| "learning_rate": 8.79e-07, |
| "loss": 0.4483, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.05348557692307692, |
| "grad_norm": 57.77962521420785, |
| "learning_rate": 8.89e-07, |
| "loss": 0.4528, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.054086538461538464, |
| "grad_norm": 14.605643747393765, |
| "learning_rate": 8.99e-07, |
| "loss": 0.4183, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0546875, |
| "grad_norm": 30.921239622195746, |
| "learning_rate": 9.09e-07, |
| "loss": 0.4299, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.055288461538461536, |
| "grad_norm": 14.136816366550216, |
| "learning_rate": 9.19e-07, |
| "loss": 0.4105, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.05588942307692308, |
| "grad_norm": 11.231190902163389, |
| "learning_rate": 9.29e-07, |
| "loss": 0.4185, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.056490384615384616, |
| "grad_norm": 33.801234798562184, |
| "learning_rate": 9.389999999999999e-07, |
| "loss": 0.4541, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.05709134615384615, |
| "grad_norm": 17.846337242420834, |
| "learning_rate": 9.489999999999999e-07, |
| "loss": 0.4178, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.057692307692307696, |
| "grad_norm": 19.667393686818674, |
| "learning_rate": 9.589999999999998e-07, |
| "loss": 0.4446, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.05829326923076923, |
| "grad_norm": 10.919623781198071, |
| "learning_rate": 9.69e-07, |
| "loss": 0.4745, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.05889423076923077, |
| "grad_norm": 27.87766553645535, |
| "learning_rate": 9.789999999999999e-07, |
| "loss": 0.4779, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.059495192307692304, |
| "grad_norm": 32.8338113927267, |
| "learning_rate": 9.89e-07, |
| "loss": 0.4962, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.06009615384615385, |
| "grad_norm": 54.56857085663595, |
| "learning_rate": 9.989999999999999e-07, |
| "loss": 0.4523, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.060697115384615384, |
| "grad_norm": 67.84354578257332, |
| "learning_rate": 9.994245524296674e-07, |
| "loss": 0.3837, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.06129807692307692, |
| "grad_norm": 23.915401977778863, |
| "learning_rate": 9.987851662404092e-07, |
| "loss": 0.4318, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.061899038461538464, |
| "grad_norm": 18.692398541022605, |
| "learning_rate": 9.981457800511507e-07, |
| "loss": 0.4429, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 8.96899175552455, |
| "learning_rate": 9.975063938618924e-07, |
| "loss": 0.4265, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.06310096153846154, |
| "grad_norm": 67.76708100008346, |
| "learning_rate": 9.968670076726342e-07, |
| "loss": 0.4004, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.06370192307692307, |
| "grad_norm": 14.95483642619465, |
| "learning_rate": 9.962276214833759e-07, |
| "loss": 0.4134, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.06430288461538461, |
| "grad_norm": 11.936205292399153, |
| "learning_rate": 9.955882352941176e-07, |
| "loss": 0.4206, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.06490384615384616, |
| "grad_norm": 8.144208338195012, |
| "learning_rate": 9.949488491048593e-07, |
| "loss": 0.4, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.0655048076923077, |
| "grad_norm": 19.659029674044277, |
| "learning_rate": 9.94309462915601e-07, |
| "loss": 0.3855, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.06610576923076923, |
| "grad_norm": 14.97116772003002, |
| "learning_rate": 9.936700767263426e-07, |
| "loss": 0.402, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.06670673076923077, |
| "grad_norm": 147.02140982947978, |
| "learning_rate": 9.930306905370843e-07, |
| "loss": 0.481, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.0673076923076923, |
| "grad_norm": 43.55065984134183, |
| "learning_rate": 9.92391304347826e-07, |
| "loss": 0.4902, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.06790865384615384, |
| "grad_norm": 7.879535552657374, |
| "learning_rate": 9.917519181585678e-07, |
| "loss": 0.4542, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.06850961538461539, |
| "grad_norm": 8.151701209609962, |
| "learning_rate": 9.911125319693095e-07, |
| "loss": 0.414, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.06911057692307693, |
| "grad_norm": 14.637959734877429, |
| "learning_rate": 9.904731457800513e-07, |
| "loss": 0.3782, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.06971153846153846, |
| "grad_norm": 4.501693875744606, |
| "learning_rate": 9.898337595907928e-07, |
| "loss": 0.3849, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.0703125, |
| "grad_norm": 35.230763684058516, |
| "learning_rate": 9.891943734015345e-07, |
| "loss": 0.4404, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.07091346153846154, |
| "grad_norm": 12.442031715529945, |
| "learning_rate": 9.885549872122762e-07, |
| "loss": 0.4576, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.07151442307692307, |
| "grad_norm": 18.8069021556737, |
| "learning_rate": 9.879156010230177e-07, |
| "loss": 0.4832, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.07211538461538461, |
| "grad_norm": 15.855907967671428, |
| "learning_rate": 9.872762148337595e-07, |
| "loss": 0.4352, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.07271634615384616, |
| "grad_norm": 14.369715804290616, |
| "learning_rate": 9.866368286445012e-07, |
| "loss": 0.4028, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.0733173076923077, |
| "grad_norm": 177.80186681538973, |
| "learning_rate": 9.85997442455243e-07, |
| "loss": 0.4571, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.07391826923076923, |
| "grad_norm": 11.84969824290971, |
| "learning_rate": 9.853580562659845e-07, |
| "loss": 0.4467, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.07451923076923077, |
| "grad_norm": 68.42006681557945, |
| "learning_rate": 9.847186700767262e-07, |
| "loss": 0.4505, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.0751201923076923, |
| "grad_norm": 16.098276335781833, |
| "learning_rate": 9.84079283887468e-07, |
| "loss": 0.4319, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.07572115384615384, |
| "grad_norm": 16.956837398008382, |
| "learning_rate": 9.834398976982096e-07, |
| "loss": 0.4748, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.07632211538461539, |
| "grad_norm": 13.890105636051898, |
| "learning_rate": 9.828005115089514e-07, |
| "loss": 0.4738, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.07692307692307693, |
| "grad_norm": 14.52112888511607, |
| "learning_rate": 9.821611253196931e-07, |
| "loss": 0.4436, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.07752403846153846, |
| "grad_norm": 14.482587862628632, |
| "learning_rate": 9.815217391304348e-07, |
| "loss": 0.4213, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.078125, |
| "grad_norm": 15.66954308966829, |
| "learning_rate": 9.808823529411764e-07, |
| "loss": 0.4889, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.07872596153846154, |
| "grad_norm": 33.3147760442157, |
| "learning_rate": 9.80242966751918e-07, |
| "loss": 0.4395, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.07932692307692307, |
| "grad_norm": 31.576191017602305, |
| "learning_rate": 9.796035805626598e-07, |
| "loss": 0.3946, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.07992788461538461, |
| "grad_norm": 15.62093132771886, |
| "learning_rate": 9.789641943734016e-07, |
| "loss": 0.4729, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.08052884615384616, |
| "grad_norm": 51.855714781154056, |
| "learning_rate": 9.783248081841433e-07, |
| "loss": 0.4652, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.0811298076923077, |
| "grad_norm": 84.97065453956941, |
| "learning_rate": 9.77685421994885e-07, |
| "loss": 0.4229, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.08173076923076923, |
| "grad_norm": 25.452407688499903, |
| "learning_rate": 9.770460358056265e-07, |
| "loss": 0.4064, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.08233173076923077, |
| "grad_norm": 18.263105399640835, |
| "learning_rate": 9.764066496163683e-07, |
| "loss": 0.4564, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.0829326923076923, |
| "grad_norm": 24.334822911726036, |
| "learning_rate": 9.7576726342711e-07, |
| "loss": 0.4071, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.08353365384615384, |
| "grad_norm": 11.382524106662059, |
| "learning_rate": 9.751278772378515e-07, |
| "loss": 0.4332, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.08413461538461539, |
| "grad_norm": 54.06425057509178, |
| "learning_rate": 9.744884910485932e-07, |
| "loss": 0.4505, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.08473557692307693, |
| "grad_norm": 5.076664593001253, |
| "learning_rate": 9.73849104859335e-07, |
| "loss": 0.4453, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.08533653846153846, |
| "grad_norm": 21.527761337764254, |
| "learning_rate": 9.732097186700767e-07, |
| "loss": 0.4719, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.0859375, |
| "grad_norm": 36.50870455739431, |
| "learning_rate": 9.725703324808182e-07, |
| "loss": 0.4512, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.08653846153846154, |
| "grad_norm": 12.540139357753157, |
| "learning_rate": 9.7193094629156e-07, |
| "loss": 0.4548, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.08713942307692307, |
| "grad_norm": 15.096306703319206, |
| "learning_rate": 9.712915601023017e-07, |
| "loss": 0.5067, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.08774038461538461, |
| "grad_norm": 88.89595612518276, |
| "learning_rate": 9.706521739130434e-07, |
| "loss": 0.469, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.08834134615384616, |
| "grad_norm": 24.05245110004213, |
| "learning_rate": 9.700127877237851e-07, |
| "loss": 0.4336, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.0889423076923077, |
| "grad_norm": 7.772235228925758, |
| "learning_rate": 9.693734015345269e-07, |
| "loss": 0.39, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.08954326923076923, |
| "grad_norm": 49.178387681040576, |
| "learning_rate": 9.687340153452686e-07, |
| "loss": 0.4151, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.09014423076923077, |
| "grad_norm": 9.880025918953056, |
| "learning_rate": 9.680946291560101e-07, |
| "loss": 0.4437, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.0907451923076923, |
| "grad_norm": 84.04802965246304, |
| "learning_rate": 9.674552429667519e-07, |
| "loss": 0.4544, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.09134615384615384, |
| "grad_norm": 23.879300812095252, |
| "learning_rate": 9.668158567774936e-07, |
| "loss": 0.4778, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.09194711538461539, |
| "grad_norm": 8.74377800352843, |
| "learning_rate": 9.661764705882353e-07, |
| "loss": 0.4424, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.09254807692307693, |
| "grad_norm": 25.340394685074983, |
| "learning_rate": 9.65537084398977e-07, |
| "loss": 0.4457, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.09314903846153846, |
| "grad_norm": 24.377319461101425, |
| "learning_rate": 9.648976982097188e-07, |
| "loss": 0.4145, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 27.44647420729, |
| "learning_rate": 9.642583120204603e-07, |
| "loss": 0.3954, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.09435096153846154, |
| "grad_norm": 9.525464831385005, |
| "learning_rate": 9.63618925831202e-07, |
| "loss": 0.3716, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.09495192307692307, |
| "grad_norm": 27.349989017536107, |
| "learning_rate": 9.629795396419438e-07, |
| "loss": 0.3668, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.09555288461538461, |
| "grad_norm": 19.197820909162925, |
| "learning_rate": 9.623401534526855e-07, |
| "loss": 0.4735, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.09615384615384616, |
| "grad_norm": 14.821828935472933, |
| "learning_rate": 9.61700767263427e-07, |
| "loss": 0.4551, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.0967548076923077, |
| "grad_norm": 10.183698117047507, |
| "learning_rate": 9.610613810741687e-07, |
| "loss": 0.4463, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.09735576923076923, |
| "grad_norm": 17.003317993340907, |
| "learning_rate": 9.604219948849105e-07, |
| "loss": 0.4501, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.09795673076923077, |
| "grad_norm": 16.527884738622046, |
| "learning_rate": 9.59782608695652e-07, |
| "loss": 0.4274, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.0985576923076923, |
| "grad_norm": 96.03639510226907, |
| "learning_rate": 9.591432225063937e-07, |
| "loss": 0.4002, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.09915865384615384, |
| "grad_norm": 21.560400331772723, |
| "learning_rate": 9.585038363171354e-07, |
| "loss": 0.3857, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.09975961538461539, |
| "grad_norm": 22.502753235352646, |
| "learning_rate": 9.578644501278772e-07, |
| "loss": 0.3972, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.10036057692307693, |
| "grad_norm": 13.923835321348774, |
| "learning_rate": 9.57225063938619e-07, |
| "loss": 0.4161, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.10096153846153846, |
| "grad_norm": 13.89020798215834, |
| "learning_rate": 9.565856777493606e-07, |
| "loss": 0.4409, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.1015625, |
| "grad_norm": 93.30579652904981, |
| "learning_rate": 9.559462915601024e-07, |
| "loss": 0.4413, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.10216346153846154, |
| "grad_norm": 10.306967482471663, |
| "learning_rate": 9.553069053708439e-07, |
| "loss": 0.4306, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.10276442307692307, |
| "grad_norm": 16.35022677130188, |
| "learning_rate": 9.546675191815856e-07, |
| "loss": 0.4361, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.10336538461538461, |
| "grad_norm": 68.65163238192922, |
| "learning_rate": 9.540281329923273e-07, |
| "loss": 0.4508, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.10396634615384616, |
| "grad_norm": 14.691319864310291, |
| "learning_rate": 9.533887468030691e-07, |
| "loss": 0.4637, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.1045673076923077, |
| "grad_norm": 11.655266762752866, |
| "learning_rate": 9.527493606138107e-07, |
| "loss": 0.4239, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.10516826923076923, |
| "grad_norm": 26.007719217225308, |
| "learning_rate": 9.521099744245524e-07, |
| "loss": 0.4427, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.10576923076923077, |
| "grad_norm": 31.093138088222684, |
| "learning_rate": 9.51470588235294e-07, |
| "loss": 0.459, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.1063701923076923, |
| "grad_norm": 8.469416417535411, |
| "learning_rate": 9.508312020460358e-07, |
| "loss": 0.402, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.10697115384615384, |
| "grad_norm": 16.06555093014836, |
| "learning_rate": 9.501918158567774e-07, |
| "loss": 0.4276, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.10757211538461539, |
| "grad_norm": 46.21222096469417, |
| "learning_rate": 9.495524296675191e-07, |
| "loss": 0.4177, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.10817307692307693, |
| "grad_norm": 11.561230412768193, |
| "learning_rate": 9.489130434782609e-07, |
| "loss": 0.3993, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.10877403846153846, |
| "grad_norm": 33.55933631376849, |
| "learning_rate": 9.482736572890026e-07, |
| "loss": 0.4206, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.109375, |
| "grad_norm": 11.530474264624262, |
| "learning_rate": 9.476342710997442e-07, |
| "loss": 0.4378, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.10997596153846154, |
| "grad_norm": 28.092472805389466, |
| "learning_rate": 9.469948849104858e-07, |
| "loss": 0.4033, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.11057692307692307, |
| "grad_norm": 12.87483056607121, |
| "learning_rate": 9.463554987212276e-07, |
| "loss": 0.3855, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.11117788461538461, |
| "grad_norm": 12.951129649825642, |
| "learning_rate": 9.457161125319693e-07, |
| "loss": 0.4156, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.11177884615384616, |
| "grad_norm": 10.238507527434807, |
| "learning_rate": 9.450767263427109e-07, |
| "loss": 0.4417, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.1123798076923077, |
| "grad_norm": 8.449261275213267, |
| "learning_rate": 9.444373401534527e-07, |
| "loss": 0.4499, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.11298076923076923, |
| "grad_norm": 33.54997769262099, |
| "learning_rate": 9.437979539641944e-07, |
| "loss": 0.3718, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.11358173076923077, |
| "grad_norm": 163.06822397421263, |
| "learning_rate": 9.43158567774936e-07, |
| "loss": 0.3994, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.1141826923076923, |
| "grad_norm": 747.6560259343341, |
| "learning_rate": 9.425191815856776e-07, |
| "loss": 0.41, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.11478365384615384, |
| "grad_norm": 20.948270060235018, |
| "learning_rate": 9.418797953964194e-07, |
| "loss": 0.4012, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.11538461538461539, |
| "grad_norm": 16.592630584743493, |
| "learning_rate": 9.412404092071611e-07, |
| "loss": 0.4071, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.11598557692307693, |
| "grad_norm": 6.4533503474115985, |
| "learning_rate": 9.406010230179028e-07, |
| "loss": 0.3927, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.11658653846153846, |
| "grad_norm": 9.481676872361602, |
| "learning_rate": 9.399616368286445e-07, |
| "loss": 0.4486, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.1171875, |
| "grad_norm": 22.879429418595755, |
| "learning_rate": 9.393222506393862e-07, |
| "loss": 0.4487, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.11778846153846154, |
| "grad_norm": 36.32090030167348, |
| "learning_rate": 9.386828644501278e-07, |
| "loss": 0.4866, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.11838942307692307, |
| "grad_norm": 15.994462091464001, |
| "learning_rate": 9.380434782608695e-07, |
| "loss": 0.4853, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.11899038461538461, |
| "grad_norm": 16.476880149018157, |
| "learning_rate": 9.374040920716112e-07, |
| "loss": 0.4713, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.11959134615384616, |
| "grad_norm": 1316.107870553675, |
| "learning_rate": 9.367647058823529e-07, |
| "loss": 0.4386, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.1201923076923077, |
| "grad_norm": 20.762433776167253, |
| "learning_rate": 9.361253196930946e-07, |
| "loss": 0.4609, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.12079326923076923, |
| "grad_norm": 33.600636933197194, |
| "learning_rate": 9.354859335038364e-07, |
| "loss": 0.4346, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.12139423076923077, |
| "grad_norm": 18.47987496120014, |
| "learning_rate": 9.34846547314578e-07, |
| "loss": 0.3968, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.1219951923076923, |
| "grad_norm": 12.493741832264165, |
| "learning_rate": 9.342071611253196e-07, |
| "loss": 0.3862, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.12259615384615384, |
| "grad_norm": 8.016162557709496, |
| "learning_rate": 9.335677749360613e-07, |
| "loss": 0.3913, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.12319711538461539, |
| "grad_norm": 556.4941548489389, |
| "learning_rate": 9.329283887468031e-07, |
| "loss": 0.4127, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.12379807692307693, |
| "grad_norm": 25.506919407129853, |
| "learning_rate": 9.322890025575447e-07, |
| "loss": 0.4349, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.12439903846153846, |
| "grad_norm": 21.228728941085222, |
| "learning_rate": 9.316496163682864e-07, |
| "loss": 0.4603, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 9.977415169528852, |
| "learning_rate": 9.310102301790282e-07, |
| "loss": 0.4158, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.125, |
| "eval_loss": 0.4175701141357422, |
| "eval_runtime": 13.8604, |
| "eval_samples_per_second": 2.309, |
| "eval_steps_per_second": 0.289, |
| "step": 2080 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 16640, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.037184756875264e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|