{ "best_metric": null, "best_model_checkpoint": null, "epoch": 23.272232304900182, "eval_steps": 100, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14519056261343014, "grad_norm": 20.98495864868164, "learning_rate": 5.882352941176471e-07, "loss": 9.4496, "step": 10 }, { "epoch": 0.29038112522686027, "grad_norm": 16.1613712310791, "learning_rate": 1.1764705882352942e-06, "loss": 8.8125, "step": 20 }, { "epoch": 0.4355716878402904, "grad_norm": 6.7023138999938965, "learning_rate": 1.7647058823529414e-06, "loss": 7.7501, "step": 30 }, { "epoch": 0.5807622504537205, "grad_norm": 4.253345489501953, "learning_rate": 2.3529411764705885e-06, "loss": 7.0378, "step": 40 }, { "epoch": 0.7259528130671506, "grad_norm": 2.4256012439727783, "learning_rate": 2.9411764705882355e-06, "loss": 6.6558, "step": 50 }, { "epoch": 0.8711433756805808, "grad_norm": 1.664031982421875, "learning_rate": 3.529411764705883e-06, "loss": 6.3545, "step": 60 }, { "epoch": 1.0181488203266789, "grad_norm": 1.206116795539856, "learning_rate": 4.11764705882353e-06, "loss": 6.2306, "step": 70 }, { "epoch": 1.1633393829401089, "grad_norm": 1.1027638912200928, "learning_rate": 4.705882352941177e-06, "loss": 5.8555, "step": 80 }, { "epoch": 1.308529945553539, "grad_norm": 1.2014143466949463, "learning_rate": 5.294117647058824e-06, "loss": 5.6277, "step": 90 }, { "epoch": 1.453720508166969, "grad_norm": 1.3407199382781982, "learning_rate": 5.882352941176471e-06, "loss": 5.4611, "step": 100 }, { "epoch": 1.453720508166969, "eval_loss": 5.380606174468994, "eval_runtime": 14.5497, "eval_samples_per_second": 195.88, "eval_steps_per_second": 6.186, "step": 100 }, { "epoch": 1.5989110707803993, "grad_norm": 1.6456304788589478, "learning_rate": 6.470588235294119e-06, "loss": 5.3117, "step": 110 }, { "epoch": 1.7441016333938295, "grad_norm": 2.4888839721679688, "learning_rate": 7.058823529411766e-06, "loss": 5.1868, "step": 120 }, { "epoch": 1.8892921960072595, "grad_norm": 2.137110471725464, "learning_rate": 7.647058823529411e-06, "loss": 5.0505, "step": 130 }, { "epoch": 2.0362976406533577, "grad_norm": 2.2031850814819336, "learning_rate": 8.23529411764706e-06, "loss": 5.0571, "step": 140 }, { "epoch": 2.1814882032667877, "grad_norm": 2.2017667293548584, "learning_rate": 8.823529411764707e-06, "loss": 4.8034, "step": 150 }, { "epoch": 2.3266787658802177, "grad_norm": 2.2998626232147217, "learning_rate": 9.411764705882354e-06, "loss": 4.6938, "step": 160 }, { "epoch": 2.471869328493648, "grad_norm": 2.088256359100342, "learning_rate": 1e-05, "loss": 4.5817, "step": 170 }, { "epoch": 2.617059891107078, "grad_norm": 2.1210474967956543, "learning_rate": 1.0588235294117648e-05, "loss": 4.4561, "step": 180 }, { "epoch": 2.762250453720508, "grad_norm": 2.330993175506592, "learning_rate": 1.1176470588235295e-05, "loss": 4.368, "step": 190 }, { "epoch": 2.907441016333938, "grad_norm": 2.2989509105682373, "learning_rate": 1.1764705882352942e-05, "loss": 4.268, "step": 200 }, { "epoch": 2.907441016333938, "eval_loss": 4.2127814292907715, "eval_runtime": 14.464, "eval_samples_per_second": 197.041, "eval_steps_per_second": 6.222, "step": 200 }, { "epoch": 3.0544464609800364, "grad_norm": 1.6547863483428955, "learning_rate": 1.235294117647059e-05, "loss": 4.2584, "step": 210 }, { "epoch": 3.1996370235934664, "grad_norm": 2.2462234497070312, "learning_rate": 1.2941176470588238e-05, "loss": 4.0844, "step": 220 }, { "epoch": 3.344827586206897, "grad_norm": 2.176753044128418, "learning_rate": 1.3529411764705885e-05, "loss": 3.9943, "step": 230 }, { "epoch": 3.490018148820327, "grad_norm": 2.2812905311584473, "learning_rate": 1.4117647058823532e-05, "loss": 3.9237, "step": 240 }, { "epoch": 3.635208711433757, "grad_norm": 2.2799627780914307, "learning_rate": 1.4705882352941179e-05, "loss": 3.849, "step": 250 }, { "epoch": 3.780399274047187, "grad_norm": 2.3910913467407227, "learning_rate": 1.5294117647058822e-05, "loss": 3.7829, "step": 260 }, { "epoch": 3.925589836660617, "grad_norm": 2.793339490890503, "learning_rate": 1.5882352941176473e-05, "loss": 3.7159, "step": 270 }, { "epoch": 4.072595281306715, "grad_norm": 4.1607465744018555, "learning_rate": 1.647058823529412e-05, "loss": 3.7605, "step": 280 }, { "epoch": 4.217785843920145, "grad_norm": 2.6675713062286377, "learning_rate": 1.7058823529411767e-05, "loss": 3.6045, "step": 290 }, { "epoch": 4.362976406533575, "grad_norm": 3.1864140033721924, "learning_rate": 1.7647058823529414e-05, "loss": 3.5499, "step": 300 }, { "epoch": 4.362976406533575, "eval_loss": 3.5333378314971924, "eval_runtime": 14.5283, "eval_samples_per_second": 196.168, "eval_steps_per_second": 6.195, "step": 300 }, { "epoch": 4.508166969147005, "grad_norm": 3.361107110977173, "learning_rate": 1.823529411764706e-05, "loss": 3.4898, "step": 310 }, { "epoch": 4.653357531760435, "grad_norm": 3.826258897781372, "learning_rate": 1.8823529411764708e-05, "loss": 3.4595, "step": 320 }, { "epoch": 4.798548094373865, "grad_norm": 3.7704880237579346, "learning_rate": 1.9411764705882355e-05, "loss": 3.4113, "step": 330 }, { "epoch": 4.943738656987296, "grad_norm": 3.5223851203918457, "learning_rate": 2e-05, "loss": 3.3734, "step": 340 }, { "epoch": 5.090744101633394, "grad_norm": 3.332577705383301, "learning_rate": 1.9999472984871734e-05, "loss": 3.4133, "step": 350 }, { "epoch": 5.235934664246824, "grad_norm": 4.306556701660156, "learning_rate": 1.9997891995035914e-05, "loss": 3.2955, "step": 360 }, { "epoch": 5.381125226860254, "grad_norm": 4.148169040679932, "learning_rate": 1.999525719713366e-05, "loss": 3.2456, "step": 370 }, { "epoch": 5.526315789473684, "grad_norm": 4.137167930603027, "learning_rate": 1.999156886888064e-05, "loss": 3.2064, "step": 380 }, { "epoch": 5.671506352087114, "grad_norm": 3.123608350753784, "learning_rate": 1.998682739903781e-05, "loss": 3.1841, "step": 390 }, { "epoch": 5.816696914700545, "grad_norm": 4.209367752075195, "learning_rate": 1.9981033287370443e-05, "loss": 3.1453, "step": 400 }, { "epoch": 5.816696914700545, "eval_loss": 3.1412713527679443, "eval_runtime": 14.4671, "eval_samples_per_second": 196.998, "eval_steps_per_second": 6.221, "step": 400 }, { "epoch": 5.961887477313975, "grad_norm": 3.708157539367676, "learning_rate": 1.9974187144595433e-05, "loss": 3.1183, "step": 410 }, { "epoch": 6.108892921960073, "grad_norm": 6.567568778991699, "learning_rate": 1.9966289692316944e-05, "loss": 3.166, "step": 420 }, { "epoch": 6.254083484573503, "grad_norm": 4.073953151702881, "learning_rate": 1.9957341762950346e-05, "loss": 3.0523, "step": 430 }, { "epoch": 6.399274047186933, "grad_norm": 4.078774452209473, "learning_rate": 1.9947344299634464e-05, "loss": 3.018, "step": 440 }, { "epoch": 6.544464609800363, "grad_norm": 2.7505741119384766, "learning_rate": 1.993629835613218e-05, "loss": 2.9874, "step": 450 }, { "epoch": 6.689655172413794, "grad_norm": 4.441661834716797, "learning_rate": 1.992420509671936e-05, "loss": 2.9679, "step": 460 }, { "epoch": 6.834845735027224, "grad_norm": 3.656827211380005, "learning_rate": 1.9911065796062137e-05, "loss": 2.9333, "step": 470 }, { "epoch": 6.980036297640654, "grad_norm": 3.519759178161621, "learning_rate": 1.9896881839082554e-05, "loss": 2.9003, "step": 480 }, { "epoch": 7.127041742286751, "grad_norm": 2.58138108253479, "learning_rate": 1.9881654720812594e-05, "loss": 2.9466, "step": 490 }, { "epoch": 7.272232304900181, "grad_norm": 3.1261541843414307, "learning_rate": 1.9865386046236597e-05, "loss": 2.865, "step": 500 }, { "epoch": 7.272232304900181, "eval_loss": 2.8578507900238037, "eval_runtime": 14.4738, "eval_samples_per_second": 196.908, "eval_steps_per_second": 6.218, "step": 500 }, { "epoch": 7.417422867513611, "grad_norm": 4.163350582122803, "learning_rate": 1.9848077530122083e-05, "loss": 2.8359, "step": 510 }, { "epoch": 7.562613430127042, "grad_norm": 2.9917635917663574, "learning_rate": 1.982973099683902e-05, "loss": 2.7953, "step": 520 }, { "epoch": 7.707803992740472, "grad_norm": 3.293595790863037, "learning_rate": 1.9810348380167527e-05, "loss": 2.7838, "step": 530 }, { "epoch": 7.852994555353902, "grad_norm": 3.7709453105926514, "learning_rate": 1.9789931723094046e-05, "loss": 2.7497, "step": 540 }, { "epoch": 7.998185117967332, "grad_norm": 3.3971333503723145, "learning_rate": 1.9768483177596008e-05, "loss": 2.8238, "step": 550 }, { "epoch": 8.14519056261343, "grad_norm": 4.206657886505127, "learning_rate": 1.9746005004415004e-05, "loss": 2.7141, "step": 560 }, { "epoch": 8.29038112522686, "grad_norm": 3.42154598236084, "learning_rate": 1.9722499572818496e-05, "loss": 2.7061, "step": 570 }, { "epoch": 8.43557168784029, "grad_norm": 2.6466500759124756, "learning_rate": 1.9697969360350098e-05, "loss": 2.6849, "step": 580 }, { "epoch": 8.58076225045372, "grad_norm": 3.4602091312408447, "learning_rate": 1.9672416952568416e-05, "loss": 2.6546, "step": 590 }, { "epoch": 8.72595281306715, "grad_norm": 2.882288694381714, "learning_rate": 1.9645845042774555e-05, "loss": 2.6592, "step": 600 }, { "epoch": 8.72595281306715, "eval_loss": 2.655930280685425, "eval_runtime": 14.4429, "eval_samples_per_second": 197.329, "eval_steps_per_second": 6.231, "step": 600 }, { "epoch": 8.87114337568058, "grad_norm": 2.86531662940979, "learning_rate": 1.961825643172819e-05, "loss": 2.6246, "step": 610 }, { "epoch": 9.01814882032668, "grad_norm": 2.0800743103027344, "learning_rate": 1.9589654027352412e-05, "loss": 2.6887, "step": 620 }, { "epoch": 9.163339382940109, "grad_norm": 2.1428987979888916, "learning_rate": 1.956004084442718e-05, "loss": 2.6034, "step": 630 }, { "epoch": 9.30852994555354, "grad_norm": 2.0337836742401123, "learning_rate": 1.9529420004271568e-05, "loss": 2.6018, "step": 640 }, { "epoch": 9.453720508166969, "grad_norm": 2.25555419921875, "learning_rate": 1.9497794734414782e-05, "loss": 2.5723, "step": 650 }, { "epoch": 9.5989110707804, "grad_norm": 2.281365156173706, "learning_rate": 1.9465168368255946e-05, "loss": 2.5639, "step": 660 }, { "epoch": 9.744101633393829, "grad_norm": 1.9673478603363037, "learning_rate": 1.9431544344712776e-05, "loss": 2.5486, "step": 670 }, { "epoch": 9.88929219600726, "grad_norm": 2.3862695693969727, "learning_rate": 1.9396926207859085e-05, "loss": 2.5319, "step": 680 }, { "epoch": 10.036297640653357, "grad_norm": 2.3560924530029297, "learning_rate": 1.936131760655124e-05, "loss": 2.5827, "step": 690 }, { "epoch": 10.181488203266788, "grad_norm": 3.1034605503082275, "learning_rate": 1.932472229404356e-05, "loss": 2.514, "step": 700 }, { "epoch": 10.181488203266788, "eval_loss": 2.5237531661987305, "eval_runtime": 14.6638, "eval_samples_per_second": 194.356, "eval_steps_per_second": 6.138, "step": 700 }, { "epoch": 10.326678765880217, "grad_norm": 2.507720470428467, "learning_rate": 1.9287144127592704e-05, "loss": 2.5069, "step": 710 }, { "epoch": 10.471869328493648, "grad_norm": 2.375530481338501, "learning_rate": 1.924858706805112e-05, "loss": 2.4914, "step": 720 }, { "epoch": 10.617059891107077, "grad_norm": 2.781869649887085, "learning_rate": 1.920905517944954e-05, "loss": 2.4731, "step": 730 }, { "epoch": 10.762250453720508, "grad_norm": 2.3014352321624756, "learning_rate": 1.9168552628568632e-05, "loss": 2.4679, "step": 740 }, { "epoch": 10.907441016333939, "grad_norm": 2.277211904525757, "learning_rate": 1.9127083684499805e-05, "loss": 2.4708, "step": 750 }, { "epoch": 11.054446460980037, "grad_norm": 2.731947660446167, "learning_rate": 1.9084652718195237e-05, "loss": 2.5219, "step": 760 }, { "epoch": 11.199637023593466, "grad_norm": 2.070516347885132, "learning_rate": 1.9041264202007158e-05, "loss": 2.4407, "step": 770 }, { "epoch": 11.344827586206897, "grad_norm": 2.526477098464966, "learning_rate": 1.8996922709216456e-05, "loss": 2.4408, "step": 780 }, { "epoch": 11.490018148820326, "grad_norm": 2.280230760574341, "learning_rate": 1.8951632913550625e-05, "loss": 2.4217, "step": 790 }, { "epoch": 11.635208711433757, "grad_norm": 2.5102462768554688, "learning_rate": 1.8905399588691165e-05, "loss": 2.4129, "step": 800 }, { "epoch": 11.635208711433757, "eval_loss": 2.4271934032440186, "eval_runtime": 14.4721, "eval_samples_per_second": 196.93, "eval_steps_per_second": 6.219, "step": 800 }, { "epoch": 11.780399274047188, "grad_norm": 2.3329319953918457, "learning_rate": 1.8858227607770398e-05, "loss": 2.4068, "step": 810 }, { "epoch": 11.925589836660617, "grad_norm": 2.9200258255004883, "learning_rate": 1.8810121942857848e-05, "loss": 2.393, "step": 820 }, { "epoch": 12.072595281306715, "grad_norm": 2.835029363632202, "learning_rate": 1.8761087664436137e-05, "loss": 2.4508, "step": 830 }, { "epoch": 12.217785843920145, "grad_norm": 2.7595760822296143, "learning_rate": 1.8711129940866577e-05, "loss": 2.3873, "step": 840 }, { "epoch": 12.362976406533575, "grad_norm": 2.940290689468384, "learning_rate": 1.866025403784439e-05, "loss": 2.3817, "step": 850 }, { "epoch": 12.508166969147005, "grad_norm": 2.936760902404785, "learning_rate": 1.860846531784368e-05, "loss": 2.3642, "step": 860 }, { "epoch": 12.653357531760436, "grad_norm": 3.160423994064331, "learning_rate": 1.8555769239552232e-05, "loss": 2.3586, "step": 870 }, { "epoch": 12.798548094373865, "grad_norm": 2.8737099170684814, "learning_rate": 1.8502171357296144e-05, "loss": 2.3481, "step": 880 }, { "epoch": 12.943738656987296, "grad_norm": 2.706122398376465, "learning_rate": 1.8447677320454367e-05, "loss": 2.3496, "step": 890 }, { "epoch": 13.090744101633394, "grad_norm": 2.1658377647399902, "learning_rate": 1.839229287286327e-05, "loss": 2.3982, "step": 900 }, { "epoch": 13.090744101633394, "eval_loss": 2.3612313270568848, "eval_runtime": 14.4663, "eval_samples_per_second": 197.009, "eval_steps_per_second": 6.221, "step": 900 }, { "epoch": 13.235934664246823, "grad_norm": 2.8399763107299805, "learning_rate": 1.8336023852211197e-05, "loss": 2.3385, "step": 910 }, { "epoch": 13.381125226860254, "grad_norm": 2.8449740409851074, "learning_rate": 1.827887618942318e-05, "loss": 2.329, "step": 920 }, { "epoch": 13.526315789473685, "grad_norm": 3.4475836753845215, "learning_rate": 1.8220855908035783e-05, "loss": 2.3102, "step": 930 }, { "epoch": 13.671506352087114, "grad_norm": 2.820624589920044, "learning_rate": 1.816196912356222e-05, "loss": 2.3118, "step": 940 }, { "epoch": 13.816696914700545, "grad_norm": 2.9867615699768066, "learning_rate": 1.8102222042847735e-05, "loss": 2.3077, "step": 950 }, { "epoch": 13.961887477313974, "grad_norm": 4.323665142059326, "learning_rate": 1.8041620963415418e-05, "loss": 2.3013, "step": 960 }, { "epoch": 14.108892921960072, "grad_norm": 4.4349236488342285, "learning_rate": 1.7980172272802398e-05, "loss": 2.3514, "step": 970 }, { "epoch": 14.254083484573503, "grad_norm": 4.404689311981201, "learning_rate": 1.7917882447886585e-05, "loss": 2.2881, "step": 980 }, { "epoch": 14.399274047186934, "grad_norm": 4.489727020263672, "learning_rate": 1.785475805420399e-05, "loss": 2.2839, "step": 990 }, { "epoch": 14.544464609800363, "grad_norm": 3.8734374046325684, "learning_rate": 1.7790805745256703e-05, "loss": 2.2785, "step": 1000 }, { "epoch": 14.544464609800363, "eval_loss": 2.2907564640045166, "eval_runtime": 14.4981, "eval_samples_per_second": 196.577, "eval_steps_per_second": 6.208, "step": 1000 }, { "epoch": 14.689655172413794, "grad_norm": 4.450066089630127, "learning_rate": 1.772603226181159e-05, "loss": 2.2566, "step": 1010 }, { "epoch": 14.834845735027223, "grad_norm": 4.106595993041992, "learning_rate": 1.766044443118978e-05, "loss": 2.261, "step": 1020 }, { "epoch": 14.980036297640654, "grad_norm": 3.8136234283447266, "learning_rate": 1.7594049166547073e-05, "loss": 2.2535, "step": 1030 }, { "epoch": 15.127041742286751, "grad_norm": 5.098197937011719, "learning_rate": 1.7526853466145248e-05, "loss": 2.3093, "step": 1040 }, { "epoch": 15.272232304900182, "grad_norm": 3.752929210662842, "learning_rate": 1.7458864412614436e-05, "loss": 2.2377, "step": 1050 }, { "epoch": 15.417422867513611, "grad_norm": 3.992673397064209, "learning_rate": 1.7390089172206594e-05, "loss": 2.2416, "step": 1060 }, { "epoch": 15.562613430127042, "grad_norm": 3.857272148132324, "learning_rate": 1.7320534994040148e-05, "loss": 2.2251, "step": 1070 }, { "epoch": 15.707803992740471, "grad_norm": 4.833571434020996, "learning_rate": 1.725020920933593e-05, "loss": 2.2262, "step": 1080 }, { "epoch": 15.852994555353902, "grad_norm": 2.919546127319336, "learning_rate": 1.717911923064442e-05, "loss": 2.2204, "step": 1090 }, { "epoch": 15.998185117967331, "grad_norm": 4.030925273895264, "learning_rate": 1.710727255106447e-05, "loss": 2.274, "step": 1100 }, { "epoch": 15.998185117967331, "eval_loss": 2.2391433715820312, "eval_runtime": 14.4613, "eval_samples_per_second": 197.077, "eval_steps_per_second": 6.223, "step": 1100 }, { "epoch": 16.14519056261343, "grad_norm": 3.9502663612365723, "learning_rate": 1.70346767434535e-05, "loss": 2.2167, "step": 1110 }, { "epoch": 16.29038112522686, "grad_norm": 3.8203535079956055, "learning_rate": 1.696133945962927e-05, "loss": 2.2083, "step": 1120 }, { "epoch": 16.43557168784029, "grad_norm": 4.025405406951904, "learning_rate": 1.6887268429563387e-05, "loss": 2.1931, "step": 1130 }, { "epoch": 16.58076225045372, "grad_norm": 3.5635058879852295, "learning_rate": 1.681247146056654e-05, "loss": 2.1873, "step": 1140 }, { "epoch": 16.72595281306715, "grad_norm": 3.3059027194976807, "learning_rate": 1.6736956436465573e-05, "loss": 2.1844, "step": 1150 }, { "epoch": 16.87114337568058, "grad_norm": 4.572406768798828, "learning_rate": 1.6660731316772503e-05, "loss": 2.1869, "step": 1160 }, { "epoch": 17.01814882032668, "grad_norm": 4.2072858810424805, "learning_rate": 1.6583804135845582e-05, "loss": 2.2311, "step": 1170 }, { "epoch": 17.16333938294011, "grad_norm": 3.021080732345581, "learning_rate": 1.650618300204242e-05, "loss": 2.1811, "step": 1180 }, { "epoch": 17.308529945553538, "grad_norm": 3.794221878051758, "learning_rate": 1.6427876096865394e-05, "loss": 2.173, "step": 1190 }, { "epoch": 17.45372050816697, "grad_norm": 2.7157645225524902, "learning_rate": 1.634889167409923e-05, "loss": 2.1565, "step": 1200 }, { "epoch": 17.45372050816697, "eval_loss": 2.190614938735962, "eval_runtime": 14.4975, "eval_samples_per_second": 196.585, "eval_steps_per_second": 6.208, "step": 1200 }, { "epoch": 17.5989110707804, "grad_norm": 2.9268596172332764, "learning_rate": 1.626923805894107e-05, "loss": 2.1555, "step": 1210 }, { "epoch": 17.74410163339383, "grad_norm": 3.686579942703247, "learning_rate": 1.6188923647122946e-05, "loss": 2.1556, "step": 1220 }, { "epoch": 17.88929219600726, "grad_norm": 4.035010814666748, "learning_rate": 1.610795690402688e-05, "loss": 2.1488, "step": 1230 }, { "epoch": 18.03629764065336, "grad_norm": 3.6302409172058105, "learning_rate": 1.6026346363792565e-05, "loss": 2.187, "step": 1240 }, { "epoch": 18.181488203266788, "grad_norm": 4.382446765899658, "learning_rate": 1.594410062841787e-05, "loss": 2.1317, "step": 1250 }, { "epoch": 18.326678765880217, "grad_norm": 3.663515567779541, "learning_rate": 1.5861228366852148e-05, "loss": 2.1395, "step": 1260 }, { "epoch": 18.471869328493646, "grad_norm": 4.221041679382324, "learning_rate": 1.5777738314082514e-05, "loss": 2.1335, "step": 1270 }, { "epoch": 18.61705989110708, "grad_norm": 2.963515043258667, "learning_rate": 1.5693639270213138e-05, "loss": 2.1366, "step": 1280 }, { "epoch": 18.762250453720508, "grad_norm": 3.6942713260650635, "learning_rate": 1.56089400995377e-05, "loss": 2.1127, "step": 1290 }, { "epoch": 18.907441016333937, "grad_norm": 3.9856643676757812, "learning_rate": 1.552364972960506e-05, "loss": 2.1151, "step": 1300 }, { "epoch": 18.907441016333937, "eval_loss": 2.1343271732330322, "eval_runtime": 14.4663, "eval_samples_per_second": 197.009, "eval_steps_per_second": 6.221, "step": 1300 }, { "epoch": 19.054446460980035, "grad_norm": 3.4405646324157715, "learning_rate": 1.5437777150278268e-05, "loss": 2.1568, "step": 1310 }, { "epoch": 19.199637023593468, "grad_norm": 3.63427472114563, "learning_rate": 1.5351331412787004e-05, "loss": 2.0952, "step": 1320 }, { "epoch": 19.344827586206897, "grad_norm": 3.7811665534973145, "learning_rate": 1.526432162877356e-05, "loss": 2.1059, "step": 1330 }, { "epoch": 19.490018148820326, "grad_norm": 4.82371187210083, "learning_rate": 1.5176756969332428e-05, "loss": 2.1137, "step": 1340 }, { "epoch": 19.635208711433755, "grad_norm": 2.7587039470672607, "learning_rate": 1.5088646664043652e-05, "loss": 2.0951, "step": 1350 }, { "epoch": 19.780399274047188, "grad_norm": 3.3458638191223145, "learning_rate": 1.5000000000000002e-05, "loss": 2.0987, "step": 1360 }, { "epoch": 19.925589836660617, "grad_norm": 3.0390782356262207, "learning_rate": 1.4910826320828085e-05, "loss": 2.0854, "step": 1370 }, { "epoch": 20.072595281306715, "grad_norm": 4.020959854125977, "learning_rate": 1.4821135025703491e-05, "loss": 2.1504, "step": 1380 }, { "epoch": 20.217785843920144, "grad_norm": 3.3670413494110107, "learning_rate": 1.4730935568360103e-05, "loss": 2.0924, "step": 1390 }, { "epoch": 20.362976406533576, "grad_norm": 3.5803141593933105, "learning_rate": 1.4640237456093636e-05, "loss": 2.0649, "step": 1400 }, { "epoch": 20.362976406533576, "eval_loss": 2.100735902786255, "eval_runtime": 14.4583, "eval_samples_per_second": 197.118, "eval_steps_per_second": 6.225, "step": 1400 }, { "epoch": 20.508166969147005, "grad_norm": 3.7825708389282227, "learning_rate": 1.4549050248759546e-05, "loss": 2.0681, "step": 1410 }, { "epoch": 20.653357531760435, "grad_norm": 2.811061382293701, "learning_rate": 1.4457383557765385e-05, "loss": 2.0663, "step": 1420 }, { "epoch": 20.798548094373867, "grad_norm": 2.962312698364258, "learning_rate": 1.4365247045057732e-05, "loss": 2.0614, "step": 1430 }, { "epoch": 20.943738656987296, "grad_norm": 2.2117114067077637, "learning_rate": 1.427265042210381e-05, "loss": 2.0595, "step": 1440 }, { "epoch": 21.090744101633394, "grad_norm": 4.001194953918457, "learning_rate": 1.4179603448867836e-05, "loss": 2.121, "step": 1450 }, { "epoch": 21.235934664246823, "grad_norm": 3.026092767715454, "learning_rate": 1.4086115932782316e-05, "loss": 2.0485, "step": 1460 }, { "epoch": 21.381125226860256, "grad_norm": 3.9385151863098145, "learning_rate": 1.399219772771431e-05, "loss": 2.0477, "step": 1470 }, { "epoch": 21.526315789473685, "grad_norm": 3.3123228549957275, "learning_rate": 1.3897858732926794e-05, "loss": 2.0427, "step": 1480 }, { "epoch": 21.671506352087114, "grad_norm": 3.20367169380188, "learning_rate": 1.3803108892035259e-05, "loss": 2.0504, "step": 1490 }, { "epoch": 21.816696914700543, "grad_norm": 3.5425407886505127, "learning_rate": 1.3707958191959609e-05, "loss": 2.0474, "step": 1500 }, { "epoch": 21.816696914700543, "eval_loss": 2.065798044204712, "eval_runtime": 14.4887, "eval_samples_per_second": 196.705, "eval_steps_per_second": 6.212, "step": 1500 }, { "epoch": 21.961887477313976, "grad_norm": 3.0119071006774902, "learning_rate": 1.3612416661871532e-05, "loss": 2.0418, "step": 1510 }, { "epoch": 22.108892921960074, "grad_norm": 3.1108176708221436, "learning_rate": 1.3516494372137368e-05, "loss": 2.0866, "step": 1520 }, { "epoch": 22.254083484573503, "grad_norm": 3.3773062229156494, "learning_rate": 1.342020143325669e-05, "loss": 2.0343, "step": 1530 }, { "epoch": 22.399274047186932, "grad_norm": 3.3889658451080322, "learning_rate": 1.3323547994796597e-05, "loss": 2.0236, "step": 1540 }, { "epoch": 22.544464609800364, "grad_norm": 2.263582229614258, "learning_rate": 1.322654424432195e-05, "loss": 2.0312, "step": 1550 }, { "epoch": 22.689655172413794, "grad_norm": 2.6817829608917236, "learning_rate": 1.3129200406321545e-05, "loss": 2.0299, "step": 1560 }, { "epoch": 22.834845735027223, "grad_norm": 3.2429702281951904, "learning_rate": 1.3031526741130435e-05, "loss": 2.0159, "step": 1570 }, { "epoch": 22.980036297640652, "grad_norm": 2.677948236465454, "learning_rate": 1.2933533543848462e-05, "loss": 2.0198, "step": 1580 }, { "epoch": 23.12704174228675, "grad_norm": 2.9102084636688232, "learning_rate": 1.283523114325511e-05, "loss": 2.0692, "step": 1590 }, { "epoch": 23.272232304900182, "grad_norm": 3.3689348697662354, "learning_rate": 1.2736629900720832e-05, "loss": 2.0121, "step": 1600 }, { "epoch": 23.272232304900182, "eval_loss": 2.03466534614563, "eval_runtime": 14.4631, "eval_samples_per_second": 197.053, "eval_steps_per_second": 6.223, "step": 1600 } ], "logging_steps": 10, "max_steps": 3400, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.967214202780385e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }