{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 2540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03937007874015748, "grad_norm": 9.399362564086914, "learning_rate": 7.086614173228347e-06, "loss": 1.4557, "step": 10 }, { "epoch": 0.07874015748031496, "grad_norm": 2.942493438720703, "learning_rate": 1.4960629921259845e-05, "loss": 0.4296, "step": 20 }, { "epoch": 0.11811023622047244, "grad_norm": 3.1724367141723633, "learning_rate": 2.283464566929134e-05, "loss": 0.3657, "step": 30 }, { "epoch": 0.15748031496062992, "grad_norm": 1.5789762735366821, "learning_rate": 3.070866141732284e-05, "loss": 0.3486, "step": 40 }, { "epoch": 0.1968503937007874, "grad_norm": 2.097869396209717, "learning_rate": 3.858267716535433e-05, "loss": 0.3044, "step": 50 }, { "epoch": 0.23622047244094488, "grad_norm": 3.880457639694214, "learning_rate": 4.645669291338583e-05, "loss": 0.3367, "step": 60 }, { "epoch": 0.2755905511811024, "grad_norm": 2.8253917694091797, "learning_rate": 5.433070866141733e-05, "loss": 0.3126, "step": 70 }, { "epoch": 0.31496062992125984, "grad_norm": 3.822925090789795, "learning_rate": 6.220472440944882e-05, "loss": 0.3004, "step": 80 }, { "epoch": 0.3543307086614173, "grad_norm": 1.3659324645996094, "learning_rate": 7.007874015748031e-05, "loss": 0.2605, "step": 90 }, { "epoch": 0.3937007874015748, "grad_norm": 1.7165173292160034, "learning_rate": 7.795275590551181e-05, "loss": 0.1676, "step": 100 }, { "epoch": 0.4330708661417323, "grad_norm": 1.704687476158142, "learning_rate": 8.582677165354331e-05, "loss": 0.1404, "step": 110 }, { "epoch": 0.47244094488188976, "grad_norm": 1.3101590871810913, "learning_rate": 9.370078740157481e-05, "loss": 0.1322, "step": 120 }, { "epoch": 0.5118110236220472, "grad_norm": 1.6621087789535522, "learning_rate": 9.999983049408561e-05, "loss": 0.1242, "step": 130 }, { "epoch": 0.5511811023622047, "grad_norm": 0.9743478298187256, "learning_rate": 9.999389790775648e-05, "loss": 0.1027, "step": 140 }, { "epoch": 0.5905511811023622, "grad_norm": 1.9478999376296997, "learning_rate": 9.997949117496292e-05, "loss": 0.1174, "step": 150 }, { "epoch": 0.6299212598425197, "grad_norm": 0.9509850740432739, "learning_rate": 9.995661273769822e-05, "loss": 0.1015, "step": 160 }, { "epoch": 0.6692913385826772, "grad_norm": 0.9505985379219055, "learning_rate": 9.992526647394022e-05, "loss": 0.102, "step": 170 }, { "epoch": 0.7086614173228346, "grad_norm": 1.489611268043518, "learning_rate": 9.988545769699399e-05, "loss": 0.097, "step": 180 }, { "epoch": 0.7480314960629921, "grad_norm": 1.1149543523788452, "learning_rate": 9.983719315459114e-05, "loss": 0.0925, "step": 190 }, { "epoch": 0.7874015748031497, "grad_norm": 1.0860552787780762, "learning_rate": 9.978048102774613e-05, "loss": 0.0964, "step": 200 }, { "epoch": 0.8267716535433071, "grad_norm": 1.2707302570343018, "learning_rate": 9.971533092936954e-05, "loss": 0.0844, "step": 210 }, { "epoch": 0.8661417322834646, "grad_norm": 1.1820255517959595, "learning_rate": 9.964175390263856e-05, "loss": 0.0805, "step": 220 }, { "epoch": 0.905511811023622, "grad_norm": 1.3937278985977173, "learning_rate": 9.955976241912535e-05, "loss": 0.0871, "step": 230 }, { "epoch": 0.9448818897637795, "grad_norm": 0.9341478943824768, "learning_rate": 9.946937037668275e-05, "loss": 0.0826, "step": 240 }, { "epoch": 0.984251968503937, "grad_norm": 1.7772321701049805, "learning_rate": 9.937059309708885e-05, "loss": 0.0873, "step": 250 }, { "epoch": 1.0236220472440944, "grad_norm": 1.2695393562316895, "learning_rate": 9.926344732344967e-05, "loss": 0.0794, "step": 260 }, { "epoch": 1.0629921259842519, "grad_norm": 1.1093697547912598, "learning_rate": 9.914795121736128e-05, "loss": 0.0758, "step": 270 }, { "epoch": 1.1023622047244095, "grad_norm": 0.8029543161392212, "learning_rate": 9.902412435583128e-05, "loss": 0.0678, "step": 280 }, { "epoch": 1.141732283464567, "grad_norm": 0.7547488808631897, "learning_rate": 9.88919877279604e-05, "loss": 0.0761, "step": 290 }, { "epoch": 1.1811023622047245, "grad_norm": 0.8116704225540161, "learning_rate": 9.875156373138489e-05, "loss": 0.057, "step": 300 }, { "epoch": 1.220472440944882, "grad_norm": 0.8954646587371826, "learning_rate": 9.86028761684799e-05, "loss": 0.0738, "step": 310 }, { "epoch": 1.2598425196850394, "grad_norm": 1.016405463218689, "learning_rate": 9.844595024232495e-05, "loss": 0.0901, "step": 320 }, { "epoch": 1.2992125984251968, "grad_norm": 1.395342469215393, "learning_rate": 9.828081255243198e-05, "loss": 0.0796, "step": 330 }, { "epoch": 1.3385826771653544, "grad_norm": 0.8092917203903198, "learning_rate": 9.81074910902365e-05, "loss": 0.0883, "step": 340 }, { "epoch": 1.3779527559055118, "grad_norm": 1.1169452667236328, "learning_rate": 9.792601523435307e-05, "loss": 0.0748, "step": 350 }, { "epoch": 1.4173228346456692, "grad_norm": 0.7994294166564941, "learning_rate": 9.773641574559546e-05, "loss": 0.0862, "step": 360 }, { "epoch": 1.4566929133858268, "grad_norm": 0.7973235249519348, "learning_rate": 9.753872476176254e-05, "loss": 0.0735, "step": 370 }, { "epoch": 1.4960629921259843, "grad_norm": 0.9219651818275452, "learning_rate": 9.73329757921909e-05, "loss": 0.077, "step": 380 }, { "epoch": 1.5354330708661417, "grad_norm": 1.1715006828308105, "learning_rate": 9.711920371207484e-05, "loss": 0.0691, "step": 390 }, { "epoch": 1.574803149606299, "grad_norm": 0.7752212882041931, "learning_rate": 9.68974447565549e-05, "loss": 0.0669, "step": 400 }, { "epoch": 1.6141732283464567, "grad_norm": 1.0260776281356812, "learning_rate": 9.666773651457588e-05, "loss": 0.0623, "step": 410 }, { "epoch": 1.6535433070866141, "grad_norm": 0.8338336944580078, "learning_rate": 9.643011792251538e-05, "loss": 0.0699, "step": 420 }, { "epoch": 1.6929133858267718, "grad_norm": 0.8776105642318726, "learning_rate": 9.618462925758392e-05, "loss": 0.0653, "step": 430 }, { "epoch": 1.7322834645669292, "grad_norm": 0.6896973252296448, "learning_rate": 9.593131213099789e-05, "loss": 0.0586, "step": 440 }, { "epoch": 1.7716535433070866, "grad_norm": 1.0852605104446411, "learning_rate": 9.567020948092616e-05, "loss": 0.0673, "step": 450 }, { "epoch": 1.811023622047244, "grad_norm": 1.0203490257263184, "learning_rate": 9.540136556521203e-05, "loss": 0.0663, "step": 460 }, { "epoch": 1.8503937007874016, "grad_norm": 0.774488091468811, "learning_rate": 9.512482595387132e-05, "loss": 0.0609, "step": 470 }, { "epoch": 1.889763779527559, "grad_norm": 0.5737660527229309, "learning_rate": 9.484063752136805e-05, "loss": 0.0606, "step": 480 }, { "epoch": 1.9291338582677167, "grad_norm": 1.0153898000717163, "learning_rate": 9.454884843866912e-05, "loss": 0.0737, "step": 490 }, { "epoch": 1.968503937007874, "grad_norm": 0.7526334524154663, "learning_rate": 9.424950816507909e-05, "loss": 0.0641, "step": 500 }, { "epoch": 2.0078740157480315, "grad_norm": 0.5760018825531006, "learning_rate": 9.394266743985671e-05, "loss": 0.0674, "step": 510 }, { "epoch": 2.047244094488189, "grad_norm": 0.70269775390625, "learning_rate": 9.36283782736144e-05, "loss": 0.0631, "step": 520 }, { "epoch": 2.0866141732283463, "grad_norm": 0.8864635229110718, "learning_rate": 9.330669393950219e-05, "loss": 0.0654, "step": 530 }, { "epoch": 2.1259842519685037, "grad_norm": 0.7043759226799011, "learning_rate": 9.297766896417793e-05, "loss": 0.0657, "step": 540 }, { "epoch": 2.1653543307086616, "grad_norm": 0.6329500675201416, "learning_rate": 9.264135911856462e-05, "loss": 0.0707, "step": 550 }, { "epoch": 2.204724409448819, "grad_norm": 0.4031962752342224, "learning_rate": 9.22978214083971e-05, "loss": 0.0528, "step": 560 }, { "epoch": 2.2440944881889764, "grad_norm": 0.5401821136474609, "learning_rate": 9.194711406455945e-05, "loss": 0.0654, "step": 570 }, { "epoch": 2.283464566929134, "grad_norm": 0.713798999786377, "learning_rate": 9.158929653321451e-05, "loss": 0.0555, "step": 580 }, { "epoch": 2.322834645669291, "grad_norm": 0.4728735387325287, "learning_rate": 9.122442946572768e-05, "loss": 0.0552, "step": 590 }, { "epoch": 2.362204724409449, "grad_norm": 0.7359452843666077, "learning_rate": 9.085257470838619e-05, "loss": 0.0677, "step": 600 }, { "epoch": 2.4015748031496065, "grad_norm": 0.6030870676040649, "learning_rate": 9.047379529191594e-05, "loss": 0.053, "step": 610 }, { "epoch": 2.440944881889764, "grad_norm": 0.5791817903518677, "learning_rate": 9.008815542079766e-05, "loss": 0.0493, "step": 620 }, { "epoch": 2.4803149606299213, "grad_norm": 0.8772215247154236, "learning_rate": 8.969572046238389e-05, "loss": 0.0721, "step": 630 }, { "epoch": 2.5196850393700787, "grad_norm": 0.8733668923377991, "learning_rate": 8.929655693581904e-05, "loss": 0.0597, "step": 640 }, { "epoch": 2.559055118110236, "grad_norm": 1.0022321939468384, "learning_rate": 8.889073250076421e-05, "loss": 0.0659, "step": 650 }, { "epoch": 2.5984251968503935, "grad_norm": 0.7206939458847046, "learning_rate": 8.84783159459285e-05, "loss": 0.0452, "step": 660 }, { "epoch": 2.637795275590551, "grad_norm": 0.8875113725662231, "learning_rate": 8.805937717740918e-05, "loss": 0.0539, "step": 670 }, { "epoch": 2.677165354330709, "grad_norm": 0.5767335295677185, "learning_rate": 8.763398720684232e-05, "loss": 0.0503, "step": 680 }, { "epoch": 2.716535433070866, "grad_norm": 0.5727648138999939, "learning_rate": 8.72022181393661e-05, "loss": 0.0457, "step": 690 }, { "epoch": 2.7559055118110236, "grad_norm": 0.8125827312469482, "learning_rate": 8.676414316139863e-05, "loss": 0.0607, "step": 700 }, { "epoch": 2.795275590551181, "grad_norm": 0.6720311641693115, "learning_rate": 8.631983652823267e-05, "loss": 0.0665, "step": 710 }, { "epoch": 2.8346456692913384, "grad_norm": 0.6637985706329346, "learning_rate": 8.586937355144908e-05, "loss": 0.068, "step": 720 }, { "epoch": 2.8740157480314963, "grad_norm": 0.7840360999107361, "learning_rate": 8.541283058615124e-05, "loss": 0.0561, "step": 730 }, { "epoch": 2.9133858267716537, "grad_norm": 0.44171687960624695, "learning_rate": 8.495028501802251e-05, "loss": 0.0534, "step": 740 }, { "epoch": 2.952755905511811, "grad_norm": 0.4313163459300995, "learning_rate": 8.448181525020921e-05, "loss": 0.0391, "step": 750 }, { "epoch": 2.9921259842519685, "grad_norm": 0.7261826395988464, "learning_rate": 8.400750069003086e-05, "loss": 0.0486, "step": 760 }, { "epoch": 3.031496062992126, "grad_norm": 0.4469556212425232, "learning_rate": 8.352742173552046e-05, "loss": 0.0511, "step": 770 }, { "epoch": 3.0708661417322833, "grad_norm": 0.9129867553710938, "learning_rate": 8.304165976179667e-05, "loss": 0.0533, "step": 780 }, { "epoch": 3.1102362204724407, "grad_norm": 1.2041122913360596, "learning_rate": 8.255029710727048e-05, "loss": 0.0671, "step": 790 }, { "epoch": 3.1496062992125986, "grad_norm": 0.7420069575309753, "learning_rate": 8.20534170596885e-05, "loss": 0.0685, "step": 800 }, { "epoch": 3.188976377952756, "grad_norm": 0.3230190575122833, "learning_rate": 8.155110384201544e-05, "loss": 0.0647, "step": 810 }, { "epoch": 3.2283464566929134, "grad_norm": 0.6603342890739441, "learning_rate": 8.104344259815794e-05, "loss": 0.0558, "step": 820 }, { "epoch": 3.267716535433071, "grad_norm": 0.5632081031799316, "learning_rate": 8.053051937853248e-05, "loss": 0.0558, "step": 830 }, { "epoch": 3.3070866141732282, "grad_norm": 0.7754299640655518, "learning_rate": 8.001242112547942e-05, "loss": 0.0632, "step": 840 }, { "epoch": 3.3464566929133857, "grad_norm": 0.7823946475982666, "learning_rate": 7.948923565852598e-05, "loss": 0.0662, "step": 850 }, { "epoch": 3.3858267716535435, "grad_norm": 0.7399844527244568, "learning_rate": 7.896105165950059e-05, "loss": 0.052, "step": 860 }, { "epoch": 3.425196850393701, "grad_norm": 0.8208476305007935, "learning_rate": 7.842795865750088e-05, "loss": 0.0486, "step": 870 }, { "epoch": 3.4645669291338583, "grad_norm": 0.5400993227958679, "learning_rate": 7.789004701371825e-05, "loss": 0.0443, "step": 880 }, { "epoch": 3.5039370078740157, "grad_norm": 0.6949036717414856, "learning_rate": 7.734740790612136e-05, "loss": 0.0597, "step": 890 }, { "epoch": 3.543307086614173, "grad_norm": 1.0321848392486572, "learning_rate": 7.680013331400098e-05, "loss": 0.0446, "step": 900 }, { "epoch": 3.5826771653543306, "grad_norm": 0.5193206071853638, "learning_rate": 7.624831600237937e-05, "loss": 0.0499, "step": 910 }, { "epoch": 3.622047244094488, "grad_norm": 0.755699872970581, "learning_rate": 7.569204950628605e-05, "loss": 0.0595, "step": 920 }, { "epoch": 3.661417322834646, "grad_norm": 0.4758411645889282, "learning_rate": 7.513142811490356e-05, "loss": 0.0403, "step": 930 }, { "epoch": 3.7007874015748032, "grad_norm": 0.9744377732276917, "learning_rate": 7.456654685558481e-05, "loss": 0.0566, "step": 940 }, { "epoch": 3.7401574803149606, "grad_norm": 0.46791282296180725, "learning_rate": 7.399750147774575e-05, "loss": 0.0445, "step": 950 }, { "epoch": 3.779527559055118, "grad_norm": 0.4000394642353058, "learning_rate": 7.34243884366355e-05, "loss": 0.0577, "step": 960 }, { "epoch": 3.8188976377952755, "grad_norm": 0.6318042874336243, "learning_rate": 7.28473048769868e-05, "loss": 0.0491, "step": 970 }, { "epoch": 3.8582677165354333, "grad_norm": 0.9635873436927795, "learning_rate": 7.226634861654965e-05, "loss": 0.0501, "step": 980 }, { "epoch": 3.8976377952755907, "grad_norm": 0.7494231462478638, "learning_rate": 7.168161812951084e-05, "loss": 0.0511, "step": 990 }, { "epoch": 3.937007874015748, "grad_norm": 0.6621044874191284, "learning_rate": 7.109321252980218e-05, "loss": 0.0429, "step": 1000 }, { "epoch": 3.9763779527559056, "grad_norm": 0.6507459878921509, "learning_rate": 7.05012315543004e-05, "loss": 0.0514, "step": 1010 }, { "epoch": 4.015748031496063, "grad_norm": 0.6397859454154968, "learning_rate": 6.990577554592134e-05, "loss": 0.051, "step": 1020 }, { "epoch": 4.05511811023622, "grad_norm": 0.8459829688072205, "learning_rate": 6.930694543661149e-05, "loss": 0.0492, "step": 1030 }, { "epoch": 4.094488188976378, "grad_norm": 0.6369995474815369, "learning_rate": 6.870484273023968e-05, "loss": 0.0447, "step": 1040 }, { "epoch": 4.133858267716535, "grad_norm": 0.6142792701721191, "learning_rate": 6.809956948539166e-05, "loss": 0.044, "step": 1050 }, { "epoch": 4.173228346456693, "grad_norm": 0.5256998538970947, "learning_rate": 6.749122829807103e-05, "loss": 0.0427, "step": 1060 }, { "epoch": 4.21259842519685, "grad_norm": 0.7650443911552429, "learning_rate": 6.687992228430872e-05, "loss": 0.0525, "step": 1070 }, { "epoch": 4.251968503937007, "grad_norm": 0.6876934170722961, "learning_rate": 6.62657550626844e-05, "loss": 0.0385, "step": 1080 }, { "epoch": 4.291338582677166, "grad_norm": 0.7614730000495911, "learning_rate": 6.564883073676287e-05, "loss": 0.0543, "step": 1090 }, { "epoch": 4.330708661417323, "grad_norm": 0.591896653175354, "learning_rate": 6.502925387744807e-05, "loss": 0.044, "step": 1100 }, { "epoch": 4.3700787401574805, "grad_norm": 0.8287089467048645, "learning_rate": 6.440712950525791e-05, "loss": 0.0427, "step": 1110 }, { "epoch": 4.409448818897638, "grad_norm": 0.8359081745147705, "learning_rate": 6.3782563072523e-05, "loss": 0.0513, "step": 1120 }, { "epoch": 4.448818897637795, "grad_norm": 0.4965924322605133, "learning_rate": 6.315566044551197e-05, "loss": 0.0503, "step": 1130 }, { "epoch": 4.488188976377953, "grad_norm": 0.5000588297843933, "learning_rate": 6.252652788648691e-05, "loss": 0.0348, "step": 1140 }, { "epoch": 4.52755905511811, "grad_norm": 0.5434101819992065, "learning_rate": 6.18952720356914e-05, "loss": 0.0409, "step": 1150 }, { "epoch": 4.566929133858268, "grad_norm": 0.6852266788482666, "learning_rate": 6.126199989327462e-05, "loss": 0.0437, "step": 1160 }, { "epoch": 4.606299212598425, "grad_norm": 0.684528648853302, "learning_rate": 6.062681880115453e-05, "loss": 0.0447, "step": 1170 }, { "epoch": 4.645669291338582, "grad_norm": 0.6656462550163269, "learning_rate": 5.998983642482296e-05, "loss": 0.0429, "step": 1180 }, { "epoch": 4.68503937007874, "grad_norm": 0.7268936634063721, "learning_rate": 5.935116073509592e-05, "loss": 0.0478, "step": 1190 }, { "epoch": 4.724409448818898, "grad_norm": 0.5984519720077515, "learning_rate": 5.871089998981214e-05, "loss": 0.038, "step": 1200 }, { "epoch": 4.7637795275590555, "grad_norm": 0.6121963858604431, "learning_rate": 5.8069162715483e-05, "loss": 0.0388, "step": 1210 }, { "epoch": 4.803149606299213, "grad_norm": 0.47043073177337646, "learning_rate": 5.742605768889693e-05, "loss": 0.0355, "step": 1220 }, { "epoch": 4.84251968503937, "grad_norm": 0.5659105181694031, "learning_rate": 5.6781693918681275e-05, "loss": 0.0434, "step": 1230 }, { "epoch": 4.881889763779528, "grad_norm": 0.5114152431488037, "learning_rate": 5.613618062682502e-05, "loss": 0.0379, "step": 1240 }, { "epoch": 4.921259842519685, "grad_norm": 0.6941166520118713, "learning_rate": 5.5489627230165176e-05, "loss": 0.046, "step": 1250 }, { "epoch": 4.960629921259843, "grad_norm": 0.6256818771362305, "learning_rate": 5.48421433218403e-05, "loss": 0.0505, "step": 1260 }, { "epoch": 5.0, "grad_norm": 0.8666470646858215, "learning_rate": 5.419383865271402e-05, "loss": 0.0419, "step": 1270 }, { "epoch": 5.039370078740157, "grad_norm": 1.043869972229004, "learning_rate": 5.354482311277193e-05, "loss": 0.0483, "step": 1280 }, { "epoch": 5.078740157480315, "grad_norm": 0.4656646251678467, "learning_rate": 5.289520671249479e-05, "loss": 0.0333, "step": 1290 }, { "epoch": 5.118110236220472, "grad_norm": 0.45656928420066833, "learning_rate": 5.224509956421133e-05, "loss": 0.0373, "step": 1300 }, { "epoch": 5.15748031496063, "grad_norm": 0.7461910247802734, "learning_rate": 5.159461186343385e-05, "loss": 0.0427, "step": 1310 }, { "epoch": 5.196850393700787, "grad_norm": 0.646958589553833, "learning_rate": 5.094385387017967e-05, "loss": 0.0447, "step": 1320 }, { "epoch": 5.2362204724409445, "grad_norm": 0.2877052128314972, "learning_rate": 5.02929358902817e-05, "loss": 0.036, "step": 1330 }, { "epoch": 5.275590551181103, "grad_norm": 0.5020745992660522, "learning_rate": 4.964196825669112e-05, "loss": 0.0485, "step": 1340 }, { "epoch": 5.31496062992126, "grad_norm": 0.494816392660141, "learning_rate": 4.899106131077562e-05, "loss": 0.0446, "step": 1350 }, { "epoch": 5.354330708661418, "grad_norm": 0.4856385290622711, "learning_rate": 4.834032538361607e-05, "loss": 0.0418, "step": 1360 }, { "epoch": 5.393700787401575, "grad_norm": 0.2937265634536743, "learning_rate": 4.768987077730509e-05, "loss": 0.0329, "step": 1370 }, { "epoch": 5.433070866141732, "grad_norm": 0.44668734073638916, "learning_rate": 4.703980774625038e-05, "loss": 0.0373, "step": 1380 }, { "epoch": 5.47244094488189, "grad_norm": 0.49580681324005127, "learning_rate": 4.6390246478486196e-05, "loss": 0.0393, "step": 1390 }, { "epoch": 5.511811023622047, "grad_norm": 0.41949307918548584, "learning_rate": 4.574129707699617e-05, "loss": 0.0336, "step": 1400 }, { "epoch": 5.551181102362205, "grad_norm": 0.7558311223983765, "learning_rate": 4.509306954105028e-05, "loss": 0.036, "step": 1410 }, { "epoch": 5.590551181102362, "grad_norm": 0.6058725118637085, "learning_rate": 4.4445673747559776e-05, "loss": 0.0389, "step": 1420 }, { "epoch": 5.6299212598425195, "grad_norm": 0.5574952960014343, "learning_rate": 4.3799219432452527e-05, "loss": 0.0441, "step": 1430 }, { "epoch": 5.669291338582677, "grad_norm": 0.3628334403038025, "learning_rate": 4.315381617207239e-05, "loss": 0.0306, "step": 1440 }, { "epoch": 5.708661417322834, "grad_norm": 0.6688554286956787, "learning_rate": 4.2509573364605695e-05, "loss": 0.0384, "step": 1450 }, { "epoch": 5.748031496062993, "grad_norm": 0.4437117874622345, "learning_rate": 4.1866600211537734e-05, "loss": 0.0359, "step": 1460 }, { "epoch": 5.78740157480315, "grad_norm": 0.5025441646575928, "learning_rate": 4.122500569914285e-05, "loss": 0.0339, "step": 1470 }, { "epoch": 5.826771653543307, "grad_norm": 0.6300631761550903, "learning_rate": 4.058489858001079e-05, "loss": 0.0412, "step": 1480 }, { "epoch": 5.866141732283465, "grad_norm": 0.33207225799560547, "learning_rate": 3.9946387354612754e-05, "loss": 0.0382, "step": 1490 }, { "epoch": 5.905511811023622, "grad_norm": 0.4854939877986908, "learning_rate": 3.930958025291021e-05, "loss": 0.0355, "step": 1500 }, { "epoch": 5.94488188976378, "grad_norm": 0.4515645205974579, "learning_rate": 3.867458521600943e-05, "loss": 0.0403, "step": 1510 }, { "epoch": 5.984251968503937, "grad_norm": 0.5812086462974548, "learning_rate": 3.804150987786525e-05, "loss": 0.045, "step": 1520 }, { "epoch": 6.0236220472440944, "grad_norm": 0.4998365640640259, "learning_rate": 3.7410461547036534e-05, "loss": 0.0372, "step": 1530 }, { "epoch": 6.062992125984252, "grad_norm": 0.2945879101753235, "learning_rate": 3.6781547188497135e-05, "loss": 0.0355, "step": 1540 }, { "epoch": 6.102362204724409, "grad_norm": 0.4646545946598053, "learning_rate": 3.6154873405504895e-05, "loss": 0.0271, "step": 1550 }, { "epoch": 6.141732283464567, "grad_norm": 0.5919941067695618, "learning_rate": 3.553054642153192e-05, "loss": 0.04, "step": 1560 }, { "epoch": 6.181102362204724, "grad_norm": 0.4582063555717468, "learning_rate": 3.4908672062259487e-05, "loss": 0.0308, "step": 1570 }, { "epoch": 6.2204724409448815, "grad_norm": 0.5543438196182251, "learning_rate": 3.428935573764005e-05, "loss": 0.0319, "step": 1580 }, { "epoch": 6.259842519685039, "grad_norm": 0.3909936249256134, "learning_rate": 3.367270242402999e-05, "loss": 0.0305, "step": 1590 }, { "epoch": 6.299212598425197, "grad_norm": 0.4638426601886749, "learning_rate": 3.30588166463957e-05, "loss": 0.0298, "step": 1600 }, { "epoch": 6.338582677165355, "grad_norm": 0.5119015574455261, "learning_rate": 3.2447802460596124e-05, "loss": 0.0299, "step": 1610 }, { "epoch": 6.377952755905512, "grad_norm": 0.4361736476421356, "learning_rate": 3.183976343574513e-05, "loss": 0.0279, "step": 1620 }, { "epoch": 6.417322834645669, "grad_norm": 0.7282920479774475, "learning_rate": 3.123480263665597e-05, "loss": 0.0305, "step": 1630 }, { "epoch": 6.456692913385827, "grad_norm": 0.4011118710041046, "learning_rate": 3.063302260637151e-05, "loss": 0.0294, "step": 1640 }, { "epoch": 6.496062992125984, "grad_norm": 0.44994768500328064, "learning_rate": 3.0034525348782855e-05, "loss": 0.0256, "step": 1650 }, { "epoch": 6.535433070866142, "grad_norm": 0.4626915454864502, "learning_rate": 2.9439412311339175e-05, "loss": 0.039, "step": 1660 }, { "epoch": 6.574803149606299, "grad_norm": 0.6001310348510742, "learning_rate": 2.8847784367852184e-05, "loss": 0.025, "step": 1670 }, { "epoch": 6.6141732283464565, "grad_norm": 0.38421395421028137, "learning_rate": 2.8259741801397477e-05, "loss": 0.0373, "step": 1680 }, { "epoch": 6.653543307086614, "grad_norm": 0.37881389260292053, "learning_rate": 2.7675384287316363e-05, "loss": 0.034, "step": 1690 }, { "epoch": 6.692913385826771, "grad_norm": 0.43850505352020264, "learning_rate": 2.709481087632041e-05, "loss": 0.0367, "step": 1700 }, { "epoch": 6.73228346456693, "grad_norm": 0.6985974907875061, "learning_rate": 2.6518119977702e-05, "loss": 0.0364, "step": 1710 }, { "epoch": 6.771653543307087, "grad_norm": 0.4026467502117157, "learning_rate": 2.5945409342653726e-05, "loss": 0.0363, "step": 1720 }, { "epoch": 6.811023622047244, "grad_norm": 0.36351197957992554, "learning_rate": 2.5376776047698965e-05, "loss": 0.03, "step": 1730 }, { "epoch": 6.850393700787402, "grad_norm": 0.2879717946052551, "learning_rate": 2.4812316478237353e-05, "loss": 0.0309, "step": 1740 }, { "epoch": 6.889763779527559, "grad_norm": 0.6626470685005188, "learning_rate": 2.4252126312206873e-05, "loss": 0.0431, "step": 1750 }, { "epoch": 6.929133858267717, "grad_norm": 0.4203033447265625, "learning_rate": 2.3696300503866204e-05, "loss": 0.0276, "step": 1760 }, { "epoch": 6.968503937007874, "grad_norm": 0.5207045078277588, "learning_rate": 2.314493326769968e-05, "loss": 0.0294, "step": 1770 }, { "epoch": 7.0078740157480315, "grad_norm": 0.4823305308818817, "learning_rate": 2.259811806244741e-05, "loss": 0.044, "step": 1780 }, { "epoch": 7.047244094488189, "grad_norm": 0.6036306023597717, "learning_rate": 2.2055947575263912e-05, "loss": 0.0283, "step": 1790 }, { "epoch": 7.086614173228346, "grad_norm": 0.5445898175239563, "learning_rate": 2.1518513706007155e-05, "loss": 0.0299, "step": 1800 }, { "epoch": 7.125984251968504, "grad_norm": 0.44029784202575684, "learning_rate": 2.0985907551661206e-05, "loss": 0.0349, "step": 1810 }, { "epoch": 7.165354330708661, "grad_norm": 0.4250989258289337, "learning_rate": 2.0458219390895106e-05, "loss": 0.0301, "step": 1820 }, { "epoch": 7.2047244094488185, "grad_norm": 0.31232279539108276, "learning_rate": 1.9935538668760057e-05, "loss": 0.0421, "step": 1830 }, { "epoch": 7.244094488188976, "grad_norm": 0.5633496642112732, "learning_rate": 1.9417953981528424e-05, "loss": 0.03, "step": 1840 }, { "epoch": 7.283464566929134, "grad_norm": 0.7122541666030884, "learning_rate": 1.890555306167619e-05, "loss": 0.0343, "step": 1850 }, { "epoch": 7.322834645669292, "grad_norm": 0.4240683615207672, "learning_rate": 1.8398422763011985e-05, "loss": 0.0244, "step": 1860 }, { "epoch": 7.362204724409449, "grad_norm": 0.3466864824295044, "learning_rate": 1.789664904595518e-05, "loss": 0.0265, "step": 1870 }, { "epoch": 7.4015748031496065, "grad_norm": 0.7498940229415894, "learning_rate": 1.7400316962965087e-05, "loss": 0.0303, "step": 1880 }, { "epoch": 7.440944881889764, "grad_norm": 0.38159969449043274, "learning_rate": 1.6909510644124455e-05, "loss": 0.0261, "step": 1890 }, { "epoch": 7.480314960629921, "grad_norm": 0.5678858160972595, "learning_rate": 1.642431328287899e-05, "loss": 0.035, "step": 1900 }, { "epoch": 7.519685039370079, "grad_norm": 0.4290473461151123, "learning_rate": 1.594480712193579e-05, "loss": 0.026, "step": 1910 }, { "epoch": 7.559055118110236, "grad_norm": 0.5522475242614746, "learning_rate": 1.547107343932299e-05, "loss": 0.029, "step": 1920 }, { "epoch": 7.5984251968503935, "grad_norm": 0.41189444065093994, "learning_rate": 1.5003192534612675e-05, "loss": 0.0243, "step": 1930 }, { "epoch": 7.637795275590551, "grad_norm": 0.1887262910604477, "learning_rate": 1.4541243715310005e-05, "loss": 0.0255, "step": 1940 }, { "epoch": 7.677165354330708, "grad_norm": 0.5400164127349854, "learning_rate": 1.4085305283410166e-05, "loss": 0.0251, "step": 1950 }, { "epoch": 7.716535433070866, "grad_norm": 0.5268674492835999, "learning_rate": 1.3635454522125946e-05, "loss": 0.036, "step": 1960 }, { "epoch": 7.755905511811024, "grad_norm": 0.6157100200653076, "learning_rate": 1.3191767682788003e-05, "loss": 0.0266, "step": 1970 }, { "epoch": 7.7952755905511815, "grad_norm": 0.3194139301776886, "learning_rate": 1.2754319971919842e-05, "loss": 0.0243, "step": 1980 }, { "epoch": 7.834645669291339, "grad_norm": 0.37460631132125854, "learning_rate": 1.2323185538490229e-05, "loss": 0.0357, "step": 1990 }, { "epoch": 7.874015748031496, "grad_norm": 0.37989112734794617, "learning_rate": 1.1898437461344518e-05, "loss": 0.0318, "step": 2000 }, { "epoch": 7.913385826771654, "grad_norm": 0.42782655358314514, "learning_rate": 1.1480147736817598e-05, "loss": 0.0263, "step": 2010 }, { "epoch": 7.952755905511811, "grad_norm": 0.5276915431022644, "learning_rate": 1.1068387266530267e-05, "loss": 0.025, "step": 2020 }, { "epoch": 7.9921259842519685, "grad_norm": 0.6227043271064758, "learning_rate": 1.0663225845371045e-05, "loss": 0.0296, "step": 2030 }, { "epoch": 8.031496062992126, "grad_norm": 0.2772444784641266, "learning_rate": 1.026473214966584e-05, "loss": 0.0346, "step": 2040 }, { "epoch": 8.070866141732283, "grad_norm": 0.4647983908653259, "learning_rate": 9.872973725536955e-06, "loss": 0.0308, "step": 2050 }, { "epoch": 8.11023622047244, "grad_norm": 0.32798609137535095, "learning_rate": 9.488016977453807e-06, "loss": 0.0248, "step": 2060 }, { "epoch": 8.149606299212598, "grad_norm": 0.2955688238143921, "learning_rate": 9.109927156977122e-06, "loss": 0.0268, "step": 2070 }, { "epoch": 8.188976377952756, "grad_norm": 0.3243492543697357, "learning_rate": 8.738768351698574e-06, "loss": 0.0273, "step": 2080 }, { "epoch": 8.228346456692913, "grad_norm": 0.2228180319070816, "learning_rate": 8.374603474377718e-06, "loss": 0.0276, "step": 2090 }, { "epoch": 8.26771653543307, "grad_norm": 0.4943491816520691, "learning_rate": 8.017494252278019e-06, "loss": 0.0338, "step": 2100 }, { "epoch": 8.307086614173228, "grad_norm": 0.26574063301086426, "learning_rate": 7.667501216703849e-06, "loss": 0.0286, "step": 2110 }, { "epoch": 8.346456692913385, "grad_norm": 0.561998188495636, "learning_rate": 7.324683692740259e-06, "loss": 0.0302, "step": 2120 }, { "epoch": 8.385826771653543, "grad_norm": 0.5270190238952637, "learning_rate": 6.989099789197112e-06, "loss": 0.0309, "step": 2130 }, { "epoch": 8.4251968503937, "grad_norm": 0.49484336376190186, "learning_rate": 6.660806388759505e-06, "loss": 0.0255, "step": 2140 }, { "epoch": 8.464566929133857, "grad_norm": 0.5083401799201965, "learning_rate": 6.339859138345838e-06, "loss": 0.0253, "step": 2150 }, { "epoch": 8.503937007874015, "grad_norm": 0.34489235281944275, "learning_rate": 6.026312439675552e-06, "loss": 0.0216, "step": 2160 }, { "epoch": 8.543307086614174, "grad_norm": 0.41160085797309875, "learning_rate": 5.720219440047797e-06, "loss": 0.0168, "step": 2170 }, { "epoch": 8.582677165354331, "grad_norm": 0.2923263609409332, "learning_rate": 5.421632023332779e-06, "loss": 0.0264, "step": 2180 }, { "epoch": 8.622047244094489, "grad_norm": 0.522784411907196, "learning_rate": 5.130600801177294e-06, "loss": 0.0222, "step": 2190 }, { "epoch": 8.661417322834646, "grad_norm": 0.413273423910141, "learning_rate": 4.8471751044257995e-06, "loss": 0.0312, "step": 2200 }, { "epoch": 8.700787401574804, "grad_norm": 0.4753558039665222, "learning_rate": 4.571402974758715e-06, "loss": 0.0285, "step": 2210 }, { "epoch": 8.740157480314961, "grad_norm": 0.36735445261001587, "learning_rate": 4.303331156549162e-06, "loss": 0.0303, "step": 2220 }, { "epoch": 8.779527559055119, "grad_norm": 0.3742741048336029, "learning_rate": 4.043005088939616e-06, "loss": 0.0166, "step": 2230 }, { "epoch": 8.818897637795276, "grad_norm": 0.2975374460220337, "learning_rate": 3.7904688981398485e-06, "loss": 0.0233, "step": 2240 }, { "epoch": 8.858267716535433, "grad_norm": 0.4250410497188568, "learning_rate": 3.5457653899473197e-06, "loss": 0.0343, "step": 2250 }, { "epoch": 8.89763779527559, "grad_norm": 0.41226938366889954, "learning_rate": 3.3089360424914674e-06, "loss": 0.0329, "step": 2260 }, { "epoch": 8.937007874015748, "grad_norm": 0.18084678053855896, "learning_rate": 3.080020999203026e-06, "loss": 0.0195, "step": 2270 }, { "epoch": 8.976377952755906, "grad_norm": 0.29392436146736145, "learning_rate": 2.8590590620095336e-06, "loss": 0.0257, "step": 2280 }, { "epoch": 9.015748031496063, "grad_norm": 0.2687474489212036, "learning_rate": 2.646087684758325e-06, "loss": 0.0274, "step": 2290 }, { "epoch": 9.05511811023622, "grad_norm": 0.1699579358100891, "learning_rate": 2.4411429668679043e-06, "loss": 0.0217, "step": 2300 }, { "epoch": 9.094488188976378, "grad_norm": 0.2965308725833893, "learning_rate": 2.2442596472089907e-06, "loss": 0.0266, "step": 2310 }, { "epoch": 9.133858267716535, "grad_norm": 0.29086264967918396, "learning_rate": 2.0554710982161607e-06, "loss": 0.0202, "step": 2320 }, { "epoch": 9.173228346456693, "grad_norm": 0.525842010974884, "learning_rate": 1.8748093202311078e-06, "loss": 0.0252, "step": 2330 }, { "epoch": 9.21259842519685, "grad_norm": 0.23323991894721985, "learning_rate": 1.7023049360784193e-06, "loss": 0.0257, "step": 2340 }, { "epoch": 9.251968503937007, "grad_norm": 0.3511553704738617, "learning_rate": 1.5379871858749784e-06, "loss": 0.0341, "step": 2350 }, { "epoch": 9.291338582677165, "grad_norm": 0.2637965977191925, "learning_rate": 1.3818839220735792e-06, "loss": 0.0207, "step": 2360 }, { "epoch": 9.330708661417322, "grad_norm": 0.212178036570549, "learning_rate": 1.2340216047418695e-06, "loss": 0.0207, "step": 2370 }, { "epoch": 9.37007874015748, "grad_norm": 0.38010916113853455, "learning_rate": 1.094425297077295e-06, "loss": 0.0239, "step": 2380 }, { "epoch": 9.409448818897637, "grad_norm": 0.5460922718048096, "learning_rate": 9.631186611587405e-07, "loss": 0.0216, "step": 2390 }, { "epoch": 9.448818897637794, "grad_norm": 0.3007522523403168, "learning_rate": 8.401239539358008e-07, "loss": 0.0301, "step": 2400 }, { "epoch": 9.488188976377952, "grad_norm": 0.279300719499588, "learning_rate": 7.254620234560583e-07, "loss": 0.0313, "step": 2410 }, { "epoch": 9.527559055118111, "grad_norm": 0.6319423913955688, "learning_rate": 6.191523053313386e-07, "loss": 0.0298, "step": 2420 }, { "epoch": 9.566929133858268, "grad_norm": 0.5479516983032227, "learning_rate": 5.212128194432509e-07, "loss": 0.0222, "step": 2430 }, { "epoch": 9.606299212598426, "grad_norm": 0.44871729612350464, "learning_rate": 4.3166016688879205e-07, "loss": 0.021, "step": 2440 }, { "epoch": 9.645669291338583, "grad_norm": 0.5158320665359497, "learning_rate": 3.505095271663705e-07, "loss": 0.0174, "step": 2450 }, { "epoch": 9.68503937007874, "grad_norm": 0.3390265107154846, "learning_rate": 2.7777465560285265e-07, "loss": 0.0224, "step": 2460 }, { "epoch": 9.724409448818898, "grad_norm": 0.39901813864707947, "learning_rate": 2.1346788102196148e-07, "loss": 0.0181, "step": 2470 }, { "epoch": 9.763779527559056, "grad_norm": 0.4687666893005371, "learning_rate": 1.5760010365450938e-07, "loss": 0.0213, "step": 2480 }, { "epoch": 9.803149606299213, "grad_norm": 0.5264361500740051, "learning_rate": 1.1018079329076503e-07, "loss": 0.0249, "step": 2490 }, { "epoch": 9.84251968503937, "grad_norm": 0.5656106472015381, "learning_rate": 7.121798767530385e-08, "loss": 0.0217, "step": 2500 }, { "epoch": 9.881889763779528, "grad_norm": 0.3701815903186798, "learning_rate": 4.071829114455361e-08, "loss": 0.0198, "step": 2510 }, { "epoch": 9.921259842519685, "grad_norm": 0.2982766628265381, "learning_rate": 1.868687350736198e-08, "loss": 0.0219, "step": 2520 }, { "epoch": 9.960629921259843, "grad_norm": 0.3252989947795868, "learning_rate": 5.1274691686697965e-09, "loss": 0.021, "step": 2530 }, { "epoch": 10.0, "grad_norm": 0.5611906051635742, "learning_rate": 4.23764965562512e-11, "loss": 0.0357, "step": 2540 }, { "epoch": 10.0, "step": 2540, "total_flos": 0.0, "train_loss": 0.06096198053106548, "train_runtime": 2181.3746, "train_samples_per_second": 31.434, "train_steps_per_second": 1.164 } ], "logging_steps": 10, "max_steps": 2540, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 27, "trial_name": null, "trial_params": null }