{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10252758988964045, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005695977216091135, "grad_norm": 21.049573670903968, "learning_rate": 3.4155597722960153e-09, "loss": 0.4919, "step": 10 }, { "epoch": 0.001139195443218227, "grad_norm": 20.80684714819707, "learning_rate": 7.210626185958254e-09, "loss": 0.4437, "step": 20 }, { "epoch": 0.0017087931648273407, "grad_norm": 33.130290630821996, "learning_rate": 1.1005692599620494e-08, "loss": 0.5198, "step": 30 }, { "epoch": 0.002278390886436454, "grad_norm": 12.510945540397115, "learning_rate": 1.4800759013282731e-08, "loss": 0.4968, "step": 40 }, { "epoch": 0.002847988608045568, "grad_norm": 15.943493472335307, "learning_rate": 1.859582542694497e-08, "loss": 0.4409, "step": 50 }, { "epoch": 0.0034175863296546814, "grad_norm": 42.406097387194166, "learning_rate": 2.239089184060721e-08, "loss": 0.4684, "step": 60 }, { "epoch": 0.003987184051263795, "grad_norm": 24.406933558360457, "learning_rate": 2.6185958254269448e-08, "loss": 0.478, "step": 70 }, { "epoch": 0.004556781772872908, "grad_norm": 29.842569661833632, "learning_rate": 2.9981024667931685e-08, "loss": 0.4511, "step": 80 }, { "epoch": 0.005126379494482022, "grad_norm": 42.41367434650834, "learning_rate": 3.3776091081593926e-08, "loss": 0.4326, "step": 90 }, { "epoch": 0.005695977216091136, "grad_norm": 19.885296598548557, "learning_rate": 3.757115749525617e-08, "loss": 0.4347, "step": 100 }, { "epoch": 0.006265574937700249, "grad_norm": 25.71810252475294, "learning_rate": 4.13662239089184e-08, "loss": 0.4735, "step": 110 }, { "epoch": 0.006835172659309363, "grad_norm": 30.792769140339338, "learning_rate": 4.516129032258064e-08, "loss": 0.4933, "step": 120 }, { "epoch": 0.007404770380918476, "grad_norm": 16.942087666916184, "learning_rate": 4.8956356736242883e-08, "loss": 0.508, "step": 130 }, { "epoch": 0.00797436810252759, "grad_norm": 13.25121169703299, "learning_rate": 5.275142314990512e-08, "loss": 0.4854, "step": 140 }, { "epoch": 0.008543965824136704, "grad_norm": 26.66597693714079, "learning_rate": 5.654648956356736e-08, "loss": 0.4939, "step": 150 }, { "epoch": 0.009113563545745816, "grad_norm": 25.56128647346689, "learning_rate": 6.03415559772296e-08, "loss": 0.4886, "step": 160 }, { "epoch": 0.00968316126735493, "grad_norm": 24.66360202992372, "learning_rate": 6.413662239089184e-08, "loss": 0.5195, "step": 170 }, { "epoch": 0.010252758988964043, "grad_norm": 29.34378737951036, "learning_rate": 6.793168880455408e-08, "loss": 0.4434, "step": 180 }, { "epoch": 0.010822356710573158, "grad_norm": 56.85479253251122, "learning_rate": 7.172675521821632e-08, "loss": 0.421, "step": 190 }, { "epoch": 0.011391954432182272, "grad_norm": 32.3202020199265, "learning_rate": 7.552182163187856e-08, "loss": 0.447, "step": 200 }, { "epoch": 0.011961552153791384, "grad_norm": 30.909848483794523, "learning_rate": 7.931688804554079e-08, "loss": 0.4677, "step": 210 }, { "epoch": 0.012531149875400499, "grad_norm": 25.632205852181723, "learning_rate": 8.311195445920303e-08, "loss": 0.4739, "step": 220 }, { "epoch": 0.013100747597009611, "grad_norm": 17.49619627631964, "learning_rate": 8.690702087286526e-08, "loss": 0.4753, "step": 230 }, { "epoch": 0.013670345318618726, "grad_norm": 33.64843713209131, "learning_rate": 9.07020872865275e-08, "loss": 0.485, "step": 240 }, { "epoch": 0.01423994304022784, "grad_norm": 34.99600769438769, "learning_rate": 9.449715370018974e-08, "loss": 0.4711, "step": 250 }, { "epoch": 0.014809540761836952, "grad_norm": 29.524926487531715, "learning_rate": 9.829222011385198e-08, "loss": 0.4665, "step": 260 }, { "epoch": 0.015379138483446067, "grad_norm": 13.568503433545974, "learning_rate": 1.0208728652751421e-07, "loss": 0.4457, "step": 270 }, { "epoch": 0.01594873620505518, "grad_norm": 24.811742737462957, "learning_rate": 1.0588235294117647e-07, "loss": 0.4924, "step": 280 }, { "epoch": 0.016518333926664294, "grad_norm": 14.269729917230158, "learning_rate": 1.0967741935483869e-07, "loss": 0.4603, "step": 290 }, { "epoch": 0.017087931648273408, "grad_norm": 16.274688413588088, "learning_rate": 1.1347248576850095e-07, "loss": 0.5013, "step": 300 }, { "epoch": 0.017657529369882522, "grad_norm": 25.477411052433148, "learning_rate": 1.1726755218216317e-07, "loss": 0.4705, "step": 310 }, { "epoch": 0.018227127091491633, "grad_norm": 19.70072062267979, "learning_rate": 1.2106261859582542e-07, "loss": 0.4611, "step": 320 }, { "epoch": 0.018796724813100747, "grad_norm": 13.857865555631838, "learning_rate": 1.2485768500948766e-07, "loss": 0.4363, "step": 330 }, { "epoch": 0.01936632253470986, "grad_norm": 36.710229582900936, "learning_rate": 1.286527514231499e-07, "loss": 0.4937, "step": 340 }, { "epoch": 0.019935920256318976, "grad_norm": 16.21485980059241, "learning_rate": 1.3244781783681214e-07, "loss": 0.4843, "step": 350 }, { "epoch": 0.020505517977928087, "grad_norm": 18.51035631765946, "learning_rate": 1.3624288425047438e-07, "loss": 0.4953, "step": 360 }, { "epoch": 0.0210751156995372, "grad_norm": 109.65864479439105, "learning_rate": 1.4003795066413662e-07, "loss": 0.4452, "step": 370 }, { "epoch": 0.021644713421146315, "grad_norm": 26.812069103996944, "learning_rate": 1.4383301707779884e-07, "loss": 0.4745, "step": 380 }, { "epoch": 0.02221431114275543, "grad_norm": 750.501418758497, "learning_rate": 1.476280834914611e-07, "loss": 0.4178, "step": 390 }, { "epoch": 0.022783908864364544, "grad_norm": 12.593443551819714, "learning_rate": 1.5142314990512332e-07, "loss": 0.4774, "step": 400 }, { "epoch": 0.023353506585973655, "grad_norm": 41.1730678527844, "learning_rate": 1.5521821631878558e-07, "loss": 0.5109, "step": 410 }, { "epoch": 0.02392310430758277, "grad_norm": 16.51074339271402, "learning_rate": 1.590132827324478e-07, "loss": 0.4692, "step": 420 }, { "epoch": 0.024492702029191883, "grad_norm": 11.523289642261012, "learning_rate": 1.6280834914611007e-07, "loss": 0.417, "step": 430 }, { "epoch": 0.025062299750800997, "grad_norm": 21.107219252953012, "learning_rate": 1.6660341555977228e-07, "loss": 0.3798, "step": 440 }, { "epoch": 0.025631897472410112, "grad_norm": 8.312935431045693, "learning_rate": 1.7039848197343455e-07, "loss": 0.4586, "step": 450 }, { "epoch": 0.026201495194019223, "grad_norm": 17.659714103179056, "learning_rate": 1.7419354838709676e-07, "loss": 0.4637, "step": 460 }, { "epoch": 0.026771092915628337, "grad_norm": 35.28046535756356, "learning_rate": 1.77988614800759e-07, "loss": 0.4591, "step": 470 }, { "epoch": 0.02734069063723745, "grad_norm": 17.475339931922075, "learning_rate": 1.8178368121442124e-07, "loss": 0.4884, "step": 480 }, { "epoch": 0.027910288358846565, "grad_norm": 12.775420545423671, "learning_rate": 1.8557874762808349e-07, "loss": 0.4355, "step": 490 }, { "epoch": 0.02847988608045568, "grad_norm": 22.58921652660408, "learning_rate": 1.8937381404174573e-07, "loss": 0.5028, "step": 500 }, { "epoch": 0.02904948380206479, "grad_norm": 21.649711297403396, "learning_rate": 1.9316888045540797e-07, "loss": 0.4595, "step": 510 }, { "epoch": 0.029619081523673905, "grad_norm": 61.88775690696106, "learning_rate": 1.969639468690702e-07, "loss": 0.4419, "step": 520 }, { "epoch": 0.03018867924528302, "grad_norm": 18.72064581371778, "learning_rate": 1.9999999319386685e-07, "loss": 0.4346, "step": 530 }, { "epoch": 0.030758276966892133, "grad_norm": 20.753732350716753, "learning_rate": 1.9999975497930434e-07, "loss": 0.4233, "step": 540 }, { "epoch": 0.03132787468850125, "grad_norm": 50.13806900167064, "learning_rate": 1.999991764590115e-07, "loss": 0.4471, "step": 550 }, { "epoch": 0.03189747241011036, "grad_norm": 15.313720228827242, "learning_rate": 1.99998257634957e-07, "loss": 0.4284, "step": 560 }, { "epoch": 0.032467070131719476, "grad_norm": 13.936702397387482, "learning_rate": 1.999969985102677e-07, "loss": 0.4901, "step": 570 }, { "epoch": 0.03303666785332859, "grad_norm": 13.608501418916317, "learning_rate": 1.9999539908922847e-07, "loss": 0.4853, "step": 580 }, { "epoch": 0.0336062655749377, "grad_norm": 15.107322868355896, "learning_rate": 1.9999345937728225e-07, "loss": 0.4361, "step": 590 }, { "epoch": 0.034175863296546816, "grad_norm": 32.40056901599672, "learning_rate": 1.9999117938103e-07, "loss": 0.5263, "step": 600 }, { "epoch": 0.034745461018155926, "grad_norm": 9.964058712960675, "learning_rate": 1.9998855910823074e-07, "loss": 0.4443, "step": 610 }, { "epoch": 0.035315058739765044, "grad_norm": 22.67319149302576, "learning_rate": 1.999855985678014e-07, "loss": 0.453, "step": 620 }, { "epoch": 0.035884656461374155, "grad_norm": 32.08276760500707, "learning_rate": 1.9998229776981686e-07, "loss": 0.4704, "step": 630 }, { "epoch": 0.036454254182983266, "grad_norm": 35.57871112406775, "learning_rate": 1.9997865672551e-07, "loss": 0.4567, "step": 640 }, { "epoch": 0.037023851904592384, "grad_norm": 30.629691553067705, "learning_rate": 1.9997467544727151e-07, "loss": 0.4657, "step": 650 }, { "epoch": 0.037593449626201494, "grad_norm": 16.79894739182948, "learning_rate": 1.9997035394864997e-07, "loss": 0.4889, "step": 660 }, { "epoch": 0.03816304734781061, "grad_norm": 24.064206535589015, "learning_rate": 1.9996569224435164e-07, "loss": 0.4541, "step": 670 }, { "epoch": 0.03873264506941972, "grad_norm": 15.092611591816881, "learning_rate": 1.9996069035024073e-07, "loss": 0.4437, "step": 680 }, { "epoch": 0.039302242791028834, "grad_norm": 10.567345786993243, "learning_rate": 1.9995534828333894e-07, "loss": 0.4567, "step": 690 }, { "epoch": 0.03987184051263795, "grad_norm": 15.772143297135706, "learning_rate": 1.9994966606182567e-07, "loss": 0.4429, "step": 700 }, { "epoch": 0.04044143823424706, "grad_norm": 23.454081623401972, "learning_rate": 1.9994364370503791e-07, "loss": 0.4874, "step": 710 }, { "epoch": 0.04101103595585617, "grad_norm": 25.771274144252004, "learning_rate": 1.9993728123347014e-07, "loss": 0.4804, "step": 720 }, { "epoch": 0.04158063367746529, "grad_norm": 38.49240584512041, "learning_rate": 1.9993057866877422e-07, "loss": 0.4851, "step": 730 }, { "epoch": 0.0421502313990744, "grad_norm": 14.490580753823508, "learning_rate": 1.999235360337595e-07, "loss": 0.5076, "step": 740 }, { "epoch": 0.04271982912068352, "grad_norm": 73.59426722231183, "learning_rate": 1.999161533523925e-07, "loss": 0.4716, "step": 750 }, { "epoch": 0.04328942684229263, "grad_norm": 85.76885571174373, "learning_rate": 1.9990843064979692e-07, "loss": 0.4993, "step": 760 }, { "epoch": 0.04385902456390174, "grad_norm": 14.903196926933902, "learning_rate": 1.999003679522537e-07, "loss": 0.4791, "step": 770 }, { "epoch": 0.04442862228551086, "grad_norm": 18.928061501180686, "learning_rate": 1.9989196528720064e-07, "loss": 0.4483, "step": 780 }, { "epoch": 0.04499822000711997, "grad_norm": 19.436313523994027, "learning_rate": 1.9988322268323266e-07, "loss": 0.4549, "step": 790 }, { "epoch": 0.04556781772872909, "grad_norm": 38.861669459737385, "learning_rate": 1.9987414017010133e-07, "loss": 0.4298, "step": 800 }, { "epoch": 0.0461374154503382, "grad_norm": 24.571115668775043, "learning_rate": 1.998647177787151e-07, "loss": 0.4558, "step": 810 }, { "epoch": 0.04670701317194731, "grad_norm": 19.706898075912328, "learning_rate": 1.9985495554113894e-07, "loss": 0.4366, "step": 820 }, { "epoch": 0.04727661089355643, "grad_norm": 33.9617977848147, "learning_rate": 1.998448534905944e-07, "loss": 0.4577, "step": 830 }, { "epoch": 0.04784620861516554, "grad_norm": 42.96396193732204, "learning_rate": 1.9983441166145946e-07, "loss": 0.4851, "step": 840 }, { "epoch": 0.048415806336774656, "grad_norm": 21.29503243417614, "learning_rate": 1.998236300892683e-07, "loss": 0.4807, "step": 850 }, { "epoch": 0.048985404058383766, "grad_norm": 31.984418484238574, "learning_rate": 1.9981250881071133e-07, "loss": 0.4281, "step": 860 }, { "epoch": 0.04955500177999288, "grad_norm": 23.76695346707812, "learning_rate": 1.9980104786363502e-07, "loss": 0.4753, "step": 870 }, { "epoch": 0.050124599501601995, "grad_norm": 51.49266265495552, "learning_rate": 1.9978924728704166e-07, "loss": 0.4453, "step": 880 }, { "epoch": 0.050694197223211106, "grad_norm": 57.45228883254441, "learning_rate": 1.997771071210895e-07, "loss": 0.4547, "step": 890 }, { "epoch": 0.051263794944820223, "grad_norm": 582.5163847711418, "learning_rate": 1.997646274070922e-07, "loss": 0.4011, "step": 900 }, { "epoch": 0.051833392666429334, "grad_norm": 12.943726869700884, "learning_rate": 1.9975180818751908e-07, "loss": 0.4156, "step": 910 }, { "epoch": 0.052402990388038445, "grad_norm": 15.975864494882453, "learning_rate": 1.997386495059948e-07, "loss": 0.3637, "step": 920 }, { "epoch": 0.05297258810964756, "grad_norm": 20.386395091429563, "learning_rate": 1.9972515140729928e-07, "loss": 0.4506, "step": 930 }, { "epoch": 0.053542185831256674, "grad_norm": 16.014612208992087, "learning_rate": 1.9971131393736732e-07, "loss": 0.4581, "step": 940 }, { "epoch": 0.05411178355286579, "grad_norm": 13.034905667217853, "learning_rate": 1.996971371432888e-07, "loss": 0.4288, "step": 950 }, { "epoch": 0.0546813812744749, "grad_norm": 13.755268105445708, "learning_rate": 1.996826210733083e-07, "loss": 0.4704, "step": 960 }, { "epoch": 0.05525097899608401, "grad_norm": 14.709572404543191, "learning_rate": 1.9966776577682488e-07, "loss": 0.4514, "step": 970 }, { "epoch": 0.05582057671769313, "grad_norm": 267.85069242584797, "learning_rate": 1.9965257130439217e-07, "loss": 0.4667, "step": 980 }, { "epoch": 0.05639017443930224, "grad_norm": 36.2529434259595, "learning_rate": 1.9963703770771795e-07, "loss": 0.4929, "step": 990 }, { "epoch": 0.05695977216091136, "grad_norm": 138.25374119669067, "learning_rate": 1.99621165039664e-07, "loss": 0.4887, "step": 1000 }, { "epoch": 0.05752936988252047, "grad_norm": 37.65440236342901, "learning_rate": 1.9960495335424615e-07, "loss": 0.4702, "step": 1010 }, { "epoch": 0.05809896760412958, "grad_norm": 20.49543757338681, "learning_rate": 1.9958840270663377e-07, "loss": 0.4421, "step": 1020 }, { "epoch": 0.0586685653257387, "grad_norm": 135.86209503027462, "learning_rate": 1.9957151315314977e-07, "loss": 0.4665, "step": 1030 }, { "epoch": 0.05923816304734781, "grad_norm": 16.112172807163798, "learning_rate": 1.9955428475127049e-07, "loss": 0.4627, "step": 1040 }, { "epoch": 0.05980776076895693, "grad_norm": 20.582916516737605, "learning_rate": 1.9953671755962525e-07, "loss": 0.4728, "step": 1050 }, { "epoch": 0.06037735849056604, "grad_norm": 60.603879240132336, "learning_rate": 1.995188116379964e-07, "loss": 0.4804, "step": 1060 }, { "epoch": 0.06094695621217515, "grad_norm": 31.78120339860008, "learning_rate": 1.995005670473189e-07, "loss": 0.4671, "step": 1070 }, { "epoch": 0.06151655393378427, "grad_norm": 24.087070640843212, "learning_rate": 1.9948198384968038e-07, "loss": 0.4299, "step": 1080 }, { "epoch": 0.06208615165539338, "grad_norm": 78.72280950118618, "learning_rate": 1.994630621083206e-07, "loss": 0.4402, "step": 1090 }, { "epoch": 0.0626557493770025, "grad_norm": 26.03024474381767, "learning_rate": 1.9944380188763154e-07, "loss": 0.4909, "step": 1100 }, { "epoch": 0.0632253470986116, "grad_norm": 19.38210538566336, "learning_rate": 1.99424203253157e-07, "loss": 0.4324, "step": 1110 }, { "epoch": 0.06379494482022072, "grad_norm": 96.14283327480236, "learning_rate": 1.9940426627159237e-07, "loss": 0.3688, "step": 1120 }, { "epoch": 0.06436454254182983, "grad_norm": 31.026454060990613, "learning_rate": 1.9938399101078453e-07, "loss": 0.5005, "step": 1130 }, { "epoch": 0.06493414026343895, "grad_norm": 12.493715605004533, "learning_rate": 1.9936337753973154e-07, "loss": 0.4489, "step": 1140 }, { "epoch": 0.06550373798504806, "grad_norm": 12.701919393018652, "learning_rate": 1.9934242592858236e-07, "loss": 0.4142, "step": 1150 }, { "epoch": 0.06607333570665717, "grad_norm": 28.82714425455963, "learning_rate": 1.9932113624863676e-07, "loss": 0.4932, "step": 1160 }, { "epoch": 0.06664293342826628, "grad_norm": 37.0311582587264, "learning_rate": 1.9929950857234485e-07, "loss": 0.4971, "step": 1170 }, { "epoch": 0.0672125311498754, "grad_norm": 117.98131014467332, "learning_rate": 1.9927754297330708e-07, "loss": 0.4852, "step": 1180 }, { "epoch": 0.06778212887148452, "grad_norm": 9.731916201335425, "learning_rate": 1.9925523952627379e-07, "loss": 0.4253, "step": 1190 }, { "epoch": 0.06835172659309363, "grad_norm": 38.263236761476804, "learning_rate": 1.992325983071451e-07, "loss": 0.4321, "step": 1200 }, { "epoch": 0.06892132431470274, "grad_norm": 15.381655794971056, "learning_rate": 1.9920961939297058e-07, "loss": 0.441, "step": 1210 }, { "epoch": 0.06949092203631185, "grad_norm": 15.428597063217957, "learning_rate": 1.991863028619489e-07, "loss": 0.4968, "step": 1220 }, { "epoch": 0.07006051975792096, "grad_norm": 45.071961716711186, "learning_rate": 1.9916264879342785e-07, "loss": 0.4305, "step": 1230 }, { "epoch": 0.07063011747953009, "grad_norm": 27.232374368700274, "learning_rate": 1.9913865726790373e-07, "loss": 0.4366, "step": 1240 }, { "epoch": 0.0711997152011392, "grad_norm": 21.18563628403933, "learning_rate": 1.9911432836702127e-07, "loss": 0.4833, "step": 1250 }, { "epoch": 0.07176931292274831, "grad_norm": 13.543561740630265, "learning_rate": 1.990896621735733e-07, "loss": 0.4487, "step": 1260 }, { "epoch": 0.07233891064435742, "grad_norm": 86.58996123562757, "learning_rate": 1.9906465877150058e-07, "loss": 0.4773, "step": 1270 }, { "epoch": 0.07290850836596653, "grad_norm": 14.872934684284921, "learning_rate": 1.9903931824589123e-07, "loss": 0.4457, "step": 1280 }, { "epoch": 0.07347810608757566, "grad_norm": 33.14051909456766, "learning_rate": 1.9901364068298077e-07, "loss": 0.4897, "step": 1290 }, { "epoch": 0.07404770380918477, "grad_norm": 16.442246412948936, "learning_rate": 1.989876261701516e-07, "loss": 0.4477, "step": 1300 }, { "epoch": 0.07461730153079388, "grad_norm": 15.10709676956974, "learning_rate": 1.9896127479593287e-07, "loss": 0.542, "step": 1310 }, { "epoch": 0.07518689925240299, "grad_norm": 63.44702816378984, "learning_rate": 1.9893458665000002e-07, "loss": 0.4563, "step": 1320 }, { "epoch": 0.0757564969740121, "grad_norm": 11.730796276591832, "learning_rate": 1.9890756182317454e-07, "loss": 0.4634, "step": 1330 }, { "epoch": 0.07632609469562122, "grad_norm": 27.47043509263349, "learning_rate": 1.9888020040742367e-07, "loss": 0.5072, "step": 1340 }, { "epoch": 0.07689569241723034, "grad_norm": 52.27539617258087, "learning_rate": 1.9885250249586014e-07, "loss": 0.4645, "step": 1350 }, { "epoch": 0.07746529013883945, "grad_norm": 12.68105188761479, "learning_rate": 1.9882446818274176e-07, "loss": 0.4396, "step": 1360 }, { "epoch": 0.07803488786044856, "grad_norm": 21.818578525864865, "learning_rate": 1.9879609756347113e-07, "loss": 0.4814, "step": 1370 }, { "epoch": 0.07860448558205767, "grad_norm": 58.13255180180692, "learning_rate": 1.987673907345953e-07, "loss": 0.4867, "step": 1380 }, { "epoch": 0.07917408330366678, "grad_norm": 14.770491903827162, "learning_rate": 1.9873834779380556e-07, "loss": 0.4893, "step": 1390 }, { "epoch": 0.0797436810252759, "grad_norm": 16.635013592634976, "learning_rate": 1.987089688399369e-07, "loss": 0.4478, "step": 1400 }, { "epoch": 0.08031327874688501, "grad_norm": 17.65859576674858, "learning_rate": 1.9867925397296784e-07, "loss": 0.4944, "step": 1410 }, { "epoch": 0.08088287646849412, "grad_norm": 86.88472923093384, "learning_rate": 1.9864920329402e-07, "loss": 0.4616, "step": 1420 }, { "epoch": 0.08145247419010324, "grad_norm": 25.736980272449554, "learning_rate": 1.9861881690535784e-07, "loss": 0.4348, "step": 1430 }, { "epoch": 0.08202207191171235, "grad_norm": 21.180899350268074, "learning_rate": 1.9858809491038823e-07, "loss": 0.4156, "step": 1440 }, { "epoch": 0.08259166963332147, "grad_norm": 94.81320893975935, "learning_rate": 1.9855703741366013e-07, "loss": 0.4449, "step": 1450 }, { "epoch": 0.08316126735493058, "grad_norm": 15.599193486437276, "learning_rate": 1.9852564452086424e-07, "loss": 0.4869, "step": 1460 }, { "epoch": 0.08373086507653969, "grad_norm": 20.953628168685903, "learning_rate": 1.9849391633883262e-07, "loss": 0.4373, "step": 1470 }, { "epoch": 0.0843004627981488, "grad_norm": 38.52912326460994, "learning_rate": 1.9846185297553842e-07, "loss": 0.4605, "step": 1480 }, { "epoch": 0.08487006051975791, "grad_norm": 22.139614672186013, "learning_rate": 1.9842945454009527e-07, "loss": 0.4333, "step": 1490 }, { "epoch": 0.08543965824136704, "grad_norm": 15.268704177226823, "learning_rate": 1.9839672114275726e-07, "loss": 0.4497, "step": 1500 }, { "epoch": 0.08600925596297615, "grad_norm": 29.168893226415655, "learning_rate": 1.9836365289491823e-07, "loss": 0.4613, "step": 1510 }, { "epoch": 0.08657885368458526, "grad_norm": 21.606021719728066, "learning_rate": 1.9833024990911165e-07, "loss": 0.4617, "step": 1520 }, { "epoch": 0.08714845140619437, "grad_norm": 32.88438245448298, "learning_rate": 1.9829651229901004e-07, "loss": 0.4377, "step": 1530 }, { "epoch": 0.08771804912780348, "grad_norm": 14.247154488970322, "learning_rate": 1.9826244017942467e-07, "loss": 0.4355, "step": 1540 }, { "epoch": 0.08828764684941261, "grad_norm": 20.872950619341626, "learning_rate": 1.9822803366630527e-07, "loss": 0.4497, "step": 1550 }, { "epoch": 0.08885724457102172, "grad_norm": 17.850902788048966, "learning_rate": 1.9819329287673946e-07, "loss": 0.5261, "step": 1560 }, { "epoch": 0.08942684229263083, "grad_norm": 16.138046631977943, "learning_rate": 1.9815821792895235e-07, "loss": 0.4969, "step": 1570 }, { "epoch": 0.08999644001423994, "grad_norm": 19.458881544939857, "learning_rate": 1.9812280894230636e-07, "loss": 0.4784, "step": 1580 }, { "epoch": 0.09056603773584905, "grad_norm": 17.653853307895435, "learning_rate": 1.9808706603730057e-07, "loss": 0.4463, "step": 1590 }, { "epoch": 0.09113563545745818, "grad_norm": 15.623914718581604, "learning_rate": 1.9805098933557044e-07, "loss": 0.4121, "step": 1600 }, { "epoch": 0.09170523317906729, "grad_norm": 102.23831398557063, "learning_rate": 1.9801457895988732e-07, "loss": 0.4202, "step": 1610 }, { "epoch": 0.0922748309006764, "grad_norm": 18.72931059613502, "learning_rate": 1.9797783503415818e-07, "loss": 0.4243, "step": 1620 }, { "epoch": 0.09284442862228551, "grad_norm": 237.94062671792636, "learning_rate": 1.9794075768342494e-07, "loss": 0.4527, "step": 1630 }, { "epoch": 0.09341402634389462, "grad_norm": 20.831268870843843, "learning_rate": 1.9790334703386428e-07, "loss": 0.4888, "step": 1640 }, { "epoch": 0.09398362406550374, "grad_norm": 20.77514897368684, "learning_rate": 1.9786560321278714e-07, "loss": 0.4432, "step": 1650 }, { "epoch": 0.09455322178711285, "grad_norm": 30.08308934914739, "learning_rate": 1.9782752634863814e-07, "loss": 0.4323, "step": 1660 }, { "epoch": 0.09512281950872196, "grad_norm": 34.03857180920022, "learning_rate": 1.9778911657099544e-07, "loss": 0.4347, "step": 1670 }, { "epoch": 0.09569241723033108, "grad_norm": 41.38196703392111, "learning_rate": 1.9775037401056998e-07, "loss": 0.48, "step": 1680 }, { "epoch": 0.09626201495194019, "grad_norm": 26.728838791837614, "learning_rate": 1.9771129879920522e-07, "loss": 0.4541, "step": 1690 }, { "epoch": 0.09683161267354931, "grad_norm": 17.319090510690177, "learning_rate": 1.976718910698767e-07, "loss": 0.47, "step": 1700 }, { "epoch": 0.09740121039515842, "grad_norm": 24.927565311503443, "learning_rate": 1.9763215095669147e-07, "loss": 0.4775, "step": 1710 }, { "epoch": 0.09797080811676753, "grad_norm": 35.5217833486329, "learning_rate": 1.9759207859488781e-07, "loss": 0.4387, "step": 1720 }, { "epoch": 0.09854040583837664, "grad_norm": 12.75295290738318, "learning_rate": 1.975516741208346e-07, "loss": 0.4733, "step": 1730 }, { "epoch": 0.09911000355998575, "grad_norm": 23.901764027051986, "learning_rate": 1.9751093767203084e-07, "loss": 0.4549, "step": 1740 }, { "epoch": 0.09967960128159488, "grad_norm": 16.507601695686727, "learning_rate": 1.974698693871054e-07, "loss": 0.4483, "step": 1750 }, { "epoch": 0.10024919900320399, "grad_norm": 20.096118428482878, "learning_rate": 1.9742846940581628e-07, "loss": 0.4363, "step": 1760 }, { "epoch": 0.1008187967248131, "grad_norm": 15.396477204440362, "learning_rate": 1.9738673786905045e-07, "loss": 0.4871, "step": 1770 }, { "epoch": 0.10138839444642221, "grad_norm": 42.5121633336015, "learning_rate": 1.9734467491882297e-07, "loss": 0.5204, "step": 1780 }, { "epoch": 0.10195799216803132, "grad_norm": 28.227316949939553, "learning_rate": 1.9730228069827685e-07, "loss": 0.4862, "step": 1790 }, { "epoch": 0.10252758988964045, "grad_norm": 24.685923174663778, "learning_rate": 1.972595553516824e-07, "loss": 0.4545, "step": 1800 } ], "logging_steps": 10, "max_steps": 17557, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 571832082890752.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }