{ "best_metric": 0.943651823029255, "best_model_checkpoint": "./phobert_results/checkpoint-12145", "epoch": 10.0, "eval_steps": 500, "global_step": 17350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02881844380403458, "grad_norm": 4.970236301422119, "learning_rate": 1.994236311239193e-05, "loss": 1.9031, "step": 50 }, { "epoch": 0.05763688760806916, "grad_norm": 9.052480697631836, "learning_rate": 1.9884726224783863e-05, "loss": 1.7452, "step": 100 }, { "epoch": 0.08645533141210375, "grad_norm": 10.461433410644531, "learning_rate": 1.9827089337175795e-05, "loss": 1.6609, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 9.097572326660156, "learning_rate": 1.9769452449567724e-05, "loss": 1.5037, "step": 200 }, { "epoch": 0.1440922190201729, "grad_norm": 11.013162612915039, "learning_rate": 1.9711815561959656e-05, "loss": 1.4436, "step": 250 }, { "epoch": 0.1729106628242075, "grad_norm": 10.979734420776367, "learning_rate": 1.9654178674351588e-05, "loss": 1.3187, "step": 300 }, { "epoch": 0.2017291066282421, "grad_norm": 12.8245849609375, "learning_rate": 1.9596541786743517e-05, "loss": 1.2895, "step": 350 }, { "epoch": 0.23054755043227665, "grad_norm": 13.686446189880371, "learning_rate": 1.953890489913545e-05, "loss": 1.2117, "step": 400 }, { "epoch": 0.25936599423631124, "grad_norm": 11.00157356262207, "learning_rate": 1.9481268011527378e-05, "loss": 1.2032, "step": 450 }, { "epoch": 0.2881844380403458, "grad_norm": 13.596818923950195, "learning_rate": 1.942363112391931e-05, "loss": 1.1336, "step": 500 }, { "epoch": 0.3170028818443804, "grad_norm": 13.817182540893555, "learning_rate": 1.936599423631124e-05, "loss": 1.1091, "step": 550 }, { "epoch": 0.345821325648415, "grad_norm": 6.913839817047119, "learning_rate": 1.930835734870317e-05, "loss": 1.0896, "step": 600 }, { "epoch": 0.3746397694524496, "grad_norm": 11.378829002380371, "learning_rate": 1.9250720461095104e-05, "loss": 0.9832, "step": 650 }, { "epoch": 0.4034582132564842, "grad_norm": 16.737525939941406, "learning_rate": 1.9193083573487033e-05, "loss": 0.978, "step": 700 }, { "epoch": 0.4322766570605187, "grad_norm": 12.594534873962402, "learning_rate": 1.9135446685878965e-05, "loss": 0.9757, "step": 750 }, { "epoch": 0.4610951008645533, "grad_norm": 11.076528549194336, "learning_rate": 1.9077809798270894e-05, "loss": 0.9533, "step": 800 }, { "epoch": 0.4899135446685879, "grad_norm": 12.495336532592773, "learning_rate": 1.9020172910662826e-05, "loss": 0.9399, "step": 850 }, { "epoch": 0.5187319884726225, "grad_norm": 10.985849380493164, "learning_rate": 1.8962536023054755e-05, "loss": 0.8982, "step": 900 }, { "epoch": 0.547550432276657, "grad_norm": 13.00562572479248, "learning_rate": 1.8904899135446687e-05, "loss": 0.9525, "step": 950 }, { "epoch": 0.5763688760806917, "grad_norm": 13.221843719482422, "learning_rate": 1.884726224783862e-05, "loss": 0.9063, "step": 1000 }, { "epoch": 0.6051873198847262, "grad_norm": 15.107306480407715, "learning_rate": 1.878962536023055e-05, "loss": 0.8538, "step": 1050 }, { "epoch": 0.6340057636887608, "grad_norm": 16.602313995361328, "learning_rate": 1.873198847262248e-05, "loss": 0.8626, "step": 1100 }, { "epoch": 0.6628242074927954, "grad_norm": 13.856719017028809, "learning_rate": 1.867435158501441e-05, "loss": 0.8186, "step": 1150 }, { "epoch": 0.69164265129683, "grad_norm": 15.631819725036621, "learning_rate": 1.861671469740634e-05, "loss": 0.8228, "step": 1200 }, { "epoch": 0.7204610951008645, "grad_norm": 25.166427612304688, "learning_rate": 1.855907780979827e-05, "loss": 0.837, "step": 1250 }, { "epoch": 0.7492795389048992, "grad_norm": 18.48745346069336, "learning_rate": 1.8501440922190203e-05, "loss": 0.7323, "step": 1300 }, { "epoch": 0.7780979827089337, "grad_norm": 19.641643524169922, "learning_rate": 1.8443804034582135e-05, "loss": 0.8067, "step": 1350 }, { "epoch": 0.8069164265129684, "grad_norm": 18.942670822143555, "learning_rate": 1.8386167146974067e-05, "loss": 0.8028, "step": 1400 }, { "epoch": 0.8357348703170029, "grad_norm": 17.313879013061523, "learning_rate": 1.8328530259365996e-05, "loss": 0.7598, "step": 1450 }, { "epoch": 0.8645533141210374, "grad_norm": 10.607074737548828, "learning_rate": 1.8270893371757928e-05, "loss": 0.8624, "step": 1500 }, { "epoch": 0.8933717579250721, "grad_norm": 12.565463066101074, "learning_rate": 1.8213256484149857e-05, "loss": 0.7614, "step": 1550 }, { "epoch": 0.9221902017291066, "grad_norm": 21.289403915405273, "learning_rate": 1.815561959654179e-05, "loss": 0.7644, "step": 1600 }, { "epoch": 0.9510086455331412, "grad_norm": 17.60152816772461, "learning_rate": 1.8097982708933718e-05, "loss": 0.7546, "step": 1650 }, { "epoch": 0.9798270893371758, "grad_norm": 17.39754867553711, "learning_rate": 1.804034582132565e-05, "loss": 0.7933, "step": 1700 }, { "epoch": 1.0, "eval_accuracy": 0.7976653696498055, "eval_loss": 0.6221615076065063, "eval_runtime": 37.0936, "eval_samples_per_second": 187.067, "eval_steps_per_second": 11.7, "step": 1735 }, { "epoch": 1.0086455331412103, "grad_norm": 18.93373680114746, "learning_rate": 1.7982708933717582e-05, "loss": 0.7119, "step": 1750 }, { "epoch": 1.037463976945245, "grad_norm": 15.846963882446289, "learning_rate": 1.792507204610951e-05, "loss": 0.5235, "step": 1800 }, { "epoch": 1.0662824207492796, "grad_norm": 9.528020858764648, "learning_rate": 1.7867435158501444e-05, "loss": 0.5441, "step": 1850 }, { "epoch": 1.0951008645533142, "grad_norm": 22.303991317749023, "learning_rate": 1.7809798270893372e-05, "loss": 0.5471, "step": 1900 }, { "epoch": 1.1239193083573487, "grad_norm": 13.73653507232666, "learning_rate": 1.7752161383285305e-05, "loss": 0.5555, "step": 1950 }, { "epoch": 1.1527377521613833, "grad_norm": 16.266706466674805, "learning_rate": 1.7694524495677234e-05, "loss": 0.6031, "step": 2000 }, { "epoch": 1.181556195965418, "grad_norm": 12.598456382751465, "learning_rate": 1.7636887608069166e-05, "loss": 0.5668, "step": 2050 }, { "epoch": 1.2103746397694524, "grad_norm": 14.683968544006348, "learning_rate": 1.7579250720461095e-05, "loss": 0.5748, "step": 2100 }, { "epoch": 1.239193083573487, "grad_norm": 15.514312744140625, "learning_rate": 1.7521613832853027e-05, "loss": 0.5473, "step": 2150 }, { "epoch": 1.2680115273775217, "grad_norm": 12.627583503723145, "learning_rate": 1.746397694524496e-05, "loss": 0.4907, "step": 2200 }, { "epoch": 1.2968299711815563, "grad_norm": 9.212458610534668, "learning_rate": 1.7406340057636888e-05, "loss": 0.5064, "step": 2250 }, { "epoch": 1.3256484149855907, "grad_norm": 9.678365707397461, "learning_rate": 1.734870317002882e-05, "loss": 0.5415, "step": 2300 }, { "epoch": 1.3544668587896254, "grad_norm": 26.177536010742188, "learning_rate": 1.729106628242075e-05, "loss": 0.4821, "step": 2350 }, { "epoch": 1.38328530259366, "grad_norm": 11.909614562988281, "learning_rate": 1.723342939481268e-05, "loss": 0.4962, "step": 2400 }, { "epoch": 1.4121037463976944, "grad_norm": 16.271272659301758, "learning_rate": 1.717579250720461e-05, "loss": 0.5287, "step": 2450 }, { "epoch": 1.440922190201729, "grad_norm": 25.189302444458008, "learning_rate": 1.7118155619596542e-05, "loss": 0.5096, "step": 2500 }, { "epoch": 1.4697406340057637, "grad_norm": 23.7978515625, "learning_rate": 1.7060518731988475e-05, "loss": 0.4859, "step": 2550 }, { "epoch": 1.4985590778097984, "grad_norm": 6.646437168121338, "learning_rate": 1.7002881844380407e-05, "loss": 0.4467, "step": 2600 }, { "epoch": 1.527377521613833, "grad_norm": 18.11911964416504, "learning_rate": 1.6945244956772336e-05, "loss": 0.4939, "step": 2650 }, { "epoch": 1.5561959654178674, "grad_norm": 17.764862060546875, "learning_rate": 1.6887608069164268e-05, "loss": 0.4876, "step": 2700 }, { "epoch": 1.585014409221902, "grad_norm": 16.96575927734375, "learning_rate": 1.6829971181556197e-05, "loss": 0.4426, "step": 2750 }, { "epoch": 1.6138328530259365, "grad_norm": 20.623233795166016, "learning_rate": 1.6772334293948126e-05, "loss": 0.5638, "step": 2800 }, { "epoch": 1.6426512968299711, "grad_norm": 16.03717041015625, "learning_rate": 1.6714697406340058e-05, "loss": 0.4558, "step": 2850 }, { "epoch": 1.6714697406340058, "grad_norm": 27.027576446533203, "learning_rate": 1.665706051873199e-05, "loss": 0.4227, "step": 2900 }, { "epoch": 1.7002881844380404, "grad_norm": 16.704652786254883, "learning_rate": 1.6599423631123922e-05, "loss": 0.4521, "step": 2950 }, { "epoch": 1.729106628242075, "grad_norm": 19.68289566040039, "learning_rate": 1.654178674351585e-05, "loss": 0.4419, "step": 3000 }, { "epoch": 1.7579250720461095, "grad_norm": 13.75253677368164, "learning_rate": 1.6484149855907783e-05, "loss": 0.4428, "step": 3050 }, { "epoch": 1.7867435158501441, "grad_norm": 44.23019790649414, "learning_rate": 1.6426512968299712e-05, "loss": 0.4416, "step": 3100 }, { "epoch": 1.8155619596541785, "grad_norm": 14.690359115600586, "learning_rate": 1.6368876080691644e-05, "loss": 0.3885, "step": 3150 }, { "epoch": 1.8443804034582132, "grad_norm": 12.004839897155762, "learning_rate": 1.6311239193083573e-05, "loss": 0.3744, "step": 3200 }, { "epoch": 1.8731988472622478, "grad_norm": 10.716139793395996, "learning_rate": 1.6253602305475506e-05, "loss": 0.398, "step": 3250 }, { "epoch": 1.9020172910662825, "grad_norm": 17.37589454650879, "learning_rate": 1.6195965417867438e-05, "loss": 0.4649, "step": 3300 }, { "epoch": 1.9308357348703171, "grad_norm": 13.404267311096191, "learning_rate": 1.613832853025937e-05, "loss": 0.3985, "step": 3350 }, { "epoch": 1.9596541786743515, "grad_norm": 14.350135803222656, "learning_rate": 1.60806916426513e-05, "loss": 0.4282, "step": 3400 }, { "epoch": 1.9884726224783862, "grad_norm": 20.828453063964844, "learning_rate": 1.6023054755043228e-05, "loss": 0.3764, "step": 3450 }, { "epoch": 2.0, "eval_accuracy": 0.8651102464332037, "eval_loss": 0.4364924728870392, "eval_runtime": 37.9177, "eval_samples_per_second": 183.002, "eval_steps_per_second": 11.446, "step": 3470 }, { "epoch": 2.0172910662824206, "grad_norm": 10.62299633026123, "learning_rate": 1.596541786743516e-05, "loss": 0.3511, "step": 3500 }, { "epoch": 2.0461095100864553, "grad_norm": 21.19414710998535, "learning_rate": 1.590778097982709e-05, "loss": 0.3166, "step": 3550 }, { "epoch": 2.07492795389049, "grad_norm": 9.603983879089355, "learning_rate": 1.585014409221902e-05, "loss": 0.257, "step": 3600 }, { "epoch": 2.1037463976945245, "grad_norm": 14.70697021484375, "learning_rate": 1.5792507204610953e-05, "loss": 0.2606, "step": 3650 }, { "epoch": 2.132564841498559, "grad_norm": 25.806259155273438, "learning_rate": 1.5734870317002882e-05, "loss": 0.2505, "step": 3700 }, { "epoch": 2.161383285302594, "grad_norm": 20.40934181213379, "learning_rate": 1.5677233429394814e-05, "loss": 0.3198, "step": 3750 }, { "epoch": 2.1902017291066285, "grad_norm": 34.36139678955078, "learning_rate": 1.5619596541786747e-05, "loss": 0.2689, "step": 3800 }, { "epoch": 2.2190201729106627, "grad_norm": 23.640975952148438, "learning_rate": 1.5561959654178675e-05, "loss": 0.2744, "step": 3850 }, { "epoch": 2.2478386167146973, "grad_norm": 15.340378761291504, "learning_rate": 1.5504322766570608e-05, "loss": 0.332, "step": 3900 }, { "epoch": 2.276657060518732, "grad_norm": 29.153423309326172, "learning_rate": 1.5446685878962537e-05, "loss": 0.2945, "step": 3950 }, { "epoch": 2.3054755043227666, "grad_norm": 9.021677017211914, "learning_rate": 1.538904899135447e-05, "loss": 0.284, "step": 4000 }, { "epoch": 2.3342939481268012, "grad_norm": 32.12895584106445, "learning_rate": 1.5331412103746398e-05, "loss": 0.2429, "step": 4050 }, { "epoch": 2.363112391930836, "grad_norm": 36.411773681640625, "learning_rate": 1.527377521613833e-05, "loss": 0.2723, "step": 4100 }, { "epoch": 2.39193083573487, "grad_norm": 18.912567138671875, "learning_rate": 1.521613832853026e-05, "loss": 0.2702, "step": 4150 }, { "epoch": 2.4207492795389047, "grad_norm": 17.08293342590332, "learning_rate": 1.5158501440922191e-05, "loss": 0.3458, "step": 4200 }, { "epoch": 2.4495677233429394, "grad_norm": 24.516544342041016, "learning_rate": 1.5100864553314123e-05, "loss": 0.2542, "step": 4250 }, { "epoch": 2.478386167146974, "grad_norm": 3.7018916606903076, "learning_rate": 1.5043227665706052e-05, "loss": 0.2958, "step": 4300 }, { "epoch": 2.5072046109510087, "grad_norm": 24.21662712097168, "learning_rate": 1.4985590778097984e-05, "loss": 0.2693, "step": 4350 }, { "epoch": 2.5360230547550433, "grad_norm": 36.85274124145508, "learning_rate": 1.4927953890489915e-05, "loss": 0.2769, "step": 4400 }, { "epoch": 2.564841498559078, "grad_norm": 26.23798179626465, "learning_rate": 1.4870317002881847e-05, "loss": 0.2443, "step": 4450 }, { "epoch": 2.5936599423631126, "grad_norm": 40.98781204223633, "learning_rate": 1.4812680115273776e-05, "loss": 0.2789, "step": 4500 }, { "epoch": 2.6224783861671472, "grad_norm": 33.100223541259766, "learning_rate": 1.4755043227665706e-05, "loss": 0.2776, "step": 4550 }, { "epoch": 2.6512968299711814, "grad_norm": 158.5146942138672, "learning_rate": 1.4697406340057639e-05, "loss": 0.2852, "step": 4600 }, { "epoch": 2.680115273775216, "grad_norm": 19.569507598876953, "learning_rate": 1.4639769452449568e-05, "loss": 0.2743, "step": 4650 }, { "epoch": 2.7089337175792507, "grad_norm": 15.44676685333252, "learning_rate": 1.45821325648415e-05, "loss": 0.2746, "step": 4700 }, { "epoch": 2.7377521613832854, "grad_norm": 11.913484573364258, "learning_rate": 1.452449567723343e-05, "loss": 0.2309, "step": 4750 }, { "epoch": 2.76657060518732, "grad_norm": 17.194211959838867, "learning_rate": 1.4466858789625363e-05, "loss": 0.2497, "step": 4800 }, { "epoch": 2.795389048991354, "grad_norm": 1.0725657939910889, "learning_rate": 1.4409221902017291e-05, "loss": 0.2903, "step": 4850 }, { "epoch": 2.824207492795389, "grad_norm": 10.74899959564209, "learning_rate": 1.4351585014409224e-05, "loss": 0.2968, "step": 4900 }, { "epoch": 2.8530259365994235, "grad_norm": 29.902647018432617, "learning_rate": 1.4293948126801154e-05, "loss": 0.2935, "step": 4950 }, { "epoch": 2.881844380403458, "grad_norm": 11.288642883300781, "learning_rate": 1.4236311239193086e-05, "loss": 0.2928, "step": 5000 }, { "epoch": 2.910662824207493, "grad_norm": 14.153397560119629, "learning_rate": 1.4178674351585015e-05, "loss": 0.1799, "step": 5050 }, { "epoch": 2.9394812680115274, "grad_norm": 15.782369613647461, "learning_rate": 1.4121037463976946e-05, "loss": 0.2154, "step": 5100 }, { "epoch": 2.968299711815562, "grad_norm": 3.8253252506256104, "learning_rate": 1.4063400576368878e-05, "loss": 0.2628, "step": 5150 }, { "epoch": 2.9971181556195967, "grad_norm": 6.50337553024292, "learning_rate": 1.4005763688760807e-05, "loss": 0.2194, "step": 5200 }, { "epoch": 3.0, "eval_accuracy": 0.9028678483931403, "eval_loss": 0.3793924152851105, "eval_runtime": 36.9217, "eval_samples_per_second": 187.938, "eval_steps_per_second": 11.755, "step": 5205 }, { "epoch": 3.025936599423631, "grad_norm": 37.30765914916992, "learning_rate": 1.3948126801152739e-05, "loss": 0.1946, "step": 5250 }, { "epoch": 3.0547550432276656, "grad_norm": 38.157142639160156, "learning_rate": 1.389048991354467e-05, "loss": 0.1694, "step": 5300 }, { "epoch": 3.0835734870317, "grad_norm": 22.551681518554688, "learning_rate": 1.3832853025936602e-05, "loss": 0.2151, "step": 5350 }, { "epoch": 3.112391930835735, "grad_norm": 3.1120803356170654, "learning_rate": 1.377521613832853e-05, "loss": 0.1945, "step": 5400 }, { "epoch": 3.1412103746397695, "grad_norm": 24.61436653137207, "learning_rate": 1.3717579250720463e-05, "loss": 0.13, "step": 5450 }, { "epoch": 3.170028818443804, "grad_norm": 15.621273040771484, "learning_rate": 1.3659942363112394e-05, "loss": 0.1603, "step": 5500 }, { "epoch": 3.1988472622478388, "grad_norm": 10.645062446594238, "learning_rate": 1.3602305475504324e-05, "loss": 0.1814, "step": 5550 }, { "epoch": 3.227665706051873, "grad_norm": 37.47835159301758, "learning_rate": 1.3544668587896255e-05, "loss": 0.1527, "step": 5600 }, { "epoch": 3.2564841498559076, "grad_norm": 0.6854817271232605, "learning_rate": 1.3487031700288185e-05, "loss": 0.209, "step": 5650 }, { "epoch": 3.2853025936599423, "grad_norm": 1.353789210319519, "learning_rate": 1.3429394812680117e-05, "loss": 0.1542, "step": 5700 }, { "epoch": 3.314121037463977, "grad_norm": 9.835870742797852, "learning_rate": 1.3371757925072046e-05, "loss": 0.2365, "step": 5750 }, { "epoch": 3.3429394812680115, "grad_norm": 13.969425201416016, "learning_rate": 1.3314121037463979e-05, "loss": 0.1715, "step": 5800 }, { "epoch": 3.371757925072046, "grad_norm": 21.74212074279785, "learning_rate": 1.3256484149855909e-05, "loss": 0.1726, "step": 5850 }, { "epoch": 3.400576368876081, "grad_norm": 6.624606609344482, "learning_rate": 1.319884726224784e-05, "loss": 0.1997, "step": 5900 }, { "epoch": 3.4293948126801155, "grad_norm": 24.518829345703125, "learning_rate": 1.314121037463977e-05, "loss": 0.1027, "step": 5950 }, { "epoch": 3.4582132564841497, "grad_norm": 1.1079559326171875, "learning_rate": 1.3083573487031702e-05, "loss": 0.1708, "step": 6000 }, { "epoch": 3.4870317002881843, "grad_norm": 89.6827163696289, "learning_rate": 1.3025936599423631e-05, "loss": 0.2025, "step": 6050 }, { "epoch": 3.515850144092219, "grad_norm": 3.5336265563964844, "learning_rate": 1.2968299711815563e-05, "loss": 0.188, "step": 6100 }, { "epoch": 3.5446685878962536, "grad_norm": 3.5633816719055176, "learning_rate": 1.2910662824207494e-05, "loss": 0.1638, "step": 6150 }, { "epoch": 3.5734870317002883, "grad_norm": 1.318528175354004, "learning_rate": 1.2853025936599423e-05, "loss": 0.142, "step": 6200 }, { "epoch": 3.602305475504323, "grad_norm": 18.405723571777344, "learning_rate": 1.2795389048991355e-05, "loss": 0.1707, "step": 6250 }, { "epoch": 3.631123919308357, "grad_norm": 2.065215587615967, "learning_rate": 1.2737752161383286e-05, "loss": 0.1858, "step": 6300 }, { "epoch": 3.6599423631123917, "grad_norm": 1.1660606861114502, "learning_rate": 1.2680115273775218e-05, "loss": 0.1406, "step": 6350 }, { "epoch": 3.6887608069164264, "grad_norm": 0.28399357199668884, "learning_rate": 1.2622478386167147e-05, "loss": 0.1398, "step": 6400 }, { "epoch": 3.717579250720461, "grad_norm": 9.657651901245117, "learning_rate": 1.2564841498559079e-05, "loss": 0.1322, "step": 6450 }, { "epoch": 3.7463976945244957, "grad_norm": 42.729248046875, "learning_rate": 1.250720461095101e-05, "loss": 0.2353, "step": 6500 }, { "epoch": 3.7752161383285303, "grad_norm": 1.589701771736145, "learning_rate": 1.2449567723342942e-05, "loss": 0.2107, "step": 6550 }, { "epoch": 3.804034582132565, "grad_norm": 9.485612869262695, "learning_rate": 1.239193083573487e-05, "loss": 0.2303, "step": 6600 }, { "epoch": 3.8328530259365996, "grad_norm": 15.697033882141113, "learning_rate": 1.2334293948126803e-05, "loss": 0.1243, "step": 6650 }, { "epoch": 3.8616714697406342, "grad_norm": 13.186999320983887, "learning_rate": 1.2276657060518733e-05, "loss": 0.1802, "step": 6700 }, { "epoch": 3.8904899135446684, "grad_norm": 0.07961810380220413, "learning_rate": 1.2219020172910662e-05, "loss": 0.1776, "step": 6750 }, { "epoch": 3.919308357348703, "grad_norm": 19.50320053100586, "learning_rate": 1.2161383285302594e-05, "loss": 0.1825, "step": 6800 }, { "epoch": 3.9481268011527377, "grad_norm": 0.9466652870178223, "learning_rate": 1.2103746397694525e-05, "loss": 0.247, "step": 6850 }, { "epoch": 3.9769452449567724, "grad_norm": 0.9334861636161804, "learning_rate": 1.2046109510086457e-05, "loss": 0.1826, "step": 6900 }, { "epoch": 4.0, "eval_accuracy": 0.9244847960801268, "eval_loss": 0.34017300605773926, "eval_runtime": 37.1217, "eval_samples_per_second": 186.926, "eval_steps_per_second": 11.691, "step": 6940 }, { "epoch": 4.005763688760807, "grad_norm": 38.7332763671875, "learning_rate": 1.1988472622478386e-05, "loss": 0.1516, "step": 6950 }, { "epoch": 4.034582132564841, "grad_norm": 1.7459765672683716, "learning_rate": 1.1930835734870318e-05, "loss": 0.1305, "step": 7000 }, { "epoch": 4.063400576368876, "grad_norm": 1.0959562063217163, "learning_rate": 1.1873198847262249e-05, "loss": 0.1054, "step": 7050 }, { "epoch": 4.0922190201729105, "grad_norm": 26.440959930419922, "learning_rate": 1.1815561959654181e-05, "loss": 0.1426, "step": 7100 }, { "epoch": 4.121037463976945, "grad_norm": 0.2572319805622101, "learning_rate": 1.175792507204611e-05, "loss": 0.0893, "step": 7150 }, { "epoch": 4.14985590778098, "grad_norm": 2.3376266956329346, "learning_rate": 1.1700288184438042e-05, "loss": 0.1211, "step": 7200 }, { "epoch": 4.178674351585014, "grad_norm": 27.235889434814453, "learning_rate": 1.1642651296829973e-05, "loss": 0.1125, "step": 7250 }, { "epoch": 4.207492795389049, "grad_norm": 0.729739785194397, "learning_rate": 1.1585014409221902e-05, "loss": 0.1263, "step": 7300 }, { "epoch": 4.236311239193084, "grad_norm": 95.56041717529297, "learning_rate": 1.1527377521613834e-05, "loss": 0.1196, "step": 7350 }, { "epoch": 4.265129682997118, "grad_norm": 0.6992059350013733, "learning_rate": 1.1469740634005764e-05, "loss": 0.1289, "step": 7400 }, { "epoch": 4.293948126801153, "grad_norm": 5.825665473937988, "learning_rate": 1.1412103746397697e-05, "loss": 0.0838, "step": 7450 }, { "epoch": 4.322766570605188, "grad_norm": 12.848553657531738, "learning_rate": 1.1354466858789625e-05, "loss": 0.109, "step": 7500 }, { "epoch": 4.351585014409222, "grad_norm": 0.7549734711647034, "learning_rate": 1.1296829971181558e-05, "loss": 0.1136, "step": 7550 }, { "epoch": 4.380403458213257, "grad_norm": 11.627622604370117, "learning_rate": 1.1239193083573488e-05, "loss": 0.1387, "step": 7600 }, { "epoch": 4.409221902017291, "grad_norm": 2.81147837638855, "learning_rate": 1.1181556195965419e-05, "loss": 0.0966, "step": 7650 }, { "epoch": 4.438040345821325, "grad_norm": 68.66207885742188, "learning_rate": 1.112391930835735e-05, "loss": 0.1238, "step": 7700 }, { "epoch": 4.46685878962536, "grad_norm": 0.8019499182701111, "learning_rate": 1.1066282420749282e-05, "loss": 0.1062, "step": 7750 }, { "epoch": 4.495677233429395, "grad_norm": 48.5037841796875, "learning_rate": 1.100864553314121e-05, "loss": 0.0796, "step": 7800 }, { "epoch": 4.524495677233429, "grad_norm": 1.2362786531448364, "learning_rate": 1.0951008645533141e-05, "loss": 0.1219, "step": 7850 }, { "epoch": 4.553314121037464, "grad_norm": 8.287176132202148, "learning_rate": 1.0893371757925073e-05, "loss": 0.1102, "step": 7900 }, { "epoch": 4.582132564841499, "grad_norm": 0.1790919452905655, "learning_rate": 1.0835734870317004e-05, "loss": 0.0486, "step": 7950 }, { "epoch": 4.610951008645533, "grad_norm": 36.70028305053711, "learning_rate": 1.0778097982708934e-05, "loss": 0.1433, "step": 8000 }, { "epoch": 4.639769452449568, "grad_norm": 22.68642807006836, "learning_rate": 1.0720461095100865e-05, "loss": 0.1598, "step": 8050 }, { "epoch": 4.6685878962536025, "grad_norm": 0.10349202156066895, "learning_rate": 1.0662824207492797e-05, "loss": 0.0742, "step": 8100 }, { "epoch": 4.697406340057637, "grad_norm": 14.959641456604004, "learning_rate": 1.0605187319884726e-05, "loss": 0.1328, "step": 8150 }, { "epoch": 4.726224783861672, "grad_norm": 1.050648808479309, "learning_rate": 1.0547550432276658e-05, "loss": 0.137, "step": 8200 }, { "epoch": 4.755043227665706, "grad_norm": 2.9457409381866455, "learning_rate": 1.0489913544668589e-05, "loss": 0.0624, "step": 8250 }, { "epoch": 4.78386167146974, "grad_norm": 0.5071529746055603, "learning_rate": 1.0432276657060521e-05, "loss": 0.1035, "step": 8300 }, { "epoch": 4.812680115273775, "grad_norm": 19.162033081054688, "learning_rate": 1.037463976945245e-05, "loss": 0.1705, "step": 8350 }, { "epoch": 4.8414985590778095, "grad_norm": 1.8972758054733276, "learning_rate": 1.031700288184438e-05, "loss": 0.1543, "step": 8400 }, { "epoch": 4.870317002881844, "grad_norm": 0.05384368821978569, "learning_rate": 1.0259365994236313e-05, "loss": 0.1049, "step": 8450 }, { "epoch": 4.899135446685879, "grad_norm": 9.819040298461914, "learning_rate": 1.0201729106628241e-05, "loss": 0.1305, "step": 8500 }, { "epoch": 4.927953890489913, "grad_norm": 2.4009037017822266, "learning_rate": 1.0144092219020174e-05, "loss": 0.1294, "step": 8550 }, { "epoch": 4.956772334293948, "grad_norm": 44.24007034301758, "learning_rate": 1.0086455331412104e-05, "loss": 0.1563, "step": 8600 }, { "epoch": 4.985590778097983, "grad_norm": 0.5839857459068298, "learning_rate": 1.0028818443804036e-05, "loss": 0.1116, "step": 8650 }, { "epoch": 5.0, "eval_accuracy": 0.9311139933708027, "eval_loss": 0.3880128860473633, "eval_runtime": 37.4244, "eval_samples_per_second": 185.414, "eval_steps_per_second": 11.597, "step": 8675 }, { "epoch": 5.014409221902017, "grad_norm": 23.385534286499023, "learning_rate": 9.971181556195965e-06, "loss": 0.1166, "step": 8700 }, { "epoch": 5.043227665706052, "grad_norm": 0.4506791830062866, "learning_rate": 9.913544668587897e-06, "loss": 0.057, "step": 8750 }, { "epoch": 5.072046109510087, "grad_norm": 49.61991882324219, "learning_rate": 9.855907780979828e-06, "loss": 0.0734, "step": 8800 }, { "epoch": 5.100864553314121, "grad_norm": 0.0602734349668026, "learning_rate": 9.798270893371759e-06, "loss": 0.0681, "step": 8850 }, { "epoch": 5.129682997118156, "grad_norm": 0.023745020851492882, "learning_rate": 9.740634005763689e-06, "loss": 0.072, "step": 8900 }, { "epoch": 5.1585014409221905, "grad_norm": 31.497879028320312, "learning_rate": 9.68299711815562e-06, "loss": 0.0474, "step": 8950 }, { "epoch": 5.187319884726225, "grad_norm": 1.734765648841858, "learning_rate": 9.625360230547552e-06, "loss": 0.0688, "step": 9000 }, { "epoch": 5.216138328530259, "grad_norm": 98.79872131347656, "learning_rate": 9.567723342939482e-06, "loss": 0.1255, "step": 9050 }, { "epoch": 5.244956772334294, "grad_norm": 6.518857002258301, "learning_rate": 9.510086455331413e-06, "loss": 0.0974, "step": 9100 }, { "epoch": 5.273775216138328, "grad_norm": 31.412202835083008, "learning_rate": 9.452449567723344e-06, "loss": 0.0412, "step": 9150 }, { "epoch": 5.302593659942363, "grad_norm": 2.6872849464416504, "learning_rate": 9.394812680115276e-06, "loss": 0.0679, "step": 9200 }, { "epoch": 5.3314121037463975, "grad_norm": 28.229368209838867, "learning_rate": 9.337175792507205e-06, "loss": 0.0817, "step": 9250 }, { "epoch": 5.360230547550432, "grad_norm": 0.24797913432121277, "learning_rate": 9.279538904899135e-06, "loss": 0.0768, "step": 9300 }, { "epoch": 5.389048991354467, "grad_norm": 0.3930692970752716, "learning_rate": 9.221902017291067e-06, "loss": 0.0945, "step": 9350 }, { "epoch": 5.417867435158501, "grad_norm": 3.4596047401428223, "learning_rate": 9.164265129682998e-06, "loss": 0.0729, "step": 9400 }, { "epoch": 5.446685878962536, "grad_norm": 3.12446665763855, "learning_rate": 9.106628242074928e-06, "loss": 0.0695, "step": 9450 }, { "epoch": 5.475504322766571, "grad_norm": 0.043711356818675995, "learning_rate": 9.048991354466859e-06, "loss": 0.0656, "step": 9500 }, { "epoch": 5.504322766570605, "grad_norm": 63.43326950073242, "learning_rate": 8.991354466858791e-06, "loss": 0.0684, "step": 9550 }, { "epoch": 5.53314121037464, "grad_norm": 2.399331569671631, "learning_rate": 8.933717579250722e-06, "loss": 0.1072, "step": 9600 }, { "epoch": 5.561959654178675, "grad_norm": 0.027092283591628075, "learning_rate": 8.876080691642652e-06, "loss": 0.1134, "step": 9650 }, { "epoch": 5.590778097982709, "grad_norm": 0.9653186798095703, "learning_rate": 8.818443804034583e-06, "loss": 0.0805, "step": 9700 }, { "epoch": 5.619596541786743, "grad_norm": 0.43673309683799744, "learning_rate": 8.760806916426513e-06, "loss": 0.0879, "step": 9750 }, { "epoch": 5.648414985590778, "grad_norm": 0.3906983435153961, "learning_rate": 8.703170028818444e-06, "loss": 0.1068, "step": 9800 }, { "epoch": 5.677233429394812, "grad_norm": 16.26913070678711, "learning_rate": 8.645533141210375e-06, "loss": 0.1136, "step": 9850 }, { "epoch": 5.706051873198847, "grad_norm": 2.990417242050171, "learning_rate": 8.587896253602305e-06, "loss": 0.0682, "step": 9900 }, { "epoch": 5.734870317002882, "grad_norm": 0.40273919701576233, "learning_rate": 8.530259365994237e-06, "loss": 0.0974, "step": 9950 }, { "epoch": 5.763688760806916, "grad_norm": 0.6563950777053833, "learning_rate": 8.472622478386168e-06, "loss": 0.0612, "step": 10000 }, { "epoch": 5.792507204610951, "grad_norm": 0.04713226109743118, "learning_rate": 8.414985590778098e-06, "loss": 0.058, "step": 10050 }, { "epoch": 5.821325648414986, "grad_norm": 155.84222412109375, "learning_rate": 8.357348703170029e-06, "loss": 0.0765, "step": 10100 }, { "epoch": 5.85014409221902, "grad_norm": 0.14352348446846008, "learning_rate": 8.299711815561961e-06, "loss": 0.1251, "step": 10150 }, { "epoch": 5.878962536023055, "grad_norm": 0.05221036821603775, "learning_rate": 8.242074927953892e-06, "loss": 0.0724, "step": 10200 }, { "epoch": 5.9077809798270895, "grad_norm": 44.994606018066406, "learning_rate": 8.184438040345822e-06, "loss": 0.0597, "step": 10250 }, { "epoch": 5.936599423631124, "grad_norm": 57.5409049987793, "learning_rate": 8.126801152737753e-06, "loss": 0.0747, "step": 10300 }, { "epoch": 5.965417867435159, "grad_norm": 40.75551986694336, "learning_rate": 8.069164265129685e-06, "loss": 0.11, "step": 10350 }, { "epoch": 5.994236311239193, "grad_norm": 21.53359603881836, "learning_rate": 8.011527377521614e-06, "loss": 0.0721, "step": 10400 }, { "epoch": 6.0, "eval_accuracy": 0.9414901282605562, "eval_loss": 0.37702837586402893, "eval_runtime": 37.3615, "eval_samples_per_second": 185.726, "eval_steps_per_second": 11.616, "step": 10410 }, { "epoch": 6.023054755043228, "grad_norm": 11.95662784576416, "learning_rate": 7.953890489913544e-06, "loss": 0.0444, "step": 10450 }, { "epoch": 6.051873198847262, "grad_norm": 0.17058134078979492, "learning_rate": 7.896253602305477e-06, "loss": 0.0413, "step": 10500 }, { "epoch": 6.0806916426512965, "grad_norm": 0.023899082094430923, "learning_rate": 7.838616714697407e-06, "loss": 0.0283, "step": 10550 }, { "epoch": 6.109510086455331, "grad_norm": 5.194774627685547, "learning_rate": 7.780979827089338e-06, "loss": 0.029, "step": 10600 }, { "epoch": 6.138328530259366, "grad_norm": 47.05008316040039, "learning_rate": 7.723342939481268e-06, "loss": 0.0639, "step": 10650 }, { "epoch": 6.1671469740634, "grad_norm": 1.5677424669265747, "learning_rate": 7.665706051873199e-06, "loss": 0.013, "step": 10700 }, { "epoch": 6.195965417867435, "grad_norm": 0.054807789623737335, "learning_rate": 7.60806916426513e-06, "loss": 0.0314, "step": 10750 }, { "epoch": 6.22478386167147, "grad_norm": 0.005258807446807623, "learning_rate": 7.550432276657062e-06, "loss": 0.0773, "step": 10800 }, { "epoch": 6.253602305475504, "grad_norm": 6.556185722351074, "learning_rate": 7.492795389048992e-06, "loss": 0.055, "step": 10850 }, { "epoch": 6.282420749279539, "grad_norm": 0.009452180936932564, "learning_rate": 7.4351585014409235e-06, "loss": 0.0428, "step": 10900 }, { "epoch": 6.311239193083574, "grad_norm": 1.1048903465270996, "learning_rate": 7.377521613832853e-06, "loss": 0.023, "step": 10950 }, { "epoch": 6.340057636887608, "grad_norm": 0.10645721852779388, "learning_rate": 7.319884726224784e-06, "loss": 0.0403, "step": 11000 }, { "epoch": 6.368876080691643, "grad_norm": 0.03314146026968956, "learning_rate": 7.262247838616715e-06, "loss": 0.0632, "step": 11050 }, { "epoch": 6.3976945244956775, "grad_norm": 0.010250965133309364, "learning_rate": 7.204610951008646e-06, "loss": 0.0461, "step": 11100 }, { "epoch": 6.426512968299712, "grad_norm": 0.663324773311615, "learning_rate": 7.146974063400577e-06, "loss": 0.0706, "step": 11150 }, { "epoch": 6.455331412103746, "grad_norm": 0.016964536160230637, "learning_rate": 7.089337175792508e-06, "loss": 0.0623, "step": 11200 }, { "epoch": 6.484149855907781, "grad_norm": 0.8198705911636353, "learning_rate": 7.031700288184439e-06, "loss": 0.0486, "step": 11250 }, { "epoch": 6.512968299711815, "grad_norm": 0.014807182364165783, "learning_rate": 6.9740634005763696e-06, "loss": 0.0125, "step": 11300 }, { "epoch": 6.54178674351585, "grad_norm": 101.77655029296875, "learning_rate": 6.916426512968301e-06, "loss": 0.0436, "step": 11350 }, { "epoch": 6.5706051873198845, "grad_norm": 0.8024958968162537, "learning_rate": 6.8587896253602315e-06, "loss": 0.0525, "step": 11400 }, { "epoch": 6.599423631123919, "grad_norm": 0.010165904648602009, "learning_rate": 6.801152737752162e-06, "loss": 0.0255, "step": 11450 }, { "epoch": 6.628242074927954, "grad_norm": 0.31986117362976074, "learning_rate": 6.743515850144093e-06, "loss": 0.0765, "step": 11500 }, { "epoch": 6.6570605187319885, "grad_norm": 0.01843302696943283, "learning_rate": 6.685878962536023e-06, "loss": 0.0396, "step": 11550 }, { "epoch": 6.685878962536023, "grad_norm": 51.702552795410156, "learning_rate": 6.6282420749279545e-06, "loss": 0.1131, "step": 11600 }, { "epoch": 6.714697406340058, "grad_norm": 0.10061511397361755, "learning_rate": 6.570605187319885e-06, "loss": 0.0528, "step": 11650 }, { "epoch": 6.743515850144092, "grad_norm": 0.010481205768883228, "learning_rate": 6.512968299711816e-06, "loss": 0.0537, "step": 11700 }, { "epoch": 6.772334293948127, "grad_norm": 0.07575374096632004, "learning_rate": 6.455331412103747e-06, "loss": 0.0478, "step": 11750 }, { "epoch": 6.801152737752162, "grad_norm": 0.0058944206684827805, "learning_rate": 6.3976945244956775e-06, "loss": 0.065, "step": 11800 }, { "epoch": 6.829971181556196, "grad_norm": 0.9527530074119568, "learning_rate": 6.340057636887609e-06, "loss": 0.0505, "step": 11850 }, { "epoch": 6.858789625360231, "grad_norm": 22.32554054260254, "learning_rate": 6.2824207492795395e-06, "loss": 0.0858, "step": 11900 }, { "epoch": 6.887608069164266, "grad_norm": 0.04536249861121178, "learning_rate": 6.224783861671471e-06, "loss": 0.0618, "step": 11950 }, { "epoch": 6.916426512968299, "grad_norm": 0.030509620904922485, "learning_rate": 6.167146974063401e-06, "loss": 0.06, "step": 12000 }, { "epoch": 6.945244956772334, "grad_norm": 4.554119110107422, "learning_rate": 6.109510086455331e-06, "loss": 0.0478, "step": 12050 }, { "epoch": 6.974063400576369, "grad_norm": 0.03110508993268013, "learning_rate": 6.0518731988472625e-06, "loss": 0.0486, "step": 12100 }, { "epoch": 7.0, "eval_accuracy": 0.943651823029255, "eval_loss": 0.4001205563545227, "eval_runtime": 43.9647, "eval_samples_per_second": 157.831, "eval_steps_per_second": 9.872, "step": 12145 }, { "epoch": 7.002881844380403, "grad_norm": 3.505014181137085, "learning_rate": 5.994236311239193e-06, "loss": 0.0951, "step": 12150 }, { "epoch": 7.031700288184438, "grad_norm": 0.1990944892168045, "learning_rate": 5.9365994236311244e-06, "loss": 0.0131, "step": 12200 }, { "epoch": 7.060518731988473, "grad_norm": 0.19344107806682587, "learning_rate": 5.878962536023055e-06, "loss": 0.0247, "step": 12250 }, { "epoch": 7.089337175792507, "grad_norm": 0.5416840314865112, "learning_rate": 5.821325648414986e-06, "loss": 0.069, "step": 12300 }, { "epoch": 7.118155619596542, "grad_norm": 0.029165951535105705, "learning_rate": 5.763688760806917e-06, "loss": 0.029, "step": 12350 }, { "epoch": 7.1469740634005765, "grad_norm": 0.0034562216605991125, "learning_rate": 5.706051873198848e-06, "loss": 0.0337, "step": 12400 }, { "epoch": 7.175792507204611, "grad_norm": 0.048965733498334885, "learning_rate": 5.648414985590779e-06, "loss": 0.0214, "step": 12450 }, { "epoch": 7.204610951008646, "grad_norm": 0.05087200552225113, "learning_rate": 5.590778097982709e-06, "loss": 0.0071, "step": 12500 }, { "epoch": 7.23342939481268, "grad_norm": 0.0026253710966557264, "learning_rate": 5.533141210374641e-06, "loss": 0.0254, "step": 12550 }, { "epoch": 7.262247838616715, "grad_norm": 0.01383883971720934, "learning_rate": 5.4755043227665705e-06, "loss": 0.0354, "step": 12600 }, { "epoch": 7.291066282420749, "grad_norm": 0.06278753280639648, "learning_rate": 5.417867435158502e-06, "loss": 0.0499, "step": 12650 }, { "epoch": 7.3198847262247835, "grad_norm": 91.43658447265625, "learning_rate": 5.360230547550432e-06, "loss": 0.0162, "step": 12700 }, { "epoch": 7.348703170028818, "grad_norm": 0.042137544602155685, "learning_rate": 5.302593659942363e-06, "loss": 0.0183, "step": 12750 }, { "epoch": 7.377521613832853, "grad_norm": 0.5085373520851135, "learning_rate": 5.244956772334294e-06, "loss": 0.0623, "step": 12800 }, { "epoch": 7.406340057636887, "grad_norm": 0.07125350832939148, "learning_rate": 5.187319884726225e-06, "loss": 0.0403, "step": 12850 }, { "epoch": 7.435158501440922, "grad_norm": 0.0029079755768179893, "learning_rate": 5.129682997118156e-06, "loss": 0.0294, "step": 12900 }, { "epoch": 7.463976945244957, "grad_norm": 0.19476212561130524, "learning_rate": 5.072046109510087e-06, "loss": 0.0136, "step": 12950 }, { "epoch": 7.492795389048991, "grad_norm": 0.0114585617557168, "learning_rate": 5.014409221902018e-06, "loss": 0.0207, "step": 13000 }, { "epoch": 7.521613832853026, "grad_norm": 72.99742889404297, "learning_rate": 4.956772334293949e-06, "loss": 0.031, "step": 13050 }, { "epoch": 7.550432276657061, "grad_norm": 0.003232621820643544, "learning_rate": 4.899135446685879e-06, "loss": 0.0328, "step": 13100 }, { "epoch": 7.579250720461095, "grad_norm": 32.08872985839844, "learning_rate": 4.84149855907781e-06, "loss": 0.025, "step": 13150 }, { "epoch": 7.60806916426513, "grad_norm": 101.10234832763672, "learning_rate": 4.783861671469741e-06, "loss": 0.0514, "step": 13200 }, { "epoch": 7.636887608069165, "grad_norm": 0.02673684060573578, "learning_rate": 4.726224783861672e-06, "loss": 0.0423, "step": 13250 }, { "epoch": 7.665706051873199, "grad_norm": 0.09100785851478577, "learning_rate": 4.668587896253602e-06, "loss": 0.0128, "step": 13300 }, { "epoch": 7.694524495677234, "grad_norm": 0.014528327621519566, "learning_rate": 4.610951008645534e-06, "loss": 0.0401, "step": 13350 }, { "epoch": 7.7233429394812685, "grad_norm": 0.005020576063543558, "learning_rate": 4.553314121037464e-06, "loss": 0.0407, "step": 13400 }, { "epoch": 7.752161383285302, "grad_norm": 0.006958292331546545, "learning_rate": 4.495677233429396e-06, "loss": 0.0355, "step": 13450 }, { "epoch": 7.780979827089337, "grad_norm": 0.005842278711497784, "learning_rate": 4.438040345821326e-06, "loss": 0.0928, "step": 13500 }, { "epoch": 7.8097982708933715, "grad_norm": 0.0038527839351445436, "learning_rate": 4.380403458213257e-06, "loss": 0.049, "step": 13550 }, { "epoch": 7.838616714697406, "grad_norm": 0.10149471461772919, "learning_rate": 4.322766570605187e-06, "loss": 0.0537, "step": 13600 }, { "epoch": 7.867435158501441, "grad_norm": 0.20959939062595367, "learning_rate": 4.265129682997119e-06, "loss": 0.0314, "step": 13650 }, { "epoch": 7.8962536023054755, "grad_norm": 0.15081147849559784, "learning_rate": 4.207492795389049e-06, "loss": 0.0347, "step": 13700 }, { "epoch": 7.92507204610951, "grad_norm": 0.0020617288537323475, "learning_rate": 4.149855907780981e-06, "loss": 0.0193, "step": 13750 }, { "epoch": 7.953890489913545, "grad_norm": 1.0864264965057373, "learning_rate": 4.092219020172911e-06, "loss": 0.0307, "step": 13800 }, { "epoch": 7.982708933717579, "grad_norm": 0.8607079386711121, "learning_rate": 4.0345821325648425e-06, "loss": 0.0353, "step": 13850 }, { "epoch": 8.0, "eval_accuracy": 0.9432194840755153, "eval_loss": 0.4078510105609894, "eval_runtime": 37.2866, "eval_samples_per_second": 186.099, "eval_steps_per_second": 11.64, "step": 13880 }, { "epoch": 8.011527377521613, "grad_norm": 1.9995282888412476, "learning_rate": 3.976945244956772e-06, "loss": 0.0205, "step": 13900 }, { "epoch": 8.040345821325648, "grad_norm": 0.00989406555891037, "learning_rate": 3.919308357348704e-06, "loss": 0.0358, "step": 13950 }, { "epoch": 8.069164265129682, "grad_norm": 0.029149776324629784, "learning_rate": 3.861671469740634e-06, "loss": 0.0201, "step": 14000 }, { "epoch": 8.097982708933717, "grad_norm": 0.013825014233589172, "learning_rate": 3.804034582132565e-06, "loss": 0.0201, "step": 14050 }, { "epoch": 8.126801152737752, "grad_norm": 0.002229891484603286, "learning_rate": 3.746397694524496e-06, "loss": 0.0152, "step": 14100 }, { "epoch": 8.155619596541786, "grad_norm": 0.22877122461795807, "learning_rate": 3.6887608069164266e-06, "loss": 0.017, "step": 14150 }, { "epoch": 8.184438040345821, "grad_norm": 0.014974354766309261, "learning_rate": 3.6311239193083576e-06, "loss": 0.01, "step": 14200 }, { "epoch": 8.213256484149856, "grad_norm": 49.01778030395508, "learning_rate": 3.5734870317002885e-06, "loss": 0.0324, "step": 14250 }, { "epoch": 8.24207492795389, "grad_norm": 2.350428581237793, "learning_rate": 3.5158501440922195e-06, "loss": 0.0119, "step": 14300 }, { "epoch": 8.270893371757925, "grad_norm": 0.037689752876758575, "learning_rate": 3.4582132564841505e-06, "loss": 0.0205, "step": 14350 }, { "epoch": 8.29971181556196, "grad_norm": 0.15642857551574707, "learning_rate": 3.400576368876081e-06, "loss": 0.0346, "step": 14400 }, { "epoch": 8.328530259365994, "grad_norm": 0.0028990712016820908, "learning_rate": 3.3429394812680116e-06, "loss": 0.0316, "step": 14450 }, { "epoch": 8.357348703170029, "grad_norm": 42.16065979003906, "learning_rate": 3.2853025936599425e-06, "loss": 0.021, "step": 14500 }, { "epoch": 8.386167146974064, "grad_norm": 0.1258460283279419, "learning_rate": 3.2276657060518735e-06, "loss": 0.0258, "step": 14550 }, { "epoch": 8.414985590778098, "grad_norm": 0.015242321416735649, "learning_rate": 3.1700288184438045e-06, "loss": 0.0116, "step": 14600 }, { "epoch": 8.443804034582133, "grad_norm": 10.062729835510254, "learning_rate": 3.1123919308357354e-06, "loss": 0.0279, "step": 14650 }, { "epoch": 8.472622478386167, "grad_norm": 0.005074836779385805, "learning_rate": 3.0547550432276656e-06, "loss": 0.0294, "step": 14700 }, { "epoch": 8.501440922190202, "grad_norm": 0.029905997216701508, "learning_rate": 2.9971181556195965e-06, "loss": 0.0352, "step": 14750 }, { "epoch": 8.530259365994237, "grad_norm": 134.86546325683594, "learning_rate": 2.9394812680115275e-06, "loss": 0.0458, "step": 14800 }, { "epoch": 8.559077809798271, "grad_norm": 0.03388785198330879, "learning_rate": 2.8818443804034585e-06, "loss": 0.0014, "step": 14850 }, { "epoch": 8.587896253602306, "grad_norm": 0.0030293685849756002, "learning_rate": 2.8242074927953894e-06, "loss": 0.0098, "step": 14900 }, { "epoch": 8.61671469740634, "grad_norm": 0.008076900616288185, "learning_rate": 2.7665706051873204e-06, "loss": 0.0196, "step": 14950 }, { "epoch": 8.645533141210375, "grad_norm": 3.2185356616973877, "learning_rate": 2.708933717579251e-06, "loss": 0.0191, "step": 15000 }, { "epoch": 8.67435158501441, "grad_norm": 0.04700352996587753, "learning_rate": 2.6512968299711815e-06, "loss": 0.021, "step": 15050 }, { "epoch": 8.703170028818445, "grad_norm": 0.07113181054592133, "learning_rate": 2.5936599423631124e-06, "loss": 0.0325, "step": 15100 }, { "epoch": 8.73198847262248, "grad_norm": 36.98723220825195, "learning_rate": 2.5360230547550434e-06, "loss": 0.0391, "step": 15150 }, { "epoch": 8.760806916426514, "grad_norm": 0.005969559773802757, "learning_rate": 2.4783861671469744e-06, "loss": 0.0041, "step": 15200 }, { "epoch": 8.789625360230547, "grad_norm": 0.05540904030203819, "learning_rate": 2.420749279538905e-06, "loss": 0.0175, "step": 15250 }, { "epoch": 8.818443804034581, "grad_norm": 5.187469482421875, "learning_rate": 2.363112391930836e-06, "loss": 0.0574, "step": 15300 }, { "epoch": 8.847262247838616, "grad_norm": 0.02656296268105507, "learning_rate": 2.305475504322767e-06, "loss": 0.0076, "step": 15350 }, { "epoch": 8.87608069164265, "grad_norm": 62.193111419677734, "learning_rate": 2.247838616714698e-06, "loss": 0.0349, "step": 15400 }, { "epoch": 8.904899135446685, "grad_norm": 0.0021070409566164017, "learning_rate": 2.1902017291066284e-06, "loss": 0.0124, "step": 15450 }, { "epoch": 8.93371757925072, "grad_norm": 0.007772780489176512, "learning_rate": 2.1325648414985593e-06, "loss": 0.0116, "step": 15500 }, { "epoch": 8.962536023054755, "grad_norm": 0.012445935048162937, "learning_rate": 2.0749279538904903e-06, "loss": 0.0432, "step": 15550 }, { "epoch": 8.99135446685879, "grad_norm": 0.005035657435655594, "learning_rate": 2.0172910662824213e-06, "loss": 0.0311, "step": 15600 }, { "epoch": 9.0, "eval_accuracy": 0.9409136763222367, "eval_loss": 0.45458710193634033, "eval_runtime": 37.2035, "eval_samples_per_second": 186.515, "eval_steps_per_second": 11.666, "step": 15615 }, { "epoch": 9.020172910662824, "grad_norm": 0.31200042366981506, "learning_rate": 1.959654178674352e-06, "loss": 0.0366, "step": 15650 }, { "epoch": 9.048991354466859, "grad_norm": 0.0019912293646484613, "learning_rate": 1.9020172910662826e-06, "loss": 0.0083, "step": 15700 }, { "epoch": 9.077809798270893, "grad_norm": 0.00532187195494771, "learning_rate": 1.8443804034582133e-06, "loss": 0.0064, "step": 15750 }, { "epoch": 9.106628242074928, "grad_norm": 0.0012970577226951718, "learning_rate": 1.7867435158501443e-06, "loss": 0.0279, "step": 15800 }, { "epoch": 9.135446685878962, "grad_norm": 0.002228269586339593, "learning_rate": 1.7291066282420752e-06, "loss": 0.0084, "step": 15850 }, { "epoch": 9.164265129682997, "grad_norm": 0.006390728056430817, "learning_rate": 1.6714697406340058e-06, "loss": 0.0265, "step": 15900 }, { "epoch": 9.193083573487032, "grad_norm": 0.00681735435500741, "learning_rate": 1.6138328530259367e-06, "loss": 0.0183, "step": 15950 }, { "epoch": 9.221902017291066, "grad_norm": 0.0014511954504996538, "learning_rate": 1.5561959654178677e-06, "loss": 0.0135, "step": 16000 }, { "epoch": 9.250720461095101, "grad_norm": 0.05415060743689537, "learning_rate": 1.4985590778097983e-06, "loss": 0.0243, "step": 16050 }, { "epoch": 9.279538904899136, "grad_norm": 0.003793516429141164, "learning_rate": 1.4409221902017292e-06, "loss": 0.031, "step": 16100 }, { "epoch": 9.30835734870317, "grad_norm": 0.0018004026496782899, "learning_rate": 1.3832853025936602e-06, "loss": 0.0193, "step": 16150 }, { "epoch": 9.337175792507205, "grad_norm": 0.9138274192810059, "learning_rate": 1.3256484149855907e-06, "loss": 0.021, "step": 16200 }, { "epoch": 9.36599423631124, "grad_norm": 0.0023099486716091633, "learning_rate": 1.2680115273775217e-06, "loss": 0.0052, "step": 16250 }, { "epoch": 9.394812680115274, "grad_norm": 0.004157908260822296, "learning_rate": 1.2103746397694525e-06, "loss": 0.016, "step": 16300 }, { "epoch": 9.423631123919309, "grad_norm": 0.00460004573687911, "learning_rate": 1.1527377521613834e-06, "loss": 0.0248, "step": 16350 }, { "epoch": 9.452449567723344, "grad_norm": 0.024985365569591522, "learning_rate": 1.0951008645533142e-06, "loss": 0.0312, "step": 16400 }, { "epoch": 9.481268011527378, "grad_norm": 0.09961481392383575, "learning_rate": 1.0374639769452451e-06, "loss": 0.041, "step": 16450 }, { "epoch": 9.510086455331413, "grad_norm": 0.004411065485328436, "learning_rate": 9.79827089337176e-07, "loss": 0.0116, "step": 16500 }, { "epoch": 9.538904899135447, "grad_norm": 0.0155422892421484, "learning_rate": 9.221902017291067e-07, "loss": 0.0178, "step": 16550 }, { "epoch": 9.56772334293948, "grad_norm": 0.003015684662386775, "learning_rate": 8.645533141210376e-07, "loss": 0.0107, "step": 16600 }, { "epoch": 9.596541786743515, "grad_norm": 0.17505097389221191, "learning_rate": 8.069164265129684e-07, "loss": 0.0057, "step": 16650 }, { "epoch": 9.62536023054755, "grad_norm": 0.02643350511789322, "learning_rate": 7.492795389048991e-07, "loss": 0.0131, "step": 16700 }, { "epoch": 9.654178674351584, "grad_norm": 0.0013237950624898076, "learning_rate": 6.916426512968301e-07, "loss": 0.0009, "step": 16750 }, { "epoch": 9.682997118155619, "grad_norm": 0.10223102569580078, "learning_rate": 6.340057636887609e-07, "loss": 0.0116, "step": 16800 }, { "epoch": 9.711815561959654, "grad_norm": 0.018011650070548058, "learning_rate": 5.763688760806917e-07, "loss": 0.0551, "step": 16850 }, { "epoch": 9.740634005763688, "grad_norm": 0.017454462125897408, "learning_rate": 5.187319884726226e-07, "loss": 0.0011, "step": 16900 }, { "epoch": 9.769452449567723, "grad_norm": 0.0015755228232592344, "learning_rate": 4.6109510086455333e-07, "loss": 0.013, "step": 16950 }, { "epoch": 9.798270893371757, "grad_norm": 0.009967944584786892, "learning_rate": 4.034582132564842e-07, "loss": 0.0189, "step": 17000 }, { "epoch": 9.827089337175792, "grad_norm": 31.352365493774414, "learning_rate": 3.4582132564841505e-07, "loss": 0.0139, "step": 17050 }, { "epoch": 9.855907780979827, "grad_norm": 0.010829967446625233, "learning_rate": 2.8818443804034586e-07, "loss": 0.0091, "step": 17100 }, { "epoch": 9.884726224783861, "grad_norm": 0.002866011345759034, "learning_rate": 2.3054755043227666e-07, "loss": 0.0264, "step": 17150 }, { "epoch": 9.913544668587896, "grad_norm": 0.0031210549641400576, "learning_rate": 1.7291066282420752e-07, "loss": 0.0137, "step": 17200 }, { "epoch": 9.94236311239193, "grad_norm": 0.020828669890761375, "learning_rate": 1.1527377521613833e-07, "loss": 0.0216, "step": 17250 }, { "epoch": 9.971181556195965, "grad_norm": 0.13391831517219543, "learning_rate": 5.7636887608069166e-08, "loss": 0.0065, "step": 17300 }, { "epoch": 10.0, "grad_norm": 0.0037497931625694036, "learning_rate": 0.0, "loss": 0.009, "step": 17350 }, { "epoch": 10.0, "eval_accuracy": 0.9424989191526156, "eval_loss": 0.4487462341785431, "eval_runtime": 37.4007, "eval_samples_per_second": 185.531, "eval_steps_per_second": 11.604, "step": 17350 } ], "logging_steps": 50, "max_steps": 17350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 3.6512244667008e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }