Spaces:
Sleeping
Sleeping
| { | |
| "best_metric": 0.943651823029255, | |
| "best_model_checkpoint": "./phobert_results/checkpoint-12145", | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 17350, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02881844380403458, | |
| "grad_norm": 4.970236301422119, | |
| "learning_rate": 1.994236311239193e-05, | |
| "loss": 1.9031, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05763688760806916, | |
| "grad_norm": 9.052480697631836, | |
| "learning_rate": 1.9884726224783863e-05, | |
| "loss": 1.7452, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08645533141210375, | |
| "grad_norm": 10.461433410644531, | |
| "learning_rate": 1.9827089337175795e-05, | |
| "loss": 1.6609, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.11527377521613832, | |
| "grad_norm": 9.097572326660156, | |
| "learning_rate": 1.9769452449567724e-05, | |
| "loss": 1.5037, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1440922190201729, | |
| "grad_norm": 11.013162612915039, | |
| "learning_rate": 1.9711815561959656e-05, | |
| "loss": 1.4436, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1729106628242075, | |
| "grad_norm": 10.979734420776367, | |
| "learning_rate": 1.9654178674351588e-05, | |
| "loss": 1.3187, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2017291066282421, | |
| "grad_norm": 12.8245849609375, | |
| "learning_rate": 1.9596541786743517e-05, | |
| "loss": 1.2895, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23054755043227665, | |
| "grad_norm": 13.686446189880371, | |
| "learning_rate": 1.953890489913545e-05, | |
| "loss": 1.2117, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.25936599423631124, | |
| "grad_norm": 11.00157356262207, | |
| "learning_rate": 1.9481268011527378e-05, | |
| "loss": 1.2032, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2881844380403458, | |
| "grad_norm": 13.596818923950195, | |
| "learning_rate": 1.942363112391931e-05, | |
| "loss": 1.1336, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3170028818443804, | |
| "grad_norm": 13.817182540893555, | |
| "learning_rate": 1.936599423631124e-05, | |
| "loss": 1.1091, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.345821325648415, | |
| "grad_norm": 6.913839817047119, | |
| "learning_rate": 1.930835734870317e-05, | |
| "loss": 1.0896, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3746397694524496, | |
| "grad_norm": 11.378829002380371, | |
| "learning_rate": 1.9250720461095104e-05, | |
| "loss": 0.9832, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4034582132564842, | |
| "grad_norm": 16.737525939941406, | |
| "learning_rate": 1.9193083573487033e-05, | |
| "loss": 0.978, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4322766570605187, | |
| "grad_norm": 12.594534873962402, | |
| "learning_rate": 1.9135446685878965e-05, | |
| "loss": 0.9757, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4610951008645533, | |
| "grad_norm": 11.076528549194336, | |
| "learning_rate": 1.9077809798270894e-05, | |
| "loss": 0.9533, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4899135446685879, | |
| "grad_norm": 12.495336532592773, | |
| "learning_rate": 1.9020172910662826e-05, | |
| "loss": 0.9399, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5187319884726225, | |
| "grad_norm": 10.985849380493164, | |
| "learning_rate": 1.8962536023054755e-05, | |
| "loss": 0.8982, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.547550432276657, | |
| "grad_norm": 13.00562572479248, | |
| "learning_rate": 1.8904899135446687e-05, | |
| "loss": 0.9525, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5763688760806917, | |
| "grad_norm": 13.221843719482422, | |
| "learning_rate": 1.884726224783862e-05, | |
| "loss": 0.9063, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6051873198847262, | |
| "grad_norm": 15.107306480407715, | |
| "learning_rate": 1.878962536023055e-05, | |
| "loss": 0.8538, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6340057636887608, | |
| "grad_norm": 16.602313995361328, | |
| "learning_rate": 1.873198847262248e-05, | |
| "loss": 0.8626, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6628242074927954, | |
| "grad_norm": 13.856719017028809, | |
| "learning_rate": 1.867435158501441e-05, | |
| "loss": 0.8186, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.69164265129683, | |
| "grad_norm": 15.631819725036621, | |
| "learning_rate": 1.861671469740634e-05, | |
| "loss": 0.8228, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7204610951008645, | |
| "grad_norm": 25.166427612304688, | |
| "learning_rate": 1.855907780979827e-05, | |
| "loss": 0.837, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.7492795389048992, | |
| "grad_norm": 18.48745346069336, | |
| "learning_rate": 1.8501440922190203e-05, | |
| "loss": 0.7323, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7780979827089337, | |
| "grad_norm": 19.641643524169922, | |
| "learning_rate": 1.8443804034582135e-05, | |
| "loss": 0.8067, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8069164265129684, | |
| "grad_norm": 18.942670822143555, | |
| "learning_rate": 1.8386167146974067e-05, | |
| "loss": 0.8028, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8357348703170029, | |
| "grad_norm": 17.313879013061523, | |
| "learning_rate": 1.8328530259365996e-05, | |
| "loss": 0.7598, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8645533141210374, | |
| "grad_norm": 10.607074737548828, | |
| "learning_rate": 1.8270893371757928e-05, | |
| "loss": 0.8624, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8933717579250721, | |
| "grad_norm": 12.565463066101074, | |
| "learning_rate": 1.8213256484149857e-05, | |
| "loss": 0.7614, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9221902017291066, | |
| "grad_norm": 21.289403915405273, | |
| "learning_rate": 1.815561959654179e-05, | |
| "loss": 0.7644, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9510086455331412, | |
| "grad_norm": 17.60152816772461, | |
| "learning_rate": 1.8097982708933718e-05, | |
| "loss": 0.7546, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9798270893371758, | |
| "grad_norm": 17.39754867553711, | |
| "learning_rate": 1.804034582132565e-05, | |
| "loss": 0.7933, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.7976653696498055, | |
| "eval_loss": 0.6221615076065063, | |
| "eval_runtime": 37.0936, | |
| "eval_samples_per_second": 187.067, | |
| "eval_steps_per_second": 11.7, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.0086455331412103, | |
| "grad_norm": 18.93373680114746, | |
| "learning_rate": 1.7982708933717582e-05, | |
| "loss": 0.7119, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.037463976945245, | |
| "grad_norm": 15.846963882446289, | |
| "learning_rate": 1.792507204610951e-05, | |
| "loss": 0.5235, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0662824207492796, | |
| "grad_norm": 9.528020858764648, | |
| "learning_rate": 1.7867435158501444e-05, | |
| "loss": 0.5441, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0951008645533142, | |
| "grad_norm": 22.303991317749023, | |
| "learning_rate": 1.7809798270893372e-05, | |
| "loss": 0.5471, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.1239193083573487, | |
| "grad_norm": 13.73653507232666, | |
| "learning_rate": 1.7752161383285305e-05, | |
| "loss": 0.5555, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.1527377521613833, | |
| "grad_norm": 16.266706466674805, | |
| "learning_rate": 1.7694524495677234e-05, | |
| "loss": 0.6031, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.181556195965418, | |
| "grad_norm": 12.598456382751465, | |
| "learning_rate": 1.7636887608069166e-05, | |
| "loss": 0.5668, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.2103746397694524, | |
| "grad_norm": 14.683968544006348, | |
| "learning_rate": 1.7579250720461095e-05, | |
| "loss": 0.5748, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.239193083573487, | |
| "grad_norm": 15.514312744140625, | |
| "learning_rate": 1.7521613832853027e-05, | |
| "loss": 0.5473, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.2680115273775217, | |
| "grad_norm": 12.627583503723145, | |
| "learning_rate": 1.746397694524496e-05, | |
| "loss": 0.4907, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2968299711815563, | |
| "grad_norm": 9.212458610534668, | |
| "learning_rate": 1.7406340057636888e-05, | |
| "loss": 0.5064, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.3256484149855907, | |
| "grad_norm": 9.678365707397461, | |
| "learning_rate": 1.734870317002882e-05, | |
| "loss": 0.5415, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.3544668587896254, | |
| "grad_norm": 26.177536010742188, | |
| "learning_rate": 1.729106628242075e-05, | |
| "loss": 0.4821, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.38328530259366, | |
| "grad_norm": 11.909614562988281, | |
| "learning_rate": 1.723342939481268e-05, | |
| "loss": 0.4962, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.4121037463976944, | |
| "grad_norm": 16.271272659301758, | |
| "learning_rate": 1.717579250720461e-05, | |
| "loss": 0.5287, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.440922190201729, | |
| "grad_norm": 25.189302444458008, | |
| "learning_rate": 1.7118155619596542e-05, | |
| "loss": 0.5096, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.4697406340057637, | |
| "grad_norm": 23.7978515625, | |
| "learning_rate": 1.7060518731988475e-05, | |
| "loss": 0.4859, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.4985590778097984, | |
| "grad_norm": 6.646437168121338, | |
| "learning_rate": 1.7002881844380407e-05, | |
| "loss": 0.4467, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.527377521613833, | |
| "grad_norm": 18.11911964416504, | |
| "learning_rate": 1.6945244956772336e-05, | |
| "loss": 0.4939, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.5561959654178674, | |
| "grad_norm": 17.764862060546875, | |
| "learning_rate": 1.6887608069164268e-05, | |
| "loss": 0.4876, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.585014409221902, | |
| "grad_norm": 16.96575927734375, | |
| "learning_rate": 1.6829971181556197e-05, | |
| "loss": 0.4426, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.6138328530259365, | |
| "grad_norm": 20.623233795166016, | |
| "learning_rate": 1.6772334293948126e-05, | |
| "loss": 0.5638, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6426512968299711, | |
| "grad_norm": 16.03717041015625, | |
| "learning_rate": 1.6714697406340058e-05, | |
| "loss": 0.4558, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.6714697406340058, | |
| "grad_norm": 27.027576446533203, | |
| "learning_rate": 1.665706051873199e-05, | |
| "loss": 0.4227, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.7002881844380404, | |
| "grad_norm": 16.704652786254883, | |
| "learning_rate": 1.6599423631123922e-05, | |
| "loss": 0.4521, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.729106628242075, | |
| "grad_norm": 19.68289566040039, | |
| "learning_rate": 1.654178674351585e-05, | |
| "loss": 0.4419, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.7579250720461095, | |
| "grad_norm": 13.75253677368164, | |
| "learning_rate": 1.6484149855907783e-05, | |
| "loss": 0.4428, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.7867435158501441, | |
| "grad_norm": 44.23019790649414, | |
| "learning_rate": 1.6426512968299712e-05, | |
| "loss": 0.4416, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.8155619596541785, | |
| "grad_norm": 14.690359115600586, | |
| "learning_rate": 1.6368876080691644e-05, | |
| "loss": 0.3885, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.8443804034582132, | |
| "grad_norm": 12.004839897155762, | |
| "learning_rate": 1.6311239193083573e-05, | |
| "loss": 0.3744, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.8731988472622478, | |
| "grad_norm": 10.716139793395996, | |
| "learning_rate": 1.6253602305475506e-05, | |
| "loss": 0.398, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.9020172910662825, | |
| "grad_norm": 17.37589454650879, | |
| "learning_rate": 1.6195965417867438e-05, | |
| "loss": 0.4649, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.9308357348703171, | |
| "grad_norm": 13.404267311096191, | |
| "learning_rate": 1.613832853025937e-05, | |
| "loss": 0.3985, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.9596541786743515, | |
| "grad_norm": 14.350135803222656, | |
| "learning_rate": 1.60806916426513e-05, | |
| "loss": 0.4282, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.9884726224783862, | |
| "grad_norm": 20.828453063964844, | |
| "learning_rate": 1.6023054755043228e-05, | |
| "loss": 0.3764, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.8651102464332037, | |
| "eval_loss": 0.4364924728870392, | |
| "eval_runtime": 37.9177, | |
| "eval_samples_per_second": 183.002, | |
| "eval_steps_per_second": 11.446, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.0172910662824206, | |
| "grad_norm": 10.62299633026123, | |
| "learning_rate": 1.596541786743516e-05, | |
| "loss": 0.3511, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.0461095100864553, | |
| "grad_norm": 21.19414710998535, | |
| "learning_rate": 1.590778097982709e-05, | |
| "loss": 0.3166, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.07492795389049, | |
| "grad_norm": 9.603983879089355, | |
| "learning_rate": 1.585014409221902e-05, | |
| "loss": 0.257, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.1037463976945245, | |
| "grad_norm": 14.70697021484375, | |
| "learning_rate": 1.5792507204610953e-05, | |
| "loss": 0.2606, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.132564841498559, | |
| "grad_norm": 25.806259155273438, | |
| "learning_rate": 1.5734870317002882e-05, | |
| "loss": 0.2505, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.161383285302594, | |
| "grad_norm": 20.40934181213379, | |
| "learning_rate": 1.5677233429394814e-05, | |
| "loss": 0.3198, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.1902017291066285, | |
| "grad_norm": 34.36139678955078, | |
| "learning_rate": 1.5619596541786747e-05, | |
| "loss": 0.2689, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.2190201729106627, | |
| "grad_norm": 23.640975952148438, | |
| "learning_rate": 1.5561959654178675e-05, | |
| "loss": 0.2744, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.2478386167146973, | |
| "grad_norm": 15.340378761291504, | |
| "learning_rate": 1.5504322766570608e-05, | |
| "loss": 0.332, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.276657060518732, | |
| "grad_norm": 29.153423309326172, | |
| "learning_rate": 1.5446685878962537e-05, | |
| "loss": 0.2945, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.3054755043227666, | |
| "grad_norm": 9.021677017211914, | |
| "learning_rate": 1.538904899135447e-05, | |
| "loss": 0.284, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.3342939481268012, | |
| "grad_norm": 32.12895584106445, | |
| "learning_rate": 1.5331412103746398e-05, | |
| "loss": 0.2429, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.363112391930836, | |
| "grad_norm": 36.411773681640625, | |
| "learning_rate": 1.527377521613833e-05, | |
| "loss": 0.2723, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.39193083573487, | |
| "grad_norm": 18.912567138671875, | |
| "learning_rate": 1.521613832853026e-05, | |
| "loss": 0.2702, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.4207492795389047, | |
| "grad_norm": 17.08293342590332, | |
| "learning_rate": 1.5158501440922191e-05, | |
| "loss": 0.3458, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.4495677233429394, | |
| "grad_norm": 24.516544342041016, | |
| "learning_rate": 1.5100864553314123e-05, | |
| "loss": 0.2542, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.478386167146974, | |
| "grad_norm": 3.7018916606903076, | |
| "learning_rate": 1.5043227665706052e-05, | |
| "loss": 0.2958, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.5072046109510087, | |
| "grad_norm": 24.21662712097168, | |
| "learning_rate": 1.4985590778097984e-05, | |
| "loss": 0.2693, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.5360230547550433, | |
| "grad_norm": 36.85274124145508, | |
| "learning_rate": 1.4927953890489915e-05, | |
| "loss": 0.2769, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.564841498559078, | |
| "grad_norm": 26.23798179626465, | |
| "learning_rate": 1.4870317002881847e-05, | |
| "loss": 0.2443, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.5936599423631126, | |
| "grad_norm": 40.98781204223633, | |
| "learning_rate": 1.4812680115273776e-05, | |
| "loss": 0.2789, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.6224783861671472, | |
| "grad_norm": 33.100223541259766, | |
| "learning_rate": 1.4755043227665706e-05, | |
| "loss": 0.2776, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.6512968299711814, | |
| "grad_norm": 158.5146942138672, | |
| "learning_rate": 1.4697406340057639e-05, | |
| "loss": 0.2852, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.680115273775216, | |
| "grad_norm": 19.569507598876953, | |
| "learning_rate": 1.4639769452449568e-05, | |
| "loss": 0.2743, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.7089337175792507, | |
| "grad_norm": 15.44676685333252, | |
| "learning_rate": 1.45821325648415e-05, | |
| "loss": 0.2746, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.7377521613832854, | |
| "grad_norm": 11.913484573364258, | |
| "learning_rate": 1.452449567723343e-05, | |
| "loss": 0.2309, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.76657060518732, | |
| "grad_norm": 17.194211959838867, | |
| "learning_rate": 1.4466858789625363e-05, | |
| "loss": 0.2497, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.795389048991354, | |
| "grad_norm": 1.0725657939910889, | |
| "learning_rate": 1.4409221902017291e-05, | |
| "loss": 0.2903, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.824207492795389, | |
| "grad_norm": 10.74899959564209, | |
| "learning_rate": 1.4351585014409224e-05, | |
| "loss": 0.2968, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.8530259365994235, | |
| "grad_norm": 29.902647018432617, | |
| "learning_rate": 1.4293948126801154e-05, | |
| "loss": 0.2935, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.881844380403458, | |
| "grad_norm": 11.288642883300781, | |
| "learning_rate": 1.4236311239193086e-05, | |
| "loss": 0.2928, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.910662824207493, | |
| "grad_norm": 14.153397560119629, | |
| "learning_rate": 1.4178674351585015e-05, | |
| "loss": 0.1799, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.9394812680115274, | |
| "grad_norm": 15.782369613647461, | |
| "learning_rate": 1.4121037463976946e-05, | |
| "loss": 0.2154, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.968299711815562, | |
| "grad_norm": 3.8253252506256104, | |
| "learning_rate": 1.4063400576368878e-05, | |
| "loss": 0.2628, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.9971181556195967, | |
| "grad_norm": 6.50337553024292, | |
| "learning_rate": 1.4005763688760807e-05, | |
| "loss": 0.2194, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9028678483931403, | |
| "eval_loss": 0.3793924152851105, | |
| "eval_runtime": 36.9217, | |
| "eval_samples_per_second": 187.938, | |
| "eval_steps_per_second": 11.755, | |
| "step": 5205 | |
| }, | |
| { | |
| "epoch": 3.025936599423631, | |
| "grad_norm": 37.30765914916992, | |
| "learning_rate": 1.3948126801152739e-05, | |
| "loss": 0.1946, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.0547550432276656, | |
| "grad_norm": 38.157142639160156, | |
| "learning_rate": 1.389048991354467e-05, | |
| "loss": 0.1694, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.0835734870317, | |
| "grad_norm": 22.551681518554688, | |
| "learning_rate": 1.3832853025936602e-05, | |
| "loss": 0.2151, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.112391930835735, | |
| "grad_norm": 3.1120803356170654, | |
| "learning_rate": 1.377521613832853e-05, | |
| "loss": 0.1945, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.1412103746397695, | |
| "grad_norm": 24.61436653137207, | |
| "learning_rate": 1.3717579250720463e-05, | |
| "loss": 0.13, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.170028818443804, | |
| "grad_norm": 15.621273040771484, | |
| "learning_rate": 1.3659942363112394e-05, | |
| "loss": 0.1603, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.1988472622478388, | |
| "grad_norm": 10.645062446594238, | |
| "learning_rate": 1.3602305475504324e-05, | |
| "loss": 0.1814, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.227665706051873, | |
| "grad_norm": 37.47835159301758, | |
| "learning_rate": 1.3544668587896255e-05, | |
| "loss": 0.1527, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.2564841498559076, | |
| "grad_norm": 0.6854817271232605, | |
| "learning_rate": 1.3487031700288185e-05, | |
| "loss": 0.209, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.2853025936599423, | |
| "grad_norm": 1.353789210319519, | |
| "learning_rate": 1.3429394812680117e-05, | |
| "loss": 0.1542, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.314121037463977, | |
| "grad_norm": 9.835870742797852, | |
| "learning_rate": 1.3371757925072046e-05, | |
| "loss": 0.2365, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.3429394812680115, | |
| "grad_norm": 13.969425201416016, | |
| "learning_rate": 1.3314121037463979e-05, | |
| "loss": 0.1715, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.371757925072046, | |
| "grad_norm": 21.74212074279785, | |
| "learning_rate": 1.3256484149855909e-05, | |
| "loss": 0.1726, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.400576368876081, | |
| "grad_norm": 6.624606609344482, | |
| "learning_rate": 1.319884726224784e-05, | |
| "loss": 0.1997, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.4293948126801155, | |
| "grad_norm": 24.518829345703125, | |
| "learning_rate": 1.314121037463977e-05, | |
| "loss": 0.1027, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.4582132564841497, | |
| "grad_norm": 1.1079559326171875, | |
| "learning_rate": 1.3083573487031702e-05, | |
| "loss": 0.1708, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.4870317002881843, | |
| "grad_norm": 89.6827163696289, | |
| "learning_rate": 1.3025936599423631e-05, | |
| "loss": 0.2025, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.515850144092219, | |
| "grad_norm": 3.5336265563964844, | |
| "learning_rate": 1.2968299711815563e-05, | |
| "loss": 0.188, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.5446685878962536, | |
| "grad_norm": 3.5633816719055176, | |
| "learning_rate": 1.2910662824207494e-05, | |
| "loss": 0.1638, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.5734870317002883, | |
| "grad_norm": 1.318528175354004, | |
| "learning_rate": 1.2853025936599423e-05, | |
| "loss": 0.142, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.602305475504323, | |
| "grad_norm": 18.405723571777344, | |
| "learning_rate": 1.2795389048991355e-05, | |
| "loss": 0.1707, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.631123919308357, | |
| "grad_norm": 2.065215587615967, | |
| "learning_rate": 1.2737752161383286e-05, | |
| "loss": 0.1858, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.6599423631123917, | |
| "grad_norm": 1.1660606861114502, | |
| "learning_rate": 1.2680115273775218e-05, | |
| "loss": 0.1406, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 3.6887608069164264, | |
| "grad_norm": 0.28399357199668884, | |
| "learning_rate": 1.2622478386167147e-05, | |
| "loss": 0.1398, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.717579250720461, | |
| "grad_norm": 9.657651901245117, | |
| "learning_rate": 1.2564841498559079e-05, | |
| "loss": 0.1322, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.7463976945244957, | |
| "grad_norm": 42.729248046875, | |
| "learning_rate": 1.250720461095101e-05, | |
| "loss": 0.2353, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.7752161383285303, | |
| "grad_norm": 1.589701771736145, | |
| "learning_rate": 1.2449567723342942e-05, | |
| "loss": 0.2107, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.804034582132565, | |
| "grad_norm": 9.485612869262695, | |
| "learning_rate": 1.239193083573487e-05, | |
| "loss": 0.2303, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.8328530259365996, | |
| "grad_norm": 15.697033882141113, | |
| "learning_rate": 1.2334293948126803e-05, | |
| "loss": 0.1243, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.8616714697406342, | |
| "grad_norm": 13.186999320983887, | |
| "learning_rate": 1.2276657060518733e-05, | |
| "loss": 0.1802, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.8904899135446684, | |
| "grad_norm": 0.07961810380220413, | |
| "learning_rate": 1.2219020172910662e-05, | |
| "loss": 0.1776, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.919308357348703, | |
| "grad_norm": 19.50320053100586, | |
| "learning_rate": 1.2161383285302594e-05, | |
| "loss": 0.1825, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.9481268011527377, | |
| "grad_norm": 0.9466652870178223, | |
| "learning_rate": 1.2103746397694525e-05, | |
| "loss": 0.247, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.9769452449567724, | |
| "grad_norm": 0.9334861636161804, | |
| "learning_rate": 1.2046109510086457e-05, | |
| "loss": 0.1826, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9244847960801268, | |
| "eval_loss": 0.34017300605773926, | |
| "eval_runtime": 37.1217, | |
| "eval_samples_per_second": 186.926, | |
| "eval_steps_per_second": 11.691, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 4.005763688760807, | |
| "grad_norm": 38.7332763671875, | |
| "learning_rate": 1.1988472622478386e-05, | |
| "loss": 0.1516, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 4.034582132564841, | |
| "grad_norm": 1.7459765672683716, | |
| "learning_rate": 1.1930835734870318e-05, | |
| "loss": 0.1305, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.063400576368876, | |
| "grad_norm": 1.0959562063217163, | |
| "learning_rate": 1.1873198847262249e-05, | |
| "loss": 0.1054, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 4.0922190201729105, | |
| "grad_norm": 26.440959930419922, | |
| "learning_rate": 1.1815561959654181e-05, | |
| "loss": 0.1426, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.121037463976945, | |
| "grad_norm": 0.2572319805622101, | |
| "learning_rate": 1.175792507204611e-05, | |
| "loss": 0.0893, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 4.14985590778098, | |
| "grad_norm": 2.3376266956329346, | |
| "learning_rate": 1.1700288184438042e-05, | |
| "loss": 0.1211, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.178674351585014, | |
| "grad_norm": 27.235889434814453, | |
| "learning_rate": 1.1642651296829973e-05, | |
| "loss": 0.1125, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 4.207492795389049, | |
| "grad_norm": 0.729739785194397, | |
| "learning_rate": 1.1585014409221902e-05, | |
| "loss": 0.1263, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.236311239193084, | |
| "grad_norm": 95.56041717529297, | |
| "learning_rate": 1.1527377521613834e-05, | |
| "loss": 0.1196, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 4.265129682997118, | |
| "grad_norm": 0.6992059350013733, | |
| "learning_rate": 1.1469740634005764e-05, | |
| "loss": 0.1289, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 4.293948126801153, | |
| "grad_norm": 5.825665473937988, | |
| "learning_rate": 1.1412103746397697e-05, | |
| "loss": 0.0838, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 4.322766570605188, | |
| "grad_norm": 12.848553657531738, | |
| "learning_rate": 1.1354466858789625e-05, | |
| "loss": 0.109, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.351585014409222, | |
| "grad_norm": 0.7549734711647034, | |
| "learning_rate": 1.1296829971181558e-05, | |
| "loss": 0.1136, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 4.380403458213257, | |
| "grad_norm": 11.627622604370117, | |
| "learning_rate": 1.1239193083573488e-05, | |
| "loss": 0.1387, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.409221902017291, | |
| "grad_norm": 2.81147837638855, | |
| "learning_rate": 1.1181556195965419e-05, | |
| "loss": 0.0966, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 4.438040345821325, | |
| "grad_norm": 68.66207885742188, | |
| "learning_rate": 1.112391930835735e-05, | |
| "loss": 0.1238, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.46685878962536, | |
| "grad_norm": 0.8019499182701111, | |
| "learning_rate": 1.1066282420749282e-05, | |
| "loss": 0.1062, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.495677233429395, | |
| "grad_norm": 48.5037841796875, | |
| "learning_rate": 1.100864553314121e-05, | |
| "loss": 0.0796, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.524495677233429, | |
| "grad_norm": 1.2362786531448364, | |
| "learning_rate": 1.0951008645533141e-05, | |
| "loss": 0.1219, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 4.553314121037464, | |
| "grad_norm": 8.287176132202148, | |
| "learning_rate": 1.0893371757925073e-05, | |
| "loss": 0.1102, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.582132564841499, | |
| "grad_norm": 0.1790919452905655, | |
| "learning_rate": 1.0835734870317004e-05, | |
| "loss": 0.0486, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 4.610951008645533, | |
| "grad_norm": 36.70028305053711, | |
| "learning_rate": 1.0778097982708934e-05, | |
| "loss": 0.1433, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.639769452449568, | |
| "grad_norm": 22.68642807006836, | |
| "learning_rate": 1.0720461095100865e-05, | |
| "loss": 0.1598, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 4.6685878962536025, | |
| "grad_norm": 0.10349202156066895, | |
| "learning_rate": 1.0662824207492797e-05, | |
| "loss": 0.0742, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.697406340057637, | |
| "grad_norm": 14.959641456604004, | |
| "learning_rate": 1.0605187319884726e-05, | |
| "loss": 0.1328, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 4.726224783861672, | |
| "grad_norm": 1.050648808479309, | |
| "learning_rate": 1.0547550432276658e-05, | |
| "loss": 0.137, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.755043227665706, | |
| "grad_norm": 2.9457409381866455, | |
| "learning_rate": 1.0489913544668589e-05, | |
| "loss": 0.0624, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.78386167146974, | |
| "grad_norm": 0.5071529746055603, | |
| "learning_rate": 1.0432276657060521e-05, | |
| "loss": 0.1035, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.812680115273775, | |
| "grad_norm": 19.162033081054688, | |
| "learning_rate": 1.037463976945245e-05, | |
| "loss": 0.1705, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 4.8414985590778095, | |
| "grad_norm": 1.8972758054733276, | |
| "learning_rate": 1.031700288184438e-05, | |
| "loss": 0.1543, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.870317002881844, | |
| "grad_norm": 0.05384368821978569, | |
| "learning_rate": 1.0259365994236313e-05, | |
| "loss": 0.1049, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 4.899135446685879, | |
| "grad_norm": 9.819040298461914, | |
| "learning_rate": 1.0201729106628241e-05, | |
| "loss": 0.1305, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.927953890489913, | |
| "grad_norm": 2.4009037017822266, | |
| "learning_rate": 1.0144092219020174e-05, | |
| "loss": 0.1294, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 4.956772334293948, | |
| "grad_norm": 44.24007034301758, | |
| "learning_rate": 1.0086455331412104e-05, | |
| "loss": 0.1563, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.985590778097983, | |
| "grad_norm": 0.5839857459068298, | |
| "learning_rate": 1.0028818443804036e-05, | |
| "loss": 0.1116, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.9311139933708027, | |
| "eval_loss": 0.3880128860473633, | |
| "eval_runtime": 37.4244, | |
| "eval_samples_per_second": 185.414, | |
| "eval_steps_per_second": 11.597, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 5.014409221902017, | |
| "grad_norm": 23.385534286499023, | |
| "learning_rate": 9.971181556195965e-06, | |
| "loss": 0.1166, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 5.043227665706052, | |
| "grad_norm": 0.4506791830062866, | |
| "learning_rate": 9.913544668587897e-06, | |
| "loss": 0.057, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 5.072046109510087, | |
| "grad_norm": 49.61991882324219, | |
| "learning_rate": 9.855907780979828e-06, | |
| "loss": 0.0734, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 5.100864553314121, | |
| "grad_norm": 0.0602734349668026, | |
| "learning_rate": 9.798270893371759e-06, | |
| "loss": 0.0681, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 5.129682997118156, | |
| "grad_norm": 0.023745020851492882, | |
| "learning_rate": 9.740634005763689e-06, | |
| "loss": 0.072, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 5.1585014409221905, | |
| "grad_norm": 31.497879028320312, | |
| "learning_rate": 9.68299711815562e-06, | |
| "loss": 0.0474, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 5.187319884726225, | |
| "grad_norm": 1.734765648841858, | |
| "learning_rate": 9.625360230547552e-06, | |
| "loss": 0.0688, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.216138328530259, | |
| "grad_norm": 98.79872131347656, | |
| "learning_rate": 9.567723342939482e-06, | |
| "loss": 0.1255, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 5.244956772334294, | |
| "grad_norm": 6.518857002258301, | |
| "learning_rate": 9.510086455331413e-06, | |
| "loss": 0.0974, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 5.273775216138328, | |
| "grad_norm": 31.412202835083008, | |
| "learning_rate": 9.452449567723344e-06, | |
| "loss": 0.0412, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 5.302593659942363, | |
| "grad_norm": 2.6872849464416504, | |
| "learning_rate": 9.394812680115276e-06, | |
| "loss": 0.0679, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 5.3314121037463975, | |
| "grad_norm": 28.229368209838867, | |
| "learning_rate": 9.337175792507205e-06, | |
| "loss": 0.0817, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 5.360230547550432, | |
| "grad_norm": 0.24797913432121277, | |
| "learning_rate": 9.279538904899135e-06, | |
| "loss": 0.0768, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 5.389048991354467, | |
| "grad_norm": 0.3930692970752716, | |
| "learning_rate": 9.221902017291067e-06, | |
| "loss": 0.0945, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 5.417867435158501, | |
| "grad_norm": 3.4596047401428223, | |
| "learning_rate": 9.164265129682998e-06, | |
| "loss": 0.0729, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 5.446685878962536, | |
| "grad_norm": 3.12446665763855, | |
| "learning_rate": 9.106628242074928e-06, | |
| "loss": 0.0695, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 5.475504322766571, | |
| "grad_norm": 0.043711356818675995, | |
| "learning_rate": 9.048991354466859e-06, | |
| "loss": 0.0656, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 5.504322766570605, | |
| "grad_norm": 63.43326950073242, | |
| "learning_rate": 8.991354466858791e-06, | |
| "loss": 0.0684, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 5.53314121037464, | |
| "grad_norm": 2.399331569671631, | |
| "learning_rate": 8.933717579250722e-06, | |
| "loss": 0.1072, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 5.561959654178675, | |
| "grad_norm": 0.027092283591628075, | |
| "learning_rate": 8.876080691642652e-06, | |
| "loss": 0.1134, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 5.590778097982709, | |
| "grad_norm": 0.9653186798095703, | |
| "learning_rate": 8.818443804034583e-06, | |
| "loss": 0.0805, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 5.619596541786743, | |
| "grad_norm": 0.43673309683799744, | |
| "learning_rate": 8.760806916426513e-06, | |
| "loss": 0.0879, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 5.648414985590778, | |
| "grad_norm": 0.3906983435153961, | |
| "learning_rate": 8.703170028818444e-06, | |
| "loss": 0.1068, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 5.677233429394812, | |
| "grad_norm": 16.26913070678711, | |
| "learning_rate": 8.645533141210375e-06, | |
| "loss": 0.1136, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 5.706051873198847, | |
| "grad_norm": 2.990417242050171, | |
| "learning_rate": 8.587896253602305e-06, | |
| "loss": 0.0682, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 5.734870317002882, | |
| "grad_norm": 0.40273919701576233, | |
| "learning_rate": 8.530259365994237e-06, | |
| "loss": 0.0974, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 5.763688760806916, | |
| "grad_norm": 0.6563950777053833, | |
| "learning_rate": 8.472622478386168e-06, | |
| "loss": 0.0612, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.792507204610951, | |
| "grad_norm": 0.04713226109743118, | |
| "learning_rate": 8.414985590778098e-06, | |
| "loss": 0.058, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 5.821325648414986, | |
| "grad_norm": 155.84222412109375, | |
| "learning_rate": 8.357348703170029e-06, | |
| "loss": 0.0765, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 5.85014409221902, | |
| "grad_norm": 0.14352348446846008, | |
| "learning_rate": 8.299711815561961e-06, | |
| "loss": 0.1251, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 5.878962536023055, | |
| "grad_norm": 0.05221036821603775, | |
| "learning_rate": 8.242074927953892e-06, | |
| "loss": 0.0724, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 5.9077809798270895, | |
| "grad_norm": 44.994606018066406, | |
| "learning_rate": 8.184438040345822e-06, | |
| "loss": 0.0597, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 5.936599423631124, | |
| "grad_norm": 57.5409049987793, | |
| "learning_rate": 8.126801152737753e-06, | |
| "loss": 0.0747, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 5.965417867435159, | |
| "grad_norm": 40.75551986694336, | |
| "learning_rate": 8.069164265129685e-06, | |
| "loss": 0.11, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 5.994236311239193, | |
| "grad_norm": 21.53359603881836, | |
| "learning_rate": 8.011527377521614e-06, | |
| "loss": 0.0721, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9414901282605562, | |
| "eval_loss": 0.37702837586402893, | |
| "eval_runtime": 37.3615, | |
| "eval_samples_per_second": 185.726, | |
| "eval_steps_per_second": 11.616, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 6.023054755043228, | |
| "grad_norm": 11.95662784576416, | |
| "learning_rate": 7.953890489913544e-06, | |
| "loss": 0.0444, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 6.051873198847262, | |
| "grad_norm": 0.17058134078979492, | |
| "learning_rate": 7.896253602305477e-06, | |
| "loss": 0.0413, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.0806916426512965, | |
| "grad_norm": 0.023899082094430923, | |
| "learning_rate": 7.838616714697407e-06, | |
| "loss": 0.0283, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 6.109510086455331, | |
| "grad_norm": 5.194774627685547, | |
| "learning_rate": 7.780979827089338e-06, | |
| "loss": 0.029, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 6.138328530259366, | |
| "grad_norm": 47.05008316040039, | |
| "learning_rate": 7.723342939481268e-06, | |
| "loss": 0.0639, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 6.1671469740634, | |
| "grad_norm": 1.5677424669265747, | |
| "learning_rate": 7.665706051873199e-06, | |
| "loss": 0.013, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 6.195965417867435, | |
| "grad_norm": 0.054807789623737335, | |
| "learning_rate": 7.60806916426513e-06, | |
| "loss": 0.0314, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 6.22478386167147, | |
| "grad_norm": 0.005258807446807623, | |
| "learning_rate": 7.550432276657062e-06, | |
| "loss": 0.0773, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 6.253602305475504, | |
| "grad_norm": 6.556185722351074, | |
| "learning_rate": 7.492795389048992e-06, | |
| "loss": 0.055, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 6.282420749279539, | |
| "grad_norm": 0.009452180936932564, | |
| "learning_rate": 7.4351585014409235e-06, | |
| "loss": 0.0428, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 6.311239193083574, | |
| "grad_norm": 1.1048903465270996, | |
| "learning_rate": 7.377521613832853e-06, | |
| "loss": 0.023, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 6.340057636887608, | |
| "grad_norm": 0.10645721852779388, | |
| "learning_rate": 7.319884726224784e-06, | |
| "loss": 0.0403, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.368876080691643, | |
| "grad_norm": 0.03314146026968956, | |
| "learning_rate": 7.262247838616715e-06, | |
| "loss": 0.0632, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 6.3976945244956775, | |
| "grad_norm": 0.010250965133309364, | |
| "learning_rate": 7.204610951008646e-06, | |
| "loss": 0.0461, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 6.426512968299712, | |
| "grad_norm": 0.663324773311615, | |
| "learning_rate": 7.146974063400577e-06, | |
| "loss": 0.0706, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 6.455331412103746, | |
| "grad_norm": 0.016964536160230637, | |
| "learning_rate": 7.089337175792508e-06, | |
| "loss": 0.0623, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 6.484149855907781, | |
| "grad_norm": 0.8198705911636353, | |
| "learning_rate": 7.031700288184439e-06, | |
| "loss": 0.0486, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 6.512968299711815, | |
| "grad_norm": 0.014807182364165783, | |
| "learning_rate": 6.9740634005763696e-06, | |
| "loss": 0.0125, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 6.54178674351585, | |
| "grad_norm": 101.77655029296875, | |
| "learning_rate": 6.916426512968301e-06, | |
| "loss": 0.0436, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 6.5706051873198845, | |
| "grad_norm": 0.8024958968162537, | |
| "learning_rate": 6.8587896253602315e-06, | |
| "loss": 0.0525, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 6.599423631123919, | |
| "grad_norm": 0.010165904648602009, | |
| "learning_rate": 6.801152737752162e-06, | |
| "loss": 0.0255, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 6.628242074927954, | |
| "grad_norm": 0.31986117362976074, | |
| "learning_rate": 6.743515850144093e-06, | |
| "loss": 0.0765, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 6.6570605187319885, | |
| "grad_norm": 0.01843302696943283, | |
| "learning_rate": 6.685878962536023e-06, | |
| "loss": 0.0396, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 6.685878962536023, | |
| "grad_norm": 51.702552795410156, | |
| "learning_rate": 6.6282420749279545e-06, | |
| "loss": 0.1131, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 6.714697406340058, | |
| "grad_norm": 0.10061511397361755, | |
| "learning_rate": 6.570605187319885e-06, | |
| "loss": 0.0528, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 6.743515850144092, | |
| "grad_norm": 0.010481205768883228, | |
| "learning_rate": 6.512968299711816e-06, | |
| "loss": 0.0537, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 6.772334293948127, | |
| "grad_norm": 0.07575374096632004, | |
| "learning_rate": 6.455331412103747e-06, | |
| "loss": 0.0478, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 6.801152737752162, | |
| "grad_norm": 0.0058944206684827805, | |
| "learning_rate": 6.3976945244956775e-06, | |
| "loss": 0.065, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 6.829971181556196, | |
| "grad_norm": 0.9527530074119568, | |
| "learning_rate": 6.340057636887609e-06, | |
| "loss": 0.0505, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 6.858789625360231, | |
| "grad_norm": 22.32554054260254, | |
| "learning_rate": 6.2824207492795395e-06, | |
| "loss": 0.0858, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 6.887608069164266, | |
| "grad_norm": 0.04536249861121178, | |
| "learning_rate": 6.224783861671471e-06, | |
| "loss": 0.0618, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 6.916426512968299, | |
| "grad_norm": 0.030509620904922485, | |
| "learning_rate": 6.167146974063401e-06, | |
| "loss": 0.06, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 6.945244956772334, | |
| "grad_norm": 4.554119110107422, | |
| "learning_rate": 6.109510086455331e-06, | |
| "loss": 0.0478, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 6.974063400576369, | |
| "grad_norm": 0.03110508993268013, | |
| "learning_rate": 6.0518731988472625e-06, | |
| "loss": 0.0486, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.943651823029255, | |
| "eval_loss": 0.4001205563545227, | |
| "eval_runtime": 43.9647, | |
| "eval_samples_per_second": 157.831, | |
| "eval_steps_per_second": 9.872, | |
| "step": 12145 | |
| }, | |
| { | |
| "epoch": 7.002881844380403, | |
| "grad_norm": 3.505014181137085, | |
| "learning_rate": 5.994236311239193e-06, | |
| "loss": 0.0951, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 7.031700288184438, | |
| "grad_norm": 0.1990944892168045, | |
| "learning_rate": 5.9365994236311244e-06, | |
| "loss": 0.0131, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 7.060518731988473, | |
| "grad_norm": 0.19344107806682587, | |
| "learning_rate": 5.878962536023055e-06, | |
| "loss": 0.0247, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 7.089337175792507, | |
| "grad_norm": 0.5416840314865112, | |
| "learning_rate": 5.821325648414986e-06, | |
| "loss": 0.069, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 7.118155619596542, | |
| "grad_norm": 0.029165951535105705, | |
| "learning_rate": 5.763688760806917e-06, | |
| "loss": 0.029, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 7.1469740634005765, | |
| "grad_norm": 0.0034562216605991125, | |
| "learning_rate": 5.706051873198848e-06, | |
| "loss": 0.0337, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 7.175792507204611, | |
| "grad_norm": 0.048965733498334885, | |
| "learning_rate": 5.648414985590779e-06, | |
| "loss": 0.0214, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 7.204610951008646, | |
| "grad_norm": 0.05087200552225113, | |
| "learning_rate": 5.590778097982709e-06, | |
| "loss": 0.0071, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 7.23342939481268, | |
| "grad_norm": 0.0026253710966557264, | |
| "learning_rate": 5.533141210374641e-06, | |
| "loss": 0.0254, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 7.262247838616715, | |
| "grad_norm": 0.01383883971720934, | |
| "learning_rate": 5.4755043227665705e-06, | |
| "loss": 0.0354, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 7.291066282420749, | |
| "grad_norm": 0.06278753280639648, | |
| "learning_rate": 5.417867435158502e-06, | |
| "loss": 0.0499, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 7.3198847262247835, | |
| "grad_norm": 91.43658447265625, | |
| "learning_rate": 5.360230547550432e-06, | |
| "loss": 0.0162, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 7.348703170028818, | |
| "grad_norm": 0.042137544602155685, | |
| "learning_rate": 5.302593659942363e-06, | |
| "loss": 0.0183, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 7.377521613832853, | |
| "grad_norm": 0.5085373520851135, | |
| "learning_rate": 5.244956772334294e-06, | |
| "loss": 0.0623, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 7.406340057636887, | |
| "grad_norm": 0.07125350832939148, | |
| "learning_rate": 5.187319884726225e-06, | |
| "loss": 0.0403, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 7.435158501440922, | |
| "grad_norm": 0.0029079755768179893, | |
| "learning_rate": 5.129682997118156e-06, | |
| "loss": 0.0294, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 7.463976945244957, | |
| "grad_norm": 0.19476212561130524, | |
| "learning_rate": 5.072046109510087e-06, | |
| "loss": 0.0136, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 7.492795389048991, | |
| "grad_norm": 0.0114585617557168, | |
| "learning_rate": 5.014409221902018e-06, | |
| "loss": 0.0207, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 7.521613832853026, | |
| "grad_norm": 72.99742889404297, | |
| "learning_rate": 4.956772334293949e-06, | |
| "loss": 0.031, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 7.550432276657061, | |
| "grad_norm": 0.003232621820643544, | |
| "learning_rate": 4.899135446685879e-06, | |
| "loss": 0.0328, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 7.579250720461095, | |
| "grad_norm": 32.08872985839844, | |
| "learning_rate": 4.84149855907781e-06, | |
| "loss": 0.025, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 7.60806916426513, | |
| "grad_norm": 101.10234832763672, | |
| "learning_rate": 4.783861671469741e-06, | |
| "loss": 0.0514, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 7.636887608069165, | |
| "grad_norm": 0.02673684060573578, | |
| "learning_rate": 4.726224783861672e-06, | |
| "loss": 0.0423, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 7.665706051873199, | |
| "grad_norm": 0.09100785851478577, | |
| "learning_rate": 4.668587896253602e-06, | |
| "loss": 0.0128, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 7.694524495677234, | |
| "grad_norm": 0.014528327621519566, | |
| "learning_rate": 4.610951008645534e-06, | |
| "loss": 0.0401, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 7.7233429394812685, | |
| "grad_norm": 0.005020576063543558, | |
| "learning_rate": 4.553314121037464e-06, | |
| "loss": 0.0407, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 7.752161383285302, | |
| "grad_norm": 0.006958292331546545, | |
| "learning_rate": 4.495677233429396e-06, | |
| "loss": 0.0355, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 7.780979827089337, | |
| "grad_norm": 0.005842278711497784, | |
| "learning_rate": 4.438040345821326e-06, | |
| "loss": 0.0928, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 7.8097982708933715, | |
| "grad_norm": 0.0038527839351445436, | |
| "learning_rate": 4.380403458213257e-06, | |
| "loss": 0.049, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 7.838616714697406, | |
| "grad_norm": 0.10149471461772919, | |
| "learning_rate": 4.322766570605187e-06, | |
| "loss": 0.0537, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 7.867435158501441, | |
| "grad_norm": 0.20959939062595367, | |
| "learning_rate": 4.265129682997119e-06, | |
| "loss": 0.0314, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 7.8962536023054755, | |
| "grad_norm": 0.15081147849559784, | |
| "learning_rate": 4.207492795389049e-06, | |
| "loss": 0.0347, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 7.92507204610951, | |
| "grad_norm": 0.0020617288537323475, | |
| "learning_rate": 4.149855907780981e-06, | |
| "loss": 0.0193, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 7.953890489913545, | |
| "grad_norm": 1.0864264965057373, | |
| "learning_rate": 4.092219020172911e-06, | |
| "loss": 0.0307, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 7.982708933717579, | |
| "grad_norm": 0.8607079386711121, | |
| "learning_rate": 4.0345821325648425e-06, | |
| "loss": 0.0353, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.9432194840755153, | |
| "eval_loss": 0.4078510105609894, | |
| "eval_runtime": 37.2866, | |
| "eval_samples_per_second": 186.099, | |
| "eval_steps_per_second": 11.64, | |
| "step": 13880 | |
| }, | |
| { | |
| "epoch": 8.011527377521613, | |
| "grad_norm": 1.9995282888412476, | |
| "learning_rate": 3.976945244956772e-06, | |
| "loss": 0.0205, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 8.040345821325648, | |
| "grad_norm": 0.00989406555891037, | |
| "learning_rate": 3.919308357348704e-06, | |
| "loss": 0.0358, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 8.069164265129682, | |
| "grad_norm": 0.029149776324629784, | |
| "learning_rate": 3.861671469740634e-06, | |
| "loss": 0.0201, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.097982708933717, | |
| "grad_norm": 0.013825014233589172, | |
| "learning_rate": 3.804034582132565e-06, | |
| "loss": 0.0201, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 8.126801152737752, | |
| "grad_norm": 0.002229891484603286, | |
| "learning_rate": 3.746397694524496e-06, | |
| "loss": 0.0152, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 8.155619596541786, | |
| "grad_norm": 0.22877122461795807, | |
| "learning_rate": 3.6887608069164266e-06, | |
| "loss": 0.017, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 8.184438040345821, | |
| "grad_norm": 0.014974354766309261, | |
| "learning_rate": 3.6311239193083576e-06, | |
| "loss": 0.01, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 8.213256484149856, | |
| "grad_norm": 49.01778030395508, | |
| "learning_rate": 3.5734870317002885e-06, | |
| "loss": 0.0324, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 8.24207492795389, | |
| "grad_norm": 2.350428581237793, | |
| "learning_rate": 3.5158501440922195e-06, | |
| "loss": 0.0119, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 8.270893371757925, | |
| "grad_norm": 0.037689752876758575, | |
| "learning_rate": 3.4582132564841505e-06, | |
| "loss": 0.0205, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 8.29971181556196, | |
| "grad_norm": 0.15642857551574707, | |
| "learning_rate": 3.400576368876081e-06, | |
| "loss": 0.0346, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 8.328530259365994, | |
| "grad_norm": 0.0028990712016820908, | |
| "learning_rate": 3.3429394812680116e-06, | |
| "loss": 0.0316, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 8.357348703170029, | |
| "grad_norm": 42.16065979003906, | |
| "learning_rate": 3.2853025936599425e-06, | |
| "loss": 0.021, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 8.386167146974064, | |
| "grad_norm": 0.1258460283279419, | |
| "learning_rate": 3.2276657060518735e-06, | |
| "loss": 0.0258, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 8.414985590778098, | |
| "grad_norm": 0.015242321416735649, | |
| "learning_rate": 3.1700288184438045e-06, | |
| "loss": 0.0116, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 8.443804034582133, | |
| "grad_norm": 10.062729835510254, | |
| "learning_rate": 3.1123919308357354e-06, | |
| "loss": 0.0279, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 8.472622478386167, | |
| "grad_norm": 0.005074836779385805, | |
| "learning_rate": 3.0547550432276656e-06, | |
| "loss": 0.0294, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 8.501440922190202, | |
| "grad_norm": 0.029905997216701508, | |
| "learning_rate": 2.9971181556195965e-06, | |
| "loss": 0.0352, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 8.530259365994237, | |
| "grad_norm": 134.86546325683594, | |
| "learning_rate": 2.9394812680115275e-06, | |
| "loss": 0.0458, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 8.559077809798271, | |
| "grad_norm": 0.03388785198330879, | |
| "learning_rate": 2.8818443804034585e-06, | |
| "loss": 0.0014, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 8.587896253602306, | |
| "grad_norm": 0.0030293685849756002, | |
| "learning_rate": 2.8242074927953894e-06, | |
| "loss": 0.0098, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 8.61671469740634, | |
| "grad_norm": 0.008076900616288185, | |
| "learning_rate": 2.7665706051873204e-06, | |
| "loss": 0.0196, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 8.645533141210375, | |
| "grad_norm": 3.2185356616973877, | |
| "learning_rate": 2.708933717579251e-06, | |
| "loss": 0.0191, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 8.67435158501441, | |
| "grad_norm": 0.04700352996587753, | |
| "learning_rate": 2.6512968299711815e-06, | |
| "loss": 0.021, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 8.703170028818445, | |
| "grad_norm": 0.07113181054592133, | |
| "learning_rate": 2.5936599423631124e-06, | |
| "loss": 0.0325, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 8.73198847262248, | |
| "grad_norm": 36.98723220825195, | |
| "learning_rate": 2.5360230547550434e-06, | |
| "loss": 0.0391, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 8.760806916426514, | |
| "grad_norm": 0.005969559773802757, | |
| "learning_rate": 2.4783861671469744e-06, | |
| "loss": 0.0041, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 8.789625360230547, | |
| "grad_norm": 0.05540904030203819, | |
| "learning_rate": 2.420749279538905e-06, | |
| "loss": 0.0175, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 8.818443804034581, | |
| "grad_norm": 5.187469482421875, | |
| "learning_rate": 2.363112391930836e-06, | |
| "loss": 0.0574, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 8.847262247838616, | |
| "grad_norm": 0.02656296268105507, | |
| "learning_rate": 2.305475504322767e-06, | |
| "loss": 0.0076, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 8.87608069164265, | |
| "grad_norm": 62.193111419677734, | |
| "learning_rate": 2.247838616714698e-06, | |
| "loss": 0.0349, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 8.904899135446685, | |
| "grad_norm": 0.0021070409566164017, | |
| "learning_rate": 2.1902017291066284e-06, | |
| "loss": 0.0124, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 8.93371757925072, | |
| "grad_norm": 0.007772780489176512, | |
| "learning_rate": 2.1325648414985593e-06, | |
| "loss": 0.0116, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 8.962536023054755, | |
| "grad_norm": 0.012445935048162937, | |
| "learning_rate": 2.0749279538904903e-06, | |
| "loss": 0.0432, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 8.99135446685879, | |
| "grad_norm": 0.005035657435655594, | |
| "learning_rate": 2.0172910662824213e-06, | |
| "loss": 0.0311, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9409136763222367, | |
| "eval_loss": 0.45458710193634033, | |
| "eval_runtime": 37.2035, | |
| "eval_samples_per_second": 186.515, | |
| "eval_steps_per_second": 11.666, | |
| "step": 15615 | |
| }, | |
| { | |
| "epoch": 9.020172910662824, | |
| "grad_norm": 0.31200042366981506, | |
| "learning_rate": 1.959654178674352e-06, | |
| "loss": 0.0366, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 9.048991354466859, | |
| "grad_norm": 0.0019912293646484613, | |
| "learning_rate": 1.9020172910662826e-06, | |
| "loss": 0.0083, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 9.077809798270893, | |
| "grad_norm": 0.00532187195494771, | |
| "learning_rate": 1.8443804034582133e-06, | |
| "loss": 0.0064, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 9.106628242074928, | |
| "grad_norm": 0.0012970577226951718, | |
| "learning_rate": 1.7867435158501443e-06, | |
| "loss": 0.0279, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 9.135446685878962, | |
| "grad_norm": 0.002228269586339593, | |
| "learning_rate": 1.7291066282420752e-06, | |
| "loss": 0.0084, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 9.164265129682997, | |
| "grad_norm": 0.006390728056430817, | |
| "learning_rate": 1.6714697406340058e-06, | |
| "loss": 0.0265, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 9.193083573487032, | |
| "grad_norm": 0.00681735435500741, | |
| "learning_rate": 1.6138328530259367e-06, | |
| "loss": 0.0183, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 9.221902017291066, | |
| "grad_norm": 0.0014511954504996538, | |
| "learning_rate": 1.5561959654178677e-06, | |
| "loss": 0.0135, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 9.250720461095101, | |
| "grad_norm": 0.05415060743689537, | |
| "learning_rate": 1.4985590778097983e-06, | |
| "loss": 0.0243, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 9.279538904899136, | |
| "grad_norm": 0.003793516429141164, | |
| "learning_rate": 1.4409221902017292e-06, | |
| "loss": 0.031, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 9.30835734870317, | |
| "grad_norm": 0.0018004026496782899, | |
| "learning_rate": 1.3832853025936602e-06, | |
| "loss": 0.0193, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 9.337175792507205, | |
| "grad_norm": 0.9138274192810059, | |
| "learning_rate": 1.3256484149855907e-06, | |
| "loss": 0.021, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 9.36599423631124, | |
| "grad_norm": 0.0023099486716091633, | |
| "learning_rate": 1.2680115273775217e-06, | |
| "loss": 0.0052, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 9.394812680115274, | |
| "grad_norm": 0.004157908260822296, | |
| "learning_rate": 1.2103746397694525e-06, | |
| "loss": 0.016, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 9.423631123919309, | |
| "grad_norm": 0.00460004573687911, | |
| "learning_rate": 1.1527377521613834e-06, | |
| "loss": 0.0248, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 9.452449567723344, | |
| "grad_norm": 0.024985365569591522, | |
| "learning_rate": 1.0951008645533142e-06, | |
| "loss": 0.0312, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 9.481268011527378, | |
| "grad_norm": 0.09961481392383575, | |
| "learning_rate": 1.0374639769452451e-06, | |
| "loss": 0.041, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 9.510086455331413, | |
| "grad_norm": 0.004411065485328436, | |
| "learning_rate": 9.79827089337176e-07, | |
| "loss": 0.0116, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 9.538904899135447, | |
| "grad_norm": 0.0155422892421484, | |
| "learning_rate": 9.221902017291067e-07, | |
| "loss": 0.0178, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 9.56772334293948, | |
| "grad_norm": 0.003015684662386775, | |
| "learning_rate": 8.645533141210376e-07, | |
| "loss": 0.0107, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 9.596541786743515, | |
| "grad_norm": 0.17505097389221191, | |
| "learning_rate": 8.069164265129684e-07, | |
| "loss": 0.0057, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 9.62536023054755, | |
| "grad_norm": 0.02643350511789322, | |
| "learning_rate": 7.492795389048991e-07, | |
| "loss": 0.0131, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 9.654178674351584, | |
| "grad_norm": 0.0013237950624898076, | |
| "learning_rate": 6.916426512968301e-07, | |
| "loss": 0.0009, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 9.682997118155619, | |
| "grad_norm": 0.10223102569580078, | |
| "learning_rate": 6.340057636887609e-07, | |
| "loss": 0.0116, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 9.711815561959654, | |
| "grad_norm": 0.018011650070548058, | |
| "learning_rate": 5.763688760806917e-07, | |
| "loss": 0.0551, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 9.740634005763688, | |
| "grad_norm": 0.017454462125897408, | |
| "learning_rate": 5.187319884726226e-07, | |
| "loss": 0.0011, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 9.769452449567723, | |
| "grad_norm": 0.0015755228232592344, | |
| "learning_rate": 4.6109510086455333e-07, | |
| "loss": 0.013, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 9.798270893371757, | |
| "grad_norm": 0.009967944584786892, | |
| "learning_rate": 4.034582132564842e-07, | |
| "loss": 0.0189, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 9.827089337175792, | |
| "grad_norm": 31.352365493774414, | |
| "learning_rate": 3.4582132564841505e-07, | |
| "loss": 0.0139, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 9.855907780979827, | |
| "grad_norm": 0.010829967446625233, | |
| "learning_rate": 2.8818443804034586e-07, | |
| "loss": 0.0091, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 9.884726224783861, | |
| "grad_norm": 0.002866011345759034, | |
| "learning_rate": 2.3054755043227666e-07, | |
| "loss": 0.0264, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 9.913544668587896, | |
| "grad_norm": 0.0031210549641400576, | |
| "learning_rate": 1.7291066282420752e-07, | |
| "loss": 0.0137, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 9.94236311239193, | |
| "grad_norm": 0.020828669890761375, | |
| "learning_rate": 1.1527377521613833e-07, | |
| "loss": 0.0216, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 9.971181556195965, | |
| "grad_norm": 0.13391831517219543, | |
| "learning_rate": 5.7636887608069166e-08, | |
| "loss": 0.0065, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.0037497931625694036, | |
| "learning_rate": 0.0, | |
| "loss": 0.009, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.9424989191526156, | |
| "eval_loss": 0.4487462341785431, | |
| "eval_runtime": 37.4007, | |
| "eval_samples_per_second": 185.531, | |
| "eval_steps_per_second": 11.604, | |
| "step": 17350 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 17350, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "total_flos": 3.6512244667008e+16, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |