| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.9999432860086583, |
| "eval_steps": 500, |
| "global_step": 52896, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00945233189027733, |
| "grad_norm": 29.5, |
| "learning_rate": 9.960000000000001e-06, |
| "loss": 4.1004, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.01890466378055466, |
| "grad_norm": 20.125, |
| "learning_rate": 9.999448050049255e-06, |
| "loss": 4.0386, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.028356995670831994, |
| "grad_norm": 16.625, |
| "learning_rate": 9.997783447634044e-06, |
| "loss": 4.1263, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.03780932756110932, |
| "grad_norm": 18.5, |
| "learning_rate": 9.995006554320588e-06, |
| "loss": 3.9826, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.047261659451386655, |
| "grad_norm": 18.625, |
| "learning_rate": 9.991117988125487e-06, |
| "loss": 4.0093, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.05671399134166399, |
| "grad_norm": 12.375, |
| "learning_rate": 9.986118614475757e-06, |
| "loss": 3.9503, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.06616632323194133, |
| "grad_norm": 15.8125, |
| "learning_rate": 9.980009546016204e-06, |
| "loss": 4.0245, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.07561865512221864, |
| "grad_norm": 17.5, |
| "learning_rate": 9.972792142361807e-06, |
| "loss": 3.9901, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.08507098701249598, |
| "grad_norm": 22.75, |
| "learning_rate": 9.964468009795128e-06, |
| "loss": 3.9098, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.09452331890277331, |
| "grad_norm": 19.5, |
| "learning_rate": 9.95503900090882e-06, |
| "loss": 3.9788, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.10397565079305064, |
| "grad_norm": 17.375, |
| "learning_rate": 9.944507214193314e-06, |
| "loss": 4.0492, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.11342798268332797, |
| "grad_norm": 16.75, |
| "learning_rate": 9.932874993569803e-06, |
| "loss": 3.9152, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.1228803145736053, |
| "grad_norm": 19.625, |
| "learning_rate": 9.92014492786856e-06, |
| "loss": 4.0289, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.13233264646388265, |
| "grad_norm": 26.75, |
| "learning_rate": 9.906319850252806e-06, |
| "loss": 3.9419, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.14178497835415999, |
| "grad_norm": 17.875, |
| "learning_rate": 9.891402837588142e-06, |
| "loss": 3.9255, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.1512373102444373, |
| "grad_norm": 17.0, |
| "learning_rate": 9.875397209757793e-06, |
| "loss": 4.068, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.16068964213471462, |
| "grad_norm": 22.375, |
| "learning_rate": 9.858306528923734e-06, |
| "loss": 3.9229, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.17014197402499195, |
| "grad_norm": 15.8125, |
| "learning_rate": 9.840134598733906e-06, |
| "loss": 3.8975, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.1795943059152693, |
| "grad_norm": 17.875, |
| "learning_rate": 9.8208854634757e-06, |
| "loss": 3.9159, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.18904663780554662, |
| "grad_norm": 19.0, |
| "learning_rate": 9.800563407175856e-06, |
| "loss": 3.9892, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.19849896969582395, |
| "grad_norm": 24.0, |
| "learning_rate": 9.779172952647035e-06, |
| "loss": 3.9846, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.20795130158610128, |
| "grad_norm": 15.9375, |
| "learning_rate": 9.756718860481235e-06, |
| "loss": 4.0746, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.21740363347637862, |
| "grad_norm": 19.25, |
| "learning_rate": 9.733206127990285e-06, |
| "loss": 3.9736, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.22685596536665595, |
| "grad_norm": 17.25, |
| "learning_rate": 9.708639988093663e-06, |
| "loss": 3.9673, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.23630829725693328, |
| "grad_norm": 17.375, |
| "learning_rate": 9.683025908153868e-06, |
| "loss": 3.9672, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.2457606291472106, |
| "grad_norm": 17.375, |
| "learning_rate": 9.656369588759628e-06, |
| "loss": 3.9812, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.255212961037488, |
| "grad_norm": 17.625, |
| "learning_rate": 9.628676962457194e-06, |
| "loss": 3.9659, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.2646652929277653, |
| "grad_norm": 18.0, |
| "learning_rate": 9.599954192430004e-06, |
| "loss": 3.9614, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.27411762481804264, |
| "grad_norm": 17.125, |
| "learning_rate": 9.570207671127034e-06, |
| "loss": 3.9424, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.28356995670831997, |
| "grad_norm": 24.125, |
| "learning_rate": 9.539444018840107e-06, |
| "loss": 3.9533, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.29302228859859725, |
| "grad_norm": 20.375, |
| "learning_rate": 9.507670082230507e-06, |
| "loss": 4.0344, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.3024746204888746, |
| "grad_norm": 18.375, |
| "learning_rate": 9.474892932805209e-06, |
| "loss": 3.8986, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.3119269523791519, |
| "grad_norm": 19.5, |
| "learning_rate": 9.441119865343054e-06, |
| "loss": 3.9415, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.32137928426942924, |
| "grad_norm": 20.125, |
| "learning_rate": 9.406358396271266e-06, |
| "loss": 3.9542, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.3308316161597066, |
| "grad_norm": 25.375, |
| "learning_rate": 9.370616261992605e-06, |
| "loss": 4.0098, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.3402839480499839, |
| "grad_norm": 19.625, |
| "learning_rate": 9.33390141716358e-06, |
| "loss": 3.9839, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.34973627994026124, |
| "grad_norm": 19.5, |
| "learning_rate": 9.296222032924092e-06, |
| "loss": 3.9886, |
| "step": 9250 |
| }, |
| { |
| "epoch": 0.3591886118305386, |
| "grad_norm": 16.75, |
| "learning_rate": 9.257586495078882e-06, |
| "loss": 3.8992, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.3686409437208159, |
| "grad_norm": 17.5, |
| "learning_rate": 9.21800340223122e-06, |
| "loss": 4.0108, |
| "step": 9750 |
| }, |
| { |
| "epoch": 0.37809327561109324, |
| "grad_norm": 17.375, |
| "learning_rate": 9.177481563869226e-06, |
| "loss": 3.9957, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.38754560750137057, |
| "grad_norm": 19.625, |
| "learning_rate": 9.136029998405253e-06, |
| "loss": 3.966, |
| "step": 10250 |
| }, |
| { |
| "epoch": 0.3969979393916479, |
| "grad_norm": 20.125, |
| "learning_rate": 9.093657931168782e-06, |
| "loss": 4.0057, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.40645027128192524, |
| "grad_norm": 25.5, |
| "learning_rate": 9.050374792353265e-06, |
| "loss": 4.0049, |
| "step": 10750 |
| }, |
| { |
| "epoch": 0.41590260317220257, |
| "grad_norm": 17.5, |
| "learning_rate": 9.006190214917363e-06, |
| "loss": 4.046, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.4253549350624799, |
| "grad_norm": 15.25, |
| "learning_rate": 8.961114032441067e-06, |
| "loss": 4.0138, |
| "step": 11250 |
| }, |
| { |
| "epoch": 0.43480726695275723, |
| "grad_norm": 58.5, |
| "learning_rate": 8.915156276937175e-06, |
| "loss": 4.0145, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.44425959884303456, |
| "grad_norm": 18.25, |
| "learning_rate": 8.868327176618592e-06, |
| "loss": 3.9748, |
| "step": 11750 |
| }, |
| { |
| "epoch": 0.4537119307333119, |
| "grad_norm": 17.0, |
| "learning_rate": 8.82063715362197e-06, |
| "loss": 4.0039, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.46316426262358923, |
| "grad_norm": 17.875, |
| "learning_rate": 8.772096821688194e-06, |
| "loss": 4.1231, |
| "step": 12250 |
| }, |
| { |
| "epoch": 0.47261659451386656, |
| "grad_norm": 19.25, |
| "learning_rate": 8.722716983800226e-06, |
| "loss": 4.0778, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.4820689264041439, |
| "grad_norm": 14.0625, |
| "learning_rate": 8.672508629778809e-06, |
| "loss": 3.9998, |
| "step": 12750 |
| }, |
| { |
| "epoch": 0.4915212582944212, |
| "grad_norm": 20.0, |
| "learning_rate": 8.621482933836634e-06, |
| "loss": 4.0298, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.5009735901846986, |
| "grad_norm": 21.5, |
| "learning_rate": 8.569651252091418e-06, |
| "loss": 3.9807, |
| "step": 13250 |
| }, |
| { |
| "epoch": 0.510425922074976, |
| "grad_norm": 17.75, |
| "learning_rate": 8.517025120038536e-06, |
| "loss": 4.084, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.5198782539652532, |
| "grad_norm": 19.0, |
| "learning_rate": 8.463616249983718e-06, |
| "loss": 4.1373, |
| "step": 13750 |
| }, |
| { |
| "epoch": 0.5293305858555306, |
| "grad_norm": 19.75, |
| "learning_rate": 8.409436528436381e-06, |
| "loss": 4.0691, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.5387829177458079, |
| "grad_norm": 17.25, |
| "learning_rate": 8.354498013464228e-06, |
| "loss": 4.0686, |
| "step": 14250 |
| }, |
| { |
| "epoch": 0.5482352496360853, |
| "grad_norm": 16.875, |
| "learning_rate": 8.298812932009622e-06, |
| "loss": 4.0066, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.5576875815263626, |
| "grad_norm": 19.625, |
| "learning_rate": 8.242393677168406e-06, |
| "loss": 4.0525, |
| "step": 14750 |
| }, |
| { |
| "epoch": 0.5671399134166399, |
| "grad_norm": 20.5, |
| "learning_rate": 8.185252805431732e-06, |
| "loss": 4.0993, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.5765922453069172, |
| "grad_norm": 20.375, |
| "learning_rate": 8.127403033891532e-06, |
| "loss": 4.0902, |
| "step": 15250 |
| }, |
| { |
| "epoch": 0.5860445771971945, |
| "grad_norm": 21.125, |
| "learning_rate": 8.068857237410237e-06, |
| "loss": 4.0273, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.5954969090874719, |
| "grad_norm": 19.875, |
| "learning_rate": 8.00962844575539e-06, |
| "loss": 4.0831, |
| "step": 15750 |
| }, |
| { |
| "epoch": 0.6049492409777492, |
| "grad_norm": 243.0, |
| "learning_rate": 7.949729840699784e-06, |
| "loss": 4.0758, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.6144015728680265, |
| "grad_norm": 17.875, |
| "learning_rate": 7.889174753087767e-06, |
| "loss": 4.0918, |
| "step": 16250 |
| }, |
| { |
| "epoch": 0.6238539047583038, |
| "grad_norm": 19.125, |
| "learning_rate": 7.827976659868368e-06, |
| "loss": 4.0538, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.6333062366485812, |
| "grad_norm": 19.625, |
| "learning_rate": 7.766149181095916e-06, |
| "loss": 4.1164, |
| "step": 16750 |
| }, |
| { |
| "epoch": 0.6427585685388585, |
| "grad_norm": 16.5, |
| "learning_rate": 7.703706076898803e-06, |
| "loss": 4.0626, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.6522109004291359, |
| "grad_norm": 14.375, |
| "learning_rate": 7.640661244417064e-06, |
| "loss": 4.0444, |
| "step": 17250 |
| }, |
| { |
| "epoch": 0.6616632323194132, |
| "grad_norm": 19.0, |
| "learning_rate": 7.577028714709484e-06, |
| "loss": 4.0429, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.6711155642096905, |
| "grad_norm": 17.875, |
| "learning_rate": 7.512822649630893e-06, |
| "loss": 4.0362, |
| "step": 17750 |
| }, |
| { |
| "epoch": 0.6805678960999678, |
| "grad_norm": 15.625, |
| "learning_rate": 7.44805733868033e-06, |
| "loss": 4.0806, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.6900202279902452, |
| "grad_norm": 17.625, |
| "learning_rate": 7.382747195820834e-06, |
| "loss": 4.0933, |
| "step": 18250 |
| }, |
| { |
| "epoch": 0.6994725598805225, |
| "grad_norm": 16.875, |
| "learning_rate": 7.316906756271515e-06, |
| "loss": 4.0495, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.7089248917707999, |
| "grad_norm": 20.375, |
| "learning_rate": 7.250550673272639e-06, |
| "loss": 4.0599, |
| "step": 18750 |
| }, |
| { |
| "epoch": 0.7183772236610771, |
| "grad_norm": 19.375, |
| "learning_rate": 7.1836937148244445e-06, |
| "loss": 4.0653, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.7278295555513545, |
| "grad_norm": 16.5, |
| "learning_rate": 7.1163507604004326e-06, |
| "loss": 4.0266, |
| "step": 19250 |
| }, |
| { |
| "epoch": 0.7372818874416318, |
| "grad_norm": 20.75, |
| "learning_rate": 7.048536797635832e-06, |
| "loss": 4.0484, |
| "step": 19500 |
| }, |
| { |
| "epoch": 0.7467342193319092, |
| "grad_norm": 18.25, |
| "learning_rate": 6.9802669189920005e-06, |
| "loss": 4.043, |
| "step": 19750 |
| }, |
| { |
| "epoch": 0.7561865512221865, |
| "grad_norm": 47.0, |
| "learning_rate": 6.911556318397493e-06, |
| "loss": 4.0716, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.7656388831124639, |
| "grad_norm": 15.625, |
| "learning_rate": 6.8424202878665515e-06, |
| "loss": 4.059, |
| "step": 20250 |
| }, |
| { |
| "epoch": 0.7750912150027411, |
| "grad_norm": 18.5, |
| "learning_rate": 6.772874214095761e-06, |
| "loss": 3.9974, |
| "step": 20500 |
| }, |
| { |
| "epoch": 0.7845435468930185, |
| "grad_norm": 16.0, |
| "learning_rate": 6.702933575039631e-06, |
| "loss": 4.0551, |
| "step": 20750 |
| }, |
| { |
| "epoch": 0.7939958787832958, |
| "grad_norm": 15.5, |
| "learning_rate": 6.6326139364658795e-06, |
| "loss": 4.1337, |
| "step": 21000 |
| }, |
| { |
| "epoch": 0.8034482106735732, |
| "grad_norm": 19.125, |
| "learning_rate": 6.561930948491155e-06, |
| "loss": 4.0849, |
| "step": 21250 |
| }, |
| { |
| "epoch": 0.8129005425638505, |
| "grad_norm": 19.125, |
| "learning_rate": 6.4909003420980065e-06, |
| "loss": 4.069, |
| "step": 21500 |
| }, |
| { |
| "epoch": 0.8223528744541279, |
| "grad_norm": 18.875, |
| "learning_rate": 6.419537925633836e-06, |
| "loss": 4.0218, |
| "step": 21750 |
| }, |
| { |
| "epoch": 0.8318052063444051, |
| "grad_norm": 17.125, |
| "learning_rate": 6.34785958129265e-06, |
| "loss": 3.9901, |
| "step": 22000 |
| }, |
| { |
| "epoch": 0.8412575382346825, |
| "grad_norm": 17.625, |
| "learning_rate": 6.275881261580363e-06, |
| "loss": 4.0088, |
| "step": 22250 |
| }, |
| { |
| "epoch": 0.8507098701249598, |
| "grad_norm": 17.0, |
| "learning_rate": 6.2036189857644616e-06, |
| "loss": 4.0448, |
| "step": 22500 |
| }, |
| { |
| "epoch": 0.8601622020152372, |
| "grad_norm": 15.8125, |
| "learning_rate": 6.131088836308805e-06, |
| "loss": 4.0443, |
| "step": 22750 |
| }, |
| { |
| "epoch": 0.8696145339055145, |
| "grad_norm": 22.875, |
| "learning_rate": 6.058306955294365e-06, |
| "loss": 4.0573, |
| "step": 23000 |
| }, |
| { |
| "epoch": 0.8790668657957919, |
| "grad_norm": 54.0, |
| "learning_rate": 5.9852895408266955e-06, |
| "loss": 4.0054, |
| "step": 23250 |
| }, |
| { |
| "epoch": 0.8885191976860691, |
| "grad_norm": 21.0, |
| "learning_rate": 5.9120528434309245e-06, |
| "loss": 4.0112, |
| "step": 23500 |
| }, |
| { |
| "epoch": 0.8979715295763465, |
| "grad_norm": 19.5, |
| "learning_rate": 5.838613162435106e-06, |
| "loss": 4.0095, |
| "step": 23750 |
| }, |
| { |
| "epoch": 0.9074238614666238, |
| "grad_norm": 22.375, |
| "learning_rate": 5.764986842342675e-06, |
| "loss": 3.9941, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.9168761933569012, |
| "grad_norm": 15.875, |
| "learning_rate": 5.6911902691948786e-06, |
| "loss": 3.9703, |
| "step": 24250 |
| }, |
| { |
| "epoch": 0.9263285252471785, |
| "grad_norm": 18.0, |
| "learning_rate": 5.617239866923945e-06, |
| "loss": 3.9949, |
| "step": 24500 |
| }, |
| { |
| "epoch": 0.9357808571374558, |
| "grad_norm": 18.25, |
| "learning_rate": 5.543152093697826e-06, |
| "loss": 4.0225, |
| "step": 24750 |
| }, |
| { |
| "epoch": 0.9452331890277331, |
| "grad_norm": 17.625, |
| "learning_rate": 5.4689434382573156e-06, |
| "loss": 3.998, |
| "step": 25000 |
| }, |
| { |
| "epoch": 0.9546855209180105, |
| "grad_norm": 16.5, |
| "learning_rate": 5.39463041624638e-06, |
| "loss": 3.9813, |
| "step": 25250 |
| }, |
| { |
| "epoch": 0.9641378528082878, |
| "grad_norm": 17.125, |
| "learning_rate": 5.320229566536474e-06, |
| "loss": 3.9089, |
| "step": 25500 |
| }, |
| { |
| "epoch": 0.9735901846985652, |
| "grad_norm": 17.875, |
| "learning_rate": 5.245757447545706e-06, |
| "loss": 4.0302, |
| "step": 25750 |
| }, |
| { |
| "epoch": 0.9830425165888425, |
| "grad_norm": 17.25, |
| "learning_rate": 5.171230633553656e-06, |
| "loss": 3.9841, |
| "step": 26000 |
| }, |
| { |
| "epoch": 0.9924948484791198, |
| "grad_norm": 16.125, |
| "learning_rate": 5.096665711012646e-06, |
| "loss": 3.9648, |
| "step": 26250 |
| }, |
| { |
| "epoch": 1.0019282757056165, |
| "grad_norm": 21.125, |
| "learning_rate": 5.0220792748563195e-06, |
| "loss": 3.8978, |
| "step": 26500 |
| }, |
| { |
| "epoch": 1.011380607595894, |
| "grad_norm": 25.75, |
| "learning_rate": 4.94748792480632e-06, |
| "loss": 3.4963, |
| "step": 26750 |
| }, |
| { |
| "epoch": 1.0208329394861713, |
| "grad_norm": 17.375, |
| "learning_rate": 4.872908261677911e-06, |
| "loss": 3.6178, |
| "step": 27000 |
| }, |
| { |
| "epoch": 1.0302852713764485, |
| "grad_norm": 25.375, |
| "learning_rate": 4.7983568836853564e-06, |
| "loss": 3.5309, |
| "step": 27250 |
| }, |
| { |
| "epoch": 1.0397376032667258, |
| "grad_norm": 26.625, |
| "learning_rate": 4.723850382747863e-06, |
| "loss": 3.4875, |
| "step": 27500 |
| }, |
| { |
| "epoch": 1.0491899351570033, |
| "grad_norm": 23.25, |
| "learning_rate": 4.649405340796947e-06, |
| "loss": 3.5433, |
| "step": 27750 |
| }, |
| { |
| "epoch": 1.0586422670472806, |
| "grad_norm": 17.125, |
| "learning_rate": 4.575038326086007e-06, |
| "loss": 3.5867, |
| "step": 28000 |
| }, |
| { |
| "epoch": 1.0680945989375579, |
| "grad_norm": 17.875, |
| "learning_rate": 4.500765889502937e-06, |
| "loss": 3.5986, |
| "step": 28250 |
| }, |
| { |
| "epoch": 1.0775469308278351, |
| "grad_norm": 18.0, |
| "learning_rate": 4.426604560886636e-06, |
| "loss": 3.5402, |
| "step": 28500 |
| }, |
| { |
| "epoch": 1.0869992627181126, |
| "grad_norm": 17.75, |
| "learning_rate": 4.3525708453481505e-06, |
| "loss": 3.5184, |
| "step": 28750 |
| }, |
| { |
| "epoch": 1.09645159460839, |
| "grad_norm": 19.875, |
| "learning_rate": 4.278681219597375e-06, |
| "loss": 3.5137, |
| "step": 29000 |
| }, |
| { |
| "epoch": 1.1059039264986672, |
| "grad_norm": 22.375, |
| "learning_rate": 4.204952128276027e-06, |
| "loss": 3.6344, |
| "step": 29250 |
| }, |
| { |
| "epoch": 1.1153562583889445, |
| "grad_norm": 19.25, |
| "learning_rate": 4.131399980297796e-06, |
| "loss": 3.5423, |
| "step": 29500 |
| }, |
| { |
| "epoch": 1.124808590279222, |
| "grad_norm": 24.25, |
| "learning_rate": 4.058041145196414e-06, |
| "loss": 3.5973, |
| "step": 29750 |
| }, |
| { |
| "epoch": 1.1342609221694993, |
| "grad_norm": 20.875, |
| "learning_rate": 3.98489194948251e-06, |
| "loss": 3.5454, |
| "step": 30000 |
| }, |
| { |
| "epoch": 1.1437132540597765, |
| "grad_norm": 19.25, |
| "learning_rate": 3.911968673010038e-06, |
| "loss": 3.5508, |
| "step": 30250 |
| }, |
| { |
| "epoch": 1.1531655859500538, |
| "grad_norm": 20.0, |
| "learning_rate": 3.839287545353076e-06, |
| "loss": 3.4747, |
| "step": 30500 |
| }, |
| { |
| "epoch": 1.1626179178403313, |
| "grad_norm": 21.25, |
| "learning_rate": 3.7668647421938275e-06, |
| "loss": 3.538, |
| "step": 30750 |
| }, |
| { |
| "epoch": 1.1720702497306086, |
| "grad_norm": 26.125, |
| "learning_rate": 3.694716381722609e-06, |
| "loss": 3.5677, |
| "step": 31000 |
| }, |
| { |
| "epoch": 1.1815225816208859, |
| "grad_norm": 25.75, |
| "learning_rate": 3.6228585210506427e-06, |
| "loss": 3.5912, |
| "step": 31250 |
| }, |
| { |
| "epoch": 1.1909749135111631, |
| "grad_norm": 17.5, |
| "learning_rate": 3.551307152636431e-06, |
| "loss": 3.5178, |
| "step": 31500 |
| }, |
| { |
| "epoch": 1.2004272454014404, |
| "grad_norm": 21.875, |
| "learning_rate": 3.4800782007265265e-06, |
| "loss": 3.5475, |
| "step": 31750 |
| }, |
| { |
| "epoch": 1.209879577291718, |
| "grad_norm": 28.0, |
| "learning_rate": 3.409187517811486e-06, |
| "loss": 3.5383, |
| "step": 32000 |
| }, |
| { |
| "epoch": 1.2193319091819952, |
| "grad_norm": 29.625, |
| "learning_rate": 3.3386508810977856e-06, |
| "loss": 3.5525, |
| "step": 32250 |
| }, |
| { |
| "epoch": 1.2287842410722725, |
| "grad_norm": 18.75, |
| "learning_rate": 3.2684839889964988e-06, |
| "loss": 3.5202, |
| "step": 32500 |
| }, |
| { |
| "epoch": 1.23823657296255, |
| "grad_norm": 24.0, |
| "learning_rate": 3.1987024576295012e-06, |
| "loss": 3.5926, |
| "step": 32750 |
| }, |
| { |
| "epoch": 1.2476889048528272, |
| "grad_norm": 21.0, |
| "learning_rate": 3.1293218173540074e-06, |
| "loss": 3.4852, |
| "step": 33000 |
| }, |
| { |
| "epoch": 1.2571412367431045, |
| "grad_norm": 20.125, |
| "learning_rate": 3.060357509306171e-06, |
| "loss": 3.4994, |
| "step": 33250 |
| }, |
| { |
| "epoch": 1.2665935686333818, |
| "grad_norm": 20.875, |
| "learning_rate": 2.9918248819645624e-06, |
| "loss": 3.566, |
| "step": 33500 |
| }, |
| { |
| "epoch": 1.276045900523659, |
| "grad_norm": 39.75, |
| "learning_rate": 2.923739187734258e-06, |
| "loss": 3.517, |
| "step": 33750 |
| }, |
| { |
| "epoch": 1.2854982324139366, |
| "grad_norm": 18.875, |
| "learning_rate": 2.8561155795523133e-06, |
| "loss": 3.4938, |
| "step": 34000 |
| }, |
| { |
| "epoch": 1.2949505643042138, |
| "grad_norm": 20.625, |
| "learning_rate": 2.788969107515369e-06, |
| "loss": 3.501, |
| "step": 34250 |
| }, |
| { |
| "epoch": 1.3044028961944911, |
| "grad_norm": 22.5, |
| "learning_rate": 2.722314715530156e-06, |
| "loss": 3.574, |
| "step": 34500 |
| }, |
| { |
| "epoch": 1.3138552280847686, |
| "grad_norm": 22.375, |
| "learning_rate": 2.6561672379876236e-06, |
| "loss": 3.4953, |
| "step": 34750 |
| }, |
| { |
| "epoch": 1.323307559975046, |
| "grad_norm": 20.375, |
| "learning_rate": 2.590541396461438e-06, |
| "loss": 3.4766, |
| "step": 35000 |
| }, |
| { |
| "epoch": 1.3327598918653232, |
| "grad_norm": 23.0, |
| "learning_rate": 2.5254517964316084e-06, |
| "loss": 3.4905, |
| "step": 35250 |
| }, |
| { |
| "epoch": 1.3422122237556005, |
| "grad_norm": 24.0, |
| "learning_rate": 2.4609129240339253e-06, |
| "loss": 3.5543, |
| "step": 35500 |
| }, |
| { |
| "epoch": 1.3516645556458777, |
| "grad_norm": 23.125, |
| "learning_rate": 2.39693914283598e-06, |
| "loss": 3.5577, |
| "step": 35750 |
| }, |
| { |
| "epoch": 1.3611168875361552, |
| "grad_norm": 19.125, |
| "learning_rate": 2.333544690640451e-06, |
| "loss": 3.515, |
| "step": 36000 |
| }, |
| { |
| "epoch": 1.3705692194264325, |
| "grad_norm": 22.625, |
| "learning_rate": 2.270743676316383e-06, |
| "loss": 3.506, |
| "step": 36250 |
| }, |
| { |
| "epoch": 1.3800215513167098, |
| "grad_norm": 18.625, |
| "learning_rate": 2.20855007665916e-06, |
| "loss": 3.5909, |
| "step": 36500 |
| }, |
| { |
| "epoch": 1.3894738832069873, |
| "grad_norm": 20.875, |
| "learning_rate": 2.1469777332798804e-06, |
| "loss": 3.5444, |
| "step": 36750 |
| }, |
| { |
| "epoch": 1.3989262150972646, |
| "grad_norm": 19.375, |
| "learning_rate": 2.086040349524807e-06, |
| "loss": 3.5885, |
| "step": 37000 |
| }, |
| { |
| "epoch": 1.4083785469875418, |
| "grad_norm": 20.5, |
| "learning_rate": 2.025751487425591e-06, |
| "loss": 3.5437, |
| "step": 37250 |
| }, |
| { |
| "epoch": 1.4178308788778191, |
| "grad_norm": 22.75, |
| "learning_rate": 1.9661245646809546e-06, |
| "loss": 3.5815, |
| "step": 37500 |
| }, |
| { |
| "epoch": 1.4272832107680964, |
| "grad_norm": 24.375, |
| "learning_rate": 1.9071728516704897e-06, |
| "loss": 3.5147, |
| "step": 37750 |
| }, |
| { |
| "epoch": 1.4367355426583739, |
| "grad_norm": 25.75, |
| "learning_rate": 1.8489094685012394e-06, |
| "loss": 3.5288, |
| "step": 38000 |
| }, |
| { |
| "epoch": 1.4461878745486512, |
| "grad_norm": 16.25, |
| "learning_rate": 1.7913473820877353e-06, |
| "loss": 3.5381, |
| "step": 38250 |
| }, |
| { |
| "epoch": 1.4556402064389284, |
| "grad_norm": 22.375, |
| "learning_rate": 1.7344994032661116e-06, |
| "loss": 3.5954, |
| "step": 38500 |
| }, |
| { |
| "epoch": 1.465092538329206, |
| "grad_norm": 19.125, |
| "learning_rate": 1.6783781839429785e-06, |
| "loss": 3.5212, |
| "step": 38750 |
| }, |
| { |
| "epoch": 1.4745448702194832, |
| "grad_norm": 23.25, |
| "learning_rate": 1.6229962142796469e-06, |
| "loss": 3.4585, |
| "step": 39000 |
| }, |
| { |
| "epoch": 1.4839972021097605, |
| "grad_norm": 18.375, |
| "learning_rate": 1.5683658199123524e-06, |
| "loss": 3.5231, |
| "step": 39250 |
| }, |
| { |
| "epoch": 1.4934495340000378, |
| "grad_norm": 25.0, |
| "learning_rate": 1.5144991592091162e-06, |
| "loss": 3.5881, |
| "step": 39500 |
| }, |
| { |
| "epoch": 1.502901865890315, |
| "grad_norm": 22.25, |
| "learning_rate": 1.461408220563803e-06, |
| "loss": 3.5278, |
| "step": 39750 |
| }, |
| { |
| "epoch": 1.5123541977805925, |
| "grad_norm": 18.625, |
| "learning_rate": 1.4091048197280227e-06, |
| "loss": 3.5224, |
| "step": 40000 |
| }, |
| { |
| "epoch": 1.5218065296708698, |
| "grad_norm": 21.5, |
| "learning_rate": 1.3576005971814627e-06, |
| "loss": 3.5465, |
| "step": 40250 |
| }, |
| { |
| "epoch": 1.531258861561147, |
| "grad_norm": 19.375, |
| "learning_rate": 1.3069070155412145e-06, |
| "loss": 3.5318, |
| "step": 40500 |
| }, |
| { |
| "epoch": 1.5407111934514246, |
| "grad_norm": 19.5, |
| "learning_rate": 1.2570353570106864e-06, |
| "loss": 3.5316, |
| "step": 40750 |
| }, |
| { |
| "epoch": 1.5501635253417017, |
| "grad_norm": 23.5, |
| "learning_rate": 1.2079967208686787e-06, |
| "loss": 3.5112, |
| "step": 41000 |
| }, |
| { |
| "epoch": 1.5596158572319792, |
| "grad_norm": 21.25, |
| "learning_rate": 1.159802020999159e-06, |
| "loss": 3.5891, |
| "step": 41250 |
| }, |
| { |
| "epoch": 1.5690681891222564, |
| "grad_norm": 22.375, |
| "learning_rate": 1.112461983462304e-06, |
| "loss": 3.5365, |
| "step": 41500 |
| }, |
| { |
| "epoch": 1.5785205210125337, |
| "grad_norm": 18.25, |
| "learning_rate": 1.0659871441073422e-06, |
| "loss": 3.5665, |
| "step": 41750 |
| }, |
| { |
| "epoch": 1.5879728529028112, |
| "grad_norm": 37.75, |
| "learning_rate": 1.020387846227724e-06, |
| "loss": 3.5764, |
| "step": 42000 |
| }, |
| { |
| "epoch": 1.5974251847930885, |
| "grad_norm": 21.75, |
| "learning_rate": 9.756742382591577e-07, |
| "loss": 3.6041, |
| "step": 42250 |
| }, |
| { |
| "epoch": 1.6068775166833658, |
| "grad_norm": 24.375, |
| "learning_rate": 9.318562715210039e-07, |
| "loss": 3.6046, |
| "step": 42500 |
| }, |
| { |
| "epoch": 1.6163298485736433, |
| "grad_norm": 18.375, |
| "learning_rate": 8.889436980015336e-07, |
| "loss": 3.5789, |
| "step": 42750 |
| }, |
| { |
| "epoch": 1.6257821804639203, |
| "grad_norm": 19.875, |
| "learning_rate": 8.469460681875674e-07, |
| "loss": 3.588, |
| "step": 43000 |
| }, |
| { |
| "epoch": 1.6352345123541978, |
| "grad_norm": 20.375, |
| "learning_rate": 8.058727289389485e-07, |
| "loss": 3.571, |
| "step": 43250 |
| }, |
| { |
| "epoch": 1.644686844244475, |
| "grad_norm": 33.25, |
| "learning_rate": 7.657328214083226e-07, |
| "loss": 3.5252, |
| "step": 43500 |
| }, |
| { |
| "epoch": 1.6541391761347524, |
| "grad_norm": 20.625, |
| "learning_rate": 7.26535279006727e-07, |
| "loss": 3.5418, |
| "step": 43750 |
| }, |
| { |
| "epoch": 1.6635915080250299, |
| "grad_norm": 20.25, |
| "learning_rate": 6.882888254153902e-07, |
| "loss": 3.475, |
| "step": 44000 |
| }, |
| { |
| "epoch": 1.6730438399153071, |
| "grad_norm": 22.25, |
| "learning_rate": 6.51001972644218e-07, |
| "loss": 3.6097, |
| "step": 44250 |
| }, |
| { |
| "epoch": 1.6824961718055844, |
| "grad_norm": 20.125, |
| "learning_rate": 6.146830191373909e-07, |
| "loss": 3.5361, |
| "step": 44500 |
| }, |
| { |
| "epoch": 1.691948503695862, |
| "grad_norm": 19.375, |
| "learning_rate": 5.793400479264849e-07, |
| "loss": 3.5127, |
| "step": 44750 |
| }, |
| { |
| "epoch": 1.701400835586139, |
| "grad_norm": 22.5, |
| "learning_rate": 5.449809248315402e-07, |
| "loss": 3.5631, |
| "step": 45000 |
| }, |
| { |
| "epoch": 1.7108531674764165, |
| "grad_norm": 20.125, |
| "learning_rate": 5.11613296710467e-07, |
| "loss": 3.4704, |
| "step": 45250 |
| }, |
| { |
| "epoch": 1.7203054993666937, |
| "grad_norm": 25.75, |
| "learning_rate": 4.792445897571845e-07, |
| "loss": 3.5528, |
| "step": 45500 |
| }, |
| { |
| "epoch": 1.729757831256971, |
| "grad_norm": 24.0, |
| "learning_rate": 4.478820078488749e-07, |
| "loss": 3.515, |
| "step": 45750 |
| }, |
| { |
| "epoch": 1.7392101631472485, |
| "grad_norm": 17.375, |
| "learning_rate": 4.175325309427064e-07, |
| "loss": 3.5246, |
| "step": 46000 |
| }, |
| { |
| "epoch": 1.7486624950375258, |
| "grad_norm": 21.875, |
| "learning_rate": 3.882029135223975e-07, |
| "loss": 3.504, |
| "step": 46250 |
| }, |
| { |
| "epoch": 1.758114826927803, |
| "grad_norm": 23.0, |
| "learning_rate": 3.598996830949619e-07, |
| "loss": 3.5885, |
| "step": 46500 |
| }, |
| { |
| "epoch": 1.7675671588180806, |
| "grad_norm": 29.0, |
| "learning_rate": 3.326291387379654e-07, |
| "loss": 3.5235, |
| "step": 46750 |
| }, |
| { |
| "epoch": 1.7770194907083576, |
| "grad_norm": 20.0, |
| "learning_rate": 3.0639734969762524e-07, |
| "loss": 3.5873, |
| "step": 47000 |
| }, |
| { |
| "epoch": 1.7864718225986351, |
| "grad_norm": 20.75, |
| "learning_rate": 2.8121015403805406e-07, |
| "loss": 3.501, |
| "step": 47250 |
| }, |
| { |
| "epoch": 1.7959241544889124, |
| "grad_norm": 19.375, |
| "learning_rate": 2.570731573419638e-07, |
| "loss": 3.4923, |
| "step": 47500 |
| }, |
| { |
| "epoch": 1.8053764863791897, |
| "grad_norm": 23.625, |
| "learning_rate": 2.3399173146309906e-07, |
| "loss": 3.4967, |
| "step": 47750 |
| }, |
| { |
| "epoch": 1.8148288182694672, |
| "grad_norm": 16.875, |
| "learning_rate": 2.119710133306996e-07, |
| "loss": 3.5254, |
| "step": 48000 |
| }, |
| { |
| "epoch": 1.8242811501597445, |
| "grad_norm": 19.625, |
| "learning_rate": 1.9101590380623925e-07, |
| "loss": 3.4922, |
| "step": 48250 |
| }, |
| { |
| "epoch": 1.8337334820500217, |
| "grad_norm": 23.125, |
| "learning_rate": 1.711310665927046e-07, |
| "loss": 3.5446, |
| "step": 48500 |
| }, |
| { |
| "epoch": 1.8431858139402992, |
| "grad_norm": 25.375, |
| "learning_rate": 1.5232092719666025e-07, |
| "loss": 3.5395, |
| "step": 48750 |
| }, |
| { |
| "epoch": 1.8526381458305763, |
| "grad_norm": 20.0, |
| "learning_rate": 1.3458967194331485e-07, |
| "loss": 3.5714, |
| "step": 49000 |
| }, |
| { |
| "epoch": 1.8620904777208538, |
| "grad_norm": 23.0, |
| "learning_rate": 1.1794124704483324e-07, |
| "loss": 3.5383, |
| "step": 49250 |
| }, |
| { |
| "epoch": 1.871542809611131, |
| "grad_norm": 16.875, |
| "learning_rate": 1.0237935772207608e-07, |
| "loss": 3.5393, |
| "step": 49500 |
| }, |
| { |
| "epoch": 1.8809951415014083, |
| "grad_norm": 18.25, |
| "learning_rate": 8.790746737997569e-08, |
| "loss": 3.5028, |
| "step": 49750 |
| }, |
| { |
| "epoch": 1.8904474733916858, |
| "grad_norm": 19.5, |
| "learning_rate": 7.452879683673809e-08, |
| "loss": 3.5613, |
| "step": 50000 |
| }, |
| { |
| "epoch": 1.899899805281963, |
| "grad_norm": 21.25, |
| "learning_rate": 6.224632360702143e-08, |
| "loss": 3.5304, |
| "step": 50250 |
| }, |
| { |
| "epoch": 1.9093521371722404, |
| "grad_norm": 21.75, |
| "learning_rate": 5.1062781239271665e-08, |
| "loss": 3.5906, |
| "step": 50500 |
| }, |
| { |
| "epoch": 1.9188044690625177, |
| "grad_norm": 21.75, |
| "learning_rate": 4.0980658707355234e-08, |
| "loss": 3.5713, |
| "step": 50750 |
| }, |
| { |
| "epoch": 1.928256800952795, |
| "grad_norm": 25.125, |
| "learning_rate": 3.2002199856617236e-08, |
| "loss": 3.5918, |
| "step": 51000 |
| }, |
| { |
| "epoch": 1.9377091328430724, |
| "grad_norm": 19.625, |
| "learning_rate": 2.412940290450083e-08, |
| "loss": 3.4949, |
| "step": 51250 |
| }, |
| { |
| "epoch": 1.9471614647333497, |
| "grad_norm": 21.5, |
| "learning_rate": 1.736401999582804e-08, |
| "loss": 3.5472, |
| "step": 51500 |
| }, |
| { |
| "epoch": 1.956613796623627, |
| "grad_norm": 23.375, |
| "learning_rate": 1.1707556812851074e-08, |
| "loss": 3.5542, |
| "step": 51750 |
| }, |
| { |
| "epoch": 1.9660661285139045, |
| "grad_norm": 24.25, |
| "learning_rate": 7.161272240148731e-09, |
| "loss": 3.5946, |
| "step": 52000 |
| }, |
| { |
| "epoch": 1.9755184604041816, |
| "grad_norm": 20.875, |
| "learning_rate": 3.726178084456078e-09, |
| "loss": 3.5614, |
| "step": 52250 |
| }, |
| { |
| "epoch": 1.984970792294459, |
| "grad_norm": 19.625, |
| "learning_rate": 1.4030388494790104e-09, |
| "loss": 3.5535, |
| "step": 52500 |
| }, |
| { |
| "epoch": 1.9944231241847363, |
| "grad_norm": 24.25, |
| "learning_rate": 1.923715657464742e-10, |
| "loss": 3.5299, |
| "step": 52750 |
| } |
| ], |
| "logging_steps": 250, |
| "max_steps": 52896, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 250, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8374993889748353e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|