{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 1804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004434589800443459, "grad_norm": 8.667221069335938, "learning_rate": 2.197802197802198e-07, "loss": 1.8642287254333496, "step": 2 }, { "epoch": 0.008869179600886918, "grad_norm": 8.565436363220215, "learning_rate": 6.593406593406594e-07, "loss": 2.1231369972229004, "step": 4 }, { "epoch": 0.013303769401330377, "grad_norm": 3.455594301223755, "learning_rate": 1.098901098901099e-06, "loss": 1.89163339138031, "step": 6 }, { "epoch": 0.017738359201773836, "grad_norm": 1.2133512496948242, "learning_rate": 1.5384615384615387e-06, "loss": 1.7869961261749268, "step": 8 }, { "epoch": 0.022172949002217297, "grad_norm": 4.369198799133301, "learning_rate": 1.9780219780219782e-06, "loss": 1.5530983209609985, "step": 10 }, { "epoch": 0.026607538802660754, "grad_norm": 6.942768096923828, "learning_rate": 2.4175824175824177e-06, "loss": 1.5389991998672485, "step": 12 }, { "epoch": 0.031042128603104215, "grad_norm": 2.6731338500976562, "learning_rate": 2.8571428571428573e-06, "loss": 1.016729474067688, "step": 14 }, { "epoch": 0.03547671840354767, "grad_norm": 6.184485912322998, "learning_rate": 3.2967032967032968e-06, "loss": 0.9146304130554199, "step": 16 }, { "epoch": 0.03991130820399113, "grad_norm": 1.2878278493881226, "learning_rate": 3.7362637362637367e-06, "loss": 0.9207720756530762, "step": 18 }, { "epoch": 0.04434589800443459, "grad_norm": 3.814846992492676, "learning_rate": 4.175824175824177e-06, "loss": 1.1863677501678467, "step": 20 }, { "epoch": 0.04878048780487805, "grad_norm": 6.385776042938232, "learning_rate": 4.615384615384616e-06, "loss": 1.320576548576355, "step": 22 }, { "epoch": 0.05321507760532151, "grad_norm": 1.8087103366851807, "learning_rate": 5.054945054945055e-06, "loss": 1.4690011739730835, "step": 24 }, { "epoch": 0.057649667405764965, "grad_norm": 1.2201143503189087, "learning_rate": 5.494505494505495e-06, "loss": 1.2042573690414429, "step": 26 }, { "epoch": 0.06208425720620843, "grad_norm": 1.1334809064865112, "learning_rate": 5.934065934065935e-06, "loss": 1.040234923362732, "step": 28 }, { "epoch": 0.06651884700665188, "grad_norm": 1.8232964277267456, "learning_rate": 6.373626373626373e-06, "loss": 1.169386386871338, "step": 30 }, { "epoch": 0.07095343680709534, "grad_norm": 2.845280885696411, "learning_rate": 6.813186813186814e-06, "loss": 1.0687059164047241, "step": 32 }, { "epoch": 0.07538802660753881, "grad_norm": 0.9850583672523499, "learning_rate": 7.252747252747253e-06, "loss": 1.4395848512649536, "step": 34 }, { "epoch": 0.07982261640798226, "grad_norm": 3.9745352268218994, "learning_rate": 7.692307692307694e-06, "loss": 1.1598668098449707, "step": 36 }, { "epoch": 0.08425720620842572, "grad_norm": 4.545168399810791, "learning_rate": 8.131868131868132e-06, "loss": 1.515150785446167, "step": 38 }, { "epoch": 0.08869179600886919, "grad_norm": 2.1682798862457275, "learning_rate": 8.571428571428571e-06, "loss": 0.7324872612953186, "step": 40 }, { "epoch": 0.09312638580931264, "grad_norm": 0.8849499225616455, "learning_rate": 9.010989010989011e-06, "loss": 1.3759689331054688, "step": 42 }, { "epoch": 0.0975609756097561, "grad_norm": 1.3699463605880737, "learning_rate": 9.450549450549452e-06, "loss": 1.3197706937789917, "step": 44 }, { "epoch": 0.10199556541019955, "grad_norm": 1.9112352132797241, "learning_rate": 9.890109890109892e-06, "loss": 1.355299472808838, "step": 46 }, { "epoch": 0.10643015521064302, "grad_norm": 4.099548816680908, "learning_rate": 1.0329670329670332e-05, "loss": 1.583066701889038, "step": 48 }, { "epoch": 0.11086474501108648, "grad_norm": 1.642897605895996, "learning_rate": 1.076923076923077e-05, "loss": 1.4291114807128906, "step": 50 }, { "epoch": 0.11529933481152993, "grad_norm": 2.4580774307250977, "learning_rate": 1.120879120879121e-05, "loss": 1.3548572063446045, "step": 52 }, { "epoch": 0.1197339246119734, "grad_norm": 1.3751822710037231, "learning_rate": 1.164835164835165e-05, "loss": 1.367175579071045, "step": 54 }, { "epoch": 0.12416851441241686, "grad_norm": 5.466642379760742, "learning_rate": 1.2087912087912089e-05, "loss": 1.136663794517517, "step": 56 }, { "epoch": 0.1286031042128603, "grad_norm": 1.1212538480758667, "learning_rate": 1.2527472527472529e-05, "loss": 1.3549809455871582, "step": 58 }, { "epoch": 0.13303769401330376, "grad_norm": 0.9152220487594604, "learning_rate": 1.296703296703297e-05, "loss": 1.4315998554229736, "step": 60 }, { "epoch": 0.13747228381374724, "grad_norm": 1.2018588781356812, "learning_rate": 1.3406593406593406e-05, "loss": 1.3350356817245483, "step": 62 }, { "epoch": 0.1419068736141907, "grad_norm": 1.993096947669983, "learning_rate": 1.3846153846153847e-05, "loss": 1.3968464136123657, "step": 64 }, { "epoch": 0.14634146341463414, "grad_norm": 1.467322826385498, "learning_rate": 1.4285714285714287e-05, "loss": 1.4635425806045532, "step": 66 }, { "epoch": 0.15077605321507762, "grad_norm": 0.7607141137123108, "learning_rate": 1.4725274725274727e-05, "loss": 1.3317251205444336, "step": 68 }, { "epoch": 0.15521064301552107, "grad_norm": 7.009274959564209, "learning_rate": 1.5164835164835166e-05, "loss": 1.3160146474838257, "step": 70 }, { "epoch": 0.15964523281596452, "grad_norm": 1.0283435583114624, "learning_rate": 1.5604395604395605e-05, "loss": 1.3538073301315308, "step": 72 }, { "epoch": 0.164079822616408, "grad_norm": 0.7895150184631348, "learning_rate": 1.6043956043956047e-05, "loss": 1.2550619840621948, "step": 74 }, { "epoch": 0.16851441241685144, "grad_norm": 0.7530434131622314, "learning_rate": 1.6483516483516486e-05, "loss": 1.355035424232483, "step": 76 }, { "epoch": 0.1729490022172949, "grad_norm": 0.6738516688346863, "learning_rate": 1.6923076923076924e-05, "loss": 1.396584391593933, "step": 78 }, { "epoch": 0.17738359201773837, "grad_norm": 1.0755456686019897, "learning_rate": 1.7362637362637363e-05, "loss": 1.3568543195724487, "step": 80 }, { "epoch": 0.18181818181818182, "grad_norm": 1.3478541374206543, "learning_rate": 1.78021978021978e-05, "loss": 0.8403951525688171, "step": 82 }, { "epoch": 0.18625277161862527, "grad_norm": 0.7471117973327637, "learning_rate": 1.8241758241758244e-05, "loss": 1.0819566249847412, "step": 84 }, { "epoch": 0.19068736141906872, "grad_norm": 3.1562721729278564, "learning_rate": 1.8681318681318682e-05, "loss": 1.0565105676651, "step": 86 }, { "epoch": 0.1951219512195122, "grad_norm": 0.9117481708526611, "learning_rate": 1.9120879120879124e-05, "loss": 1.336931586265564, "step": 88 }, { "epoch": 0.19955654101995565, "grad_norm": 1.8324049711227417, "learning_rate": 1.9560439560439563e-05, "loss": 1.4609527587890625, "step": 90 }, { "epoch": 0.2039911308203991, "grad_norm": 0.8476412892341614, "learning_rate": 2e-05, "loss": 1.317558765411377, "step": 92 }, { "epoch": 0.20842572062084258, "grad_norm": 0.6812918782234192, "learning_rate": 1.999993945796182e-05, "loss": 1.309884786605835, "step": 94 }, { "epoch": 0.21286031042128603, "grad_norm": 1.9555091857910156, "learning_rate": 1.9999757832661787e-05, "loss": 1.8222039937973022, "step": 96 }, { "epoch": 0.21729490022172948, "grad_norm": 1.6802914142608643, "learning_rate": 1.9999455126543454e-05, "loss": 1.0341295003890991, "step": 98 }, { "epoch": 0.22172949002217296, "grad_norm": 0.9253756403923035, "learning_rate": 1.9999031343679364e-05, "loss": 1.2889328002929688, "step": 100 }, { "epoch": 0.2261640798226164, "grad_norm": 0.9691144824028015, "learning_rate": 1.9998486489770998e-05, "loss": 1.4229637384414673, "step": 102 }, { "epoch": 0.23059866962305986, "grad_norm": 0.7583999037742615, "learning_rate": 1.999782057214871e-05, "loss": 1.1750223636627197, "step": 104 }, { "epoch": 0.23503325942350334, "grad_norm": 0.7559353709220886, "learning_rate": 1.999703359977161e-05, "loss": 1.3722642660140991, "step": 106 }, { "epoch": 0.2394678492239468, "grad_norm": 1.8747915029525757, "learning_rate": 1.9996125583227458e-05, "loss": 1.5751910209655762, "step": 108 }, { "epoch": 0.24390243902439024, "grad_norm": 0.7324615120887756, "learning_rate": 1.999509653473251e-05, "loss": 1.1686367988586426, "step": 110 }, { "epoch": 0.24833702882483372, "grad_norm": 5.023177623748779, "learning_rate": 1.999394646813137e-05, "loss": 1.368462324142456, "step": 112 }, { "epoch": 0.25277161862527714, "grad_norm": 2.301079750061035, "learning_rate": 1.9992675398896784e-05, "loss": 0.8811516761779785, "step": 114 }, { "epoch": 0.2572062084257206, "grad_norm": 0.6491958498954773, "learning_rate": 1.9991283344129452e-05, "loss": 1.4907201528549194, "step": 116 }, { "epoch": 0.2616407982261641, "grad_norm": 0.6563892364501953, "learning_rate": 1.998977032255777e-05, "loss": 1.224129557609558, "step": 118 }, { "epoch": 0.2660753880266075, "grad_norm": 0.708153486251831, "learning_rate": 1.9988136354537615e-05, "loss": 1.3663833141326904, "step": 120 }, { "epoch": 0.270509977827051, "grad_norm": 1.3739961385726929, "learning_rate": 1.9986381462052048e-05, "loss": 1.2798233032226562, "step": 122 }, { "epoch": 0.2749445676274945, "grad_norm": 1.1927521228790283, "learning_rate": 1.9984505668711006e-05, "loss": 1.6487520933151245, "step": 124 }, { "epoch": 0.2793791574279379, "grad_norm": 1.0914132595062256, "learning_rate": 1.998250899975102e-05, "loss": 0.9563515186309814, "step": 126 }, { "epoch": 0.2838137472283814, "grad_norm": 0.6142106056213379, "learning_rate": 1.9980391482034844e-05, "loss": 1.2922307252883911, "step": 128 }, { "epoch": 0.28824833702882485, "grad_norm": 0.9818975925445557, "learning_rate": 1.9978153144051108e-05, "loss": 1.0446155071258545, "step": 130 }, { "epoch": 0.2926829268292683, "grad_norm": 1.9593212604522705, "learning_rate": 1.9975794015913936e-05, "loss": 1.0657705068588257, "step": 132 }, { "epoch": 0.29711751662971175, "grad_norm": 2.4713385105133057, "learning_rate": 1.9973314129362533e-05, "loss": 1.0481352806091309, "step": 134 }, { "epoch": 0.30155210643015523, "grad_norm": 9.34296703338623, "learning_rate": 1.997071351776076e-05, "loss": 1.2620774507522583, "step": 136 }, { "epoch": 0.30598669623059865, "grad_norm": 2.3951597213745117, "learning_rate": 1.996799221609669e-05, "loss": 0.8199646472930908, "step": 138 }, { "epoch": 0.31042128603104213, "grad_norm": 1.0207390785217285, "learning_rate": 1.9965150260982137e-05, "loss": 1.2821062803268433, "step": 140 }, { "epoch": 0.3148558758314856, "grad_norm": 0.808794379234314, "learning_rate": 1.9962187690652157e-05, "loss": 1.0488530397415161, "step": 142 }, { "epoch": 0.31929046563192903, "grad_norm": 1.9180113077163696, "learning_rate": 1.9959104544964536e-05, "loss": 1.0375815629959106, "step": 144 }, { "epoch": 0.3237250554323725, "grad_norm": 1.6617244482040405, "learning_rate": 1.9955900865399257e-05, "loss": 1.0013810396194458, "step": 146 }, { "epoch": 0.328159645232816, "grad_norm": 0.9878438711166382, "learning_rate": 1.9952576695057944e-05, "loss": 1.4907773733139038, "step": 148 }, { "epoch": 0.3325942350332594, "grad_norm": 1.895961046218872, "learning_rate": 1.9949132078663268e-05, "loss": 1.254366397857666, "step": 150 }, { "epoch": 0.3370288248337029, "grad_norm": 1.2166792154312134, "learning_rate": 1.9945567062558368e-05, "loss": 1.1661312580108643, "step": 152 }, { "epoch": 0.34146341463414637, "grad_norm": 0.8109827637672424, "learning_rate": 1.9941881694706206e-05, "loss": 1.3392776250839233, "step": 154 }, { "epoch": 0.3458980044345898, "grad_norm": 2.391664505004883, "learning_rate": 1.993807602468893e-05, "loss": 1.3111441135406494, "step": 156 }, { "epoch": 0.35033259423503327, "grad_norm": 0.941863477230072, "learning_rate": 1.9934150103707217e-05, "loss": 1.3535107374191284, "step": 158 }, { "epoch": 0.35476718403547675, "grad_norm": 0.6483902335166931, "learning_rate": 1.9930103984579564e-05, "loss": 1.3064088821411133, "step": 160 }, { "epoch": 0.35920177383592017, "grad_norm": 0.682521641254425, "learning_rate": 1.9925937721741595e-05, "loss": 0.9179922938346863, "step": 162 }, { "epoch": 0.36363636363636365, "grad_norm": 1.1320902109146118, "learning_rate": 1.992165137124532e-05, "loss": 1.0206555128097534, "step": 164 }, { "epoch": 0.36807095343680707, "grad_norm": 0.8146328926086426, "learning_rate": 1.9917244990758385e-05, "loss": 1.3475308418273926, "step": 166 }, { "epoch": 0.37250554323725055, "grad_norm": 1.6250571012496948, "learning_rate": 1.9912718639563285e-05, "loss": 1.31868577003479, "step": 168 }, { "epoch": 0.376940133037694, "grad_norm": 0.8682546615600586, "learning_rate": 1.9908072378556585e-05, "loss": 1.2381749153137207, "step": 170 }, { "epoch": 0.38137472283813745, "grad_norm": 2.6199824810028076, "learning_rate": 1.990330627024809e-05, "loss": 0.8625264167785645, "step": 172 }, { "epoch": 0.3858093126385809, "grad_norm": 1.7685837745666504, "learning_rate": 1.989842037876e-05, "loss": 1.7184687852859497, "step": 174 }, { "epoch": 0.3902439024390244, "grad_norm": 0.9849699139595032, "learning_rate": 1.9893414769826053e-05, "loss": 1.369092583656311, "step": 176 }, { "epoch": 0.3946784922394678, "grad_norm": 1.686566948890686, "learning_rate": 1.9888289510790643e-05, "loss": 1.383589744567871, "step": 178 }, { "epoch": 0.3991130820399113, "grad_norm": 0.793823778629303, "learning_rate": 1.988304467060791e-05, "loss": 1.1963413953781128, "step": 180 }, { "epoch": 0.4035476718403548, "grad_norm": 0.6959115266799927, "learning_rate": 1.9877680319840813e-05, "loss": 1.335618257522583, "step": 182 }, { "epoch": 0.4079822616407982, "grad_norm": 1.3807117938995361, "learning_rate": 1.987219653066018e-05, "loss": 0.8666111826896667, "step": 184 }, { "epoch": 0.4124168514412417, "grad_norm": 1.2673057317733765, "learning_rate": 1.9866593376843743e-05, "loss": 1.0503551959991455, "step": 186 }, { "epoch": 0.41685144124168516, "grad_norm": 0.8807701468467712, "learning_rate": 1.9860870933775128e-05, "loss": 1.0260038375854492, "step": 188 }, { "epoch": 0.4212860310421286, "grad_norm": 2.0024898052215576, "learning_rate": 1.9855029278442865e-05, "loss": 1.1095020771026611, "step": 190 }, { "epoch": 0.42572062084257206, "grad_norm": 2.057466745376587, "learning_rate": 1.984906848943934e-05, "loss": 1.100471019744873, "step": 192 }, { "epoch": 0.43015521064301554, "grad_norm": 1.057753324508667, "learning_rate": 1.9842988646959723e-05, "loss": 1.3441250324249268, "step": 194 }, { "epoch": 0.43458980044345896, "grad_norm": 1.4172452688217163, "learning_rate": 1.983678983280093e-05, "loss": 1.6131374835968018, "step": 196 }, { "epoch": 0.43902439024390244, "grad_norm": 1.8611360788345337, "learning_rate": 1.983047213036047e-05, "loss": 1.3363574743270874, "step": 198 }, { "epoch": 0.4434589800443459, "grad_norm": 1.018568992614746, "learning_rate": 1.9824035624635368e-05, "loss": 1.2478539943695068, "step": 200 }, { "epoch": 0.44789356984478934, "grad_norm": 1.5161771774291992, "learning_rate": 1.9817480402220995e-05, "loss": 1.3159914016723633, "step": 202 }, { "epoch": 0.4523281596452328, "grad_norm": 1.3039575815200806, "learning_rate": 1.9810806551309903e-05, "loss": 1.2693634033203125, "step": 204 }, { "epoch": 0.4567627494456763, "grad_norm": 1.2496814727783203, "learning_rate": 1.9804014161690672e-05, "loss": 1.1507153511047363, "step": 206 }, { "epoch": 0.4611973392461197, "grad_norm": 4.592546463012695, "learning_rate": 1.979710332474665e-05, "loss": 1.1844661235809326, "step": 208 }, { "epoch": 0.4656319290465632, "grad_norm": 0.856142520904541, "learning_rate": 1.9790074133454765e-05, "loss": 0.7224380970001221, "step": 210 }, { "epoch": 0.4700665188470067, "grad_norm": 10.285343170166016, "learning_rate": 1.9782926682384248e-05, "loss": 0.8978222012519836, "step": 212 }, { "epoch": 0.4745011086474501, "grad_norm": 0.5395671129226685, "learning_rate": 1.977566106769538e-05, "loss": 1.1894056797027588, "step": 214 }, { "epoch": 0.4789356984478936, "grad_norm": 0.6740292310714722, "learning_rate": 1.976827738713819e-05, "loss": 1.2027900218963623, "step": 216 }, { "epoch": 0.48337028824833705, "grad_norm": 0.9326871037483215, "learning_rate": 1.976077574005114e-05, "loss": 1.1885857582092285, "step": 218 }, { "epoch": 0.4878048780487805, "grad_norm": 2.4017117023468018, "learning_rate": 1.9753156227359783e-05, "loss": 1.32407546043396, "step": 220 }, { "epoch": 0.49223946784922396, "grad_norm": 1.3293203115463257, "learning_rate": 1.9745418951575415e-05, "loss": 1.2708196640014648, "step": 222 }, { "epoch": 0.49667405764966743, "grad_norm": 0.804009199142456, "learning_rate": 1.9737564016793696e-05, "loss": 1.2493350505828857, "step": 224 }, { "epoch": 0.5011086474501109, "grad_norm": 0.6624335050582886, "learning_rate": 1.972959152869323e-05, "loss": 1.236510992050171, "step": 226 }, { "epoch": 0.5055432372505543, "grad_norm": 1.1144077777862549, "learning_rate": 1.972150159453417e-05, "loss": 1.2882966995239258, "step": 228 }, { "epoch": 0.5099778270509978, "grad_norm": 2.013320207595825, "learning_rate": 1.9713294323156768e-05, "loss": 1.8960356712341309, "step": 230 }, { "epoch": 0.5144124168514412, "grad_norm": 0.9120582342147827, "learning_rate": 1.9704969824979893e-05, "loss": 1.0289053916931152, "step": 232 }, { "epoch": 0.5188470066518847, "grad_norm": 1.22536301612854, "learning_rate": 1.9696528211999567e-05, "loss": 1.3444561958312988, "step": 234 }, { "epoch": 0.5232815964523282, "grad_norm": 0.7821425199508667, "learning_rate": 1.9687969597787445e-05, "loss": 1.1790920495986938, "step": 236 }, { "epoch": 0.5277161862527716, "grad_norm": 0.8863709568977356, "learning_rate": 1.967929409748929e-05, "loss": 1.0798450708389282, "step": 238 }, { "epoch": 0.532150776053215, "grad_norm": 0.5844965577125549, "learning_rate": 1.967050182782344e-05, "loss": 1.3156877756118774, "step": 240 }, { "epoch": 0.5365853658536586, "grad_norm": 0.46499544382095337, "learning_rate": 1.96615929070792e-05, "loss": 1.3678405284881592, "step": 242 }, { "epoch": 0.541019955654102, "grad_norm": 12.881924629211426, "learning_rate": 1.9652567455115287e-05, "loss": 1.0557224750518799, "step": 244 }, { "epoch": 0.5454545454545454, "grad_norm": 1.112845778465271, "learning_rate": 1.9643425593358212e-05, "loss": 1.308203101158142, "step": 246 }, { "epoch": 0.549889135254989, "grad_norm": 1.1576392650604248, "learning_rate": 1.9634167444800618e-05, "loss": 1.5463697910308838, "step": 248 }, { "epoch": 0.5543237250554324, "grad_norm": 1.7358508110046387, "learning_rate": 1.9624793133999663e-05, "loss": 1.3133127689361572, "step": 250 }, { "epoch": 0.5587583148558758, "grad_norm": 1.8306182622909546, "learning_rate": 1.9615302787075317e-05, "loss": 0.7901706695556641, "step": 252 }, { "epoch": 0.5631929046563193, "grad_norm": 1.574388861656189, "learning_rate": 1.9605696531708687e-05, "loss": 1.5300947427749634, "step": 254 }, { "epoch": 0.5676274944567627, "grad_norm": 0.6506041884422302, "learning_rate": 1.9595974497140275e-05, "loss": 1.3747804164886475, "step": 256 }, { "epoch": 0.5720620842572062, "grad_norm": 0.5146905779838562, "learning_rate": 1.958613681416825e-05, "loss": 1.3938028812408447, "step": 258 }, { "epoch": 0.5764966740576497, "grad_norm": 0.540286123752594, "learning_rate": 1.95761836151467e-05, "loss": 1.3356225490570068, "step": 260 }, { "epoch": 0.5809312638580931, "grad_norm": 1.7904235124588013, "learning_rate": 1.9566115033983843e-05, "loss": 0.817384660243988, "step": 262 }, { "epoch": 0.5853658536585366, "grad_norm": 1.602072834968567, "learning_rate": 1.955593120614021e-05, "loss": 1.4035075902938843, "step": 264 }, { "epoch": 0.5898004434589801, "grad_norm": 0.951567530632019, "learning_rate": 1.954563226862685e-05, "loss": 1.164678692817688, "step": 266 }, { "epoch": 0.5942350332594235, "grad_norm": 0.9935126900672913, "learning_rate": 1.953521836000346e-05, "loss": 1.6089775562286377, "step": 268 }, { "epoch": 0.5986696230598669, "grad_norm": 1.1680865287780762, "learning_rate": 1.9524689620376552e-05, "loss": 1.2622849941253662, "step": 270 }, { "epoch": 0.6031042128603105, "grad_norm": 0.8502325415611267, "learning_rate": 1.9514046191397532e-05, "loss": 1.2814254760742188, "step": 272 }, { "epoch": 0.6075388026607539, "grad_norm": 0.822547972202301, "learning_rate": 1.950328821626081e-05, "loss": 1.278984785079956, "step": 274 }, { "epoch": 0.6119733924611973, "grad_norm": 0.9921445250511169, "learning_rate": 1.9492415839701902e-05, "loss": 1.2716035842895508, "step": 276 }, { "epoch": 0.6164079822616408, "grad_norm": 2.4694504737854004, "learning_rate": 1.9481429207995424e-05, "loss": 1.2899194955825806, "step": 278 }, { "epoch": 0.6208425720620843, "grad_norm": 0.6362584829330444, "learning_rate": 1.9470328468953176e-05, "loss": 1.3732231855392456, "step": 280 }, { "epoch": 0.6252771618625277, "grad_norm": 1.02562415599823, "learning_rate": 1.9459113771922128e-05, "loss": 1.0229641199111938, "step": 282 }, { "epoch": 0.6297117516629712, "grad_norm": 0.6536508798599243, "learning_rate": 1.944778526778242e-05, "loss": 1.315395474433899, "step": 284 }, { "epoch": 0.6341463414634146, "grad_norm": 0.7477055788040161, "learning_rate": 1.9436343108945323e-05, "loss": 1.3944462537765503, "step": 286 }, { "epoch": 0.6385809312638581, "grad_norm": 0.5282856822013855, "learning_rate": 1.9424787449351194e-05, "loss": 1.3006008863449097, "step": 288 }, { "epoch": 0.6430155210643016, "grad_norm": 0.4401320219039917, "learning_rate": 1.9413118444467408e-05, "loss": 1.2911877632141113, "step": 290 }, { "epoch": 0.647450110864745, "grad_norm": 1.1002235412597656, "learning_rate": 1.9401336251286264e-05, "loss": 1.43943190574646, "step": 292 }, { "epoch": 0.6518847006651884, "grad_norm": 0.5872219204902649, "learning_rate": 1.9389441028322874e-05, "loss": 1.026016116142273, "step": 294 }, { "epoch": 0.656319290465632, "grad_norm": 0.5707578659057617, "learning_rate": 1.9377432935613016e-05, "loss": 1.0756226778030396, "step": 296 }, { "epoch": 0.6607538802660754, "grad_norm": 0.6579997539520264, "learning_rate": 1.936531213471101e-05, "loss": 1.2744524478912354, "step": 298 }, { "epoch": 0.6651884700665188, "grad_norm": 4.350220680236816, "learning_rate": 1.935307878868752e-05, "loss": 1.3224852085113525, "step": 300 }, { "epoch": 0.6696230598669624, "grad_norm": 0.5770370364189148, "learning_rate": 1.9340733062127373e-05, "loss": 1.2629750967025757, "step": 302 }, { "epoch": 0.6740576496674058, "grad_norm": 0.7492507696151733, "learning_rate": 1.9328275121127325e-05, "loss": 1.3276405334472656, "step": 304 }, { "epoch": 0.6784922394678492, "grad_norm": 0.9730760455131531, "learning_rate": 1.9315705133293857e-05, "loss": 1.2819868326187134, "step": 306 }, { "epoch": 0.6829268292682927, "grad_norm": 0.6775749921798706, "learning_rate": 1.9303023267740902e-05, "loss": 1.0328669548034668, "step": 308 }, { "epoch": 0.6873614190687362, "grad_norm": 0.6441645622253418, "learning_rate": 1.9290229695087562e-05, "loss": 1.2884297370910645, "step": 310 }, { "epoch": 0.6917960088691796, "grad_norm": 0.6459354162216187, "learning_rate": 1.9277324587455833e-05, "loss": 1.3426930904388428, "step": 312 }, { "epoch": 0.6962305986696231, "grad_norm": 0.715065598487854, "learning_rate": 1.9264308118468274e-05, "loss": 1.2753427028656006, "step": 314 }, { "epoch": 0.7006651884700665, "grad_norm": 0.7442984580993652, "learning_rate": 1.9251180463245675e-05, "loss": 1.4162836074829102, "step": 316 }, { "epoch": 0.70509977827051, "grad_norm": 0.6542792916297913, "learning_rate": 1.9237941798404708e-05, "loss": 1.1363985538482666, "step": 318 }, { "epoch": 0.7095343680709535, "grad_norm": 1.2111639976501465, "learning_rate": 1.922459230205553e-05, "loss": 1.0583592653274536, "step": 320 }, { "epoch": 0.7139689578713969, "grad_norm": 1.301080346107483, "learning_rate": 1.921113215379943e-05, "loss": 1.300571322441101, "step": 322 }, { "epoch": 0.7184035476718403, "grad_norm": 0.6510736346244812, "learning_rate": 1.9197561534726347e-05, "loss": 1.1844992637634277, "step": 324 }, { "epoch": 0.7228381374722838, "grad_norm": 2.18395733833313, "learning_rate": 1.9183880627412496e-05, "loss": 1.2481743097305298, "step": 326 }, { "epoch": 0.7272727272727273, "grad_norm": 0.287090003490448, "learning_rate": 1.9170089615917884e-05, "loss": 0.9507350325584412, "step": 328 }, { "epoch": 0.7317073170731707, "grad_norm": 1.0061886310577393, "learning_rate": 1.915618868578383e-05, "loss": 0.961956799030304, "step": 330 }, { "epoch": 0.7361419068736141, "grad_norm": 1.3770501613616943, "learning_rate": 1.9142178024030475e-05, "loss": 1.4702783823013306, "step": 332 }, { "epoch": 0.7405764966740577, "grad_norm": 4.349529266357422, "learning_rate": 1.9128057819154264e-05, "loss": 1.3034319877624512, "step": 334 }, { "epoch": 0.7450110864745011, "grad_norm": 0.5903530716896057, "learning_rate": 1.911382826112542e-05, "loss": 1.2915682792663574, "step": 336 }, { "epoch": 0.7494456762749445, "grad_norm": 1.9257632493972778, "learning_rate": 1.909948954138538e-05, "loss": 0.859005868434906, "step": 338 }, { "epoch": 0.753880266075388, "grad_norm": 4.127213001251221, "learning_rate": 1.908504185284421e-05, "loss": 0.4267387092113495, "step": 340 }, { "epoch": 0.7583148558758315, "grad_norm": 0.7010712027549744, "learning_rate": 1.9070485389878023e-05, "loss": 0.9848529696464539, "step": 342 }, { "epoch": 0.7627494456762749, "grad_norm": 0.5236337780952454, "learning_rate": 1.9055820348326358e-05, "loss": 1.400795340538025, "step": 344 }, { "epoch": 0.7671840354767184, "grad_norm": 0.49078261852264404, "learning_rate": 1.9041046925489552e-05, "loss": 1.304659128189087, "step": 346 }, { "epoch": 0.7716186252771619, "grad_norm": 0.8199257850646973, "learning_rate": 1.902616532012608e-05, "loss": 1.1828995943069458, "step": 348 }, { "epoch": 0.7760532150776053, "grad_norm": 0.66054368019104, "learning_rate": 1.9011175732449878e-05, "loss": 1.2884124517440796, "step": 350 }, { "epoch": 0.7804878048780488, "grad_norm": 1.6785452365875244, "learning_rate": 1.8996078364127655e-05, "loss": 1.2245346307754517, "step": 352 }, { "epoch": 0.7849223946784922, "grad_norm": 1.8945343494415283, "learning_rate": 1.898087341827618e-05, "loss": 1.0871098041534424, "step": 354 }, { "epoch": 0.7893569844789357, "grad_norm": 1.0700933933258057, "learning_rate": 1.896556109945954e-05, "loss": 1.2871757745742798, "step": 356 }, { "epoch": 0.7937915742793792, "grad_norm": 1.8673183917999268, "learning_rate": 1.8950141613686404e-05, "loss": 1.358439564704895, "step": 358 }, { "epoch": 0.7982261640798226, "grad_norm": 0.603571891784668, "learning_rate": 1.8934615168407237e-05, "loss": 1.295249104499817, "step": 360 }, { "epoch": 0.802660753880266, "grad_norm": 1.2410091161727905, "learning_rate": 1.891898197251151e-05, "loss": 0.8056436777114868, "step": 362 }, { "epoch": 0.8070953436807096, "grad_norm": 1.200040578842163, "learning_rate": 1.8903242236324907e-05, "loss": 1.4234434366226196, "step": 364 }, { "epoch": 0.811529933481153, "grad_norm": 0.47995078563690186, "learning_rate": 1.888739617160647e-05, "loss": 1.261313557624817, "step": 366 }, { "epoch": 0.8159645232815964, "grad_norm": 1.1774096488952637, "learning_rate": 1.8871443991545768e-05, "loss": 1.0709372758865356, "step": 368 }, { "epoch": 0.8203991130820399, "grad_norm": 1.2842013835906982, "learning_rate": 1.885538591076002e-05, "loss": 0.9137963652610779, "step": 370 }, { "epoch": 0.8248337028824834, "grad_norm": 0.7302650809288025, "learning_rate": 1.8839222145291217e-05, "loss": 1.29634690284729, "step": 372 }, { "epoch": 0.8292682926829268, "grad_norm": 0.611242413520813, "learning_rate": 1.88229529126032e-05, "loss": 1.2931721210479736, "step": 374 }, { "epoch": 0.8337028824833703, "grad_norm": 0.4736635684967041, "learning_rate": 1.8806578431578747e-05, "loss": 1.2644020318984985, "step": 376 }, { "epoch": 0.8381374722838137, "grad_norm": 1.342694640159607, "learning_rate": 1.8790098922516637e-05, "loss": 1.3544180393218994, "step": 378 }, { "epoch": 0.8425720620842572, "grad_norm": 1.1542247533798218, "learning_rate": 1.8773514607128647e-05, "loss": 0.9301992654800415, "step": 380 }, { "epoch": 0.8470066518847007, "grad_norm": 0.5767038464546204, "learning_rate": 1.875682570853662e-05, "loss": 1.3983073234558105, "step": 382 }, { "epoch": 0.8514412416851441, "grad_norm": 0.7877940535545349, "learning_rate": 1.8740032451269438e-05, "loss": 1.195070743560791, "step": 384 }, { "epoch": 0.8558758314855875, "grad_norm": 0.44601938128471375, "learning_rate": 1.8723135061259977e-05, "loss": 1.3003090620040894, "step": 386 }, { "epoch": 0.8603104212860311, "grad_norm": 0.35469481348991394, "learning_rate": 1.8706133765842126e-05, "loss": 1.2766008377075195, "step": 388 }, { "epoch": 0.8647450110864745, "grad_norm": 0.4653916358947754, "learning_rate": 1.8689028793747673e-05, "loss": 1.3040666580200195, "step": 390 }, { "epoch": 0.8691796008869179, "grad_norm": 0.6527778506278992, "learning_rate": 1.8671820375103256e-05, "loss": 1.0266871452331543, "step": 392 }, { "epoch": 0.8736141906873615, "grad_norm": 0.5240263938903809, "learning_rate": 1.8654508741427272e-05, "loss": 1.2564506530761719, "step": 394 }, { "epoch": 0.8780487804878049, "grad_norm": 0.437155157327652, "learning_rate": 1.863709412562672e-05, "loss": 1.246124505996704, "step": 396 }, { "epoch": 0.8824833702882483, "grad_norm": 0.8538821935653687, "learning_rate": 1.8619576761994137e-05, "loss": 1.2513529062271118, "step": 398 }, { "epoch": 0.8869179600886918, "grad_norm": 0.49160391092300415, "learning_rate": 1.860195688620438e-05, "loss": 0.6274079084396362, "step": 400 }, { "epoch": 0.8913525498891353, "grad_norm": 0.4428112506866455, "learning_rate": 1.8584234735311497e-05, "loss": 1.119248390197754, "step": 402 }, { "epoch": 0.8957871396895787, "grad_norm": 0.24119903147220612, "learning_rate": 1.8566410547745514e-05, "loss": 1.0662287473678589, "step": 404 }, { "epoch": 0.9002217294900222, "grad_norm": 1.1826022863388062, "learning_rate": 1.8548484563309243e-05, "loss": 1.3069649934768677, "step": 406 }, { "epoch": 0.9046563192904656, "grad_norm": 2.6790738105773926, "learning_rate": 1.853045702317505e-05, "loss": 1.210648536682129, "step": 408 }, { "epoch": 0.9090909090909091, "grad_norm": 0.8086345195770264, "learning_rate": 1.85123281698816e-05, "loss": 1.22344172000885, "step": 410 }, { "epoch": 0.9135254988913526, "grad_norm": 0.46482929587364197, "learning_rate": 1.8494098247330613e-05, "loss": 1.2734506130218506, "step": 412 }, { "epoch": 0.917960088691796, "grad_norm": 0.6504107117652893, "learning_rate": 1.847576750078357e-05, "loss": 1.2879432439804077, "step": 414 }, { "epoch": 0.9223946784922394, "grad_norm": 2.2455458641052246, "learning_rate": 1.8457336176858425e-05, "loss": 1.043541431427002, "step": 416 }, { "epoch": 0.926829268292683, "grad_norm": 0.47505319118499756, "learning_rate": 1.8438804523526258e-05, "loss": 1.339963674545288, "step": 418 }, { "epoch": 0.9312638580931264, "grad_norm": 1.4694486856460571, "learning_rate": 1.8420172790107983e-05, "loss": 0.8636243939399719, "step": 420 }, { "epoch": 0.9356984478935698, "grad_norm": 1.0414270162582397, "learning_rate": 1.8401441227270953e-05, "loss": 1.5467491149902344, "step": 422 }, { "epoch": 0.9401330376940134, "grad_norm": 1.4648017883300781, "learning_rate": 1.838261008702561e-05, "loss": 1.1460201740264893, "step": 424 }, { "epoch": 0.9445676274944568, "grad_norm": 0.5038813352584839, "learning_rate": 1.8363679622722096e-05, "loss": 1.2603991031646729, "step": 426 }, { "epoch": 0.9490022172949002, "grad_norm": 0.6404750347137451, "learning_rate": 1.8344650089046826e-05, "loss": 1.1844969987869263, "step": 428 }, { "epoch": 0.9534368070953437, "grad_norm": 2.21321439743042, "learning_rate": 1.832552174201908e-05, "loss": 0.8131325840950012, "step": 430 }, { "epoch": 0.9578713968957872, "grad_norm": 0.49369490146636963, "learning_rate": 1.830629483898755e-05, "loss": 1.2790230512619019, "step": 432 }, { "epoch": 0.9623059866962306, "grad_norm": 6.766321659088135, "learning_rate": 1.8286969638626882e-05, "loss": 1.2089905738830566, "step": 434 }, { "epoch": 0.9667405764966741, "grad_norm": 0.7347844839096069, "learning_rate": 1.826754640093419e-05, "loss": 1.3173238039016724, "step": 436 }, { "epoch": 0.9711751662971175, "grad_norm": 0.564915657043457, "learning_rate": 1.824802538722556e-05, "loss": 1.2954607009887695, "step": 438 }, { "epoch": 0.975609756097561, "grad_norm": 2.1599206924438477, "learning_rate": 1.8228406860132545e-05, "loss": 0.8611724376678467, "step": 440 }, { "epoch": 0.9800443458980045, "grad_norm": 0.5106037259101868, "learning_rate": 1.8208691083598607e-05, "loss": 1.1488136053085327, "step": 442 }, { "epoch": 0.9844789356984479, "grad_norm": 0.9815554618835449, "learning_rate": 1.8188878322875594e-05, "loss": 1.3558589220046997, "step": 444 }, { "epoch": 0.9889135254988913, "grad_norm": 0.6858358979225159, "learning_rate": 1.8168968844520157e-05, "loss": 1.2466365098953247, "step": 446 }, { "epoch": 0.9933481152993349, "grad_norm": 1.2758557796478271, "learning_rate": 1.8148962916390154e-05, "loss": 1.2831544876098633, "step": 448 }, { "epoch": 0.9977827050997783, "grad_norm": 0.47892308235168457, "learning_rate": 1.8128860807641076e-05, "loss": 1.1054222583770752, "step": 450 }, { "epoch": 1.0022172949002217, "grad_norm": 1.9382197856903076, "learning_rate": 1.810866278872239e-05, "loss": 1.0697418451309204, "step": 452 }, { "epoch": 1.0066518847006651, "grad_norm": 1.1436439752578735, "learning_rate": 1.8088369131373925e-05, "loss": 1.2170673608779907, "step": 454 }, { "epoch": 1.0110864745011086, "grad_norm": 0.7328348159790039, "learning_rate": 1.8067980108622217e-05, "loss": 1.1548501253128052, "step": 456 }, { "epoch": 1.0155210643015522, "grad_norm": 0.8741162419319153, "learning_rate": 1.8047495994776817e-05, "loss": 0.7017601132392883, "step": 458 }, { "epoch": 1.0199556541019956, "grad_norm": 0.3321545124053955, "learning_rate": 1.8026917065426605e-05, "loss": 0.7321120500564575, "step": 460 }, { "epoch": 1.024390243902439, "grad_norm": 1.3679202795028687, "learning_rate": 1.800624359743611e-05, "loss": 0.5792034864425659, "step": 462 }, { "epoch": 1.0288248337028825, "grad_norm": 0.7569698095321655, "learning_rate": 1.798547586894175e-05, "loss": 0.7689359188079834, "step": 464 }, { "epoch": 1.033259423503326, "grad_norm": 0.7644620537757874, "learning_rate": 1.7964614159348103e-05, "loss": 0.698060154914856, "step": 466 }, { "epoch": 1.0376940133037693, "grad_norm": 1.2388887405395508, "learning_rate": 1.794365874932415e-05, "loss": 0.8797460198402405, "step": 468 }, { "epoch": 1.042128603104213, "grad_norm": 0.9471485018730164, "learning_rate": 1.7922609920799493e-05, "loss": 0.6286487579345703, "step": 470 }, { "epoch": 1.0465631929046564, "grad_norm": 2.5266878604888916, "learning_rate": 1.790146795696059e-05, "loss": 1.0638426542282104, "step": 472 }, { "epoch": 1.0509977827050998, "grad_norm": 0.6257596015930176, "learning_rate": 1.7880233142246884e-05, "loss": 1.0050872564315796, "step": 474 }, { "epoch": 1.0554323725055432, "grad_norm": 0.5379915237426758, "learning_rate": 1.7858905762347044e-05, "loss": 0.9805111289024353, "step": 476 }, { "epoch": 1.0598669623059866, "grad_norm": 0.8328865170478821, "learning_rate": 1.783748610419508e-05, "loss": 1.1784859895706177, "step": 478 }, { "epoch": 1.06430155210643, "grad_norm": 0.44074714183807373, "learning_rate": 1.7815974455966488e-05, "loss": 0.6814610958099365, "step": 480 }, { "epoch": 1.0687361419068737, "grad_norm": 1.1742632389068604, "learning_rate": 1.7794371107074398e-05, "loss": 1.1012016534805298, "step": 482 }, { "epoch": 1.0731707317073171, "grad_norm": 0.37530067563056946, "learning_rate": 1.7772676348165637e-05, "loss": 0.9307145476341248, "step": 484 }, { "epoch": 1.0776053215077606, "grad_norm": 0.42450839281082153, "learning_rate": 1.7750890471116858e-05, "loss": 0.963620662689209, "step": 486 }, { "epoch": 1.082039911308204, "grad_norm": 0.47807762026786804, "learning_rate": 1.7729013769030596e-05, "loss": 0.7537004351615906, "step": 488 }, { "epoch": 1.0864745011086474, "grad_norm": 0.4078989028930664, "learning_rate": 1.7707046536231325e-05, "loss": 0.854632556438446, "step": 490 }, { "epoch": 1.0909090909090908, "grad_norm": 0.6203530430793762, "learning_rate": 1.76849890682615e-05, "loss": 0.9603514671325684, "step": 492 }, { "epoch": 1.0953436807095343, "grad_norm": 1.7032476663589478, "learning_rate": 1.7662841661877574e-05, "loss": 1.0737708806991577, "step": 494 }, { "epoch": 1.099778270509978, "grad_norm": 1.1234840154647827, "learning_rate": 1.7640604615046025e-05, "loss": 0.9386560320854187, "step": 496 }, { "epoch": 1.1042128603104213, "grad_norm": 0.427051842212677, "learning_rate": 1.7618278226939327e-05, "loss": 0.9625406265258789, "step": 498 }, { "epoch": 1.1086474501108647, "grad_norm": 0.7077636122703552, "learning_rate": 1.7595862797931936e-05, "loss": 0.6286700367927551, "step": 500 }, { "epoch": 1.1130820399113082, "grad_norm": 0.5965766310691833, "learning_rate": 1.757335862959624e-05, "loss": 0.9419457316398621, "step": 502 }, { "epoch": 1.1175166297117516, "grad_norm": 0.7379962801933289, "learning_rate": 1.755076602469851e-05, "loss": 0.8069853186607361, "step": 504 }, { "epoch": 1.1219512195121952, "grad_norm": 1.0986132621765137, "learning_rate": 1.7528085287194827e-05, "loss": 0.8290332555770874, "step": 506 }, { "epoch": 1.1263858093126387, "grad_norm": 1.4528342485427856, "learning_rate": 1.750531672222698e-05, "loss": 0.6308746933937073, "step": 508 }, { "epoch": 1.130820399113082, "grad_norm": 0.7668278217315674, "learning_rate": 1.7482460636118377e-05, "loss": 1.0766762495040894, "step": 510 }, { "epoch": 1.1352549889135255, "grad_norm": 1.3378920555114746, "learning_rate": 1.745951733636992e-05, "loss": 0.5383997559547424, "step": 512 }, { "epoch": 1.139689578713969, "grad_norm": 1.2324367761611938, "learning_rate": 1.7436487131655855e-05, "loss": 0.4129646420478821, "step": 514 }, { "epoch": 1.1441241685144123, "grad_norm": 0.6832541823387146, "learning_rate": 1.7413370331819634e-05, "loss": 0.8020773530006409, "step": 516 }, { "epoch": 1.1485587583148558, "grad_norm": 1.0301239490509033, "learning_rate": 1.7390167247869743e-05, "loss": 0.9460446238517761, "step": 518 }, { "epoch": 1.1529933481152994, "grad_norm": 1.7787998914718628, "learning_rate": 1.7366878191975516e-05, "loss": 1.080168604850769, "step": 520 }, { "epoch": 1.1574279379157428, "grad_norm": 1.1747550964355469, "learning_rate": 1.7343503477462927e-05, "loss": 0.534135639667511, "step": 522 }, { "epoch": 1.1618625277161863, "grad_norm": 0.5435235500335693, "learning_rate": 1.7320043418810394e-05, "loss": 0.9134470820426941, "step": 524 }, { "epoch": 1.1662971175166297, "grad_norm": 0.5852527022361755, "learning_rate": 1.729649833164453e-05, "loss": 1.0747884511947632, "step": 526 }, { "epoch": 1.170731707317073, "grad_norm": 0.5314655900001526, "learning_rate": 1.727286853273591e-05, "loss": 0.6440135836601257, "step": 528 }, { "epoch": 1.1751662971175167, "grad_norm": 0.5095431208610535, "learning_rate": 1.7249154339994788e-05, "loss": 0.8419979810714722, "step": 530 }, { "epoch": 1.1796008869179602, "grad_norm": 0.4051227569580078, "learning_rate": 1.7225356072466856e-05, "loss": 0.8316261768341064, "step": 532 }, { "epoch": 1.1840354767184036, "grad_norm": 0.3643783628940582, "learning_rate": 1.720147405032891e-05, "loss": 0.9231957197189331, "step": 534 }, { "epoch": 1.188470066518847, "grad_norm": 0.32051339745521545, "learning_rate": 1.7177508594884576e-05, "loss": 0.6917131543159485, "step": 536 }, { "epoch": 1.1929046563192904, "grad_norm": 0.6921893358230591, "learning_rate": 1.7153460028559964e-05, "loss": 1.00527024269104, "step": 538 }, { "epoch": 1.1973392461197339, "grad_norm": 0.6226311922073364, "learning_rate": 1.7129328674899354e-05, "loss": 0.7679756879806519, "step": 540 }, { "epoch": 1.2017738359201773, "grad_norm": 1.1230734586715698, "learning_rate": 1.7105114858560813e-05, "loss": 0.6591505408287048, "step": 542 }, { "epoch": 1.206208425720621, "grad_norm": 0.9631316661834717, "learning_rate": 1.7080818905311853e-05, "loss": 0.9413385987281799, "step": 544 }, { "epoch": 1.2106430155210643, "grad_norm": 0.3299412727355957, "learning_rate": 1.7056441142025037e-05, "loss": 0.7805101275444031, "step": 546 }, { "epoch": 1.2150776053215078, "grad_norm": 0.6347978115081787, "learning_rate": 1.703198189667358e-05, "loss": 1.2124230861663818, "step": 548 }, { "epoch": 1.2195121951219512, "grad_norm": 1.2306925058364868, "learning_rate": 1.7007441498326943e-05, "loss": 0.6341520547866821, "step": 550 }, { "epoch": 1.2239467849223946, "grad_norm": 0.6283694505691528, "learning_rate": 1.6982820277146403e-05, "loss": 0.971120297908783, "step": 552 }, { "epoch": 1.2283813747228383, "grad_norm": 2.13574481010437, "learning_rate": 1.6958118564380596e-05, "loss": 0.7344387173652649, "step": 554 }, { "epoch": 1.2328159645232817, "grad_norm": 0.3253254294395447, "learning_rate": 1.6933336692361097e-05, "loss": 0.7349171042442322, "step": 556 }, { "epoch": 1.237250554323725, "grad_norm": 2.8170223236083984, "learning_rate": 1.6908474994497912e-05, "loss": 0.588421106338501, "step": 558 }, { "epoch": 1.2416851441241685, "grad_norm": 1.3332557678222656, "learning_rate": 1.688353380527501e-05, "loss": 1.1083375215530396, "step": 560 }, { "epoch": 1.246119733924612, "grad_norm": 1.18131685256958, "learning_rate": 1.6858513460245818e-05, "loss": 0.8837442398071289, "step": 562 }, { "epoch": 1.2505543237250554, "grad_norm": 0.6048891544342041, "learning_rate": 1.6833414296028717e-05, "loss": 0.6526999473571777, "step": 564 }, { "epoch": 1.2549889135254988, "grad_norm": 0.5266470909118652, "learning_rate": 1.680823665030249e-05, "loss": 0.8695023655891418, "step": 566 }, { "epoch": 1.2594235033259422, "grad_norm": 0.5137091279029846, "learning_rate": 1.6782980861801804e-05, "loss": 0.8212327361106873, "step": 568 }, { "epoch": 1.2638580931263859, "grad_norm": 0.5950433015823364, "learning_rate": 1.6757647270312637e-05, "loss": 1.1734381914138794, "step": 570 }, { "epoch": 1.2682926829268293, "grad_norm": 0.4560319185256958, "learning_rate": 1.6732236216667722e-05, "loss": 0.739474892616272, "step": 572 }, { "epoch": 1.2727272727272727, "grad_norm": 0.6213061809539795, "learning_rate": 1.6706748042741935e-05, "loss": 1.2839826345443726, "step": 574 }, { "epoch": 1.2771618625277161, "grad_norm": 0.5989497900009155, "learning_rate": 1.6681183091447722e-05, "loss": 0.9160253405570984, "step": 576 }, { "epoch": 1.2815964523281598, "grad_norm": 1.3319306373596191, "learning_rate": 1.6655541706730476e-05, "loss": 1.093945860862732, "step": 578 }, { "epoch": 1.2860310421286032, "grad_norm": 0.5771936774253845, "learning_rate": 1.6629824233563908e-05, "loss": 1.0052553415298462, "step": 580 }, { "epoch": 1.2904656319290466, "grad_norm": 2.056089401245117, "learning_rate": 1.6604031017945403e-05, "loss": 1.3277779817581177, "step": 582 }, { "epoch": 1.29490022172949, "grad_norm": 0.4700315594673157, "learning_rate": 1.657816240689137e-05, "loss": 0.7094478607177734, "step": 584 }, { "epoch": 1.2993348115299335, "grad_norm": 0.4772210419178009, "learning_rate": 1.6552218748432572e-05, "loss": 0.7443241477012634, "step": 586 }, { "epoch": 1.3037694013303769, "grad_norm": 1.3316142559051514, "learning_rate": 1.6526200391609445e-05, "loss": 0.5478697419166565, "step": 588 }, { "epoch": 1.3082039911308203, "grad_norm": 2.8271443843841553, "learning_rate": 1.6500107686467407e-05, "loss": 1.0316827297210693, "step": 590 }, { "epoch": 1.3126385809312637, "grad_norm": 0.5958804488182068, "learning_rate": 1.6473940984052125e-05, "loss": 0.9526193141937256, "step": 592 }, { "epoch": 1.3170731707317074, "grad_norm": 0.8103643655776978, "learning_rate": 1.644770063640483e-05, "loss": 0.956438422203064, "step": 594 }, { "epoch": 1.3215077605321508, "grad_norm": 0.49165335297584534, "learning_rate": 1.6421386996557546e-05, "loss": 1.1481645107269287, "step": 596 }, { "epoch": 1.3259423503325942, "grad_norm": 0.7782723903656006, "learning_rate": 1.6395000418528362e-05, "loss": 0.9521985650062561, "step": 598 }, { "epoch": 1.3303769401330376, "grad_norm": 0.4783051609992981, "learning_rate": 1.636854125731666e-05, "loss": 0.47762957215309143, "step": 600 }, { "epoch": 1.3348115299334813, "grad_norm": 0.8502888679504395, "learning_rate": 1.6342009868898332e-05, "loss": 0.7853302955627441, "step": 602 }, { "epoch": 1.3392461197339247, "grad_norm": 0.7362395524978638, "learning_rate": 1.6315406610221017e-05, "loss": 0.842612087726593, "step": 604 }, { "epoch": 1.3436807095343681, "grad_norm": 0.31031566858291626, "learning_rate": 1.6288731839199265e-05, "loss": 0.8278278708457947, "step": 606 }, { "epoch": 1.3481152993348116, "grad_norm": 0.6640880703926086, "learning_rate": 1.6261985914709745e-05, "loss": 1.028430461883545, "step": 608 }, { "epoch": 1.352549889135255, "grad_norm": 1.618883490562439, "learning_rate": 1.6235169196586408e-05, "loss": 1.1671243906021118, "step": 610 }, { "epoch": 1.3569844789356984, "grad_norm": 0.8194751739501953, "learning_rate": 1.6208282045615648e-05, "loss": 0.717631459236145, "step": 612 }, { "epoch": 1.3614190687361418, "grad_norm": 0.5236591100692749, "learning_rate": 1.618132482353145e-05, "loss": 1.0824005603790283, "step": 614 }, { "epoch": 1.3658536585365852, "grad_norm": 0.30997705459594727, "learning_rate": 1.6154297893010516e-05, "loss": 0.705600917339325, "step": 616 }, { "epoch": 1.370288248337029, "grad_norm": 0.5286486744880676, "learning_rate": 1.6127201617667396e-05, "loss": 0.8719974756240845, "step": 618 }, { "epoch": 1.3747228381374723, "grad_norm": 0.5527012348175049, "learning_rate": 1.6100036362049576e-05, "loss": 0.10983101278543472, "step": 620 }, { "epoch": 1.3791574279379157, "grad_norm": 0.4935061037540436, "learning_rate": 1.6072802491632612e-05, "loss": 0.9561376571655273, "step": 622 }, { "epoch": 1.3835920177383592, "grad_norm": 0.8581332564353943, "learning_rate": 1.6045500372815173e-05, "loss": 0.9489790201187134, "step": 624 }, { "epoch": 1.3880266075388026, "grad_norm": 1.1202986240386963, "learning_rate": 1.6018130372914123e-05, "loss": 0.9768886566162109, "step": 626 }, { "epoch": 1.3924611973392462, "grad_norm": 0.5203282833099365, "learning_rate": 1.5990692860159597e-05, "loss": 0.8944608569145203, "step": 628 }, { "epoch": 1.3968957871396896, "grad_norm": 0.44260817766189575, "learning_rate": 1.5963188203690025e-05, "loss": 1.0010405778884888, "step": 630 }, { "epoch": 1.401330376940133, "grad_norm": 0.5329799652099609, "learning_rate": 1.5935616773547182e-05, "loss": 0.8816275000572205, "step": 632 }, { "epoch": 1.4057649667405765, "grad_norm": 0.8102928400039673, "learning_rate": 1.5907978940671183e-05, "loss": 0.9644457101821899, "step": 634 }, { "epoch": 1.41019955654102, "grad_norm": 0.551501989364624, "learning_rate": 1.5880275076895537e-05, "loss": 0.9486368894577026, "step": 636 }, { "epoch": 1.4146341463414633, "grad_norm": 4.090445041656494, "learning_rate": 1.58525055549421e-05, "loss": 0.6854583024978638, "step": 638 }, { "epoch": 1.4190687361419068, "grad_norm": 0.5645637512207031, "learning_rate": 1.5824670748416085e-05, "loss": 0.900244414806366, "step": 640 }, { "epoch": 1.4235033259423504, "grad_norm": 0.7116575837135315, "learning_rate": 1.5796771031801034e-05, "loss": 0.8295862674713135, "step": 642 }, { "epoch": 1.4279379157427938, "grad_norm": 0.7264999747276306, "learning_rate": 1.5768806780453766e-05, "loss": 0.6157872676849365, "step": 644 }, { "epoch": 1.4323725055432373, "grad_norm": 0.608518123626709, "learning_rate": 1.5740778370599344e-05, "loss": 1.0620026588439941, "step": 646 }, { "epoch": 1.4368070953436807, "grad_norm": 0.5453920364379883, "learning_rate": 1.5712686179326004e-05, "loss": 1.2050490379333496, "step": 648 }, { "epoch": 1.441241685144124, "grad_norm": 0.42610880732536316, "learning_rate": 1.5684530584580077e-05, "loss": 1.1291793584823608, "step": 650 }, { "epoch": 1.4456762749445677, "grad_norm": 2.327178716659546, "learning_rate": 1.565631196516093e-05, "loss": 0.8947151899337769, "step": 652 }, { "epoch": 1.4501108647450112, "grad_norm": 0.7120440602302551, "learning_rate": 1.5628030700715824e-05, "loss": 0.8887991905212402, "step": 654 }, { "epoch": 1.4545454545454546, "grad_norm": 1.0359218120574951, "learning_rate": 1.5599687171734853e-05, "loss": 0.7058618664741516, "step": 656 }, { "epoch": 1.458980044345898, "grad_norm": 0.5742671489715576, "learning_rate": 1.5571281759545793e-05, "loss": 0.7722383141517639, "step": 658 }, { "epoch": 1.4634146341463414, "grad_norm": 0.6867632865905762, "learning_rate": 1.5542814846308996e-05, "loss": 0.9778433442115784, "step": 660 }, { "epoch": 1.4678492239467849, "grad_norm": 0.42144981026649475, "learning_rate": 1.5514286815012222e-05, "loss": 0.9305572509765625, "step": 662 }, { "epoch": 1.4722838137472283, "grad_norm": 0.5244068503379822, "learning_rate": 1.548569804946551e-05, "loss": 0.7543381452560425, "step": 664 }, { "epoch": 1.476718403547672, "grad_norm": 0.4360713064670563, "learning_rate": 1.5457048934296e-05, "loss": 0.4798527956008911, "step": 666 }, { "epoch": 1.4811529933481153, "grad_norm": 0.905125081539154, "learning_rate": 1.5428339854942757e-05, "loss": 0.5245689749717712, "step": 668 }, { "epoch": 1.4855875831485588, "grad_norm": 0.6136901378631592, "learning_rate": 1.539957119765161e-05, "loss": 0.9089503884315491, "step": 670 }, { "epoch": 1.4900221729490022, "grad_norm": 0.4613928496837616, "learning_rate": 1.537074334946992e-05, "loss": 0.9715514779090881, "step": 672 }, { "epoch": 1.4944567627494456, "grad_norm": 0.6848336458206177, "learning_rate": 1.5341856698241397e-05, "loss": 0.6604840755462646, "step": 674 }, { "epoch": 1.4988913525498893, "grad_norm": 0.7074861526489258, "learning_rate": 1.531291163260087e-05, "loss": 0.6721962094306946, "step": 676 }, { "epoch": 1.5033259423503327, "grad_norm": 0.7671158909797668, "learning_rate": 1.5283908541969064e-05, "loss": 1.0287514925003052, "step": 678 }, { "epoch": 1.507760532150776, "grad_norm": 0.46018627285957336, "learning_rate": 1.5254847816547366e-05, "loss": 0.5789790153503418, "step": 680 }, { "epoch": 1.5121951219512195, "grad_norm": 0.5391964316368103, "learning_rate": 1.522572984731256e-05, "loss": 0.5692949295043945, "step": 682 }, { "epoch": 1.516629711751663, "grad_norm": 0.523459792137146, "learning_rate": 1.5196555026011585e-05, "loss": 0.934548556804657, "step": 684 }, { "epoch": 1.5210643015521064, "grad_norm": 0.328876793384552, "learning_rate": 1.5167323745156248e-05, "loss": 0.9151366949081421, "step": 686 }, { "epoch": 1.5254988913525498, "grad_norm": 0.5242407321929932, "learning_rate": 1.5138036398017953e-05, "loss": 0.5513712763786316, "step": 688 }, { "epoch": 1.5299334811529932, "grad_norm": 0.38611844182014465, "learning_rate": 1.510869337862241e-05, "loss": 0.281048059463501, "step": 690 }, { "epoch": 1.5343680709534369, "grad_norm": 1.463240146636963, "learning_rate": 1.507929508174433e-05, "loss": 0.8684556484222412, "step": 692 }, { "epoch": 1.5388026607538803, "grad_norm": 1.3095505237579346, "learning_rate": 1.5049841902902119e-05, "loss": 0.8829594254493713, "step": 694 }, { "epoch": 1.5432372505543237, "grad_norm": 1.3540315628051758, "learning_rate": 1.5020334238352546e-05, "loss": 0.5511650443077087, "step": 696 }, { "epoch": 1.5476718403547673, "grad_norm": 0.36952298879623413, "learning_rate": 1.499077248508542e-05, "loss": 1.02639639377594, "step": 698 }, { "epoch": 1.5521064301552108, "grad_norm": 1.0932236909866333, "learning_rate": 1.496115704081826e-05, "loss": 1.0058600902557373, "step": 700 }, { "epoch": 1.5565410199556542, "grad_norm": 0.49011874198913574, "learning_rate": 1.4931488303990916e-05, "loss": 1.0263029336929321, "step": 702 }, { "epoch": 1.5609756097560976, "grad_norm": 1.3680771589279175, "learning_rate": 1.4901766673760232e-05, "loss": 0.824455738067627, "step": 704 }, { "epoch": 1.565410199556541, "grad_norm": 0.5223835110664368, "learning_rate": 1.4871992549994673e-05, "loss": 0.4502509832382202, "step": 706 }, { "epoch": 1.5698447893569845, "grad_norm": 0.5144345164299011, "learning_rate": 1.4842166333268932e-05, "loss": 1.0360265970230103, "step": 708 }, { "epoch": 1.5742793791574279, "grad_norm": 1.030713438987732, "learning_rate": 1.481228842485856e-05, "loss": 0.8033937215805054, "step": 710 }, { "epoch": 1.5787139689578713, "grad_norm": 0.8714462518692017, "learning_rate": 1.4782359226734544e-05, "loss": 0.6804985404014587, "step": 712 }, { "epoch": 1.5831485587583147, "grad_norm": 0.4418451488018036, "learning_rate": 1.475237914155792e-05, "loss": 0.9747523665428162, "step": 714 }, { "epoch": 1.5875831485587582, "grad_norm": 0.4844651520252228, "learning_rate": 1.472234857267435e-05, "loss": 0.9988541603088379, "step": 716 }, { "epoch": 1.5920177383592018, "grad_norm": 1.146903395652771, "learning_rate": 1.4692267924108683e-05, "loss": 1.0589611530303955, "step": 718 }, { "epoch": 1.5964523281596452, "grad_norm": 1.1565581560134888, "learning_rate": 1.466213760055954e-05, "loss": 0.5897700786590576, "step": 720 }, { "epoch": 1.6008869179600886, "grad_norm": 0.23559361696243286, "learning_rate": 1.4631958007393854e-05, "loss": 0.4846925735473633, "step": 722 }, { "epoch": 1.6053215077605323, "grad_norm": 0.4940757751464844, "learning_rate": 1.4601729550641417e-05, "loss": 1.0242489576339722, "step": 724 }, { "epoch": 1.6097560975609757, "grad_norm": 1.7630901336669922, "learning_rate": 1.4571452636989433e-05, "loss": 1.0372512340545654, "step": 726 }, { "epoch": 1.6141906873614191, "grad_norm": 0.36424028873443604, "learning_rate": 1.4541127673777021e-05, "loss": 0.7359429001808167, "step": 728 }, { "epoch": 1.6186252771618626, "grad_norm": 0.4631586968898773, "learning_rate": 1.451075506898975e-05, "loss": 0.9926391839981079, "step": 730 }, { "epoch": 1.623059866962306, "grad_norm": 0.43977200984954834, "learning_rate": 1.4480335231254164e-05, "loss": 0.9845470786094666, "step": 732 }, { "epoch": 1.6274944567627494, "grad_norm": 0.5064222812652588, "learning_rate": 1.4449868569832253e-05, "loss": 0.9982655048370361, "step": 734 }, { "epoch": 1.6319290465631928, "grad_norm": 0.2603287994861603, "learning_rate": 1.4419355494615963e-05, "loss": 0.45653244853019714, "step": 736 }, { "epoch": 1.6363636363636362, "grad_norm": 0.5068104863166809, "learning_rate": 1.4388796416121696e-05, "loss": 1.2514511346817017, "step": 738 }, { "epoch": 1.6407982261640797, "grad_norm": 0.39673784375190735, "learning_rate": 1.4358191745484755e-05, "loss": 0.9661815166473389, "step": 740 }, { "epoch": 1.6452328159645233, "grad_norm": 0.9892500638961792, "learning_rate": 1.432754189445384e-05, "loss": 1.1122088432312012, "step": 742 }, { "epoch": 1.6496674057649667, "grad_norm": 0.6944724917411804, "learning_rate": 1.4296847275385495e-05, "loss": 0.7954747080802917, "step": 744 }, { "epoch": 1.6541019955654102, "grad_norm": 1.078669548034668, "learning_rate": 1.4266108301238564e-05, "loss": 0.856575071811676, "step": 746 }, { "epoch": 1.6585365853658538, "grad_norm": 0.7615432143211365, "learning_rate": 1.4235325385568636e-05, "loss": 0.6531709432601929, "step": 748 }, { "epoch": 1.6629711751662972, "grad_norm": 0.47316062450408936, "learning_rate": 1.4204498942522482e-05, "loss": 0.971373975276947, "step": 750 }, { "epoch": 1.6674057649667406, "grad_norm": 0.4431406259536743, "learning_rate": 1.4173629386832473e-05, "loss": 0.7244459390640259, "step": 752 }, { "epoch": 1.671840354767184, "grad_norm": 0.5017882585525513, "learning_rate": 1.4142717133811013e-05, "loss": 0.5894262790679932, "step": 754 }, { "epoch": 1.6762749445676275, "grad_norm": 0.7016173005104065, "learning_rate": 1.4111762599344952e-05, "loss": 1.006710171699524, "step": 756 }, { "epoch": 1.680709534368071, "grad_norm": 0.8765194416046143, "learning_rate": 1.4080766199889976e-05, "loss": 0.9072303771972656, "step": 758 }, { "epoch": 1.6851441241685143, "grad_norm": 1.2686158418655396, "learning_rate": 1.404972835246502e-05, "loss": 0.8974109292030334, "step": 760 }, { "epoch": 1.6895787139689578, "grad_norm": 0.8306912183761597, "learning_rate": 1.401864947464665e-05, "loss": 0.8825592994689941, "step": 762 }, { "epoch": 1.6940133037694012, "grad_norm": 1.107991337776184, "learning_rate": 1.3987529984563444e-05, "loss": 0.9357943534851074, "step": 764 }, { "epoch": 1.6984478935698448, "grad_norm": 1.4103295803070068, "learning_rate": 1.3956370300890374e-05, "loss": 1.0407212972640991, "step": 766 }, { "epoch": 1.7028824833702882, "grad_norm": 1.0025876760482788, "learning_rate": 1.392517084284316e-05, "loss": 0.6954239010810852, "step": 768 }, { "epoch": 1.7073170731707317, "grad_norm": 0.5951728224754333, "learning_rate": 1.3893932030172642e-05, "loss": 0.9474072456359863, "step": 770 }, { "epoch": 1.7117516629711753, "grad_norm": 1.6196831464767456, "learning_rate": 1.386265428315913e-05, "loss": 0.9979518055915833, "step": 772 }, { "epoch": 1.7161862527716187, "grad_norm": 0.4795306622982025, "learning_rate": 1.3831338022606748e-05, "loss": 0.8625308275222778, "step": 774 }, { "epoch": 1.7206208425720622, "grad_norm": 0.9456951022148132, "learning_rate": 1.3799983669837768e-05, "loss": 0.9803452491760254, "step": 776 }, { "epoch": 1.7250554323725056, "grad_norm": 0.46205422282218933, "learning_rate": 1.3768591646686957e-05, "loss": 1.0163923501968384, "step": 778 }, { "epoch": 1.729490022172949, "grad_norm": 0.6149927377700806, "learning_rate": 1.3737162375495883e-05, "loss": 0.5648257732391357, "step": 780 }, { "epoch": 1.7339246119733924, "grad_norm": 0.35180729627609253, "learning_rate": 1.3705696279107238e-05, "loss": 0.9397526979446411, "step": 782 }, { "epoch": 1.7383592017738358, "grad_norm": 0.3703164756298065, "learning_rate": 1.3674193780859163e-05, "loss": 0.6409098505973816, "step": 784 }, { "epoch": 1.7427937915742793, "grad_norm": 0.4282858371734619, "learning_rate": 1.3642655304579535e-05, "loss": 0.7513792514801025, "step": 786 }, { "epoch": 1.7472283813747227, "grad_norm": 0.3827633857727051, "learning_rate": 1.3611081274580269e-05, "loss": 0.6845064759254456, "step": 788 }, { "epoch": 1.7516629711751663, "grad_norm": 1.2396421432495117, "learning_rate": 1.3579472115651623e-05, "loss": 0.6268539428710938, "step": 790 }, { "epoch": 1.7560975609756098, "grad_norm": 0.40521490573883057, "learning_rate": 1.354782825305646e-05, "loss": 0.6478447914123535, "step": 792 }, { "epoch": 1.7605321507760532, "grad_norm": 0.32460105419158936, "learning_rate": 1.3516150112524542e-05, "loss": 0.8190337419509888, "step": 794 }, { "epoch": 1.7649667405764968, "grad_norm": 0.8050366640090942, "learning_rate": 1.3484438120246806e-05, "loss": 0.8022271394729614, "step": 796 }, { "epoch": 1.7694013303769403, "grad_norm": 0.4470427930355072, "learning_rate": 1.3452692702869619e-05, "loss": 0.9513342380523682, "step": 798 }, { "epoch": 1.7738359201773837, "grad_norm": 0.43522873520851135, "learning_rate": 1.3420914287489037e-05, "loss": 0.9605931043624878, "step": 800 }, { "epoch": 1.778270509977827, "grad_norm": 0.6569511890411377, "learning_rate": 1.3389103301645065e-05, "loss": 0.9895227551460266, "step": 802 }, { "epoch": 1.7827050997782705, "grad_norm": 0.5629826188087463, "learning_rate": 1.3357260173315918e-05, "loss": 1.1033282279968262, "step": 804 }, { "epoch": 1.787139689578714, "grad_norm": 0.4114173352718353, "learning_rate": 1.332538533091223e-05, "loss": 0.74909508228302, "step": 806 }, { "epoch": 1.7915742793791574, "grad_norm": 0.39374831318855286, "learning_rate": 1.3293479203271322e-05, "loss": 0.9650196433067322, "step": 808 }, { "epoch": 1.7960088691796008, "grad_norm": 1.316881537437439, "learning_rate": 1.3261542219651415e-05, "loss": 0.5823323130607605, "step": 810 }, { "epoch": 1.8004434589800442, "grad_norm": 0.8751013278961182, "learning_rate": 1.3229574809725859e-05, "loss": 0.5940043926239014, "step": 812 }, { "epoch": 1.8048780487804879, "grad_norm": 1.2625625133514404, "learning_rate": 1.3197577403577355e-05, "loss": 0.9879517555236816, "step": 814 }, { "epoch": 1.8093126385809313, "grad_norm": 2.798226833343506, "learning_rate": 1.3165550431692164e-05, "loss": 0.8953067064285278, "step": 816 }, { "epoch": 1.8137472283813747, "grad_norm": 0.4607974588871002, "learning_rate": 1.3133494324954328e-05, "loss": 0.4630458652973175, "step": 818 }, { "epoch": 1.8181818181818183, "grad_norm": 0.7473888993263245, "learning_rate": 1.3101409514639847e-05, "loss": 1.0197738409042358, "step": 820 }, { "epoch": 1.8226164079822618, "grad_norm": 0.7188895344734192, "learning_rate": 1.3069296432410905e-05, "loss": 1.0835227966308594, "step": 822 }, { "epoch": 1.8270509977827052, "grad_norm": 0.7948015928268433, "learning_rate": 1.3037155510310047e-05, "loss": 1.1620758771896362, "step": 824 }, { "epoch": 1.8314855875831486, "grad_norm": 2.9718968868255615, "learning_rate": 1.3004987180754367e-05, "loss": 0.9052017331123352, "step": 826 }, { "epoch": 1.835920177383592, "grad_norm": 2.999119281768799, "learning_rate": 1.29727918765297e-05, "loss": 0.8258069753646851, "step": 828 }, { "epoch": 1.8403547671840355, "grad_norm": 0.6131216287612915, "learning_rate": 1.2940570030784783e-05, "loss": 0.9284101128578186, "step": 830 }, { "epoch": 1.8447893569844789, "grad_norm": 1.4488681554794312, "learning_rate": 1.290832207702544e-05, "loss": 0.9328111410140991, "step": 832 }, { "epoch": 1.8492239467849223, "grad_norm": 0.4498242139816284, "learning_rate": 1.2876048449108756e-05, "loss": 0.9122157096862793, "step": 834 }, { "epoch": 1.8536585365853657, "grad_norm": 0.4527730643749237, "learning_rate": 1.2843749581237216e-05, "loss": 0.951221227645874, "step": 836 }, { "epoch": 1.8580931263858091, "grad_norm": 0.5404245257377625, "learning_rate": 1.2811425907952887e-05, "loss": 0.904753565788269, "step": 838 }, { "epoch": 1.8625277161862528, "grad_norm": 0.6924629807472229, "learning_rate": 1.2779077864131566e-05, "loss": 1.0605340003967285, "step": 840 }, { "epoch": 1.8669623059866962, "grad_norm": 0.4970324635505676, "learning_rate": 1.274670588497691e-05, "loss": 0.5903202295303345, "step": 842 }, { "epoch": 1.8713968957871396, "grad_norm": 0.793752133846283, "learning_rate": 1.2714310406014613e-05, "loss": 0.7120020389556885, "step": 844 }, { "epoch": 1.8758314855875833, "grad_norm": 0.8532220721244812, "learning_rate": 1.2681891863086526e-05, "loss": 0.7570974230766296, "step": 846 }, { "epoch": 1.8802660753880267, "grad_norm": 0.6667500734329224, "learning_rate": 1.2649450692344798e-05, "loss": 1.010290265083313, "step": 848 }, { "epoch": 1.8847006651884701, "grad_norm": 0.5184866786003113, "learning_rate": 1.2616987330246e-05, "loss": 0.9949779510498047, "step": 850 }, { "epoch": 1.8891352549889135, "grad_norm": 0.41842129826545715, "learning_rate": 1.2584502213545273e-05, "loss": 0.6566750407218933, "step": 852 }, { "epoch": 1.893569844789357, "grad_norm": 0.47411566972732544, "learning_rate": 1.2551995779290431e-05, "loss": 0.9789588451385498, "step": 854 }, { "epoch": 1.8980044345898004, "grad_norm": 0.41428887844085693, "learning_rate": 1.2519468464816094e-05, "loss": 0.8622305989265442, "step": 856 }, { "epoch": 1.9024390243902438, "grad_norm": 0.5540589094161987, "learning_rate": 1.2486920707737795e-05, "loss": 0.7378232479095459, "step": 858 }, { "epoch": 1.9068736141906872, "grad_norm": 0.9826019406318665, "learning_rate": 1.2454352945946105e-05, "loss": 0.7468891143798828, "step": 860 }, { "epoch": 1.9113082039911307, "grad_norm": 1.3631356954574585, "learning_rate": 1.2421765617600732e-05, "loss": 0.9804845452308655, "step": 862 }, { "epoch": 1.9157427937915743, "grad_norm": 0.6108648777008057, "learning_rate": 1.238915916112462e-05, "loss": 0.7339483499526978, "step": 864 }, { "epoch": 1.9201773835920177, "grad_norm": 1.0804190635681152, "learning_rate": 1.2356534015198067e-05, "loss": 0.6702901721000671, "step": 866 }, { "epoch": 1.9246119733924612, "grad_norm": 0.8905138373374939, "learning_rate": 1.2323890618752818e-05, "loss": 1.140580415725708, "step": 868 }, { "epoch": 1.9290465631929048, "grad_norm": 0.4676206409931183, "learning_rate": 1.229122941096615e-05, "loss": 0.9294151663780212, "step": 870 }, { "epoch": 1.9334811529933482, "grad_norm": 0.30312380194664, "learning_rate": 1.225855083125497e-05, "loss": 0.6089338660240173, "step": 872 }, { "epoch": 1.9379157427937916, "grad_norm": 0.8847364783287048, "learning_rate": 1.22258553192699e-05, "loss": 0.645588219165802, "step": 874 }, { "epoch": 1.942350332594235, "grad_norm": 0.6345183253288269, "learning_rate": 1.219314331488938e-05, "loss": 0.6743212938308716, "step": 876 }, { "epoch": 1.9467849223946785, "grad_norm": 1.4533907175064087, "learning_rate": 1.2160415258213719e-05, "loss": 0.8229029774665833, "step": 878 }, { "epoch": 1.951219512195122, "grad_norm": 0.656122624874115, "learning_rate": 1.2127671589559195e-05, "loss": 0.8455672860145569, "step": 880 }, { "epoch": 1.9556541019955653, "grad_norm": 1.9663106203079224, "learning_rate": 1.2094912749452134e-05, "loss": 0.6619812846183777, "step": 882 }, { "epoch": 1.9600886917960088, "grad_norm": 0.43535202741622925, "learning_rate": 1.2062139178622963e-05, "loss": 0.81618332862854, "step": 884 }, { "epoch": 1.9645232815964522, "grad_norm": 0.42277711629867554, "learning_rate": 1.20293513180003e-05, "loss": 0.9992027878761292, "step": 886 }, { "epoch": 1.9689578713968958, "grad_norm": 0.40196138620376587, "learning_rate": 1.199654960870502e-05, "loss": 0.9606343507766724, "step": 888 }, { "epoch": 1.9733924611973392, "grad_norm": 0.42394229769706726, "learning_rate": 1.1963734492044299e-05, "loss": 0.9592314958572388, "step": 890 }, { "epoch": 1.9778270509977827, "grad_norm": 0.549923300743103, "learning_rate": 1.193090640950571e-05, "loss": 1.0462260246276855, "step": 892 }, { "epoch": 1.9822616407982263, "grad_norm": 0.6976901292800903, "learning_rate": 1.1898065802751254e-05, "loss": 0.9654414653778076, "step": 894 }, { "epoch": 1.9866962305986697, "grad_norm": 1.4644861221313477, "learning_rate": 1.1865213113611438e-05, "loss": 0.8772508502006531, "step": 896 }, { "epoch": 1.9911308203991132, "grad_norm": 0.6265084147453308, "learning_rate": 1.1832348784079319e-05, "loss": 0.9136525988578796, "step": 898 }, { "epoch": 1.9955654101995566, "grad_norm": 0.4937969148159027, "learning_rate": 1.1799473256304567e-05, "loss": 0.7895318269729614, "step": 900 }, { "epoch": 2.0, "grad_norm": 0.5102665424346924, "learning_rate": 1.17665869725875e-05, "loss": 0.9466162919998169, "step": 902 }, { "epoch": 2.0044345898004434, "grad_norm": 0.4070099890232086, "learning_rate": 1.1733690375373147e-05, "loss": 0.715006411075592, "step": 904 }, { "epoch": 2.008869179600887, "grad_norm": 0.5904584527015686, "learning_rate": 1.1700783907245304e-05, "loss": 0.6284165978431702, "step": 906 }, { "epoch": 2.0133037694013303, "grad_norm": 0.4084486961364746, "learning_rate": 1.1667868010920555e-05, "loss": 0.4244351387023926, "step": 908 }, { "epoch": 2.0177383592017737, "grad_norm": 0.8332369923591614, "learning_rate": 1.1634943129242337e-05, "loss": 0.5955982208251953, "step": 910 }, { "epoch": 2.022172949002217, "grad_norm": 0.8778854012489319, "learning_rate": 1.160200970517497e-05, "loss": 0.50541752576828, "step": 912 }, { "epoch": 2.0266075388026605, "grad_norm": 4.370595932006836, "learning_rate": 1.1569068181797699e-05, "loss": 0.5145138502120972, "step": 914 }, { "epoch": 2.0310421286031044, "grad_norm": 1.4196687936782837, "learning_rate": 1.1536119002298737e-05, "loss": 0.47636979818344116, "step": 916 }, { "epoch": 2.035476718403548, "grad_norm": 0.7198065519332886, "learning_rate": 1.1503162609969314e-05, "loss": 0.5563622713088989, "step": 918 }, { "epoch": 2.0399113082039912, "grad_norm": 0.43456801772117615, "learning_rate": 1.1470199448197677e-05, "loss": 0.5351572632789612, "step": 920 }, { "epoch": 2.0443458980044347, "grad_norm": 0.5137150287628174, "learning_rate": 1.1437229960463163e-05, "loss": 0.5629701614379883, "step": 922 }, { "epoch": 2.048780487804878, "grad_norm": 0.3429313004016876, "learning_rate": 1.1404254590330213e-05, "loss": 0.15150287747383118, "step": 924 }, { "epoch": 2.0532150776053215, "grad_norm": 0.5494690537452698, "learning_rate": 1.137127378144241e-05, "loss": 0.5665069222450256, "step": 926 }, { "epoch": 2.057649667405765, "grad_norm": 1.4760738611221313, "learning_rate": 1.1338287977516507e-05, "loss": 0.23657920956611633, "step": 928 }, { "epoch": 2.0620842572062084, "grad_norm": 0.3918812870979309, "learning_rate": 1.1305297622336457e-05, "loss": 0.3985291123390198, "step": 930 }, { "epoch": 2.066518847006652, "grad_norm": 3.808762788772583, "learning_rate": 1.1272303159747451e-05, "loss": 0.46506452560424805, "step": 932 }, { "epoch": 2.070953436807095, "grad_norm": 0.577021062374115, "learning_rate": 1.1239305033649934e-05, "loss": 0.5112553834915161, "step": 934 }, { "epoch": 2.0753880266075386, "grad_norm": 0.7988712787628174, "learning_rate": 1.1206303687993644e-05, "loss": 0.7404617071151733, "step": 936 }, { "epoch": 2.079822616407982, "grad_norm": 0.4242592751979828, "learning_rate": 1.1173299566771626e-05, "loss": 0.33282893896102905, "step": 938 }, { "epoch": 2.084257206208426, "grad_norm": 0.46631020307540894, "learning_rate": 1.1140293114014282e-05, "loss": 0.4563349485397339, "step": 940 }, { "epoch": 2.0886917960088693, "grad_norm": 1.1207689046859741, "learning_rate": 1.1107284773783367e-05, "loss": 0.5358268022537231, "step": 942 }, { "epoch": 2.0931263858093128, "grad_norm": 0.6466286182403564, "learning_rate": 1.1074274990166036e-05, "loss": 0.406946063041687, "step": 944 }, { "epoch": 2.097560975609756, "grad_norm": 0.6163548827171326, "learning_rate": 1.1041264207268861e-05, "loss": 0.5453028678894043, "step": 946 }, { "epoch": 2.1019955654101996, "grad_norm": 0.7833722233772278, "learning_rate": 1.1008252869211864e-05, "loss": 0.5683756470680237, "step": 948 }, { "epoch": 2.106430155210643, "grad_norm": 0.8931224942207336, "learning_rate": 1.0975241420122524e-05, "loss": 0.4366806149482727, "step": 950 }, { "epoch": 2.1108647450110865, "grad_norm": 0.5928601026535034, "learning_rate": 1.0942230304129831e-05, "loss": 0.4392179846763611, "step": 952 }, { "epoch": 2.11529933481153, "grad_norm": 1.2183982133865356, "learning_rate": 1.0909219965358275e-05, "loss": 0.49065983295440674, "step": 954 }, { "epoch": 2.1197339246119733, "grad_norm": 0.6264125108718872, "learning_rate": 1.0876210847921905e-05, "loss": 0.5899641513824463, "step": 956 }, { "epoch": 2.1241685144124167, "grad_norm": 0.6409426927566528, "learning_rate": 1.0843203395918327e-05, "loss": 0.4045730531215668, "step": 958 }, { "epoch": 2.12860310421286, "grad_norm": 1.60128653049469, "learning_rate": 1.0810198053422747e-05, "loss": 0.22457213699817657, "step": 960 }, { "epoch": 2.1330376940133036, "grad_norm": 1.197357177734375, "learning_rate": 1.0777195264481988e-05, "loss": 0.3387850224971771, "step": 962 }, { "epoch": 2.1374722838137474, "grad_norm": 0.8524078130722046, "learning_rate": 1.0744195473108522e-05, "loss": 0.44860363006591797, "step": 964 }, { "epoch": 2.141906873614191, "grad_norm": 0.548141598701477, "learning_rate": 1.071119912327448e-05, "loss": 0.7017822861671448, "step": 966 }, { "epoch": 2.1463414634146343, "grad_norm": 0.4515199363231659, "learning_rate": 1.0678206658905712e-05, "loss": 0.3781665563583374, "step": 968 }, { "epoch": 2.1507760532150777, "grad_norm": 0.6646062731742859, "learning_rate": 1.0645218523875773e-05, "loss": 0.51128089427948, "step": 970 }, { "epoch": 2.155210643015521, "grad_norm": 0.5504773855209351, "learning_rate": 1.0612235161999987e-05, "loss": 0.3802485764026642, "step": 972 }, { "epoch": 2.1596452328159645, "grad_norm": 0.527137279510498, "learning_rate": 1.057925701702945e-05, "loss": 0.6255434155464172, "step": 974 }, { "epoch": 2.164079822616408, "grad_norm": 0.8251080513000488, "learning_rate": 1.0546284532645077e-05, "loss": 0.49471452832221985, "step": 976 }, { "epoch": 2.1685144124168514, "grad_norm": 1.3507685661315918, "learning_rate": 1.0513318152451627e-05, "loss": 0.3210045397281647, "step": 978 }, { "epoch": 2.172949002217295, "grad_norm": 0.6633515357971191, "learning_rate": 1.0480358319971731e-05, "loss": 0.6007053852081299, "step": 980 }, { "epoch": 2.1773835920177382, "grad_norm": 0.36952632665634155, "learning_rate": 1.0447405478639929e-05, "loss": 0.2838934361934662, "step": 982 }, { "epoch": 2.1818181818181817, "grad_norm": 0.5688261985778809, "learning_rate": 1.0414460071796712e-05, "loss": 0.18350011110305786, "step": 984 }, { "epoch": 2.186252771618625, "grad_norm": 1.1831949949264526, "learning_rate": 1.0381522542682536e-05, "loss": 0.40068039298057556, "step": 986 }, { "epoch": 2.1906873614190685, "grad_norm": 1.4388840198516846, "learning_rate": 1.0348593334431878e-05, "loss": 0.23880073428153992, "step": 988 }, { "epoch": 2.1951219512195124, "grad_norm": 0.6307854652404785, "learning_rate": 1.0315672890067271e-05, "loss": 0.5894753932952881, "step": 990 }, { "epoch": 2.199556541019956, "grad_norm": 2.421830415725708, "learning_rate": 1.0282761652493334e-05, "loss": 0.4432171583175659, "step": 992 }, { "epoch": 2.203991130820399, "grad_norm": 0.5128687620162964, "learning_rate": 1.024986006449083e-05, "loss": 0.48450496792793274, "step": 994 }, { "epoch": 2.2084257206208426, "grad_norm": 0.5676178932189941, "learning_rate": 1.0216968568710679e-05, "loss": 0.5746522545814514, "step": 996 }, { "epoch": 2.212860310421286, "grad_norm": 0.5976463556289673, "learning_rate": 1.0184087607668039e-05, "loss": 0.5264995694160461, "step": 998 }, { "epoch": 2.2172949002217295, "grad_norm": 0.7051799297332764, "learning_rate": 1.0151217623736338e-05, "loss": 0.46825850009918213, "step": 1000 }, { "epoch": 2.221729490022173, "grad_norm": 0.8515892624855042, "learning_rate": 1.0118359059141313e-05, "loss": 0.27047228813171387, "step": 1002 }, { "epoch": 2.2261640798226163, "grad_norm": 0.4068000316619873, "learning_rate": 1.0085512355955067e-05, "loss": 0.5676589608192444, "step": 1004 }, { "epoch": 2.2305986696230597, "grad_norm": 0.8601819276809692, "learning_rate": 1.0052677956090125e-05, "loss": 0.46005040407180786, "step": 1006 }, { "epoch": 2.235033259423503, "grad_norm": 0.7253012657165527, "learning_rate": 1.0019856301293482e-05, "loss": 0.5689443945884705, "step": 1008 }, { "epoch": 2.2394678492239466, "grad_norm": 0.46540704369544983, "learning_rate": 9.987047833140668e-06, "loss": 0.3451939523220062, "step": 1010 }, { "epoch": 2.2439024390243905, "grad_norm": 2.0232350826263428, "learning_rate": 9.954252993029803e-06, "loss": 0.5826783776283264, "step": 1012 }, { "epoch": 2.248337028824834, "grad_norm": 3.809951066970825, "learning_rate": 9.921472222175654e-06, "loss": 0.5647210478782654, "step": 1014 }, { "epoch": 2.2527716186252773, "grad_norm": 1.0120117664337158, "learning_rate": 9.888705961603709e-06, "loss": 0.6450280547142029, "step": 1016 }, { "epoch": 2.2572062084257207, "grad_norm": 0.6231004595756531, "learning_rate": 9.85595465214423e-06, "loss": 0.24749194085597992, "step": 1018 }, { "epoch": 2.261640798226164, "grad_norm": 0.5251925587654114, "learning_rate": 9.823218734426336e-06, "loss": 0.5488971471786499, "step": 1020 }, { "epoch": 2.2660753880266076, "grad_norm": 0.22870703041553497, "learning_rate": 9.79049864887207e-06, "loss": 0.39323848485946655, "step": 1022 }, { "epoch": 2.270509977827051, "grad_norm": 1.2425155639648438, "learning_rate": 9.757794835690463e-06, "loss": 0.8195447325706482, "step": 1024 }, { "epoch": 2.2749445676274944, "grad_norm": 1.2200350761413574, "learning_rate": 9.72510773487164e-06, "loss": 0.39812397956848145, "step": 1026 }, { "epoch": 2.279379157427938, "grad_norm": 0.5721977353096008, "learning_rate": 9.692437786180852e-06, "loss": 0.5707634687423706, "step": 1028 }, { "epoch": 2.2838137472283813, "grad_norm": 0.47224897146224976, "learning_rate": 9.659785429152615e-06, "loss": 0.6199125051498413, "step": 1030 }, { "epoch": 2.2882483370288247, "grad_norm": 1.0250192880630493, "learning_rate": 9.627151103084763e-06, "loss": 0.41856324672698975, "step": 1032 }, { "epoch": 2.292682926829268, "grad_norm": 0.947811484336853, "learning_rate": 9.594535247032543e-06, "loss": 0.32791462540626526, "step": 1034 }, { "epoch": 2.2971175166297115, "grad_norm": 0.6266341805458069, "learning_rate": 9.561938299802709e-06, "loss": 0.5352550745010376, "step": 1036 }, { "epoch": 2.3015521064301554, "grad_norm": 4.217014789581299, "learning_rate": 9.529360699947624e-06, "loss": 0.6385710835456848, "step": 1038 }, { "epoch": 2.305986696230599, "grad_norm": 0.8212743401527405, "learning_rate": 9.496802885759349e-06, "loss": 0.4557139277458191, "step": 1040 }, { "epoch": 2.3104212860310422, "grad_norm": 1.0060659646987915, "learning_rate": 9.464265295263762e-06, "loss": 0.7039799690246582, "step": 1042 }, { "epoch": 2.3148558758314857, "grad_norm": 12.946681022644043, "learning_rate": 9.431748366214648e-06, "loss": 0.4291222095489502, "step": 1044 }, { "epoch": 2.319290465631929, "grad_norm": 0.5580220222473145, "learning_rate": 9.399252536087822e-06, "loss": 0.6024729013442993, "step": 1046 }, { "epoch": 2.3237250554323725, "grad_norm": 0.607992947101593, "learning_rate": 9.366778242075236e-06, "loss": 0.5440095663070679, "step": 1048 }, { "epoch": 2.328159645232816, "grad_norm": 0.6783135533332825, "learning_rate": 9.334325921079104e-06, "loss": 0.6058806777000427, "step": 1050 }, { "epoch": 2.3325942350332594, "grad_norm": 0.6938934922218323, "learning_rate": 9.301896009706012e-06, "loss": 0.4494543671607971, "step": 1052 }, { "epoch": 2.337028824833703, "grad_norm": 0.477782279253006, "learning_rate": 9.269488944261058e-06, "loss": 0.4361210763454437, "step": 1054 }, { "epoch": 2.341463414634146, "grad_norm": 0.5728092193603516, "learning_rate": 9.237105160741976e-06, "loss": 0.5449360609054565, "step": 1056 }, { "epoch": 2.3458980044345896, "grad_norm": 0.18092034757137299, "learning_rate": 9.204745094833265e-06, "loss": 0.3745296895503998, "step": 1058 }, { "epoch": 2.3503325942350335, "grad_norm": 0.5357985496520996, "learning_rate": 9.172409181900337e-06, "loss": 0.6852684020996094, "step": 1060 }, { "epoch": 2.354767184035477, "grad_norm": 0.790863037109375, "learning_rate": 9.140097856983647e-06, "loss": 0.2813524603843689, "step": 1062 }, { "epoch": 2.3592017738359203, "grad_norm": 0.2192503809928894, "learning_rate": 9.107811554792863e-06, "loss": 0.3573903739452362, "step": 1064 }, { "epoch": 2.3636363636363638, "grad_norm": 1.4538520574569702, "learning_rate": 9.075550709700992e-06, "loss": 0.5834711790084839, "step": 1066 }, { "epoch": 2.368070953436807, "grad_norm": 0.641722559928894, "learning_rate": 9.043315755738545e-06, "loss": 0.5266854763031006, "step": 1068 }, { "epoch": 2.3725055432372506, "grad_norm": 0.6017807126045227, "learning_rate": 9.011107126587705e-06, "loss": 0.5866771936416626, "step": 1070 }, { "epoch": 2.376940133037694, "grad_norm": 0.707431435585022, "learning_rate": 8.978925255576484e-06, "loss": 0.4829937517642975, "step": 1072 }, { "epoch": 2.3813747228381374, "grad_norm": 0.2395654022693634, "learning_rate": 8.946770575672897e-06, "loss": 0.04968187212944031, "step": 1074 }, { "epoch": 2.385809312638581, "grad_norm": 0.5818225741386414, "learning_rate": 8.914643519479134e-06, "loss": 0.3766881227493286, "step": 1076 }, { "epoch": 2.3902439024390243, "grad_norm": 0.4298112094402313, "learning_rate": 8.882544519225737e-06, "loss": 0.1799193024635315, "step": 1078 }, { "epoch": 2.3946784922394677, "grad_norm": 1.3011754751205444, "learning_rate": 8.850474006765806e-06, "loss": 0.5404252409934998, "step": 1080 }, { "epoch": 2.399113082039911, "grad_norm": 0.6072801351547241, "learning_rate": 8.818432413569153e-06, "loss": 0.42710888385772705, "step": 1082 }, { "epoch": 2.4035476718403546, "grad_norm": 0.8172256350517273, "learning_rate": 8.78642017071653e-06, "loss": 0.4754990339279175, "step": 1084 }, { "epoch": 2.4079822616407984, "grad_norm": 0.4423505961894989, "learning_rate": 8.754437708893803e-06, "loss": 0.5498704314231873, "step": 1086 }, { "epoch": 2.412416851441242, "grad_norm": 0.292689710855484, "learning_rate": 8.722485458386183e-06, "loss": 0.14969071745872498, "step": 1088 }, { "epoch": 2.4168514412416853, "grad_norm": 0.5658117532730103, "learning_rate": 8.690563849072416e-06, "loss": 0.593338131904602, "step": 1090 }, { "epoch": 2.4212860310421287, "grad_norm": 1.8885061740875244, "learning_rate": 8.65867331041901e-06, "loss": 0.3968830704689026, "step": 1092 }, { "epoch": 2.425720620842572, "grad_norm": 1.8343939781188965, "learning_rate": 8.62681427147446e-06, "loss": 0.28023120760917664, "step": 1094 }, { "epoch": 2.4301552106430155, "grad_norm": 1.2832564115524292, "learning_rate": 8.594987160863464e-06, "loss": 0.3517853617668152, "step": 1096 }, { "epoch": 2.434589800443459, "grad_norm": 0.32917505502700806, "learning_rate": 8.563192406781164e-06, "loss": 0.3207606077194214, "step": 1098 }, { "epoch": 2.4390243902439024, "grad_norm": 0.9043774008750916, "learning_rate": 8.53143043698739e-06, "loss": 0.4255558252334595, "step": 1100 }, { "epoch": 2.443458980044346, "grad_norm": 0.5287153124809265, "learning_rate": 8.499701678800891e-06, "loss": 0.6775237917900085, "step": 1102 }, { "epoch": 2.4478935698447892, "grad_norm": 1.211562991142273, "learning_rate": 8.4680065590936e-06, "loss": 0.28972724080085754, "step": 1104 }, { "epoch": 2.4523281596452327, "grad_norm": 0.5662131309509277, "learning_rate": 8.436345504284884e-06, "loss": 0.685287594795227, "step": 1106 }, { "epoch": 2.4567627494456765, "grad_norm": 1.0978025197982788, "learning_rate": 8.404718940335805e-06, "loss": 0.647050142288208, "step": 1108 }, { "epoch": 2.4611973392461195, "grad_norm": 0.48306140303611755, "learning_rate": 8.373127292743392e-06, "loss": 0.7415695190429688, "step": 1110 }, { "epoch": 2.4656319290465634, "grad_norm": 0.4147641360759735, "learning_rate": 8.341570986534926e-06, "loss": 0.47963038086891174, "step": 1112 }, { "epoch": 2.470066518847007, "grad_norm": 0.6168814301490784, "learning_rate": 8.310050446262204e-06, "loss": 0.5705453157424927, "step": 1114 }, { "epoch": 2.47450110864745, "grad_norm": 0.8609782457351685, "learning_rate": 8.278566095995837e-06, "loss": 0.24776363372802734, "step": 1116 }, { "epoch": 2.4789356984478936, "grad_norm": 0.41248947381973267, "learning_rate": 8.247118359319542e-06, "loss": 0.573097825050354, "step": 1118 }, { "epoch": 2.483370288248337, "grad_norm": 0.5210030674934387, "learning_rate": 8.215707659324448e-06, "loss": 0.45975643396377563, "step": 1120 }, { "epoch": 2.4878048780487805, "grad_norm": 0.48813074827194214, "learning_rate": 8.1843344186034e-06, "loss": 0.5684525370597839, "step": 1122 }, { "epoch": 2.492239467849224, "grad_norm": 1.437232494354248, "learning_rate": 8.152999059245273e-06, "loss": 0.6159149408340454, "step": 1124 }, { "epoch": 2.4966740576496673, "grad_norm": 0.6437961459159851, "learning_rate": 8.121702002829291e-06, "loss": 0.6514344811439514, "step": 1126 }, { "epoch": 2.5011086474501107, "grad_norm": 0.4339181184768677, "learning_rate": 8.090443670419368e-06, "loss": 0.3893609642982483, "step": 1128 }, { "epoch": 2.505543237250554, "grad_norm": 0.9250460863113403, "learning_rate": 8.05922448255842e-06, "loss": 0.5106027722358704, "step": 1130 }, { "epoch": 2.5099778270509976, "grad_norm": 0.7213279008865356, "learning_rate": 8.028044859262736e-06, "loss": 0.5997860431671143, "step": 1132 }, { "epoch": 2.5144124168514415, "grad_norm": 0.5925162434577942, "learning_rate": 7.996905220016295e-06, "loss": 0.37115636467933655, "step": 1134 }, { "epoch": 2.5188470066518844, "grad_norm": 0.4195973575115204, "learning_rate": 7.965805983765156e-06, "loss": 0.6658072471618652, "step": 1136 }, { "epoch": 2.5232815964523283, "grad_norm": 0.3894807994365692, "learning_rate": 7.934747568911792e-06, "loss": 0.48177286982536316, "step": 1138 }, { "epoch": 2.5277161862527717, "grad_norm": 0.4482039213180542, "learning_rate": 7.903730393309475e-06, "loss": 0.5770375728607178, "step": 1140 }, { "epoch": 2.532150776053215, "grad_norm": 1.4334046840667725, "learning_rate": 7.872754874256658e-06, "loss": 0.37715059518814087, "step": 1142 }, { "epoch": 2.5365853658536586, "grad_norm": 0.18956467509269714, "learning_rate": 7.841821428491358e-06, "loss": 0.3323401212692261, "step": 1144 }, { "epoch": 2.541019955654102, "grad_norm": 0.9211217761039734, "learning_rate": 7.810930472185542e-06, "loss": 0.7031457424163818, "step": 1146 }, { "epoch": 2.5454545454545454, "grad_norm": 0.5245496034622192, "learning_rate": 7.78008242093953e-06, "loss": 0.6004937887191772, "step": 1148 }, { "epoch": 2.549889135254989, "grad_norm": 0.4028185307979584, "learning_rate": 7.749277689776411e-06, "loss": 0.496783971786499, "step": 1150 }, { "epoch": 2.5543237250554323, "grad_norm": 0.5988771915435791, "learning_rate": 7.718516693136455e-06, "loss": 0.38715416193008423, "step": 1152 }, { "epoch": 2.5587583148558757, "grad_norm": 0.1802368313074112, "learning_rate": 7.687799844871534e-06, "loss": 0.14051398634910583, "step": 1154 }, { "epoch": 2.5631929046563195, "grad_norm": 0.41661515831947327, "learning_rate": 7.657127558239563e-06, "loss": 0.3350878059864044, "step": 1156 }, { "epoch": 2.5676274944567625, "grad_norm": 1.085957646369934, "learning_rate": 7.626500245898927e-06, "loss": 0.3848508596420288, "step": 1158 }, { "epoch": 2.5720620842572064, "grad_norm": 0.8385711908340454, "learning_rate": 7.595918319902939e-06, "loss": 0.26338139176368713, "step": 1160 }, { "epoch": 2.57649667405765, "grad_norm": 0.716660737991333, "learning_rate": 7.565382191694302e-06, "loss": 0.6448018550872803, "step": 1162 }, { "epoch": 2.5809312638580932, "grad_norm": 0.6266429424285889, "learning_rate": 7.53489227209955e-06, "loss": 0.7049829363822937, "step": 1164 }, { "epoch": 2.5853658536585367, "grad_norm": 0.4636557996273041, "learning_rate": 7.50444897132355e-06, "loss": 0.38826262950897217, "step": 1166 }, { "epoch": 2.58980044345898, "grad_norm": 0.44733473658561707, "learning_rate": 7.474052698943961e-06, "loss": 0.5173879265785217, "step": 1168 }, { "epoch": 2.5942350332594235, "grad_norm": 0.5354277491569519, "learning_rate": 7.443703863905738e-06, "loss": 0.5096431374549866, "step": 1170 }, { "epoch": 2.598669623059867, "grad_norm": 1.4914095401763916, "learning_rate": 7.413402874515616e-06, "loss": 0.21273551881313324, "step": 1172 }, { "epoch": 2.6031042128603104, "grad_norm": 0.40523943305015564, "learning_rate": 7.383150138436628e-06, "loss": 0.49439945816993713, "step": 1174 }, { "epoch": 2.6075388026607538, "grad_norm": 0.5631287693977356, "learning_rate": 7.352946062682626e-06, "loss": 0.49207258224487305, "step": 1176 }, { "epoch": 2.611973392461197, "grad_norm": 0.5385340452194214, "learning_rate": 7.32279105361279e-06, "loss": 0.3323192000389099, "step": 1178 }, { "epoch": 2.6164079822616406, "grad_norm": 1.356742024421692, "learning_rate": 7.292685516926161e-06, "loss": 0.5721710324287415, "step": 1180 }, { "epoch": 2.6208425720620845, "grad_norm": 0.4816894829273224, "learning_rate": 7.262629857656198e-06, "loss": 0.5175535082817078, "step": 1182 }, { "epoch": 2.6252771618625275, "grad_norm": 0.4633226990699768, "learning_rate": 7.232624480165318e-06, "loss": 0.6447592973709106, "step": 1184 }, { "epoch": 2.6297117516629713, "grad_norm": 0.6813458800315857, "learning_rate": 7.202669788139456e-06, "loss": 0.5713311433792114, "step": 1186 }, { "epoch": 2.6341463414634148, "grad_norm": 2.180230140686035, "learning_rate": 7.172766184582629e-06, "loss": 0.6713429093360901, "step": 1188 }, { "epoch": 2.638580931263858, "grad_norm": 0.5426626801490784, "learning_rate": 7.142914071811535e-06, "loss": 0.37241318821907043, "step": 1190 }, { "epoch": 2.6430155210643016, "grad_norm": 1.2816245555877686, "learning_rate": 7.113113851450122e-06, "loss": 0.49532002210617065, "step": 1192 }, { "epoch": 2.647450110864745, "grad_norm": 1.509843111038208, "learning_rate": 7.083365924424175e-06, "loss": 0.40875858068466187, "step": 1194 }, { "epoch": 2.6518847006651884, "grad_norm": 0.5089661478996277, "learning_rate": 7.053670690955956e-06, "loss": 0.4947509467601776, "step": 1196 }, { "epoch": 2.656319290465632, "grad_norm": 0.381073921918869, "learning_rate": 7.024028550558781e-06, "loss": 0.2214895784854889, "step": 1198 }, { "epoch": 2.6607538802660753, "grad_norm": 0.521045446395874, "learning_rate": 6.994439902031679e-06, "loss": 0.6109291911125183, "step": 1200 }, { "epoch": 2.6651884700665187, "grad_norm": 1.0478029251098633, "learning_rate": 6.964905143453995e-06, "loss": 0.6086549162864685, "step": 1202 }, { "epoch": 2.6696230598669626, "grad_norm": 0.4481736719608307, "learning_rate": 6.9354246721800685e-06, "loss": 0.29336196184158325, "step": 1204 }, { "epoch": 2.6740576496674056, "grad_norm": 1.675062894821167, "learning_rate": 6.9059988848338466e-06, "loss": 0.48426881432533264, "step": 1206 }, { "epoch": 2.6784922394678494, "grad_norm": 0.8172009587287903, "learning_rate": 6.8766281773035906e-06, "loss": 0.4322719871997833, "step": 1208 }, { "epoch": 2.682926829268293, "grad_norm": 0.5452362298965454, "learning_rate": 6.847312944736524e-06, "loss": 0.3221188187599182, "step": 1210 }, { "epoch": 2.6873614190687363, "grad_norm": 1.4369398355484009, "learning_rate": 6.818053581533512e-06, "loss": 0.20389345288276672, "step": 1212 }, { "epoch": 2.6917960088691797, "grad_norm": 0.5867207050323486, "learning_rate": 6.788850481343782e-06, "loss": 0.42180705070495605, "step": 1214 }, { "epoch": 2.696230598669623, "grad_norm": 2.326925754547119, "learning_rate": 6.759704037059598e-06, "loss": 0.36190155148506165, "step": 1216 }, { "epoch": 2.7006651884700665, "grad_norm": 1.7214257717132568, "learning_rate": 6.7306146408109885e-06, "loss": 0.34991076588630676, "step": 1218 }, { "epoch": 2.70509977827051, "grad_norm": 0.5046329498291016, "learning_rate": 6.701582683960481e-06, "loss": 0.6116279363632202, "step": 1220 }, { "epoch": 2.7095343680709534, "grad_norm": 0.8512217998504639, "learning_rate": 6.672608557097806e-06, "loss": 0.37688618898391724, "step": 1222 }, { "epoch": 2.713968957871397, "grad_norm": 1.6093370914459229, "learning_rate": 6.643692650034684e-06, "loss": 0.7054269909858704, "step": 1224 }, { "epoch": 2.7184035476718402, "grad_norm": 3.110217809677124, "learning_rate": 6.614835351799549e-06, "loss": 0.31694677472114563, "step": 1226 }, { "epoch": 2.7228381374722836, "grad_norm": 0.5730735659599304, "learning_rate": 6.586037050632315e-06, "loss": 0.8013717532157898, "step": 1228 }, { "epoch": 2.7272727272727275, "grad_norm": 0.7116084098815918, "learning_rate": 6.557298133979177e-06, "loss": 0.45755088329315186, "step": 1230 }, { "epoch": 2.7317073170731705, "grad_norm": 0.4136090874671936, "learning_rate": 6.528618988487373e-06, "loss": 0.48779523372650146, "step": 1232 }, { "epoch": 2.7361419068736144, "grad_norm": 0.9168877601623535, "learning_rate": 6.500000000000003e-06, "loss": 0.2947143614292145, "step": 1234 }, { "epoch": 2.740576496674058, "grad_norm": 0.6739610433578491, "learning_rate": 6.471441553550813e-06, "loss": 0.6185624599456787, "step": 1236 }, { "epoch": 2.745011086474501, "grad_norm": 0.5895893573760986, "learning_rate": 6.442944033359042e-06, "loss": 0.35551586747169495, "step": 1238 }, { "epoch": 2.7494456762749446, "grad_norm": 0.37865594029426575, "learning_rate": 6.4145078228242375e-06, "loss": 0.3368171751499176, "step": 1240 }, { "epoch": 2.753880266075388, "grad_norm": 0.45283424854278564, "learning_rate": 6.386133304521094e-06, "loss": 0.5998995304107666, "step": 1242 }, { "epoch": 2.7583148558758315, "grad_norm": 0.7602055668830872, "learning_rate": 6.357820860194321e-06, "loss": 0.7485865354537964, "step": 1244 }, { "epoch": 2.762749445676275, "grad_norm": 0.12720580399036407, "learning_rate": 6.32957087075349e-06, "loss": 0.18481549620628357, "step": 1246 }, { "epoch": 2.7671840354767183, "grad_norm": 1.2511968612670898, "learning_rate": 6.301383716267917e-06, "loss": 0.3667486011981964, "step": 1248 }, { "epoch": 2.7716186252771617, "grad_norm": 0.6795738339424133, "learning_rate": 6.273259775961562e-06, "loss": 0.43524369597435, "step": 1250 }, { "epoch": 2.776053215077605, "grad_norm": 0.4668692946434021, "learning_rate": 6.245199428207898e-06, "loss": 0.7469791173934937, "step": 1252 }, { "epoch": 2.7804878048780486, "grad_norm": 0.4733211100101471, "learning_rate": 6.2172030505248515e-06, "loss": 0.6893079876899719, "step": 1254 }, { "epoch": 2.7849223946784925, "grad_norm": 0.4810378849506378, "learning_rate": 6.189271019569707e-06, "loss": 0.6243588328361511, "step": 1256 }, { "epoch": 2.7893569844789354, "grad_norm": 0.21061930060386658, "learning_rate": 6.161403711134031e-06, "loss": 0.09384872019290924, "step": 1258 }, { "epoch": 2.7937915742793793, "grad_norm": 0.4916951358318329, "learning_rate": 6.133601500138643e-06, "loss": 0.5685229301452637, "step": 1260 }, { "epoch": 2.7982261640798227, "grad_norm": 0.8098857402801514, "learning_rate": 6.1058647606285394e-06, "loss": 0.3363065719604492, "step": 1262 }, { "epoch": 2.802660753880266, "grad_norm": 0.5222221612930298, "learning_rate": 6.078193865767893e-06, "loss": 0.36431118845939636, "step": 1264 }, { "epoch": 2.8070953436807096, "grad_norm": 0.48917877674102783, "learning_rate": 6.050589187835001e-06, "loss": 0.48057618737220764, "step": 1266 }, { "epoch": 2.811529933481153, "grad_norm": 1.3627451658248901, "learning_rate": 6.023051098217307e-06, "loss": 0.4955880343914032, "step": 1268 }, { "epoch": 2.8159645232815964, "grad_norm": 0.5931581854820251, "learning_rate": 5.995579967406379e-06, "loss": 0.5985972881317139, "step": 1270 }, { "epoch": 2.82039911308204, "grad_norm": 1.0736427307128906, "learning_rate": 5.968176164992938e-06, "loss": 0.24213649332523346, "step": 1272 }, { "epoch": 2.8248337028824833, "grad_norm": 0.6388216614723206, "learning_rate": 5.940840059661892e-06, "loss": 0.41631895303726196, "step": 1274 }, { "epoch": 2.8292682926829267, "grad_norm": 0.49787789583206177, "learning_rate": 5.913572019187355e-06, "loss": 0.6338592171669006, "step": 1276 }, { "epoch": 2.8337028824833705, "grad_norm": 0.4130885601043701, "learning_rate": 5.886372410427709e-06, "loss": 0.5558915734291077, "step": 1278 }, { "epoch": 2.8381374722838135, "grad_norm": 0.4531559944152832, "learning_rate": 5.859241599320686e-06, "loss": 0.24562785029411316, "step": 1280 }, { "epoch": 2.8425720620842574, "grad_norm": 1.1224136352539062, "learning_rate": 5.832179950878414e-06, "loss": 0.38200998306274414, "step": 1282 }, { "epoch": 2.847006651884701, "grad_norm": 0.5757291913032532, "learning_rate": 5.805187829182531e-06, "loss": 0.40263280272483826, "step": 1284 }, { "epoch": 2.8514412416851442, "grad_norm": 0.4876343607902527, "learning_rate": 5.778265597379269e-06, "loss": 0.5635562539100647, "step": 1286 }, { "epoch": 2.8558758314855877, "grad_norm": 1.429746150970459, "learning_rate": 5.751413617674584e-06, "loss": 0.13587771356105804, "step": 1288 }, { "epoch": 2.860310421286031, "grad_norm": 0.43107762932777405, "learning_rate": 5.724632251329272e-06, "loss": 0.5738257765769958, "step": 1290 }, { "epoch": 2.8647450110864745, "grad_norm": 1.0720781087875366, "learning_rate": 5.697921858654106e-06, "loss": 0.36557459831237793, "step": 1292 }, { "epoch": 2.869179600886918, "grad_norm": 0.4924733638763428, "learning_rate": 5.671282799005009e-06, "loss": 0.5723231434822083, "step": 1294 }, { "epoch": 2.8736141906873613, "grad_norm": 0.4669732451438904, "learning_rate": 5.644715430778187e-06, "loss": 0.5587807893753052, "step": 1296 }, { "epoch": 2.8780487804878048, "grad_norm": 0.8375265598297119, "learning_rate": 5.6182201114053405e-06, "loss": 0.407155841588974, "step": 1298 }, { "epoch": 2.882483370288248, "grad_norm": 0.6367316246032715, "learning_rate": 5.59179719734883e-06, "loss": 0.581174373626709, "step": 1300 }, { "epoch": 2.8869179600886916, "grad_norm": 1.9464964866638184, "learning_rate": 5.565447044096888e-06, "loss": 0.23274049162864685, "step": 1302 }, { "epoch": 2.8913525498891355, "grad_norm": 0.4807678461074829, "learning_rate": 5.539170006158859e-06, "loss": 0.5287979245185852, "step": 1304 }, { "epoch": 2.8957871396895785, "grad_norm": 0.5676413774490356, "learning_rate": 5.512966437060383e-06, "loss": 0.4669223129749298, "step": 1306 }, { "epoch": 2.9002217294900223, "grad_norm": 0.19804784655570984, "learning_rate": 5.4868366893386795e-06, "loss": 0.1954198181629181, "step": 1308 }, { "epoch": 2.9046563192904657, "grad_norm": 0.5282815098762512, "learning_rate": 5.460781114537794e-06, "loss": 0.3124288320541382, "step": 1310 }, { "epoch": 2.909090909090909, "grad_norm": 0.6704612374305725, "learning_rate": 5.434800063203855e-06, "loss": 0.5746976733207703, "step": 1312 }, { "epoch": 2.9135254988913526, "grad_norm": 0.48029983043670654, "learning_rate": 5.408893884880382e-06, "loss": 0.5503944158554077, "step": 1314 }, { "epoch": 2.917960088691796, "grad_norm": 1.208801031112671, "learning_rate": 5.383062928103551e-06, "loss": 0.4464556872844696, "step": 1316 }, { "epoch": 2.9223946784922394, "grad_norm": 0.5504411458969116, "learning_rate": 5.357307540397541e-06, "loss": 0.6808157563209534, "step": 1318 }, { "epoch": 2.926829268292683, "grad_norm": 0.4721316397190094, "learning_rate": 5.331628068269832e-06, "loss": 0.3994528353214264, "step": 1320 }, { "epoch": 2.9312638580931263, "grad_norm": 0.40078234672546387, "learning_rate": 5.306024857206551e-06, "loss": 0.589479386806488, "step": 1322 }, { "epoch": 2.9356984478935697, "grad_norm": 0.4144805073738098, "learning_rate": 5.28049825166783e-06, "loss": 0.6008284687995911, "step": 1324 }, { "epoch": 2.9401330376940136, "grad_norm": 0.4621680676937103, "learning_rate": 5.255048595083161e-06, "loss": 0.48713505268096924, "step": 1326 }, { "epoch": 2.9445676274944566, "grad_norm": 0.6959161758422852, "learning_rate": 5.229676229846788e-06, "loss": 0.5818562507629395, "step": 1328 }, { "epoch": 2.9490022172949004, "grad_norm": 0.8349772095680237, "learning_rate": 5.204381497313089e-06, "loss": 0.6031002402305603, "step": 1330 }, { "epoch": 2.953436807095344, "grad_norm": 0.5815767645835876, "learning_rate": 5.179164737791984e-06, "loss": 0.6579894423484802, "step": 1332 }, { "epoch": 2.9578713968957873, "grad_norm": 0.5155860781669617, "learning_rate": 5.15402629054437e-06, "loss": 0.3109511137008667, "step": 1334 }, { "epoch": 2.9623059866962307, "grad_norm": 0.5490220189094543, "learning_rate": 5.128966493777544e-06, "loss": 0.5789236426353455, "step": 1336 }, { "epoch": 2.966740576496674, "grad_norm": 0.5740970969200134, "learning_rate": 5.103985684640653e-06, "loss": 0.5203069448471069, "step": 1338 }, { "epoch": 2.9711751662971175, "grad_norm": 0.5606107711791992, "learning_rate": 5.079084199220168e-06, "loss": 0.4374566376209259, "step": 1340 }, { "epoch": 2.975609756097561, "grad_norm": 1.1846078634262085, "learning_rate": 5.0542623725353455e-06, "loss": 0.42820480465888977, "step": 1342 }, { "epoch": 2.9800443458980044, "grad_norm": 0.19243869185447693, "learning_rate": 5.029520538533742e-06, "loss": 0.125463604927063, "step": 1344 }, { "epoch": 2.984478935698448, "grad_norm": 0.4858459532260895, "learning_rate": 5.0048590300867e-06, "loss": 0.37778711318969727, "step": 1346 }, { "epoch": 2.988913525498891, "grad_norm": 0.4838855564594269, "learning_rate": 4.980278178984886e-06, "loss": 0.33112236857414246, "step": 1348 }, { "epoch": 2.9933481152993346, "grad_norm": 1.0332651138305664, "learning_rate": 4.9557783159338134e-06, "loss": 0.28946980834007263, "step": 1350 }, { "epoch": 2.9977827050997785, "grad_norm": 1.0827792882919312, "learning_rate": 4.9313597705494045e-06, "loss": 0.44148802757263184, "step": 1352 }, { "epoch": 3.002217294900222, "grad_norm": 0.3786047399044037, "learning_rate": 4.907022871353554e-06, "loss": 0.42598864436149597, "step": 1354 }, { "epoch": 3.0066518847006654, "grad_norm": 0.35562005639076233, "learning_rate": 4.882767945769696e-06, "loss": 0.1402987688779831, "step": 1356 }, { "epoch": 3.011086474501109, "grad_norm": 1.162191390991211, "learning_rate": 4.858595320118419e-06, "loss": 0.2594584822654724, "step": 1358 }, { "epoch": 3.015521064301552, "grad_norm": 0.3751342296600342, "learning_rate": 4.834505319613061e-06, "loss": 0.3178204894065857, "step": 1360 }, { "epoch": 3.0199556541019956, "grad_norm": 0.3661974370479584, "learning_rate": 4.810498268355337e-06, "loss": 0.2332019954919815, "step": 1362 }, { "epoch": 3.024390243902439, "grad_norm": 0.5547940135002136, "learning_rate": 4.786574489330988e-06, "loss": 0.2809712886810303, "step": 1364 }, { "epoch": 3.0288248337028825, "grad_norm": 0.08006221801042557, "learning_rate": 4.762734304405419e-06, "loss": 0.1403912454843521, "step": 1366 }, { "epoch": 3.033259423503326, "grad_norm": 0.5086005926132202, "learning_rate": 4.738978034319384e-06, "loss": 0.13945481181144714, "step": 1368 }, { "epoch": 3.0376940133037693, "grad_norm": 0.6609373688697815, "learning_rate": 4.715305998684668e-06, "loss": 0.14236144721508026, "step": 1370 }, { "epoch": 3.0421286031042127, "grad_norm": 0.7926512956619263, "learning_rate": 4.691718515979772e-06, "loss": 0.2316332459449768, "step": 1372 }, { "epoch": 3.046563192904656, "grad_norm": 0.6564216613769531, "learning_rate": 4.668215903545652e-06, "loss": 0.1165812611579895, "step": 1374 }, { "epoch": 3.0509977827050996, "grad_norm": 1.1338090896606445, "learning_rate": 4.644798477581427e-06, "loss": 0.13446903228759766, "step": 1376 }, { "epoch": 3.0554323725055434, "grad_norm": 0.34968799352645874, "learning_rate": 4.6214665531401465e-06, "loss": 0.0695309042930603, "step": 1378 }, { "epoch": 3.059866962305987, "grad_norm": 0.15553732216358185, "learning_rate": 4.5982204441245294e-06, "loss": 0.1173941045999527, "step": 1380 }, { "epoch": 3.0643015521064303, "grad_norm": 1.247266411781311, "learning_rate": 4.5750604632827615e-06, "loss": 0.05880206078290939, "step": 1382 }, { "epoch": 3.0687361419068737, "grad_norm": 0.9541630744934082, "learning_rate": 4.551986922204276e-06, "loss": 0.11438459157943726, "step": 1384 }, { "epoch": 3.073170731707317, "grad_norm": 0.11932838708162308, "learning_rate": 4.529000131315559e-06, "loss": 0.05259817838668823, "step": 1386 }, { "epoch": 3.0776053215077606, "grad_norm": 0.3025910258293152, "learning_rate": 4.5061003998759864e-06, "loss": 0.0788898915052414, "step": 1388 }, { "epoch": 3.082039911308204, "grad_norm": 0.41884443163871765, "learning_rate": 4.483288035973647e-06, "loss": 0.18548215925693512, "step": 1390 }, { "epoch": 3.0864745011086474, "grad_norm": 0.69329434633255, "learning_rate": 4.46056334652121e-06, "loss": 0.07898163050413132, "step": 1392 }, { "epoch": 3.090909090909091, "grad_norm": 1.9537714719772339, "learning_rate": 4.43792663725179e-06, "loss": 0.1453198343515396, "step": 1394 }, { "epoch": 3.0953436807095343, "grad_norm": 0.5684086084365845, "learning_rate": 4.415378212714833e-06, "loss": 0.2133360058069229, "step": 1396 }, { "epoch": 3.0997782705099777, "grad_norm": 0.4299287497997284, "learning_rate": 4.392918376272028e-06, "loss": 0.18916372954845428, "step": 1398 }, { "epoch": 3.104212860310421, "grad_norm": 0.2804919481277466, "learning_rate": 4.370547430093213e-06, "loss": 0.15570159256458282, "step": 1400 }, { "epoch": 3.1086474501108645, "grad_norm": 0.8112667798995972, "learning_rate": 4.348265675152312e-06, "loss": 0.05692750960588455, "step": 1402 }, { "epoch": 3.1130820399113084, "grad_norm": 1.0895768404006958, "learning_rate": 4.326073411223299e-06, "loss": 0.072386234998703, "step": 1404 }, { "epoch": 3.117516629711752, "grad_norm": 1.3162689208984375, "learning_rate": 4.303970936876145e-06, "loss": 0.2204161435365677, "step": 1406 }, { "epoch": 3.1219512195121952, "grad_norm": 0.4283730983734131, "learning_rate": 4.281958549472821e-06, "loss": 0.24357332289218903, "step": 1408 }, { "epoch": 3.1263858093126387, "grad_norm": 0.5136526226997375, "learning_rate": 4.2600365451632755e-06, "loss": 0.1705726683139801, "step": 1410 }, { "epoch": 3.130820399113082, "grad_norm": 0.5153740644454956, "learning_rate": 4.238205218881477e-06, "loss": 0.1938788741827011, "step": 1412 }, { "epoch": 3.1352549889135255, "grad_norm": 0.3389737606048584, "learning_rate": 4.216464864341415e-06, "loss": 0.1461533159017563, "step": 1414 }, { "epoch": 3.139689578713969, "grad_norm": 0.24095015227794647, "learning_rate": 4.1948157740331765e-06, "loss": 0.016989566385746002, "step": 1416 }, { "epoch": 3.1441241685144123, "grad_norm": 0.4946073591709137, "learning_rate": 4.173258239218998e-06, "loss": 0.16947562992572784, "step": 1418 }, { "epoch": 3.1485587583148558, "grad_norm": 1.0035178661346436, "learning_rate": 4.151792549929343e-06, "loss": 0.17151474952697754, "step": 1420 }, { "epoch": 3.152993348115299, "grad_norm": 0.925403356552124, "learning_rate": 4.130418994959004e-06, "loss": 0.12084448337554932, "step": 1422 }, { "epoch": 3.1574279379157426, "grad_norm": 0.30737417936325073, "learning_rate": 4.1091378618632276e-06, "loss": 0.03554686903953552, "step": 1424 }, { "epoch": 3.1618625277161865, "grad_norm": 0.9840001463890076, "learning_rate": 4.087949436953822e-06, "loss": 0.17049196362495422, "step": 1426 }, { "epoch": 3.16629711751663, "grad_norm": 1.108886957168579, "learning_rate": 4.066854005295336e-06, "loss": 0.12697622179985046, "step": 1428 }, { "epoch": 3.1707317073170733, "grad_norm": 0.6791403293609619, "learning_rate": 4.045851850701189e-06, "loss": 0.10053610801696777, "step": 1430 }, { "epoch": 3.1751662971175167, "grad_norm": 0.23437856137752533, "learning_rate": 4.024943255729886e-06, "loss": 0.1366463154554367, "step": 1432 }, { "epoch": 3.17960088691796, "grad_norm": 0.5337254405021667, "learning_rate": 4.004128501681197e-06, "loss": 0.1613321751356125, "step": 1434 }, { "epoch": 3.1840354767184036, "grad_norm": 0.6539866924285889, "learning_rate": 3.983407868592367e-06, "loss": 0.03396349772810936, "step": 1436 }, { "epoch": 3.188470066518847, "grad_norm": 0.5891013145446777, "learning_rate": 3.9627816352343714e-06, "loss": 0.1685631275177002, "step": 1438 }, { "epoch": 3.1929046563192904, "grad_norm": 0.8137240409851074, "learning_rate": 3.94225007910814e-06, "loss": 0.16547633707523346, "step": 1440 }, { "epoch": 3.197339246119734, "grad_norm": 0.4780210852622986, "learning_rate": 3.921813476440845e-06, "loss": 0.2140340805053711, "step": 1442 }, { "epoch": 3.2017738359201773, "grad_norm": 0.7639121413230896, "learning_rate": 3.901472102182168e-06, "loss": 0.2164526730775833, "step": 1444 }, { "epoch": 3.2062084257206207, "grad_norm": 0.44395381212234497, "learning_rate": 3.881226230000607e-06, "loss": 0.18533624708652496, "step": 1446 }, { "epoch": 3.210643015521064, "grad_norm": 0.5062630772590637, "learning_rate": 3.861076132279808e-06, "loss": 0.053058087825775146, "step": 1448 }, { "epoch": 3.2150776053215075, "grad_norm": 0.4987446069717407, "learning_rate": 3.8410220801148735e-06, "loss": 0.21477347612380981, "step": 1450 }, { "epoch": 3.2195121951219514, "grad_norm": 4.220211029052734, "learning_rate": 3.821064343308734e-06, "loss": 0.04978083446621895, "step": 1452 }, { "epoch": 3.223946784922395, "grad_norm": 0.555292010307312, "learning_rate": 3.8012031903685174e-06, "loss": 0.19708330929279327, "step": 1454 }, { "epoch": 3.2283813747228383, "grad_norm": 0.9038100838661194, "learning_rate": 3.7814388885019284e-06, "loss": 0.16057579219341278, "step": 1456 }, { "epoch": 3.2328159645232817, "grad_norm": 0.3948892652988434, "learning_rate": 3.7617717036136623e-06, "loss": 0.1567579060792923, "step": 1458 }, { "epoch": 3.237250554323725, "grad_norm": 0.6105815768241882, "learning_rate": 3.7422019003018174e-06, "loss": 0.15115660429000854, "step": 1460 }, { "epoch": 3.2416851441241685, "grad_norm": 0.7068625688552856, "learning_rate": 3.7227297418543464e-06, "loss": 0.17774607241153717, "step": 1462 }, { "epoch": 3.246119733924612, "grad_norm": 1.291515588760376, "learning_rate": 3.7033554902455105e-06, "loss": 0.20271697640419006, "step": 1464 }, { "epoch": 3.2505543237250554, "grad_norm": 0.4515579342842102, "learning_rate": 3.684079406132344e-06, "loss": 0.23176366090774536, "step": 1466 }, { "epoch": 3.254988913525499, "grad_norm": 0.17358291149139404, "learning_rate": 3.6649017488511684e-06, "loss": 0.035076484084129333, "step": 1468 }, { "epoch": 3.259423503325942, "grad_norm": 0.7106318473815918, "learning_rate": 3.6458227764140796e-06, "loss": 0.11743002384901047, "step": 1470 }, { "epoch": 3.2638580931263856, "grad_norm": 0.524408221244812, "learning_rate": 3.626842745505501e-06, "loss": 0.2437806874513626, "step": 1472 }, { "epoch": 3.2682926829268295, "grad_norm": 0.37512272596359253, "learning_rate": 3.607961911478708e-06, "loss": 0.03446941822767258, "step": 1474 }, { "epoch": 3.2727272727272725, "grad_norm": 0.48498690128326416, "learning_rate": 3.5891805283524055e-06, "loss": 0.15878258645534515, "step": 1476 }, { "epoch": 3.2771618625277164, "grad_norm": 0.1239403486251831, "learning_rate": 3.570498848807308e-06, "loss": 0.11845864355564117, "step": 1478 }, { "epoch": 3.2815964523281598, "grad_norm": 0.23787540197372437, "learning_rate": 3.5519171241827445e-06, "loss": 0.13304200768470764, "step": 1480 }, { "epoch": 3.286031042128603, "grad_norm": 0.46581289172172546, "learning_rate": 3.533435604473259e-06, "loss": 0.20721173286437988, "step": 1482 }, { "epoch": 3.2904656319290466, "grad_norm": 0.6229859590530396, "learning_rate": 3.515054538325272e-06, "loss": 0.19322358071804047, "step": 1484 }, { "epoch": 3.29490022172949, "grad_norm": 0.4470021426677704, "learning_rate": 3.496774173033717e-06, "loss": 0.17478328943252563, "step": 1486 }, { "epoch": 3.2993348115299335, "grad_norm": 1.0204616785049438, "learning_rate": 3.478594754538722e-06, "loss": 0.10508938133716583, "step": 1488 }, { "epoch": 3.303769401330377, "grad_norm": 0.4292312264442444, "learning_rate": 3.460516527422298e-06, "loss": 0.05400429666042328, "step": 1490 }, { "epoch": 3.3082039911308203, "grad_norm": 0.514301061630249, "learning_rate": 3.442539734905049e-06, "loss": 0.15547773241996765, "step": 1492 }, { "epoch": 3.3126385809312637, "grad_norm": 0.8231419920921326, "learning_rate": 3.424664618842897e-06, "loss": 0.1262798309326172, "step": 1494 }, { "epoch": 3.317073170731707, "grad_norm": 0.6278258562088013, "learning_rate": 3.4068914197238352e-06, "loss": 0.17141902446746826, "step": 1496 }, { "epoch": 3.3215077605321506, "grad_norm": 0.7143641710281372, "learning_rate": 3.389220376664687e-06, "loss": 0.2325032353401184, "step": 1498 }, { "epoch": 3.3259423503325944, "grad_norm": 0.6291862726211548, "learning_rate": 3.3716517274078842e-06, "loss": 0.1395445019006729, "step": 1500 }, { "epoch": 3.330376940133038, "grad_norm": 1.111968994140625, "learning_rate": 3.354185708318284e-06, "loss": 0.19360409677028656, "step": 1502 }, { "epoch": 3.3348115299334813, "grad_norm": 0.4316374659538269, "learning_rate": 3.3368225543799716e-06, "loss": 0.19091464579105377, "step": 1504 }, { "epoch": 3.3392461197339247, "grad_norm": 0.07719559222459793, "learning_rate": 3.3195624991931074e-06, "loss": 0.0855455994606018, "step": 1506 }, { "epoch": 3.343680709534368, "grad_norm": 0.48246321082115173, "learning_rate": 3.302405774970788e-06, "loss": 0.08791041374206543, "step": 1508 }, { "epoch": 3.3481152993348116, "grad_norm": 0.36730292439460754, "learning_rate": 3.2853526125359105e-06, "loss": 0.12776361405849457, "step": 1510 }, { "epoch": 3.352549889135255, "grad_norm": 0.09562593698501587, "learning_rate": 3.26840324131808e-06, "loss": 0.0983489602804184, "step": 1512 }, { "epoch": 3.3569844789356984, "grad_norm": 0.7086212038993835, "learning_rate": 3.251557889350514e-06, "loss": 0.23420387506484985, "step": 1514 }, { "epoch": 3.361419068736142, "grad_norm": 5.378333568572998, "learning_rate": 3.2348167832669754e-06, "loss": 0.10752184689044952, "step": 1516 }, { "epoch": 3.3658536585365852, "grad_norm": 0.5152938961982727, "learning_rate": 3.218180148298732e-06, "loss": 0.21186313033103943, "step": 1518 }, { "epoch": 3.3702882483370287, "grad_norm": 1.4693471193313599, "learning_rate": 3.201648208271507e-06, "loss": 0.19114084541797638, "step": 1520 }, { "epoch": 3.374722838137472, "grad_norm": 0.12920020520687103, "learning_rate": 3.185221185602497e-06, "loss": 0.12129313498735428, "step": 1522 }, { "epoch": 3.3791574279379155, "grad_norm": 0.8857243061065674, "learning_rate": 3.168899301297347e-06, "loss": 0.21523553133010864, "step": 1524 }, { "epoch": 3.3835920177383594, "grad_norm": 0.7426590919494629, "learning_rate": 3.152682774947202e-06, "loss": 0.1364864557981491, "step": 1526 }, { "epoch": 3.388026607538803, "grad_norm": 0.7999682426452637, "learning_rate": 3.136571824725744e-06, "loss": 0.0897040143609047, "step": 1528 }, { "epoch": 3.3924611973392462, "grad_norm": 0.6461058855056763, "learning_rate": 3.1205666673862484e-06, "loss": 0.09447822719812393, "step": 1530 }, { "epoch": 3.3968957871396896, "grad_norm": 0.3650994300842285, "learning_rate": 3.104667518258688e-06, "loss": 0.041886042803525925, "step": 1532 }, { "epoch": 3.401330376940133, "grad_norm": 1.1809720993041992, "learning_rate": 3.0888745912468123e-06, "loss": 0.13893677294254303, "step": 1534 }, { "epoch": 3.4057649667405765, "grad_norm": 0.5130560398101807, "learning_rate": 3.073188098825285e-06, "loss": 0.19634631276130676, "step": 1536 }, { "epoch": 3.41019955654102, "grad_norm": 0.7646129131317139, "learning_rate": 3.0576082520368265e-06, "loss": 0.11035222560167313, "step": 1538 }, { "epoch": 3.4146341463414633, "grad_norm": 1.119156837463379, "learning_rate": 3.0421352604893602e-06, "loss": 0.23807543516159058, "step": 1540 }, { "epoch": 3.4190687361419068, "grad_norm": 0.4573220908641815, "learning_rate": 3.0267693323532116e-06, "loss": 0.14719665050506592, "step": 1542 }, { "epoch": 3.42350332594235, "grad_norm": 0.683412492275238, "learning_rate": 3.0115106743582922e-06, "loss": 0.21427640318870544, "step": 1544 }, { "epoch": 3.4279379157427936, "grad_norm": 0.5579946637153625, "learning_rate": 2.9963594917913248e-06, "loss": 0.02915109321475029, "step": 1546 }, { "epoch": 3.4323725055432375, "grad_norm": 0.10574361681938171, "learning_rate": 2.981315988493084e-06, "loss": 0.04074406251311302, "step": 1548 }, { "epoch": 3.436807095343681, "grad_norm": 0.366202175617218, "learning_rate": 2.9663803668556424e-06, "loss": 0.22145552933216095, "step": 1550 }, { "epoch": 3.4412416851441243, "grad_norm": 0.5682427287101746, "learning_rate": 2.9515528278196665e-06, "loss": 0.25287312269210815, "step": 1552 }, { "epoch": 3.4456762749445677, "grad_norm": 0.10395639389753342, "learning_rate": 2.936833570871694e-06, "loss": 0.11668358743190765, "step": 1554 }, { "epoch": 3.450110864745011, "grad_norm": 0.631152868270874, "learning_rate": 2.922222794041464e-06, "loss": 0.23132863640785217, "step": 1556 }, { "epoch": 3.4545454545454546, "grad_norm": 0.881669282913208, "learning_rate": 2.907720693899243e-06, "loss": 0.330628901720047, "step": 1558 }, { "epoch": 3.458980044345898, "grad_norm": 0.462612122297287, "learning_rate": 2.8933274655531874e-06, "loss": 0.25399714708328247, "step": 1560 }, { "epoch": 3.4634146341463414, "grad_norm": 0.5779225826263428, "learning_rate": 2.879043302646717e-06, "loss": 0.039646755903959274, "step": 1562 }, { "epoch": 3.467849223946785, "grad_norm": 0.32095006108283997, "learning_rate": 2.8648683973559054e-06, "loss": 0.23187652230262756, "step": 1564 }, { "epoch": 3.4722838137472283, "grad_norm": 0.3223656415939331, "learning_rate": 2.8508029403868962e-06, "loss": 0.09090401232242584, "step": 1566 }, { "epoch": 3.4767184035476717, "grad_norm": 0.5520133376121521, "learning_rate": 2.836847120973345e-06, "loss": 0.15556883811950684, "step": 1568 }, { "epoch": 3.481152993348115, "grad_norm": 0.47338053584098816, "learning_rate": 2.8230011268738593e-06, "loss": 0.09746363013982773, "step": 1570 }, { "epoch": 3.4855875831485585, "grad_norm": 0.1202714741230011, "learning_rate": 2.8092651443694886e-06, "loss": 0.13933829963207245, "step": 1572 }, { "epoch": 3.4900221729490024, "grad_norm": 0.6928906440734863, "learning_rate": 2.795639358261202e-06, "loss": 0.43705928325653076, "step": 1574 }, { "epoch": 3.494456762749446, "grad_norm": 0.22218959033489227, "learning_rate": 2.782123951867415e-06, "loss": 0.12843255698680878, "step": 1576 }, { "epoch": 3.4988913525498893, "grad_norm": 0.4401395618915558, "learning_rate": 2.7687191070215174e-06, "loss": 0.11058890074491501, "step": 1578 }, { "epoch": 3.5033259423503327, "grad_norm": 0.4982577860355377, "learning_rate": 2.755425004069424e-06, "loss": 0.20767910778522491, "step": 1580 }, { "epoch": 3.507760532150776, "grad_norm": 0.5209600925445557, "learning_rate": 2.7422418218671586e-06, "loss": 0.3028036952018738, "step": 1582 }, { "epoch": 3.5121951219512195, "grad_norm": 0.6526494026184082, "learning_rate": 2.7291697377784325e-06, "loss": 0.13182812929153442, "step": 1584 }, { "epoch": 3.516629711751663, "grad_norm": 0.5955665707588196, "learning_rate": 2.7162089276722746e-06, "loss": 0.11612501740455627, "step": 1586 }, { "epoch": 3.5210643015521064, "grad_norm": 0.5240582227706909, "learning_rate": 2.703359565920651e-06, "loss": 0.19106577336788177, "step": 1588 }, { "epoch": 3.52549889135255, "grad_norm": 0.5816933512687683, "learning_rate": 2.6906218253961285e-06, "loss": 0.052692461758852005, "step": 1590 }, { "epoch": 3.529933481152993, "grad_norm": 1.794288992881775, "learning_rate": 2.6779958774695487e-06, "loss": 0.15381264686584473, "step": 1592 }, { "epoch": 3.5343680709534366, "grad_norm": 0.6399196982383728, "learning_rate": 2.665481892007714e-06, "loss": 0.25606346130371094, "step": 1594 }, { "epoch": 3.5388026607538805, "grad_norm": 0.4062730371952057, "learning_rate": 2.6530800373711097e-06, "loss": 0.021856600418686867, "step": 1596 }, { "epoch": 3.5432372505543235, "grad_norm": 0.5443702936172485, "learning_rate": 2.640790480411638e-06, "loss": 0.08779677748680115, "step": 1598 }, { "epoch": 3.5476718403547673, "grad_norm": 1.7016083002090454, "learning_rate": 2.628613386470371e-06, "loss": 0.1265704333782196, "step": 1600 }, { "epoch": 3.5521064301552108, "grad_norm": 0.5498143434524536, "learning_rate": 2.61654891937533e-06, "loss": 0.19086270034313202, "step": 1602 }, { "epoch": 3.556541019955654, "grad_norm": 0.5192769765853882, "learning_rate": 2.6045972414392735e-06, "loss": 0.3860751688480377, "step": 1604 }, { "epoch": 3.5609756097560976, "grad_norm": 1.643974781036377, "learning_rate": 2.5927585134575233e-06, "loss": 0.2832165062427521, "step": 1606 }, { "epoch": 3.565410199556541, "grad_norm": 0.16696669161319733, "learning_rate": 2.581032894705798e-06, "loss": 0.013047085143625736, "step": 1608 }, { "epoch": 3.5698447893569845, "grad_norm": 0.5006920099258423, "learning_rate": 2.5694205429380616e-06, "loss": 0.17075103521347046, "step": 1610 }, { "epoch": 3.574279379157428, "grad_norm": 0.4067634642124176, "learning_rate": 2.5579216143844153e-06, "loss": 0.049309611320495605, "step": 1612 }, { "epoch": 3.5787139689578713, "grad_norm": 0.8766622543334961, "learning_rate": 2.5465362637489847e-06, "loss": 0.1669972687959671, "step": 1614 }, { "epoch": 3.5831485587583147, "grad_norm": 0.7486819624900818, "learning_rate": 2.5352646442078472e-06, "loss": 0.20184892416000366, "step": 1616 }, { "epoch": 3.587583148558758, "grad_norm": 0.6373207569122314, "learning_rate": 2.524106907406959e-06, "loss": 0.1479307860136032, "step": 1618 }, { "epoch": 3.5920177383592016, "grad_norm": 1.1294218301773071, "learning_rate": 2.513063203460127e-06, "loss": 0.15324336290359497, "step": 1620 }, { "epoch": 3.5964523281596454, "grad_norm": 0.4940034747123718, "learning_rate": 2.502133680946985e-06, "loss": 0.260329931974411, "step": 1622 }, { "epoch": 3.6008869179600884, "grad_norm": 0.5072565674781799, "learning_rate": 2.4913184869109925e-06, "loss": 0.14236906170845032, "step": 1624 }, { "epoch": 3.6053215077605323, "grad_norm": 0.14243106544017792, "learning_rate": 2.4806177668574564e-06, "loss": 0.03839609771966934, "step": 1626 }, { "epoch": 3.6097560975609757, "grad_norm": 0.462656706571579, "learning_rate": 2.4700316647515805e-06, "loss": 0.1687300205230713, "step": 1628 }, { "epoch": 3.614190687361419, "grad_norm": 2.395517587661743, "learning_rate": 2.459560323016518e-06, "loss": 0.11912352591753006, "step": 1630 }, { "epoch": 3.6186252771618626, "grad_norm": 0.4201103746891022, "learning_rate": 2.4492038825314637e-06, "loss": 0.148905947804451, "step": 1632 }, { "epoch": 3.623059866962306, "grad_norm": 1.4418302774429321, "learning_rate": 2.438962482629751e-06, "loss": 0.19345171749591827, "step": 1634 }, { "epoch": 3.6274944567627494, "grad_norm": 0.47817596793174744, "learning_rate": 2.42883626109699e-06, "loss": 0.12935222685337067, "step": 1636 }, { "epoch": 3.631929046563193, "grad_norm": 0.543739914894104, "learning_rate": 2.4188253541691973e-06, "loss": 0.1430729478597641, "step": 1638 }, { "epoch": 3.6363636363636362, "grad_norm": 0.43734827637672424, "learning_rate": 2.4089298965309753e-06, "loss": 0.19318100810050964, "step": 1640 }, { "epoch": 3.6407982261640797, "grad_norm": 0.2320224642753601, "learning_rate": 2.399150021313699e-06, "loss": 0.0949181392788887, "step": 1642 }, { "epoch": 3.6452328159645235, "grad_norm": 0.6401042938232422, "learning_rate": 2.389485860093715e-06, "loss": 0.2700011730194092, "step": 1644 }, { "epoch": 3.6496674057649665, "grad_norm": 0.12314002215862274, "learning_rate": 2.3799375428905864e-06, "loss": 0.07954643666744232, "step": 1646 }, { "epoch": 3.6541019955654104, "grad_norm": 0.7883126735687256, "learning_rate": 2.3705051981653315e-06, "loss": 0.07769718766212463, "step": 1648 }, { "epoch": 3.658536585365854, "grad_norm": 0.7627129554748535, "learning_rate": 2.361188952818697e-06, "loss": 0.2676461338996887, "step": 1650 }, { "epoch": 3.662971175166297, "grad_norm": 0.8268294334411621, "learning_rate": 2.3519889321894603e-06, "loss": 0.4033682346343994, "step": 1652 }, { "epoch": 3.6674057649667406, "grad_norm": 2.1596076488494873, "learning_rate": 2.34290526005273e-06, "loss": 0.09330250322818756, "step": 1654 }, { "epoch": 3.671840354767184, "grad_norm": 0.6786802411079407, "learning_rate": 2.3339380586182904e-06, "loss": 0.23048776388168335, "step": 1656 }, { "epoch": 3.6762749445676275, "grad_norm": 0.8763942718505859, "learning_rate": 2.3250874485289545e-06, "loss": 0.13142776489257812, "step": 1658 }, { "epoch": 3.680709534368071, "grad_norm": 0.49550583958625793, "learning_rate": 2.3163535488589363e-06, "loss": 0.17957837879657745, "step": 1660 }, { "epoch": 3.6851441241685143, "grad_norm": 0.08660886436700821, "learning_rate": 2.3077364771122573e-06, "loss": 0.12105847150087357, "step": 1662 }, { "epoch": 3.6895787139689578, "grad_norm": 0.2725079655647278, "learning_rate": 2.299236349221157e-06, "loss": 0.06378458440303802, "step": 1664 }, { "epoch": 3.694013303769401, "grad_norm": 0.40256035327911377, "learning_rate": 2.2908532795445414e-06, "loss": 0.187424436211586, "step": 1666 }, { "epoch": 3.6984478935698446, "grad_norm": 0.4576587975025177, "learning_rate": 2.2825873808664363e-06, "loss": 0.25221118330955505, "step": 1668 }, { "epoch": 3.7028824833702885, "grad_norm": 0.5043409466743469, "learning_rate": 2.2744387643944757e-06, "loss": 0.1796739250421524, "step": 1670 }, { "epoch": 3.7073170731707314, "grad_norm": 0.5289079546928406, "learning_rate": 2.2664075397584066e-06, "loss": 0.15418490767478943, "step": 1672 }, { "epoch": 3.7117516629711753, "grad_norm": 0.5016271471977234, "learning_rate": 2.258493815008605e-06, "loss": 0.23040637373924255, "step": 1674 }, { "epoch": 3.7161862527716187, "grad_norm": 0.5144860744476318, "learning_rate": 2.2506976966146355e-06, "loss": 0.21655163168907166, "step": 1676 }, { "epoch": 3.720620842572062, "grad_norm": 0.5468173027038574, "learning_rate": 2.2430192894638077e-06, "loss": 0.19511225819587708, "step": 1678 }, { "epoch": 3.7250554323725056, "grad_norm": 0.6539567112922668, "learning_rate": 2.235458696859768e-06, "loss": 0.05055548995733261, "step": 1680 }, { "epoch": 3.729490022172949, "grad_norm": 0.5066478848457336, "learning_rate": 2.228016020521116e-06, "loss": 0.17900614440441132, "step": 1682 }, { "epoch": 3.7339246119733924, "grad_norm": 0.5024972558021545, "learning_rate": 2.2206913605800267e-06, "loss": 0.12050139158964157, "step": 1684 }, { "epoch": 3.738359201773836, "grad_norm": 0.5398685932159424, "learning_rate": 2.213484815580911e-06, "loss": 0.12008091807365417, "step": 1686 }, { "epoch": 3.7427937915742793, "grad_norm": 0.10834494233131409, "learning_rate": 2.206396482479084e-06, "loss": 0.03123791143298149, "step": 1688 }, { "epoch": 3.7472283813747227, "grad_norm": 0.6228474974632263, "learning_rate": 2.199426456639465e-06, "loss": 0.22591347992420197, "step": 1690 }, { "epoch": 3.7516629711751666, "grad_norm": 0.8757428526878357, "learning_rate": 2.192574831835291e-06, "loss": 0.1378636211156845, "step": 1692 }, { "epoch": 3.7560975609756095, "grad_norm": 0.5694209933280945, "learning_rate": 2.185841700246857e-06, "loss": 0.24412274360656738, "step": 1694 }, { "epoch": 3.7605321507760534, "grad_norm": 0.494783490896225, "learning_rate": 2.1792271524602786e-06, "loss": 0.23211520910263062, "step": 1696 }, { "epoch": 3.764966740576497, "grad_norm": 0.5232568979263306, "learning_rate": 2.1727312774662656e-06, "loss": 0.12440581619739532, "step": 1698 }, { "epoch": 3.7694013303769403, "grad_norm": 0.4039710462093353, "learning_rate": 2.1663541626589337e-06, "loss": 0.11090154200792313, "step": 1700 }, { "epoch": 3.7738359201773837, "grad_norm": 0.48914211988449097, "learning_rate": 2.1600958938346202e-06, "loss": 0.5025262832641602, "step": 1702 }, { "epoch": 3.778270509977827, "grad_norm": 0.18319113552570343, "learning_rate": 2.153956555190738e-06, "loss": 0.02325468324124813, "step": 1704 }, { "epoch": 3.7827050997782705, "grad_norm": 0.10409087687730789, "learning_rate": 2.147936229324637e-06, "loss": 0.1210860013961792, "step": 1706 }, { "epoch": 3.787139689578714, "grad_norm": 0.5911566615104675, "learning_rate": 2.1420349972324942e-06, "loss": 0.11488822847604752, "step": 1708 }, { "epoch": 3.7915742793791574, "grad_norm": 0.5132036209106445, "learning_rate": 2.1362529383082255e-06, "loss": 0.30858707427978516, "step": 1710 }, { "epoch": 3.796008869179601, "grad_norm": 0.28792333602905273, "learning_rate": 2.1305901303424143e-06, "loss": 0.1212579756975174, "step": 1712 }, { "epoch": 3.800443458980044, "grad_norm": 0.7928282618522644, "learning_rate": 2.1250466495212697e-06, "loss": 0.1450139433145523, "step": 1714 }, { "epoch": 3.8048780487804876, "grad_norm": 2.0321249961853027, "learning_rate": 2.119622570425598e-06, "loss": 0.19779935479164124, "step": 1716 }, { "epoch": 3.8093126385809315, "grad_norm": 0.3824866712093353, "learning_rate": 2.1143179660298e-06, "loss": 0.1265445351600647, "step": 1718 }, { "epoch": 3.8137472283813745, "grad_norm": 0.5937016606330872, "learning_rate": 2.109132907700888e-06, "loss": 0.11517294496297836, "step": 1720 }, { "epoch": 3.8181818181818183, "grad_norm": 0.7648696899414062, "learning_rate": 2.1040674651975297e-06, "loss": 0.21361251175403595, "step": 1722 }, { "epoch": 3.8226164079822618, "grad_norm": 0.8102192282676697, "learning_rate": 2.099121706669106e-06, "loss": 0.24782630801200867, "step": 1724 }, { "epoch": 3.827050997782705, "grad_norm": 0.5768070220947266, "learning_rate": 2.0942956986547953e-06, "loss": 0.3066186010837555, "step": 1726 }, { "epoch": 3.8314855875831486, "grad_norm": 0.4514220356941223, "learning_rate": 2.0895895060826777e-06, "loss": 0.08890463411808014, "step": 1728 }, { "epoch": 3.835920177383592, "grad_norm": 0.387746661901474, "learning_rate": 2.085003192268862e-06, "loss": 0.11902990192174911, "step": 1730 }, { "epoch": 3.8403547671840355, "grad_norm": 0.5934492349624634, "learning_rate": 2.0805368189166347e-06, "loss": 0.26432839035987854, "step": 1732 }, { "epoch": 3.844789356984479, "grad_norm": 0.8290284276008606, "learning_rate": 2.076190446115625e-06, "loss": 0.13800962269306183, "step": 1734 }, { "epoch": 3.8492239467849223, "grad_norm": 0.42487943172454834, "learning_rate": 2.0719641323410084e-06, "loss": 0.1366715282201767, "step": 1736 }, { "epoch": 3.8536585365853657, "grad_norm": 0.5068730711936951, "learning_rate": 2.0678579344527038e-06, "loss": 0.18744944036006927, "step": 1738 }, { "epoch": 3.858093126385809, "grad_norm": 0.45418834686279297, "learning_rate": 2.0638719076946213e-06, "loss": 0.12399666011333466, "step": 1740 }, { "epoch": 3.8625277161862526, "grad_norm": 0.5102381706237793, "learning_rate": 2.060006105693913e-06, "loss": 0.11724897474050522, "step": 1742 }, { "epoch": 3.8669623059866964, "grad_norm": 0.5589990615844727, "learning_rate": 2.056260580460251e-06, "loss": 0.15366147458553314, "step": 1744 }, { "epoch": 3.8713968957871394, "grad_norm": 0.4272408187389374, "learning_rate": 2.052635382385134e-06, "loss": 0.16997916996479034, "step": 1746 }, { "epoch": 3.8758314855875833, "grad_norm": 0.8717123866081238, "learning_rate": 2.0491305602411997e-06, "loss": 0.11671534180641174, "step": 1748 }, { "epoch": 3.8802660753880267, "grad_norm": 0.5559657216072083, "learning_rate": 2.0457461611815782e-06, "loss": 0.15400242805480957, "step": 1750 }, { "epoch": 3.88470066518847, "grad_norm": 0.6432749032974243, "learning_rate": 2.0424822307392493e-06, "loss": 0.18111613392829895, "step": 1752 }, { "epoch": 3.8891352549889135, "grad_norm": 0.5203759074211121, "learning_rate": 2.039338812826436e-06, "loss": 0.17084263265132904, "step": 1754 }, { "epoch": 3.893569844789357, "grad_norm": 1.0190702676773071, "learning_rate": 2.036315949734011e-06, "loss": 0.1340053379535675, "step": 1756 }, { "epoch": 3.8980044345898004, "grad_norm": 2.1606268882751465, "learning_rate": 2.0334136821309286e-06, "loss": 0.23111629486083984, "step": 1758 }, { "epoch": 3.902439024390244, "grad_norm": 0.013512303121387959, "learning_rate": 2.0306320490636767e-06, "loss": 0.04244675859808922, "step": 1760 }, { "epoch": 3.9068736141906872, "grad_norm": 0.33751603960990906, "learning_rate": 2.027971087955753e-06, "loss": 0.050674207508563995, "step": 1762 }, { "epoch": 3.9113082039911307, "grad_norm": 0.043730415403842926, "learning_rate": 2.0254308346071574e-06, "loss": 0.13882163166999817, "step": 1764 }, { "epoch": 3.9157427937915745, "grad_norm": 0.37790897488594055, "learning_rate": 2.023011323193917e-06, "loss": 0.16915282607078552, "step": 1766 }, { "epoch": 3.9201773835920175, "grad_norm": 0.5498037338256836, "learning_rate": 2.020712586267621e-06, "loss": 0.24210064113140106, "step": 1768 }, { "epoch": 3.9246119733924614, "grad_norm": 0.5351380109786987, "learning_rate": 2.018534654754984e-06, "loss": 0.2681524157524109, "step": 1770 }, { "epoch": 3.929046563192905, "grad_norm": 0.7002694606781006, "learning_rate": 2.016477557957432e-06, "loss": 0.0865524411201477, "step": 1772 }, { "epoch": 3.933481152993348, "grad_norm": 1.273296594619751, "learning_rate": 2.0145413235507057e-06, "loss": 0.15231235325336456, "step": 1774 }, { "epoch": 3.9379157427937916, "grad_norm": 0.560060977935791, "learning_rate": 2.0127259775844882e-06, "loss": 0.2978004813194275, "step": 1776 }, { "epoch": 3.942350332594235, "grad_norm": 0.08104455471038818, "learning_rate": 2.0110315444820557e-06, "loss": 0.015620124526321888, "step": 1778 }, { "epoch": 3.9467849223946785, "grad_norm": 1.2214912176132202, "learning_rate": 2.0094580470399507e-06, "loss": 0.08288650959730148, "step": 1780 }, { "epoch": 3.951219512195122, "grad_norm": 0.08862635493278503, "learning_rate": 2.0080055064276703e-06, "loss": 0.11820105463266373, "step": 1782 }, { "epoch": 3.9556541019955653, "grad_norm": 0.6492655277252197, "learning_rate": 2.0066739421873856e-06, "loss": 0.23602721095085144, "step": 1784 }, { "epoch": 3.9600886917960088, "grad_norm": 0.4690077602863312, "learning_rate": 2.0054633722336776e-06, "loss": 0.17881526052951813, "step": 1786 }, { "epoch": 3.964523281596452, "grad_norm": 0.4643179774284363, "learning_rate": 2.0043738128532943e-06, "loss": 0.1461382508277893, "step": 1788 }, { "epoch": 3.9689578713968956, "grad_norm": 0.33378270268440247, "learning_rate": 2.003405278704937e-06, "loss": 0.12822888791561127, "step": 1790 }, { "epoch": 3.9733924611973395, "grad_norm": 0.7190065979957581, "learning_rate": 2.002557782819055e-06, "loss": 0.1802365928888321, "step": 1792 }, { "epoch": 3.9778270509977824, "grad_norm": 0.6157906651496887, "learning_rate": 2.001831336597679e-06, "loss": 0.09615038335323334, "step": 1794 }, { "epoch": 3.9822616407982263, "grad_norm": 0.4767264127731323, "learning_rate": 2.0012259498142596e-06, "loss": 0.13788002729415894, "step": 1796 }, { "epoch": 3.9866962305986697, "grad_norm": 0.499039888381958, "learning_rate": 2.00074163061354e-06, "loss": 0.18137040734291077, "step": 1798 }, { "epoch": 3.991130820399113, "grad_norm": 0.6075534224510193, "learning_rate": 2.000378385511451e-06, "loss": 0.10324703902006149, "step": 1800 }, { "epoch": 3.9955654101995566, "grad_norm": 0.5070518851280212, "learning_rate": 2.000136219395011e-06, "loss": 0.16305242478847504, "step": 1802 }, { "epoch": 4.0, "grad_norm": 0.4194043278694153, "learning_rate": 2.0000151355222728e-06, "loss": 0.06611192226409912, "step": 1804 }, { "epoch": 4.0, "step": 1804, "total_flos": 3.4175049861232067e+18, "train_loss": 0.6816160985415268, "train_runtime": 8301.6433, "train_samples_per_second": 6.519, "train_steps_per_second": 0.217 } ], "logging_steps": 2, "max_steps": 1804, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.4175049861232067e+18, "train_batch_size": 3, "trial_name": null, "trial_params": null }