{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2410651716888577, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002803083391730904, "grad_norm": 5.1484708590919785, "learning_rate": 0.0, "loss": 0.9179, "step": 1 }, { "epoch": 0.005606166783461808, "grad_norm": 5.487034043781978, "learning_rate": 5.58659217877095e-08, "loss": 0.9131, "step": 2 }, { "epoch": 0.008409250175192713, "grad_norm": 4.33614237795095, "learning_rate": 1.11731843575419e-07, "loss": 0.7853, "step": 3 }, { "epoch": 0.011212333566923615, "grad_norm": 4.779222535523081, "learning_rate": 1.6759776536312851e-07, "loss": 0.8746, "step": 4 }, { "epoch": 0.01401541695865452, "grad_norm": 4.7983170759971925, "learning_rate": 2.23463687150838e-07, "loss": 0.842, "step": 5 }, { "epoch": 0.016818500350385426, "grad_norm": 5.257374876703264, "learning_rate": 2.793296089385475e-07, "loss": 0.8968, "step": 6 }, { "epoch": 0.019621583742116328, "grad_norm": 4.504136127398602, "learning_rate": 3.3519553072625703e-07, "loss": 0.8337, "step": 7 }, { "epoch": 0.02242466713384723, "grad_norm": 4.414835579986107, "learning_rate": 3.910614525139665e-07, "loss": 0.8165, "step": 8 }, { "epoch": 0.025227750525578137, "grad_norm": 4.43151857886484, "learning_rate": 4.46927374301676e-07, "loss": 0.8104, "step": 9 }, { "epoch": 0.02803083391730904, "grad_norm": 4.676366358231245, "learning_rate": 5.027932960893855e-07, "loss": 0.864, "step": 10 }, { "epoch": 0.030833917309039945, "grad_norm": 4.0730644153085525, "learning_rate": 5.58659217877095e-07, "loss": 0.7886, "step": 11 }, { "epoch": 0.03363700070077085, "grad_norm": 4.172588296774101, "learning_rate": 6.145251396648045e-07, "loss": 0.837, "step": 12 }, { "epoch": 0.036440084092501754, "grad_norm": 3.9778364933448436, "learning_rate": 6.703910614525141e-07, "loss": 0.7436, "step": 13 }, { "epoch": 0.039243167484232656, "grad_norm": 3.7060565427554772, "learning_rate": 7.262569832402236e-07, "loss": 0.7504, "step": 14 }, { "epoch": 0.04204625087596356, "grad_norm": 3.570615569466669, "learning_rate": 7.82122905027933e-07, "loss": 0.7675, "step": 15 }, { "epoch": 0.04484933426769446, "grad_norm": 4.036222308910076, "learning_rate": 8.379888268156425e-07, "loss": 0.7887, "step": 16 }, { "epoch": 0.04765241765942537, "grad_norm": 4.070011925383957, "learning_rate": 8.93854748603352e-07, "loss": 0.816, "step": 17 }, { "epoch": 0.050455501051156273, "grad_norm": 3.5494513513779853, "learning_rate": 9.497206703910615e-07, "loss": 0.7722, "step": 18 }, { "epoch": 0.053258584442887176, "grad_norm": 3.0005510195801204, "learning_rate": 1.005586592178771e-06, "loss": 0.7587, "step": 19 }, { "epoch": 0.05606166783461808, "grad_norm": 3.0859421992565337, "learning_rate": 1.0614525139664806e-06, "loss": 0.7771, "step": 20 }, { "epoch": 0.05886475122634898, "grad_norm": 2.6475980984097967, "learning_rate": 1.11731843575419e-06, "loss": 0.6854, "step": 21 }, { "epoch": 0.06166783461807989, "grad_norm": 2.6610671622699016, "learning_rate": 1.1731843575418997e-06, "loss": 0.7375, "step": 22 }, { "epoch": 0.0644709180098108, "grad_norm": 2.5679290240076766, "learning_rate": 1.229050279329609e-06, "loss": 0.6947, "step": 23 }, { "epoch": 0.0672740014015417, "grad_norm": 2.3180850072986154, "learning_rate": 1.2849162011173185e-06, "loss": 0.6874, "step": 24 }, { "epoch": 0.0700770847932726, "grad_norm": 2.0906746546884425, "learning_rate": 1.3407821229050281e-06, "loss": 0.7266, "step": 25 }, { "epoch": 0.07288016818500351, "grad_norm": 2.4132207093897358, "learning_rate": 1.3966480446927375e-06, "loss": 0.6971, "step": 26 }, { "epoch": 0.0756832515767344, "grad_norm": 2.227410085533061, "learning_rate": 1.4525139664804472e-06, "loss": 0.7127, "step": 27 }, { "epoch": 0.07848633496846531, "grad_norm": 2.3134005002437656, "learning_rate": 1.5083798882681566e-06, "loss": 0.7078, "step": 28 }, { "epoch": 0.08128941836019622, "grad_norm": 2.2309237756431672, "learning_rate": 1.564245810055866e-06, "loss": 0.6541, "step": 29 }, { "epoch": 0.08409250175192712, "grad_norm": 2.6641061491321403, "learning_rate": 1.6201117318435754e-06, "loss": 0.6975, "step": 30 }, { "epoch": 0.08689558514365803, "grad_norm": 2.079611257901034, "learning_rate": 1.675977653631285e-06, "loss": 0.6206, "step": 31 }, { "epoch": 0.08969866853538892, "grad_norm": 2.2660193487426965, "learning_rate": 1.7318435754189947e-06, "loss": 0.6665, "step": 32 }, { "epoch": 0.09250175192711983, "grad_norm": 2.146259839931881, "learning_rate": 1.787709497206704e-06, "loss": 0.6787, "step": 33 }, { "epoch": 0.09530483531885074, "grad_norm": 1.9839911994049741, "learning_rate": 1.8435754189944135e-06, "loss": 0.6521, "step": 34 }, { "epoch": 0.09810791871058164, "grad_norm": 1.6089250761086926, "learning_rate": 1.899441340782123e-06, "loss": 0.6217, "step": 35 }, { "epoch": 0.10091100210231255, "grad_norm": 1.5876982534765143, "learning_rate": 1.9553072625698325e-06, "loss": 0.5882, "step": 36 }, { "epoch": 0.10371408549404344, "grad_norm": 1.7222150058922074, "learning_rate": 2.011173184357542e-06, "loss": 0.5512, "step": 37 }, { "epoch": 0.10651716888577435, "grad_norm": 1.8050081053129243, "learning_rate": 2.067039106145252e-06, "loss": 0.588, "step": 38 }, { "epoch": 0.10932025227750526, "grad_norm": 1.6434624778264646, "learning_rate": 2.1229050279329612e-06, "loss": 0.5931, "step": 39 }, { "epoch": 0.11212333566923616, "grad_norm": 1.7526009826479017, "learning_rate": 2.1787709497206706e-06, "loss": 0.5785, "step": 40 }, { "epoch": 0.11492641906096707, "grad_norm": 1.6437945906395883, "learning_rate": 2.23463687150838e-06, "loss": 0.5689, "step": 41 }, { "epoch": 0.11772950245269796, "grad_norm": 1.5330527531585365, "learning_rate": 2.2905027932960895e-06, "loss": 0.5748, "step": 42 }, { "epoch": 0.12053258584442887, "grad_norm": 1.5288757695374733, "learning_rate": 2.3463687150837993e-06, "loss": 0.5448, "step": 43 }, { "epoch": 0.12333566923615978, "grad_norm": 1.6436762237674445, "learning_rate": 2.4022346368715087e-06, "loss": 0.5589, "step": 44 }, { "epoch": 0.12613875262789068, "grad_norm": 1.4778751424937657, "learning_rate": 2.458100558659218e-06, "loss": 0.5309, "step": 45 }, { "epoch": 0.1289418360196216, "grad_norm": 1.3893104305753303, "learning_rate": 2.5139664804469276e-06, "loss": 0.5186, "step": 46 }, { "epoch": 0.1317449194113525, "grad_norm": 1.4030470862296944, "learning_rate": 2.569832402234637e-06, "loss": 0.5142, "step": 47 }, { "epoch": 0.1345480028030834, "grad_norm": 1.365418665825913, "learning_rate": 2.6256983240223464e-06, "loss": 0.5474, "step": 48 }, { "epoch": 0.1373510861948143, "grad_norm": 1.322864310730053, "learning_rate": 2.6815642458100562e-06, "loss": 0.5433, "step": 49 }, { "epoch": 0.1401541695865452, "grad_norm": 1.2841278505792295, "learning_rate": 2.7374301675977656e-06, "loss": 0.5461, "step": 50 }, { "epoch": 0.1429572529782761, "grad_norm": 1.4398037033168882, "learning_rate": 2.793296089385475e-06, "loss": 0.5089, "step": 51 }, { "epoch": 0.14576033637000702, "grad_norm": 1.2507634146653344, "learning_rate": 2.8491620111731845e-06, "loss": 0.5085, "step": 52 }, { "epoch": 0.14856341976173792, "grad_norm": 1.216547503273406, "learning_rate": 2.9050279329608943e-06, "loss": 0.5096, "step": 53 }, { "epoch": 0.1513665031534688, "grad_norm": 1.2820613031068602, "learning_rate": 2.9608938547486037e-06, "loss": 0.5577, "step": 54 }, { "epoch": 0.15416958654519972, "grad_norm": 1.2887194937768396, "learning_rate": 3.016759776536313e-06, "loss": 0.4896, "step": 55 }, { "epoch": 0.15697266993693063, "grad_norm": 1.3158880492878813, "learning_rate": 3.0726256983240226e-06, "loss": 0.5328, "step": 56 }, { "epoch": 0.15977575332866154, "grad_norm": 1.291865092954309, "learning_rate": 3.128491620111732e-06, "loss": 0.5188, "step": 57 }, { "epoch": 0.16257883672039244, "grad_norm": 1.389666654490359, "learning_rate": 3.1843575418994414e-06, "loss": 0.5578, "step": 58 }, { "epoch": 0.16538192011212333, "grad_norm": 1.2544095434531006, "learning_rate": 3.240223463687151e-06, "loss": 0.4622, "step": 59 }, { "epoch": 0.16818500350385424, "grad_norm": 1.2954956535487678, "learning_rate": 3.2960893854748607e-06, "loss": 0.5285, "step": 60 }, { "epoch": 0.17098808689558515, "grad_norm": 1.3448826889626828, "learning_rate": 3.35195530726257e-06, "loss": 0.4972, "step": 61 }, { "epoch": 0.17379117028731605, "grad_norm": 1.2888750839140442, "learning_rate": 3.4078212290502795e-06, "loss": 0.4565, "step": 62 }, { "epoch": 0.17659425367904696, "grad_norm": 1.4478169815511575, "learning_rate": 3.4636871508379893e-06, "loss": 0.5086, "step": 63 }, { "epoch": 0.17939733707077785, "grad_norm": 1.2345721438632833, "learning_rate": 3.5195530726256988e-06, "loss": 0.4887, "step": 64 }, { "epoch": 0.18220042046250876, "grad_norm": 1.2750020626675667, "learning_rate": 3.575418994413408e-06, "loss": 0.4556, "step": 65 }, { "epoch": 0.18500350385423966, "grad_norm": 1.1585364434531529, "learning_rate": 3.6312849162011176e-06, "loss": 0.4755, "step": 66 }, { "epoch": 0.18780658724597057, "grad_norm": 1.2251136398194762, "learning_rate": 3.687150837988827e-06, "loss": 0.5002, "step": 67 }, { "epoch": 0.19060967063770148, "grad_norm": 1.2785943600022907, "learning_rate": 3.7430167597765364e-06, "loss": 0.5142, "step": 68 }, { "epoch": 0.19341275402943237, "grad_norm": 1.292978777507132, "learning_rate": 3.798882681564246e-06, "loss": 0.4883, "step": 69 }, { "epoch": 0.19621583742116327, "grad_norm": 1.4760688229840058, "learning_rate": 3.854748603351956e-06, "loss": 0.4798, "step": 70 }, { "epoch": 0.19901892081289418, "grad_norm": 1.3534620335056344, "learning_rate": 3.910614525139665e-06, "loss": 0.478, "step": 71 }, { "epoch": 0.2018220042046251, "grad_norm": 1.3463500240376554, "learning_rate": 3.9664804469273745e-06, "loss": 0.497, "step": 72 }, { "epoch": 0.204625087596356, "grad_norm": 1.5078028142094007, "learning_rate": 4.022346368715084e-06, "loss": 0.4786, "step": 73 }, { "epoch": 0.20742817098808689, "grad_norm": 1.34495483790764, "learning_rate": 4.078212290502794e-06, "loss": 0.4939, "step": 74 }, { "epoch": 0.2102312543798178, "grad_norm": 1.2340956043439963, "learning_rate": 4.134078212290504e-06, "loss": 0.4921, "step": 75 }, { "epoch": 0.2130343377715487, "grad_norm": 1.3085115291626992, "learning_rate": 4.189944134078213e-06, "loss": 0.4754, "step": 76 }, { "epoch": 0.2158374211632796, "grad_norm": 1.252728806989629, "learning_rate": 4.2458100558659224e-06, "loss": 0.5128, "step": 77 }, { "epoch": 0.21864050455501052, "grad_norm": 1.4126957206402742, "learning_rate": 4.301675977653632e-06, "loss": 0.4885, "step": 78 }, { "epoch": 0.2214435879467414, "grad_norm": 1.3208716039081965, "learning_rate": 4.357541899441341e-06, "loss": 0.4786, "step": 79 }, { "epoch": 0.22424667133847231, "grad_norm": 1.2574451697332716, "learning_rate": 4.413407821229051e-06, "loss": 0.474, "step": 80 }, { "epoch": 0.22704975473020322, "grad_norm": 1.287485870277125, "learning_rate": 4.46927374301676e-06, "loss": 0.505, "step": 81 }, { "epoch": 0.22985283812193413, "grad_norm": 1.272270407613237, "learning_rate": 4.5251396648044695e-06, "loss": 0.4595, "step": 82 }, { "epoch": 0.23265592151366504, "grad_norm": 1.469219474277953, "learning_rate": 4.581005586592179e-06, "loss": 0.4652, "step": 83 }, { "epoch": 0.23545900490539592, "grad_norm": 1.2699463875565882, "learning_rate": 4.636871508379888e-06, "loss": 0.5229, "step": 84 }, { "epoch": 0.23826208829712683, "grad_norm": 1.2856158617452633, "learning_rate": 4.692737430167599e-06, "loss": 0.4532, "step": 85 }, { "epoch": 0.24106517168885774, "grad_norm": 1.4817415826333444, "learning_rate": 4.748603351955308e-06, "loss": 0.4771, "step": 86 }, { "epoch": 0.24386825508058865, "grad_norm": 1.305173329779559, "learning_rate": 4.8044692737430175e-06, "loss": 0.4517, "step": 87 }, { "epoch": 0.24667133847231956, "grad_norm": 1.236166182409069, "learning_rate": 4.860335195530727e-06, "loss": 0.4381, "step": 88 }, { "epoch": 0.24947442186405044, "grad_norm": 1.3538165210321431, "learning_rate": 4.916201117318436e-06, "loss": 0.4601, "step": 89 }, { "epoch": 0.25227750525578135, "grad_norm": 1.2959980524853771, "learning_rate": 4.972067039106146e-06, "loss": 0.5043, "step": 90 }, { "epoch": 0.25508058864751226, "grad_norm": 1.257731041700467, "learning_rate": 5.027932960893855e-06, "loss": 0.5042, "step": 91 }, { "epoch": 0.2578836720392432, "grad_norm": 1.1829514019122422, "learning_rate": 5.0837988826815645e-06, "loss": 0.5011, "step": 92 }, { "epoch": 0.2606867554309741, "grad_norm": 1.2039776317025723, "learning_rate": 5.139664804469274e-06, "loss": 0.4459, "step": 93 }, { "epoch": 0.263489838822705, "grad_norm": 1.1537490486646111, "learning_rate": 5.195530726256983e-06, "loss": 0.4135, "step": 94 }, { "epoch": 0.2662929222144359, "grad_norm": 1.1836759088562796, "learning_rate": 5.251396648044693e-06, "loss": 0.4286, "step": 95 }, { "epoch": 0.2690960056061668, "grad_norm": 1.1644038720022742, "learning_rate": 5.307262569832403e-06, "loss": 0.4511, "step": 96 }, { "epoch": 0.27189908899789766, "grad_norm": 1.2841070495233056, "learning_rate": 5.3631284916201125e-06, "loss": 0.4614, "step": 97 }, { "epoch": 0.2747021723896286, "grad_norm": 1.2674957253762746, "learning_rate": 5.418994413407822e-06, "loss": 0.4053, "step": 98 }, { "epoch": 0.2775052557813595, "grad_norm": 1.2564154040578148, "learning_rate": 5.474860335195531e-06, "loss": 0.4754, "step": 99 }, { "epoch": 0.2803083391730904, "grad_norm": 1.2707428474291496, "learning_rate": 5.530726256983241e-06, "loss": 0.4543, "step": 100 }, { "epoch": 0.2831114225648213, "grad_norm": 1.2696450733230285, "learning_rate": 5.58659217877095e-06, "loss": 0.4711, "step": 101 }, { "epoch": 0.2859145059565522, "grad_norm": 1.1385639533491085, "learning_rate": 5.6424581005586595e-06, "loss": 0.5073, "step": 102 }, { "epoch": 0.2887175893482831, "grad_norm": 1.2476843993569997, "learning_rate": 5.698324022346369e-06, "loss": 0.4832, "step": 103 }, { "epoch": 0.29152067274001403, "grad_norm": 1.2700803146631456, "learning_rate": 5.754189944134078e-06, "loss": 0.456, "step": 104 }, { "epoch": 0.29432375613174494, "grad_norm": 1.3063859932211015, "learning_rate": 5.810055865921789e-06, "loss": 0.4663, "step": 105 }, { "epoch": 0.29712683952347585, "grad_norm": 1.37190656711132, "learning_rate": 5.865921787709497e-06, "loss": 0.425, "step": 106 }, { "epoch": 0.2999299229152067, "grad_norm": 1.336749076355929, "learning_rate": 5.9217877094972075e-06, "loss": 0.4532, "step": 107 }, { "epoch": 0.3027330063069376, "grad_norm": 1.3049835257948252, "learning_rate": 5.977653631284917e-06, "loss": 0.4433, "step": 108 }, { "epoch": 0.3055360896986685, "grad_norm": 1.3177076581236578, "learning_rate": 6.033519553072626e-06, "loss": 0.4422, "step": 109 }, { "epoch": 0.30833917309039943, "grad_norm": 1.211428787666853, "learning_rate": 6.089385474860336e-06, "loss": 0.4736, "step": 110 }, { "epoch": 0.31114225648213034, "grad_norm": 1.246339941396602, "learning_rate": 6.145251396648045e-06, "loss": 0.4684, "step": 111 }, { "epoch": 0.31394533987386125, "grad_norm": 1.139096368470202, "learning_rate": 6.2011173184357546e-06, "loss": 0.4711, "step": 112 }, { "epoch": 0.31674842326559216, "grad_norm": 1.1649095863200642, "learning_rate": 6.256983240223464e-06, "loss": 0.4552, "step": 113 }, { "epoch": 0.31955150665732307, "grad_norm": 1.2679588271735107, "learning_rate": 6.312849162011173e-06, "loss": 0.4623, "step": 114 }, { "epoch": 0.322354590049054, "grad_norm": 1.2436538258573628, "learning_rate": 6.368715083798883e-06, "loss": 0.45, "step": 115 }, { "epoch": 0.3251576734407849, "grad_norm": 1.2362511627285122, "learning_rate": 6.424581005586593e-06, "loss": 0.4871, "step": 116 }, { "epoch": 0.32796075683251574, "grad_norm": 1.2278455511243713, "learning_rate": 6.480446927374302e-06, "loss": 0.4304, "step": 117 }, { "epoch": 0.33076384022424665, "grad_norm": 1.2708930790868107, "learning_rate": 6.536312849162012e-06, "loss": 0.4403, "step": 118 }, { "epoch": 0.33356692361597756, "grad_norm": 1.1581710061976354, "learning_rate": 6.592178770949721e-06, "loss": 0.4266, "step": 119 }, { "epoch": 0.33637000700770847, "grad_norm": 1.2609788947789218, "learning_rate": 6.648044692737431e-06, "loss": 0.4579, "step": 120 }, { "epoch": 0.3391730903994394, "grad_norm": 1.30396849435344, "learning_rate": 6.70391061452514e-06, "loss": 0.4427, "step": 121 }, { "epoch": 0.3419761737911703, "grad_norm": 1.2740701750338232, "learning_rate": 6.7597765363128496e-06, "loss": 0.4267, "step": 122 }, { "epoch": 0.3447792571829012, "grad_norm": 1.3545521723600704, "learning_rate": 6.815642458100559e-06, "loss": 0.4572, "step": 123 }, { "epoch": 0.3475823405746321, "grad_norm": 1.2832927748990008, "learning_rate": 6.871508379888268e-06, "loss": 0.4088, "step": 124 }, { "epoch": 0.350385423966363, "grad_norm": 1.184187983962104, "learning_rate": 6.927374301675979e-06, "loss": 0.4178, "step": 125 }, { "epoch": 0.35318850735809393, "grad_norm": 1.2692730688245826, "learning_rate": 6.983240223463687e-06, "loss": 0.4196, "step": 126 }, { "epoch": 0.3559915907498248, "grad_norm": 1.2286033209087615, "learning_rate": 7.0391061452513975e-06, "loss": 0.4623, "step": 127 }, { "epoch": 0.3587946741415557, "grad_norm": 1.3466288536167503, "learning_rate": 7.094972067039106e-06, "loss": 0.4304, "step": 128 }, { "epoch": 0.3615977575332866, "grad_norm": 1.1856599904519147, "learning_rate": 7.150837988826816e-06, "loss": 0.4577, "step": 129 }, { "epoch": 0.3644008409250175, "grad_norm": 1.2878071137137472, "learning_rate": 7.206703910614526e-06, "loss": 0.4512, "step": 130 }, { "epoch": 0.3672039243167484, "grad_norm": 1.2539611050742803, "learning_rate": 7.262569832402235e-06, "loss": 0.4492, "step": 131 }, { "epoch": 0.37000700770847933, "grad_norm": 1.284633807763949, "learning_rate": 7.318435754189945e-06, "loss": 0.4469, "step": 132 }, { "epoch": 0.37281009110021024, "grad_norm": 1.2173213618653451, "learning_rate": 7.374301675977654e-06, "loss": 0.4217, "step": 133 }, { "epoch": 0.37561317449194115, "grad_norm": 1.6356085966747518, "learning_rate": 7.430167597765364e-06, "loss": 0.4674, "step": 134 }, { "epoch": 0.37841625788367206, "grad_norm": 1.3110035885253777, "learning_rate": 7.486033519553073e-06, "loss": 0.4216, "step": 135 }, { "epoch": 0.38121934127540297, "grad_norm": 1.182519047664272, "learning_rate": 7.541899441340783e-06, "loss": 0.4399, "step": 136 }, { "epoch": 0.3840224246671338, "grad_norm": 1.2628129755669273, "learning_rate": 7.597765363128492e-06, "loss": 0.4736, "step": 137 }, { "epoch": 0.38682550805886473, "grad_norm": 1.156134702750007, "learning_rate": 7.653631284916202e-06, "loss": 0.4186, "step": 138 }, { "epoch": 0.38962859145059564, "grad_norm": 1.2403816950592557, "learning_rate": 7.709497206703911e-06, "loss": 0.4262, "step": 139 }, { "epoch": 0.39243167484232655, "grad_norm": 1.2297089863632304, "learning_rate": 7.76536312849162e-06, "loss": 0.4526, "step": 140 }, { "epoch": 0.39523475823405746, "grad_norm": 1.3400924871254594, "learning_rate": 7.82122905027933e-06, "loss": 0.4632, "step": 141 }, { "epoch": 0.39803784162578837, "grad_norm": 1.2753065276019078, "learning_rate": 7.87709497206704e-06, "loss": 0.46, "step": 142 }, { "epoch": 0.4008409250175193, "grad_norm": 1.3918096859402436, "learning_rate": 7.932960893854749e-06, "loss": 0.4265, "step": 143 }, { "epoch": 0.4036440084092502, "grad_norm": 1.4385006652657821, "learning_rate": 7.988826815642458e-06, "loss": 0.4462, "step": 144 }, { "epoch": 0.4064470918009811, "grad_norm": 1.571552869991663, "learning_rate": 8.044692737430168e-06, "loss": 0.4369, "step": 145 }, { "epoch": 0.409250175192712, "grad_norm": 1.337744524737072, "learning_rate": 8.100558659217877e-06, "loss": 0.4211, "step": 146 }, { "epoch": 0.41205325858444286, "grad_norm": 1.2554961285373831, "learning_rate": 8.156424581005588e-06, "loss": 0.4646, "step": 147 }, { "epoch": 0.41485634197617377, "grad_norm": 1.208695228744723, "learning_rate": 8.212290502793296e-06, "loss": 0.4418, "step": 148 }, { "epoch": 0.4176594253679047, "grad_norm": 1.1581362053521904, "learning_rate": 8.268156424581007e-06, "loss": 0.4424, "step": 149 }, { "epoch": 0.4204625087596356, "grad_norm": 1.3537182059958488, "learning_rate": 8.324022346368715e-06, "loss": 0.4466, "step": 150 }, { "epoch": 0.4232655921513665, "grad_norm": 1.3804359968061342, "learning_rate": 8.379888268156426e-06, "loss": 0.42, "step": 151 }, { "epoch": 0.4260686755430974, "grad_norm": 1.3492166236416887, "learning_rate": 8.435754189944135e-06, "loss": 0.3851, "step": 152 }, { "epoch": 0.4288717589348283, "grad_norm": 1.2139526698153973, "learning_rate": 8.491620111731845e-06, "loss": 0.4643, "step": 153 }, { "epoch": 0.4316748423265592, "grad_norm": 1.2990736104285787, "learning_rate": 8.547486033519554e-06, "loss": 0.4666, "step": 154 }, { "epoch": 0.43447792571829014, "grad_norm": 1.29053798784143, "learning_rate": 8.603351955307264e-06, "loss": 0.4384, "step": 155 }, { "epoch": 0.43728100911002105, "grad_norm": 1.4367357751592493, "learning_rate": 8.659217877094973e-06, "loss": 0.4653, "step": 156 }, { "epoch": 0.4400840925017519, "grad_norm": 1.2956449867477386, "learning_rate": 8.715083798882683e-06, "loss": 0.4511, "step": 157 }, { "epoch": 0.4428871758934828, "grad_norm": 1.4106445195453023, "learning_rate": 8.770949720670392e-06, "loss": 0.4456, "step": 158 }, { "epoch": 0.4456902592852137, "grad_norm": 1.2719950562427567, "learning_rate": 8.826815642458101e-06, "loss": 0.4707, "step": 159 }, { "epoch": 0.44849334267694463, "grad_norm": 1.3074151462469454, "learning_rate": 8.88268156424581e-06, "loss": 0.413, "step": 160 }, { "epoch": 0.45129642606867554, "grad_norm": 1.2729626005251127, "learning_rate": 8.93854748603352e-06, "loss": 0.4564, "step": 161 }, { "epoch": 0.45409950946040645, "grad_norm": 1.3933224721084836, "learning_rate": 8.99441340782123e-06, "loss": 0.4752, "step": 162 }, { "epoch": 0.45690259285213736, "grad_norm": 1.2274005354049757, "learning_rate": 9.050279329608939e-06, "loss": 0.4077, "step": 163 }, { "epoch": 0.45970567624386827, "grad_norm": 1.304403404718221, "learning_rate": 9.106145251396648e-06, "loss": 0.4032, "step": 164 }, { "epoch": 0.4625087596355992, "grad_norm": 1.2077276736873666, "learning_rate": 9.162011173184358e-06, "loss": 0.4521, "step": 165 }, { "epoch": 0.4653118430273301, "grad_norm": 1.314785322595634, "learning_rate": 9.217877094972067e-06, "loss": 0.4561, "step": 166 }, { "epoch": 0.46811492641906094, "grad_norm": 1.324805699258867, "learning_rate": 9.273743016759777e-06, "loss": 0.442, "step": 167 }, { "epoch": 0.47091800981079185, "grad_norm": 1.2628735158829716, "learning_rate": 9.329608938547486e-06, "loss": 0.4493, "step": 168 }, { "epoch": 0.47372109320252276, "grad_norm": 1.3524729567879255, "learning_rate": 9.385474860335197e-06, "loss": 0.412, "step": 169 }, { "epoch": 0.47652417659425367, "grad_norm": 1.3838707187120078, "learning_rate": 9.441340782122905e-06, "loss": 0.4248, "step": 170 }, { "epoch": 0.4793272599859846, "grad_norm": 1.3477705759051088, "learning_rate": 9.497206703910616e-06, "loss": 0.4362, "step": 171 }, { "epoch": 0.4821303433777155, "grad_norm": 1.4029804576470706, "learning_rate": 9.553072625698325e-06, "loss": 0.4229, "step": 172 }, { "epoch": 0.4849334267694464, "grad_norm": 1.3212879811094376, "learning_rate": 9.608938547486035e-06, "loss": 0.4462, "step": 173 }, { "epoch": 0.4877365101611773, "grad_norm": 1.2418714279374736, "learning_rate": 9.664804469273744e-06, "loss": 0.4393, "step": 174 }, { "epoch": 0.4905395935529082, "grad_norm": 1.308356659629243, "learning_rate": 9.720670391061454e-06, "loss": 0.4212, "step": 175 }, { "epoch": 0.4933426769446391, "grad_norm": 1.3384997992620349, "learning_rate": 9.776536312849163e-06, "loss": 0.425, "step": 176 }, { "epoch": 0.49614576033637003, "grad_norm": 1.2644912624597537, "learning_rate": 9.832402234636873e-06, "loss": 0.4269, "step": 177 }, { "epoch": 0.4989488437281009, "grad_norm": 1.127396335911068, "learning_rate": 9.888268156424582e-06, "loss": 0.4054, "step": 178 }, { "epoch": 0.5017519271198319, "grad_norm": 1.2928897405382775, "learning_rate": 9.944134078212291e-06, "loss": 0.4246, "step": 179 }, { "epoch": 0.5045550105115627, "grad_norm": 1.2201307841604, "learning_rate": 1e-05, "loss": 0.4229, "step": 180 }, { "epoch": 0.5073580939032937, "grad_norm": 1.1930036804674071, "learning_rate": 9.999990433600054e-06, "loss": 0.4228, "step": 181 }, { "epoch": 0.5101611772950245, "grad_norm": 1.2128301840951208, "learning_rate": 9.999961734436818e-06, "loss": 0.421, "step": 182 }, { "epoch": 0.5129642606867554, "grad_norm": 1.3718335494726508, "learning_rate": 9.999913902620112e-06, "loss": 0.416, "step": 183 }, { "epoch": 0.5157673440784863, "grad_norm": 1.2739769316629392, "learning_rate": 9.99984693833297e-06, "loss": 0.4411, "step": 184 }, { "epoch": 0.5185704274702172, "grad_norm": 1.2217498022218247, "learning_rate": 9.999760841831632e-06, "loss": 0.3921, "step": 185 }, { "epoch": 0.5213735108619482, "grad_norm": 1.2368078852394706, "learning_rate": 9.999655613445552e-06, "loss": 0.4485, "step": 186 }, { "epoch": 0.524176594253679, "grad_norm": 1.2783184977008666, "learning_rate": 9.999531253577393e-06, "loss": 0.4306, "step": 187 }, { "epoch": 0.52697967764541, "grad_norm": 1.2187408632625338, "learning_rate": 9.999387762703025e-06, "loss": 0.447, "step": 188 }, { "epoch": 0.5297827610371408, "grad_norm": 1.2385500306724004, "learning_rate": 9.999225141371528e-06, "loss": 0.4776, "step": 189 }, { "epoch": 0.5325858444288718, "grad_norm": 1.176251336630363, "learning_rate": 9.999043390205176e-06, "loss": 0.3769, "step": 190 }, { "epoch": 0.5353889278206027, "grad_norm": 1.205975361951228, "learning_rate": 9.998842509899456e-06, "loss": 0.3858, "step": 191 }, { "epoch": 0.5381920112123336, "grad_norm": 1.2260710319775825, "learning_rate": 9.998622501223045e-06, "loss": 0.4159, "step": 192 }, { "epoch": 0.5409950946040645, "grad_norm": 1.2624425805658182, "learning_rate": 9.998383365017821e-06, "loss": 0.4543, "step": 193 }, { "epoch": 0.5437981779957953, "grad_norm": 1.3494484879919946, "learning_rate": 9.998125102198855e-06, "loss": 0.4532, "step": 194 }, { "epoch": 0.5466012613875263, "grad_norm": 1.2926626394342269, "learning_rate": 9.9978477137544e-06, "loss": 0.4229, "step": 195 }, { "epoch": 0.5494043447792571, "grad_norm": 1.2615225655334812, "learning_rate": 9.997551200745905e-06, "loss": 0.4234, "step": 196 }, { "epoch": 0.5522074281709881, "grad_norm": 1.2447669303378754, "learning_rate": 9.997235564307992e-06, "loss": 0.4381, "step": 197 }, { "epoch": 0.555010511562719, "grad_norm": 1.3385879305082795, "learning_rate": 9.996900805648462e-06, "loss": 0.4225, "step": 198 }, { "epoch": 0.5578135949544499, "grad_norm": 1.2403486069231195, "learning_rate": 9.996546926048291e-06, "loss": 0.4336, "step": 199 }, { "epoch": 0.5606166783461808, "grad_norm": 1.161836362730849, "learning_rate": 9.99617392686162e-06, "loss": 0.4083, "step": 200 }, { "epoch": 0.5634197617379118, "grad_norm": 1.178827297998461, "learning_rate": 9.995781809515754e-06, "loss": 0.4065, "step": 201 }, { "epoch": 0.5662228451296426, "grad_norm": 1.2949773092417358, "learning_rate": 9.995370575511151e-06, "loss": 0.4337, "step": 202 }, { "epoch": 0.5690259285213735, "grad_norm": 1.2049998818098415, "learning_rate": 9.994940226421423e-06, "loss": 0.4351, "step": 203 }, { "epoch": 0.5718290119131044, "grad_norm": 1.2038562515827964, "learning_rate": 9.994490763893328e-06, "loss": 0.4238, "step": 204 }, { "epoch": 0.5746320953048353, "grad_norm": 1.2682914962745881, "learning_rate": 9.994022189646762e-06, "loss": 0.4974, "step": 205 }, { "epoch": 0.5774351786965662, "grad_norm": 1.247037559522988, "learning_rate": 9.99353450547475e-06, "loss": 0.4922, "step": 206 }, { "epoch": 0.5802382620882971, "grad_norm": 1.3261046076147458, "learning_rate": 9.993027713243444e-06, "loss": 0.4865, "step": 207 }, { "epoch": 0.5830413454800281, "grad_norm": 1.2671942425231029, "learning_rate": 9.992501814892118e-06, "loss": 0.4399, "step": 208 }, { "epoch": 0.5858444288717589, "grad_norm": 1.2209344442747003, "learning_rate": 9.991956812433152e-06, "loss": 0.4551, "step": 209 }, { "epoch": 0.5886475122634899, "grad_norm": 1.3204049859067966, "learning_rate": 9.991392707952032e-06, "loss": 0.4535, "step": 210 }, { "epoch": 0.5914505956552207, "grad_norm": 1.2810704874390295, "learning_rate": 9.990809503607337e-06, "loss": 0.4057, "step": 211 }, { "epoch": 0.5942536790469517, "grad_norm": 1.2683455733657367, "learning_rate": 9.99020720163073e-06, "loss": 0.4438, "step": 212 }, { "epoch": 0.5970567624386826, "grad_norm": 1.1041168309285287, "learning_rate": 9.989585804326963e-06, "loss": 0.3951, "step": 213 }, { "epoch": 0.5998598458304134, "grad_norm": 1.238260130865137, "learning_rate": 9.988945314073842e-06, "loss": 0.43, "step": 214 }, { "epoch": 0.6026629292221444, "grad_norm": 1.2534944948678515, "learning_rate": 9.988285733322246e-06, "loss": 0.4358, "step": 215 }, { "epoch": 0.6054660126138752, "grad_norm": 1.377817255639513, "learning_rate": 9.9876070645961e-06, "loss": 0.4243, "step": 216 }, { "epoch": 0.6082690960056062, "grad_norm": 1.2458026957407808, "learning_rate": 9.986909310492369e-06, "loss": 0.4377, "step": 217 }, { "epoch": 0.611072179397337, "grad_norm": 1.2147593299228276, "learning_rate": 9.98619247368105e-06, "loss": 0.4578, "step": 218 }, { "epoch": 0.613875262789068, "grad_norm": 1.3231829202411483, "learning_rate": 9.985456556905168e-06, "loss": 0.4343, "step": 219 }, { "epoch": 0.6166783461807989, "grad_norm": 1.249603899966305, "learning_rate": 9.984701562980745e-06, "loss": 0.4208, "step": 220 }, { "epoch": 0.6194814295725298, "grad_norm": 1.302692808670413, "learning_rate": 9.983927494796817e-06, "loss": 0.4467, "step": 221 }, { "epoch": 0.6222845129642607, "grad_norm": 1.2344333360492339, "learning_rate": 9.983134355315397e-06, "loss": 0.427, "step": 222 }, { "epoch": 0.6250875963559915, "grad_norm": 1.2271410804803238, "learning_rate": 9.982322147571486e-06, "loss": 0.4295, "step": 223 }, { "epoch": 0.6278906797477225, "grad_norm": 1.2807750184640823, "learning_rate": 9.98149087467304e-06, "loss": 0.4019, "step": 224 }, { "epoch": 0.6306937631394534, "grad_norm": 1.323850652586938, "learning_rate": 9.980640539800978e-06, "loss": 0.4625, "step": 225 }, { "epoch": 0.6334968465311843, "grad_norm": 1.27630164124042, "learning_rate": 9.979771146209159e-06, "loss": 0.4068, "step": 226 }, { "epoch": 0.6362999299229152, "grad_norm": 1.1999461485138891, "learning_rate": 9.978882697224365e-06, "loss": 0.4323, "step": 227 }, { "epoch": 0.6391030133146461, "grad_norm": 1.2581968225422646, "learning_rate": 9.977975196246302e-06, "loss": 0.4339, "step": 228 }, { "epoch": 0.641906096706377, "grad_norm": 1.240275234197154, "learning_rate": 9.977048646747578e-06, "loss": 0.4244, "step": 229 }, { "epoch": 0.644709180098108, "grad_norm": 1.245467039884696, "learning_rate": 9.976103052273689e-06, "loss": 0.4152, "step": 230 }, { "epoch": 0.6475122634898388, "grad_norm": 1.214344966764467, "learning_rate": 9.97513841644301e-06, "loss": 0.4601, "step": 231 }, { "epoch": 0.6503153468815698, "grad_norm": 1.4007471905080375, "learning_rate": 9.974154742946775e-06, "loss": 0.4322, "step": 232 }, { "epoch": 0.6531184302733006, "grad_norm": 1.2215958249221774, "learning_rate": 9.973152035549072e-06, "loss": 0.4433, "step": 233 }, { "epoch": 0.6559215136650315, "grad_norm": 2.120995206727059, "learning_rate": 9.972130298086821e-06, "loss": 0.4457, "step": 234 }, { "epoch": 0.6587245970567625, "grad_norm": 1.253021146434877, "learning_rate": 9.97108953446976e-06, "loss": 0.4384, "step": 235 }, { "epoch": 0.6615276804484933, "grad_norm": 1.154866193185819, "learning_rate": 9.970029748680437e-06, "loss": 0.4186, "step": 236 }, { "epoch": 0.6643307638402243, "grad_norm": 1.1532100653062711, "learning_rate": 9.96895094477418e-06, "loss": 0.4031, "step": 237 }, { "epoch": 0.6671338472319551, "grad_norm": 1.1162555329667243, "learning_rate": 9.967853126879103e-06, "loss": 0.4323, "step": 238 }, { "epoch": 0.6699369306236861, "grad_norm": 1.1151043685218318, "learning_rate": 9.96673629919607e-06, "loss": 0.4191, "step": 239 }, { "epoch": 0.6727400140154169, "grad_norm": 1.1399063080426746, "learning_rate": 9.965600465998686e-06, "loss": 0.3987, "step": 240 }, { "epoch": 0.6755430974071479, "grad_norm": 1.1987807131271386, "learning_rate": 9.964445631633289e-06, "loss": 0.4545, "step": 241 }, { "epoch": 0.6783461807988788, "grad_norm": 1.1401905995341726, "learning_rate": 9.963271800518921e-06, "loss": 0.4524, "step": 242 }, { "epoch": 0.6811492641906096, "grad_norm": 1.1958925095826711, "learning_rate": 9.962078977147315e-06, "loss": 0.4049, "step": 243 }, { "epoch": 0.6839523475823406, "grad_norm": 1.1843051651809535, "learning_rate": 9.960867166082884e-06, "loss": 0.4432, "step": 244 }, { "epoch": 0.6867554309740714, "grad_norm": 1.207650238930726, "learning_rate": 9.959636371962693e-06, "loss": 0.4027, "step": 245 }, { "epoch": 0.6895585143658024, "grad_norm": 1.2451548104875187, "learning_rate": 9.95838659949645e-06, "loss": 0.394, "step": 246 }, { "epoch": 0.6923615977575333, "grad_norm": 1.2128418404191421, "learning_rate": 9.957117853466488e-06, "loss": 0.4291, "step": 247 }, { "epoch": 0.6951646811492642, "grad_norm": 1.2652572678307166, "learning_rate": 9.955830138727736e-06, "loss": 0.3884, "step": 248 }, { "epoch": 0.6979677645409951, "grad_norm": 1.2253466216228863, "learning_rate": 9.95452346020771e-06, "loss": 0.4227, "step": 249 }, { "epoch": 0.700770847932726, "grad_norm": 1.2241766479031995, "learning_rate": 9.9531978229065e-06, "loss": 0.4035, "step": 250 }, { "epoch": 0.7035739313244569, "grad_norm": 1.2815556845680005, "learning_rate": 9.951853231896732e-06, "loss": 0.4127, "step": 251 }, { "epoch": 0.7063770147161879, "grad_norm": 1.2098755581989178, "learning_rate": 9.950489692323564e-06, "loss": 0.3664, "step": 252 }, { "epoch": 0.7091800981079187, "grad_norm": 1.240560733421699, "learning_rate": 9.949107209404664e-06, "loss": 0.4006, "step": 253 }, { "epoch": 0.7119831814996496, "grad_norm": 1.2504378152370468, "learning_rate": 9.947705788430185e-06, "loss": 0.4322, "step": 254 }, { "epoch": 0.7147862648913805, "grad_norm": 1.1186927421611175, "learning_rate": 9.946285434762748e-06, "loss": 0.4353, "step": 255 }, { "epoch": 0.7175893482831114, "grad_norm": 1.1453594474291244, "learning_rate": 9.944846153837423e-06, "loss": 0.4236, "step": 256 }, { "epoch": 0.7203924316748423, "grad_norm": 1.2847311022382035, "learning_rate": 9.943387951161702e-06, "loss": 0.4614, "step": 257 }, { "epoch": 0.7231955150665732, "grad_norm": 1.2992529016358338, "learning_rate": 9.941910832315488e-06, "loss": 0.4381, "step": 258 }, { "epoch": 0.7259985984583042, "grad_norm": 1.2578873200013914, "learning_rate": 9.940414802951065e-06, "loss": 0.419, "step": 259 }, { "epoch": 0.728801681850035, "grad_norm": 1.2120714209492573, "learning_rate": 9.938899868793074e-06, "loss": 0.4052, "step": 260 }, { "epoch": 0.731604765241766, "grad_norm": 1.4598068796114356, "learning_rate": 9.937366035638507e-06, "loss": 0.4364, "step": 261 }, { "epoch": 0.7344078486334968, "grad_norm": 1.1979088951162828, "learning_rate": 9.935813309356666e-06, "loss": 0.424, "step": 262 }, { "epoch": 0.7372109320252278, "grad_norm": 1.2090949870085903, "learning_rate": 9.934241695889152e-06, "loss": 0.3708, "step": 263 }, { "epoch": 0.7400140154169587, "grad_norm": 1.1497339936210509, "learning_rate": 9.93265120124984e-06, "loss": 0.3913, "step": 264 }, { "epoch": 0.7428170988086895, "grad_norm": 1.1332107950494912, "learning_rate": 9.931041831524849e-06, "loss": 0.4215, "step": 265 }, { "epoch": 0.7456201822004205, "grad_norm": 1.3248126918741736, "learning_rate": 9.92941359287253e-06, "loss": 0.4317, "step": 266 }, { "epoch": 0.7484232655921513, "grad_norm": 1.13149487465254, "learning_rate": 9.927766491523438e-06, "loss": 0.4085, "step": 267 }, { "epoch": 0.7512263489838823, "grad_norm": 1.2498640918111141, "learning_rate": 9.926100533780304e-06, "loss": 0.43, "step": 268 }, { "epoch": 0.7540294323756132, "grad_norm": 1.1043646249350867, "learning_rate": 9.924415726018015e-06, "loss": 0.4296, "step": 269 }, { "epoch": 0.7568325157673441, "grad_norm": 1.1939471676982463, "learning_rate": 9.92271207468359e-06, "loss": 0.4259, "step": 270 }, { "epoch": 0.759635599159075, "grad_norm": 1.1734698198260922, "learning_rate": 9.92098958629615e-06, "loss": 0.4315, "step": 271 }, { "epoch": 0.7624386825508059, "grad_norm": 1.2366506225023914, "learning_rate": 9.919248267446904e-06, "loss": 0.444, "step": 272 }, { "epoch": 0.7652417659425368, "grad_norm": 1.2935143937218294, "learning_rate": 9.91748812479911e-06, "loss": 0.4387, "step": 273 }, { "epoch": 0.7680448493342676, "grad_norm": 1.2165170235545248, "learning_rate": 9.915709165088063e-06, "loss": 0.4302, "step": 274 }, { "epoch": 0.7708479327259986, "grad_norm": 1.2403333327332409, "learning_rate": 9.913911395121055e-06, "loss": 0.3891, "step": 275 }, { "epoch": 0.7736510161177295, "grad_norm": 1.1409335992116196, "learning_rate": 9.912094821777362e-06, "loss": 0.4127, "step": 276 }, { "epoch": 0.7764540995094604, "grad_norm": 1.246259341579308, "learning_rate": 9.910259452008213e-06, "loss": 0.4433, "step": 277 }, { "epoch": 0.7792571829011913, "grad_norm": 1.2152200486896416, "learning_rate": 9.908405292836758e-06, "loss": 0.4377, "step": 278 }, { "epoch": 0.7820602662929222, "grad_norm": 1.2701928834270326, "learning_rate": 9.906532351358047e-06, "loss": 0.4105, "step": 279 }, { "epoch": 0.7848633496846531, "grad_norm": 1.2002485609561173, "learning_rate": 9.904640634739007e-06, "loss": 0.4232, "step": 280 }, { "epoch": 0.7876664330763841, "grad_norm": 1.220915641122119, "learning_rate": 9.902730150218403e-06, "loss": 0.3833, "step": 281 }, { "epoch": 0.7904695164681149, "grad_norm": 1.2221671775995764, "learning_rate": 9.900800905106817e-06, "loss": 0.4414, "step": 282 }, { "epoch": 0.7932725998598459, "grad_norm": 1.112496291445257, "learning_rate": 9.898852906786624e-06, "loss": 0.4461, "step": 283 }, { "epoch": 0.7960756832515767, "grad_norm": 1.146299506919545, "learning_rate": 9.896886162711955e-06, "loss": 0.3982, "step": 284 }, { "epoch": 0.7988787666433076, "grad_norm": 1.2228381900025063, "learning_rate": 9.894900680408674e-06, "loss": 0.4267, "step": 285 }, { "epoch": 0.8016818500350386, "grad_norm": 1.1804828603616793, "learning_rate": 9.892896467474348e-06, "loss": 0.397, "step": 286 }, { "epoch": 0.8044849334267694, "grad_norm": 1.1779671579043933, "learning_rate": 9.890873531578218e-06, "loss": 0.4178, "step": 287 }, { "epoch": 0.8072880168185004, "grad_norm": 1.1973271401486334, "learning_rate": 9.888831880461171e-06, "loss": 0.431, "step": 288 }, { "epoch": 0.8100911002102312, "grad_norm": 1.3283745000265912, "learning_rate": 9.886771521935706e-06, "loss": 0.4171, "step": 289 }, { "epoch": 0.8128941836019622, "grad_norm": 1.2927994872402913, "learning_rate": 9.88469246388591e-06, "loss": 0.4244, "step": 290 }, { "epoch": 0.815697266993693, "grad_norm": 1.1224149905731677, "learning_rate": 9.88259471426742e-06, "loss": 0.4008, "step": 291 }, { "epoch": 0.818500350385424, "grad_norm": 1.1423126396133614, "learning_rate": 9.880478281107404e-06, "loss": 0.4376, "step": 292 }, { "epoch": 0.8213034337771549, "grad_norm": 1.1440128493566712, "learning_rate": 9.87834317250452e-06, "loss": 0.392, "step": 293 }, { "epoch": 0.8241065171688857, "grad_norm": 1.2339673417138244, "learning_rate": 9.876189396628889e-06, "loss": 0.4144, "step": 294 }, { "epoch": 0.8269096005606167, "grad_norm": 1.2058763608514713, "learning_rate": 9.874016961722062e-06, "loss": 0.4253, "step": 295 }, { "epoch": 0.8297126839523475, "grad_norm": 1.2173187014135698, "learning_rate": 9.871825876096992e-06, "loss": 0.4006, "step": 296 }, { "epoch": 0.8325157673440785, "grad_norm": 1.168628149123796, "learning_rate": 9.869616148138002e-06, "loss": 0.4251, "step": 297 }, { "epoch": 0.8353188507358094, "grad_norm": 1.1695554670785075, "learning_rate": 9.867387786300743e-06, "loss": 0.4097, "step": 298 }, { "epoch": 0.8381219341275403, "grad_norm": 1.172817055064454, "learning_rate": 9.865140799112183e-06, "loss": 0.4432, "step": 299 }, { "epoch": 0.8409250175192712, "grad_norm": 1.2450897072979896, "learning_rate": 9.862875195170547e-06, "loss": 0.392, "step": 300 }, { "epoch": 0.8437281009110021, "grad_norm": 1.1423510742995624, "learning_rate": 9.860590983145307e-06, "loss": 0.4269, "step": 301 }, { "epoch": 0.846531184302733, "grad_norm": 1.321059116128163, "learning_rate": 9.858288171777137e-06, "loss": 0.4188, "step": 302 }, { "epoch": 0.849334267694464, "grad_norm": 1.1798784843294252, "learning_rate": 9.855966769877882e-06, "loss": 0.3991, "step": 303 }, { "epoch": 0.8521373510861948, "grad_norm": 1.1931905188257432, "learning_rate": 9.853626786330529e-06, "loss": 0.4248, "step": 304 }, { "epoch": 0.8549404344779257, "grad_norm": 1.1213243707873113, "learning_rate": 9.851268230089159e-06, "loss": 0.3863, "step": 305 }, { "epoch": 0.8577435178696566, "grad_norm": 1.1340621760749867, "learning_rate": 9.848891110178936e-06, "loss": 0.3895, "step": 306 }, { "epoch": 0.8605466012613875, "grad_norm": 1.2154043545797661, "learning_rate": 9.846495435696044e-06, "loss": 0.4013, "step": 307 }, { "epoch": 0.8633496846531185, "grad_norm": 1.2117311685748091, "learning_rate": 9.844081215807684e-06, "loss": 0.3986, "step": 308 }, { "epoch": 0.8661527680448493, "grad_norm": 1.1801740574734845, "learning_rate": 9.841648459752006e-06, "loss": 0.4101, "step": 309 }, { "epoch": 0.8689558514365803, "grad_norm": 1.190858126233445, "learning_rate": 9.839197176838102e-06, "loss": 0.4294, "step": 310 }, { "epoch": 0.8717589348283111, "grad_norm": 1.1329469986887595, "learning_rate": 9.836727376445949e-06, "loss": 0.4173, "step": 311 }, { "epoch": 0.8745620182200421, "grad_norm": 1.363901133414959, "learning_rate": 9.834239068026388e-06, "loss": 0.417, "step": 312 }, { "epoch": 0.877365101611773, "grad_norm": 1.2669242824719738, "learning_rate": 9.831732261101081e-06, "loss": 0.4015, "step": 313 }, { "epoch": 0.8801681850035038, "grad_norm": 1.2125343853948405, "learning_rate": 9.829206965262477e-06, "loss": 0.4076, "step": 314 }, { "epoch": 0.8829712683952348, "grad_norm": 1.3037486777718863, "learning_rate": 9.826663190173766e-06, "loss": 0.4103, "step": 315 }, { "epoch": 0.8857743517869656, "grad_norm": 1.1678797734363202, "learning_rate": 9.824100945568862e-06, "loss": 0.4303, "step": 316 }, { "epoch": 0.8885774351786966, "grad_norm": 1.2814171712048255, "learning_rate": 9.821520241252345e-06, "loss": 0.4444, "step": 317 }, { "epoch": 0.8913805185704274, "grad_norm": 1.2662469748298908, "learning_rate": 9.818921087099435e-06, "loss": 0.418, "step": 318 }, { "epoch": 0.8941836019621584, "grad_norm": 1.1798395604362133, "learning_rate": 9.81630349305595e-06, "loss": 0.4006, "step": 319 }, { "epoch": 0.8969866853538893, "grad_norm": 1.2210641823585986, "learning_rate": 9.813667469138273e-06, "loss": 0.4121, "step": 320 }, { "epoch": 0.8997897687456202, "grad_norm": 1.113750506404835, "learning_rate": 9.811013025433306e-06, "loss": 0.4001, "step": 321 }, { "epoch": 0.9025928521373511, "grad_norm": 1.1820454191767555, "learning_rate": 9.80834017209844e-06, "loss": 0.4076, "step": 322 }, { "epoch": 0.905395935529082, "grad_norm": 1.3666718348985063, "learning_rate": 9.805648919361505e-06, "loss": 0.4264, "step": 323 }, { "epoch": 0.9081990189208129, "grad_norm": 1.1007502397580635, "learning_rate": 9.802939277520742e-06, "loss": 0.3926, "step": 324 }, { "epoch": 0.9110021023125437, "grad_norm": 1.146317357704069, "learning_rate": 9.800211256944758e-06, "loss": 0.383, "step": 325 }, { "epoch": 0.9138051857042747, "grad_norm": 1.1540030060149493, "learning_rate": 9.797464868072489e-06, "loss": 0.4081, "step": 326 }, { "epoch": 0.9166082690960056, "grad_norm": 1.1968849969085296, "learning_rate": 9.794700121413153e-06, "loss": 0.4259, "step": 327 }, { "epoch": 0.9194113524877365, "grad_norm": 1.1565356272261325, "learning_rate": 9.791917027546223e-06, "loss": 0.421, "step": 328 }, { "epoch": 0.9222144358794674, "grad_norm": 1.205469308148384, "learning_rate": 9.789115597121372e-06, "loss": 0.3445, "step": 329 }, { "epoch": 0.9250175192711984, "grad_norm": 1.3194233731376825, "learning_rate": 9.786295840858444e-06, "loss": 0.4042, "step": 330 }, { "epoch": 0.9278206026629292, "grad_norm": 1.2302518477644826, "learning_rate": 9.783457769547403e-06, "loss": 0.4077, "step": 331 }, { "epoch": 0.9306236860546602, "grad_norm": 1.173610556015113, "learning_rate": 9.7806013940483e-06, "loss": 0.438, "step": 332 }, { "epoch": 0.933426769446391, "grad_norm": 1.179100651678326, "learning_rate": 9.777726725291227e-06, "loss": 0.3853, "step": 333 }, { "epoch": 0.9362298528381219, "grad_norm": 1.2097175846660944, "learning_rate": 9.774833774276278e-06, "loss": 0.4083, "step": 334 }, { "epoch": 0.9390329362298528, "grad_norm": 1.1234197630642826, "learning_rate": 9.7719225520735e-06, "loss": 0.3823, "step": 335 }, { "epoch": 0.9418360196215837, "grad_norm": 1.2464354979721357, "learning_rate": 9.768993069822862e-06, "loss": 0.4106, "step": 336 }, { "epoch": 0.9446391030133147, "grad_norm": 1.2436160510868548, "learning_rate": 9.766045338734204e-06, "loss": 0.3906, "step": 337 }, { "epoch": 0.9474421864050455, "grad_norm": 1.123928535467421, "learning_rate": 9.763079370087196e-06, "loss": 0.3872, "step": 338 }, { "epoch": 0.9502452697967765, "grad_norm": 1.1822751620037166, "learning_rate": 9.760095175231292e-06, "loss": 0.4356, "step": 339 }, { "epoch": 0.9530483531885073, "grad_norm": 1.1621700619231359, "learning_rate": 9.757092765585695e-06, "loss": 0.4097, "step": 340 }, { "epoch": 0.9558514365802383, "grad_norm": 1.1746658349586656, "learning_rate": 9.754072152639305e-06, "loss": 0.4427, "step": 341 }, { "epoch": 0.9586545199719692, "grad_norm": 1.1182360916017116, "learning_rate": 9.75103334795068e-06, "loss": 0.4284, "step": 342 }, { "epoch": 0.9614576033637001, "grad_norm": 1.1591583547321715, "learning_rate": 9.747976363147985e-06, "loss": 0.4247, "step": 343 }, { "epoch": 0.964260686755431, "grad_norm": 1.1567249929917516, "learning_rate": 9.744901209928959e-06, "loss": 0.3987, "step": 344 }, { "epoch": 0.9670637701471618, "grad_norm": 1.1069495365564859, "learning_rate": 9.741807900060858e-06, "loss": 0.4308, "step": 345 }, { "epoch": 0.9698668535388928, "grad_norm": 1.1817032261442333, "learning_rate": 9.73869644538042e-06, "loss": 0.4497, "step": 346 }, { "epoch": 0.9726699369306236, "grad_norm": 1.05271516179772, "learning_rate": 9.73556685779381e-06, "loss": 0.4119, "step": 347 }, { "epoch": 0.9754730203223546, "grad_norm": 1.170513661266846, "learning_rate": 9.732419149276586e-06, "loss": 0.3896, "step": 348 }, { "epoch": 0.9782761037140855, "grad_norm": 1.1360548736118075, "learning_rate": 9.729253331873643e-06, "loss": 0.3971, "step": 349 }, { "epoch": 0.9810791871058164, "grad_norm": 1.1978579692167597, "learning_rate": 9.726069417699167e-06, "loss": 0.4166, "step": 350 }, { "epoch": 0.9838822704975473, "grad_norm": 1.1945154693243045, "learning_rate": 9.722867418936601e-06, "loss": 0.4263, "step": 351 }, { "epoch": 0.9866853538892782, "grad_norm": 1.166230468871644, "learning_rate": 9.719647347838584e-06, "loss": 0.4042, "step": 352 }, { "epoch": 0.9894884372810091, "grad_norm": 1.1826135025862843, "learning_rate": 9.71640921672691e-06, "loss": 0.3896, "step": 353 }, { "epoch": 0.9922915206727401, "grad_norm": 1.2382476108408331, "learning_rate": 9.713153037992484e-06, "loss": 0.385, "step": 354 }, { "epoch": 0.9950946040644709, "grad_norm": 1.1847256862767788, "learning_rate": 9.709878824095266e-06, "loss": 0.4272, "step": 355 }, { "epoch": 0.9978976874562018, "grad_norm": 1.2348908655062902, "learning_rate": 9.706586587564236e-06, "loss": 0.3994, "step": 356 }, { "epoch": 1.0, "grad_norm": 1.2348908655062902, "learning_rate": 9.703276340997331e-06, "loss": 0.3106, "step": 357 }, { "epoch": 1.002803083391731, "grad_norm": 1.0847506359320522, "learning_rate": 9.699948097061412e-06, "loss": 0.3172, "step": 358 }, { "epoch": 1.0056061667834617, "grad_norm": 1.0314207594332343, "learning_rate": 9.6966018684922e-06, "loss": 0.2897, "step": 359 }, { "epoch": 1.0084092501751927, "grad_norm": 1.0439078471345375, "learning_rate": 9.693237668094242e-06, "loss": 0.328, "step": 360 }, { "epoch": 1.0112123335669236, "grad_norm": 1.0093501975936998, "learning_rate": 9.689855508740852e-06, "loss": 0.3314, "step": 361 }, { "epoch": 1.0140154169586546, "grad_norm": 1.0454941837116838, "learning_rate": 9.686455403374062e-06, "loss": 0.2894, "step": 362 }, { "epoch": 1.0168185003503853, "grad_norm": 1.0985918166666147, "learning_rate": 9.683037365004584e-06, "loss": 0.349, "step": 363 }, { "epoch": 1.0196215837421163, "grad_norm": 1.0912615951275002, "learning_rate": 9.679601406711746e-06, "loss": 0.3347, "step": 364 }, { "epoch": 1.0224246671338473, "grad_norm": 1.1344040429952662, "learning_rate": 9.676147541643449e-06, "loss": 0.2837, "step": 365 }, { "epoch": 1.0252277505255782, "grad_norm": 1.0053485977042733, "learning_rate": 9.67267578301611e-06, "loss": 0.3058, "step": 366 }, { "epoch": 1.028030833917309, "grad_norm": 1.110591276900221, "learning_rate": 9.669186144114627e-06, "loss": 0.3311, "step": 367 }, { "epoch": 1.03083391730904, "grad_norm": 1.0505594139502015, "learning_rate": 9.66567863829231e-06, "loss": 0.2907, "step": 368 }, { "epoch": 1.033637000700771, "grad_norm": 1.085730742136853, "learning_rate": 9.662153278970842e-06, "loss": 0.3222, "step": 369 }, { "epoch": 1.0364400840925017, "grad_norm": 1.1000592830799627, "learning_rate": 9.65861007964022e-06, "loss": 0.36, "step": 370 }, { "epoch": 1.0392431674842326, "grad_norm": 1.0403078396095022, "learning_rate": 9.655049053858709e-06, "loss": 0.3166, "step": 371 }, { "epoch": 1.0420462508759636, "grad_norm": 1.0898946838202692, "learning_rate": 9.65147021525279e-06, "loss": 0.3318, "step": 372 }, { "epoch": 1.0448493342676946, "grad_norm": 1.1636912765172764, "learning_rate": 9.647873577517102e-06, "loss": 0.3112, "step": 373 }, { "epoch": 1.0476524176594253, "grad_norm": 1.062876460848521, "learning_rate": 9.644259154414396e-06, "loss": 0.3213, "step": 374 }, { "epoch": 1.0504555010511563, "grad_norm": 1.0411461676692622, "learning_rate": 9.640626959775475e-06, "loss": 0.2833, "step": 375 }, { "epoch": 1.0532585844428872, "grad_norm": 1.0698719260700786, "learning_rate": 9.636977007499153e-06, "loss": 0.3183, "step": 376 }, { "epoch": 1.056061667834618, "grad_norm": 1.1063438881099272, "learning_rate": 9.633309311552192e-06, "loss": 0.3185, "step": 377 }, { "epoch": 1.058864751226349, "grad_norm": 1.1755296275706542, "learning_rate": 9.62962388596925e-06, "loss": 0.3356, "step": 378 }, { "epoch": 1.06166783461808, "grad_norm": 1.1170189445294432, "learning_rate": 9.625920744852829e-06, "loss": 0.3183, "step": 379 }, { "epoch": 1.0644709180098109, "grad_norm": 1.097121892348547, "learning_rate": 9.622199902373218e-06, "loss": 0.3228, "step": 380 }, { "epoch": 1.0672740014015416, "grad_norm": 1.0270976338759783, "learning_rate": 9.618461372768446e-06, "loss": 0.3238, "step": 381 }, { "epoch": 1.0700770847932726, "grad_norm": 1.014617517007521, "learning_rate": 9.614705170344221e-06, "loss": 0.2889, "step": 382 }, { "epoch": 1.0728801681850035, "grad_norm": 1.0832491882178725, "learning_rate": 9.610931309473875e-06, "loss": 0.3448, "step": 383 }, { "epoch": 1.0756832515767345, "grad_norm": 1.085160625021815, "learning_rate": 9.607139804598316e-06, "loss": 0.3467, "step": 384 }, { "epoch": 1.0784863349684652, "grad_norm": 1.084827294674734, "learning_rate": 9.603330670225964e-06, "loss": 0.3281, "step": 385 }, { "epoch": 1.0812894183601962, "grad_norm": 1.0773604121220748, "learning_rate": 9.599503920932698e-06, "loss": 0.3532, "step": 386 }, { "epoch": 1.0840925017519272, "grad_norm": 1.0235123673940172, "learning_rate": 9.595659571361805e-06, "loss": 0.3166, "step": 387 }, { "epoch": 1.0868955851436581, "grad_norm": 0.9963295027443626, "learning_rate": 9.591797636223921e-06, "loss": 0.3195, "step": 388 }, { "epoch": 1.0896986685353889, "grad_norm": 1.1501677549952232, "learning_rate": 9.587918130296969e-06, "loss": 0.3155, "step": 389 }, { "epoch": 1.0925017519271198, "grad_norm": 1.003967215467938, "learning_rate": 9.584021068426114e-06, "loss": 0.3063, "step": 390 }, { "epoch": 1.0953048353188508, "grad_norm": 1.0108615715138858, "learning_rate": 9.580106465523695e-06, "loss": 0.3574, "step": 391 }, { "epoch": 1.0981079187105816, "grad_norm": 1.1165766187456356, "learning_rate": 9.576174336569177e-06, "loss": 0.2864, "step": 392 }, { "epoch": 1.1009110021023125, "grad_norm": 1.0543544935190767, "learning_rate": 9.572224696609085e-06, "loss": 0.3239, "step": 393 }, { "epoch": 1.1037140854940435, "grad_norm": 1.0377856653383537, "learning_rate": 9.568257560756955e-06, "loss": 0.3123, "step": 394 }, { "epoch": 1.1065171688857744, "grad_norm": 1.1139234753576157, "learning_rate": 9.564272944193269e-06, "loss": 0.3428, "step": 395 }, { "epoch": 1.1093202522775052, "grad_norm": 1.0936556362287269, "learning_rate": 9.560270862165401e-06, "loss": 0.2733, "step": 396 }, { "epoch": 1.1121233356692362, "grad_norm": 1.0370010859408514, "learning_rate": 9.556251329987561e-06, "loss": 0.2821, "step": 397 }, { "epoch": 1.1149264190609671, "grad_norm": 1.0311513839370219, "learning_rate": 9.552214363040725e-06, "loss": 0.3423, "step": 398 }, { "epoch": 1.1177295024526979, "grad_norm": 1.10210044846964, "learning_rate": 9.548159976772593e-06, "loss": 0.3303, "step": 399 }, { "epoch": 1.1205325858444288, "grad_norm": 0.9858827916119725, "learning_rate": 9.544088186697515e-06, "loss": 0.2932, "step": 400 }, { "epoch": 1.1233356692361598, "grad_norm": 0.9983897052826234, "learning_rate": 9.539999008396442e-06, "loss": 0.3458, "step": 401 }, { "epoch": 1.1261387526278908, "grad_norm": 1.02060751773833, "learning_rate": 9.535892457516858e-06, "loss": 0.3236, "step": 402 }, { "epoch": 1.1289418360196215, "grad_norm": 1.030336385659217, "learning_rate": 9.531768549772728e-06, "loss": 0.3017, "step": 403 }, { "epoch": 1.1317449194113525, "grad_norm": 1.0518307790625796, "learning_rate": 9.527627300944434e-06, "loss": 0.3415, "step": 404 }, { "epoch": 1.1345480028030834, "grad_norm": 1.0220350871337325, "learning_rate": 9.523468726878707e-06, "loss": 0.3283, "step": 405 }, { "epoch": 1.1373510861948142, "grad_norm": 1.0047110923356777, "learning_rate": 9.519292843488584e-06, "loss": 0.3306, "step": 406 }, { "epoch": 1.1401541695865451, "grad_norm": 1.08438304004638, "learning_rate": 9.515099666753333e-06, "loss": 0.3116, "step": 407 }, { "epoch": 1.142957252978276, "grad_norm": 1.0793229294918072, "learning_rate": 9.510889212718398e-06, "loss": 0.3662, "step": 408 }, { "epoch": 1.145760336370007, "grad_norm": 1.1062791611794396, "learning_rate": 9.506661497495332e-06, "loss": 0.3316, "step": 409 }, { "epoch": 1.148563419761738, "grad_norm": 1.0827647053862033, "learning_rate": 9.502416537261739e-06, "loss": 0.2952, "step": 410 }, { "epoch": 1.1513665031534688, "grad_norm": 1.0678606153312582, "learning_rate": 9.498154348261217e-06, "loss": 0.3075, "step": 411 }, { "epoch": 1.1541695865451997, "grad_norm": 1.0418705421995285, "learning_rate": 9.493874946803287e-06, "loss": 0.3327, "step": 412 }, { "epoch": 1.1569726699369307, "grad_norm": 1.0096111225194304, "learning_rate": 9.489578349263336e-06, "loss": 0.2977, "step": 413 }, { "epoch": 1.1597757533286615, "grad_norm": 1.1086300275277539, "learning_rate": 9.485264572082551e-06, "loss": 0.31, "step": 414 }, { "epoch": 1.1625788367203924, "grad_norm": 1.0836113044122648, "learning_rate": 9.480933631767858e-06, "loss": 0.2845, "step": 415 }, { "epoch": 1.1653819201121234, "grad_norm": 1.075806150693619, "learning_rate": 9.476585544891862e-06, "loss": 0.2996, "step": 416 }, { "epoch": 1.1681850035038543, "grad_norm": 1.0465845198833996, "learning_rate": 9.472220328092778e-06, "loss": 0.3168, "step": 417 }, { "epoch": 1.170988086895585, "grad_norm": 1.0492451545412518, "learning_rate": 9.467837998074369e-06, "loss": 0.3059, "step": 418 }, { "epoch": 1.173791170287316, "grad_norm": 1.0230461022692288, "learning_rate": 9.463438571605885e-06, "loss": 0.3287, "step": 419 }, { "epoch": 1.176594253679047, "grad_norm": 1.005464439961268, "learning_rate": 9.459022065521994e-06, "loss": 0.3508, "step": 420 }, { "epoch": 1.1793973370707778, "grad_norm": 1.0919205018252067, "learning_rate": 9.45458849672272e-06, "loss": 0.3391, "step": 421 }, { "epoch": 1.1822004204625087, "grad_norm": 1.092549869041747, "learning_rate": 9.450137882173385e-06, "loss": 0.3305, "step": 422 }, { "epoch": 1.1850035038542397, "grad_norm": 1.05626041378706, "learning_rate": 9.445670238904528e-06, "loss": 0.3215, "step": 423 }, { "epoch": 1.1878065872459707, "grad_norm": 1.0293502687515852, "learning_rate": 9.441185584011854e-06, "loss": 0.2953, "step": 424 }, { "epoch": 1.1906096706377014, "grad_norm": 0.9785312880056695, "learning_rate": 9.436683934656165e-06, "loss": 0.3092, "step": 425 }, { "epoch": 1.1934127540294324, "grad_norm": 1.0204104347912326, "learning_rate": 9.432165308063293e-06, "loss": 0.3354, "step": 426 }, { "epoch": 1.1962158374211633, "grad_norm": 1.1553343174766495, "learning_rate": 9.427629721524034e-06, "loss": 0.3125, "step": 427 }, { "epoch": 1.199018920812894, "grad_norm": 1.0599997718566063, "learning_rate": 9.423077192394081e-06, "loss": 0.3172, "step": 428 }, { "epoch": 1.201822004204625, "grad_norm": 1.162318840029123, "learning_rate": 9.418507738093958e-06, "loss": 0.3392, "step": 429 }, { "epoch": 1.204625087596356, "grad_norm": 1.0011365892808832, "learning_rate": 9.413921376108958e-06, "loss": 0.3132, "step": 430 }, { "epoch": 1.207428170988087, "grad_norm": 1.0369608568465343, "learning_rate": 9.409318123989073e-06, "loss": 0.341, "step": 431 }, { "epoch": 1.2102312543798177, "grad_norm": 1.1433348864427362, "learning_rate": 9.404697999348917e-06, "loss": 0.301, "step": 432 }, { "epoch": 1.2130343377715487, "grad_norm": 0.9794101295514953, "learning_rate": 9.40006101986768e-06, "loss": 0.3232, "step": 433 }, { "epoch": 1.2158374211632796, "grad_norm": 1.0367960310279924, "learning_rate": 9.395407203289036e-06, "loss": 0.3038, "step": 434 }, { "epoch": 1.2186405045550106, "grad_norm": 1.0586375505573105, "learning_rate": 9.3907365674211e-06, "loss": 0.3347, "step": 435 }, { "epoch": 1.2214435879467413, "grad_norm": 1.0401963116022124, "learning_rate": 9.386049130136335e-06, "loss": 0.3284, "step": 436 }, { "epoch": 1.2242466713384723, "grad_norm": 1.0717767652915415, "learning_rate": 9.381344909371504e-06, "loss": 0.3243, "step": 437 }, { "epoch": 1.2270497547302033, "grad_norm": 1.1136261306345807, "learning_rate": 9.376623923127588e-06, "loss": 0.2979, "step": 438 }, { "epoch": 1.2298528381219342, "grad_norm": 0.9752498937994857, "learning_rate": 9.371886189469724e-06, "loss": 0.3265, "step": 439 }, { "epoch": 1.232655921513665, "grad_norm": 0.9663441642697678, "learning_rate": 9.367131726527137e-06, "loss": 0.3241, "step": 440 }, { "epoch": 1.235459004905396, "grad_norm": 1.0172564072998984, "learning_rate": 9.362360552493062e-06, "loss": 0.2972, "step": 441 }, { "epoch": 1.238262088297127, "grad_norm": 1.0437288823429967, "learning_rate": 9.35757268562468e-06, "loss": 0.289, "step": 442 }, { "epoch": 1.2410651716888577, "grad_norm": 1.0425647440619517, "learning_rate": 9.352768144243059e-06, "loss": 0.3415, "step": 443 }, { "epoch": 1.2438682550805886, "grad_norm": 1.0884794197548682, "learning_rate": 9.347946946733055e-06, "loss": 0.3181, "step": 444 }, { "epoch": 1.2466713384723196, "grad_norm": 1.1105563286976543, "learning_rate": 9.343109111543275e-06, "loss": 0.3373, "step": 445 }, { "epoch": 1.2494744218640506, "grad_norm": 1.0409391306978304, "learning_rate": 9.338254657185988e-06, "loss": 0.285, "step": 446 }, { "epoch": 1.2522775052557813, "grad_norm": 1.1418504158321083, "learning_rate": 9.333383602237047e-06, "loss": 0.3006, "step": 447 }, { "epoch": 1.2550805886475123, "grad_norm": 1.1141933886489612, "learning_rate": 9.328495965335844e-06, "loss": 0.3074, "step": 448 }, { "epoch": 1.2578836720392432, "grad_norm": 1.0126163466912967, "learning_rate": 9.323591765185208e-06, "loss": 0.3305, "step": 449 }, { "epoch": 1.260686755430974, "grad_norm": 0.99727536700764, "learning_rate": 9.31867102055136e-06, "loss": 0.29, "step": 450 }, { "epoch": 1.263489838822705, "grad_norm": 0.9539744388123551, "learning_rate": 9.31373375026382e-06, "loss": 0.3111, "step": 451 }, { "epoch": 1.266292922214436, "grad_norm": 1.0227036740085016, "learning_rate": 9.308779973215355e-06, "loss": 0.2659, "step": 452 }, { "epoch": 1.2690960056061669, "grad_norm": 1.0842800400398311, "learning_rate": 9.303809708361884e-06, "loss": 0.3205, "step": 453 }, { "epoch": 1.2718990889978976, "grad_norm": 1.1550945457289452, "learning_rate": 9.298822974722425e-06, "loss": 0.3131, "step": 454 }, { "epoch": 1.2747021723896286, "grad_norm": 1.1106936037552206, "learning_rate": 9.293819791379016e-06, "loss": 0.3464, "step": 455 }, { "epoch": 1.2775052557813595, "grad_norm": 1.0204999168425828, "learning_rate": 9.288800177476636e-06, "loss": 0.3334, "step": 456 }, { "epoch": 1.2803083391730903, "grad_norm": 1.101739084994357, "learning_rate": 9.28376415222314e-06, "loss": 0.3527, "step": 457 }, { "epoch": 1.2831114225648212, "grad_norm": 1.0132658206492835, "learning_rate": 9.278711734889178e-06, "loss": 0.3507, "step": 458 }, { "epoch": 1.2859145059565522, "grad_norm": 1.093935580030477, "learning_rate": 9.27364294480813e-06, "loss": 0.3125, "step": 459 }, { "epoch": 1.2887175893482832, "grad_norm": 1.0840732446328496, "learning_rate": 9.268557801376027e-06, "loss": 0.2964, "step": 460 }, { "epoch": 1.2915206727400141, "grad_norm": 1.0652339440301186, "learning_rate": 9.263456324051472e-06, "loss": 0.3346, "step": 461 }, { "epoch": 1.2943237561317449, "grad_norm": 0.9905423959308148, "learning_rate": 9.258338532355575e-06, "loss": 0.2979, "step": 462 }, { "epoch": 1.2971268395234758, "grad_norm": 1.113098382378706, "learning_rate": 9.253204445871874e-06, "loss": 0.3269, "step": 463 }, { "epoch": 1.2999299229152066, "grad_norm": 1.1279986985216184, "learning_rate": 9.248054084246259e-06, "loss": 0.3321, "step": 464 }, { "epoch": 1.3027330063069376, "grad_norm": 1.1171578114970255, "learning_rate": 9.242887467186896e-06, "loss": 0.3099, "step": 465 }, { "epoch": 1.3055360896986685, "grad_norm": 1.0217011590704772, "learning_rate": 9.237704614464157e-06, "loss": 0.3251, "step": 466 }, { "epoch": 1.3083391730903995, "grad_norm": 1.0450128549258906, "learning_rate": 9.232505545910536e-06, "loss": 0.3789, "step": 467 }, { "epoch": 1.3111422564821305, "grad_norm": 1.0600119003347885, "learning_rate": 9.227290281420583e-06, "loss": 0.3291, "step": 468 }, { "epoch": 1.3139453398738612, "grad_norm": 1.0842489244628608, "learning_rate": 9.22205884095082e-06, "loss": 0.303, "step": 469 }, { "epoch": 1.3167484232655922, "grad_norm": 1.0435075367939863, "learning_rate": 9.216811244519667e-06, "loss": 0.3043, "step": 470 }, { "epoch": 1.3195515066573231, "grad_norm": 1.022280858797138, "learning_rate": 9.211547512207366e-06, "loss": 0.3199, "step": 471 }, { "epoch": 1.3223545900490539, "grad_norm": 1.0884951175650506, "learning_rate": 9.206267664155906e-06, "loss": 0.3003, "step": 472 }, { "epoch": 1.3251576734407848, "grad_norm": 1.0283609768763116, "learning_rate": 9.200971720568942e-06, "loss": 0.304, "step": 473 }, { "epoch": 1.3279607568325158, "grad_norm": 1.0934855137470945, "learning_rate": 9.19565970171172e-06, "loss": 0.3331, "step": 474 }, { "epoch": 1.3307638402242468, "grad_norm": 1.03357671176792, "learning_rate": 9.190331627910998e-06, "loss": 0.2869, "step": 475 }, { "epoch": 1.3335669236159775, "grad_norm": 0.9869700427404623, "learning_rate": 9.184987519554969e-06, "loss": 0.2753, "step": 476 }, { "epoch": 1.3363700070077085, "grad_norm": 1.0051093138314826, "learning_rate": 9.179627397093184e-06, "loss": 0.3028, "step": 477 }, { "epoch": 1.3391730903994394, "grad_norm": 1.119221633996362, "learning_rate": 9.174251281036478e-06, "loss": 0.2821, "step": 478 }, { "epoch": 1.3419761737911702, "grad_norm": 1.0416388270107602, "learning_rate": 9.168859191956876e-06, "loss": 0.3033, "step": 479 }, { "epoch": 1.3447792571829011, "grad_norm": 1.014104502114144, "learning_rate": 9.163451150487531e-06, "loss": 0.3162, "step": 480 }, { "epoch": 1.347582340574632, "grad_norm": 0.9819678478497201, "learning_rate": 9.15802717732264e-06, "loss": 0.2956, "step": 481 }, { "epoch": 1.350385423966363, "grad_norm": 0.940075199188654, "learning_rate": 9.152587293217362e-06, "loss": 0.3095, "step": 482 }, { "epoch": 1.353188507358094, "grad_norm": 1.046977799888376, "learning_rate": 9.147131518987738e-06, "loss": 0.3041, "step": 483 }, { "epoch": 1.3559915907498248, "grad_norm": 1.0739950959035311, "learning_rate": 9.141659875510615e-06, "loss": 0.3161, "step": 484 }, { "epoch": 1.3587946741415557, "grad_norm": 1.055258970335336, "learning_rate": 9.136172383723568e-06, "loss": 0.2877, "step": 485 }, { "epoch": 1.3615977575332865, "grad_norm": 0.9942925273922855, "learning_rate": 9.130669064624811e-06, "loss": 0.3394, "step": 486 }, { "epoch": 1.3644008409250175, "grad_norm": 1.0792689049266269, "learning_rate": 9.125149939273123e-06, "loss": 0.3165, "step": 487 }, { "epoch": 1.3672039243167484, "grad_norm": 1.0022733710364253, "learning_rate": 9.119615028787771e-06, "loss": 0.2988, "step": 488 }, { "epoch": 1.3700070077084794, "grad_norm": 1.0365258649982823, "learning_rate": 9.114064354348422e-06, "loss": 0.3183, "step": 489 }, { "epoch": 1.3728100911002104, "grad_norm": 1.040841698853146, "learning_rate": 9.108497937195064e-06, "loss": 0.2916, "step": 490 }, { "epoch": 1.375613174491941, "grad_norm": 0.9702615599424862, "learning_rate": 9.102915798627925e-06, "loss": 0.3183, "step": 491 }, { "epoch": 1.378416257883672, "grad_norm": 1.0254553655980456, "learning_rate": 9.097317960007395e-06, "loss": 0.3106, "step": 492 }, { "epoch": 1.381219341275403, "grad_norm": 1.0330801056438677, "learning_rate": 9.091704442753938e-06, "loss": 0.3493, "step": 493 }, { "epoch": 1.3840224246671338, "grad_norm": 1.0482605604178792, "learning_rate": 9.086075268348014e-06, "loss": 0.3222, "step": 494 }, { "epoch": 1.3868255080588647, "grad_norm": 1.0689177080576502, "learning_rate": 9.080430458329996e-06, "loss": 0.3252, "step": 495 }, { "epoch": 1.3896285914505957, "grad_norm": 0.9952213910620371, "learning_rate": 9.07477003430009e-06, "loss": 0.3777, "step": 496 }, { "epoch": 1.3924316748423267, "grad_norm": 0.9969778598795931, "learning_rate": 9.06909401791825e-06, "loss": 0.3064, "step": 497 }, { "epoch": 1.3952347582340574, "grad_norm": 1.0324696911274618, "learning_rate": 9.063402430904087e-06, "loss": 0.3355, "step": 498 }, { "epoch": 1.3980378416257884, "grad_norm": 1.0311228329364959, "learning_rate": 9.057695295036806e-06, "loss": 0.3329, "step": 499 }, { "epoch": 1.4008409250175193, "grad_norm": 0.9956688549063683, "learning_rate": 9.0519726321551e-06, "loss": 0.3102, "step": 500 }, { "epoch": 1.40364400840925, "grad_norm": 0.9432882129064981, "learning_rate": 9.046234464157087e-06, "loss": 0.3194, "step": 501 }, { "epoch": 1.406447091800981, "grad_norm": 1.0009435461744933, "learning_rate": 9.040480813000205e-06, "loss": 0.3212, "step": 502 }, { "epoch": 1.409250175192712, "grad_norm": 1.0819896746344966, "learning_rate": 9.03471170070115e-06, "loss": 0.3188, "step": 503 }, { "epoch": 1.412053258584443, "grad_norm": 1.0309234137171452, "learning_rate": 9.028927149335773e-06, "loss": 0.3242, "step": 504 }, { "epoch": 1.4148563419761737, "grad_norm": 1.0458803781264536, "learning_rate": 9.02312718103901e-06, "loss": 0.2826, "step": 505 }, { "epoch": 1.4176594253679047, "grad_norm": 1.042415071678378, "learning_rate": 9.017311818004785e-06, "loss": 0.3319, "step": 506 }, { "epoch": 1.4204625087596356, "grad_norm": 0.9484013403771449, "learning_rate": 9.011481082485933e-06, "loss": 0.3451, "step": 507 }, { "epoch": 1.4232655921513664, "grad_norm": 1.1681194445835863, "learning_rate": 9.005634996794117e-06, "loss": 0.3313, "step": 508 }, { "epoch": 1.4260686755430974, "grad_norm": 1.055505140987747, "learning_rate": 8.99977358329973e-06, "loss": 0.3162, "step": 509 }, { "epoch": 1.4288717589348283, "grad_norm": 0.9831333063132164, "learning_rate": 8.993896864431825e-06, "loss": 0.3121, "step": 510 }, { "epoch": 1.4316748423265593, "grad_norm": 0.9791973011046631, "learning_rate": 8.988004862678019e-06, "loss": 0.3051, "step": 511 }, { "epoch": 1.4344779257182902, "grad_norm": 1.026083867106151, "learning_rate": 8.98209760058441e-06, "loss": 0.3508, "step": 512 }, { "epoch": 1.437281009110021, "grad_norm": 1.0447255347040039, "learning_rate": 8.97617510075549e-06, "loss": 0.2768, "step": 513 }, { "epoch": 1.440084092501752, "grad_norm": 1.1380333354753407, "learning_rate": 8.970237385854059e-06, "loss": 0.3334, "step": 514 }, { "epoch": 1.4428871758934827, "grad_norm": 0.9696366640107756, "learning_rate": 8.96428447860114e-06, "loss": 0.3231, "step": 515 }, { "epoch": 1.4456902592852137, "grad_norm": 1.0822835386163632, "learning_rate": 8.95831640177589e-06, "loss": 0.3453, "step": 516 }, { "epoch": 1.4484933426769446, "grad_norm": 1.1510291788031735, "learning_rate": 8.952333178215515e-06, "loss": 0.3118, "step": 517 }, { "epoch": 1.4512964260686756, "grad_norm": 1.0632519616648684, "learning_rate": 8.946334830815176e-06, "loss": 0.2929, "step": 518 }, { "epoch": 1.4540995094604066, "grad_norm": 1.536963560887113, "learning_rate": 8.940321382527908e-06, "loss": 0.329, "step": 519 }, { "epoch": 1.4569025928521373, "grad_norm": 1.1543457182131098, "learning_rate": 8.934292856364535e-06, "loss": 0.3333, "step": 520 }, { "epoch": 1.4597056762438683, "grad_norm": 1.1422839399927136, "learning_rate": 8.928249275393572e-06, "loss": 0.2989, "step": 521 }, { "epoch": 1.4625087596355992, "grad_norm": 1.0323164597196008, "learning_rate": 8.922190662741146e-06, "loss": 0.3427, "step": 522 }, { "epoch": 1.46531184302733, "grad_norm": 1.0171844433621025, "learning_rate": 8.916117041590899e-06, "loss": 0.3306, "step": 523 }, { "epoch": 1.468114926419061, "grad_norm": 1.005207029365479, "learning_rate": 8.910028435183906e-06, "loss": 0.3017, "step": 524 }, { "epoch": 1.470918009810792, "grad_norm": 0.9931572125901683, "learning_rate": 8.903924866818589e-06, "loss": 0.3412, "step": 525 }, { "epoch": 1.4737210932025229, "grad_norm": 1.0116486286896589, "learning_rate": 8.897806359850614e-06, "loss": 0.2799, "step": 526 }, { "epoch": 1.4765241765942536, "grad_norm": 0.9010403515246571, "learning_rate": 8.891672937692818e-06, "loss": 0.3329, "step": 527 }, { "epoch": 1.4793272599859846, "grad_norm": 0.983591867787472, "learning_rate": 8.885524623815107e-06, "loss": 0.3559, "step": 528 }, { "epoch": 1.4821303433777155, "grad_norm": 1.0389716365696682, "learning_rate": 8.879361441744374e-06, "loss": 0.3558, "step": 529 }, { "epoch": 1.4849334267694463, "grad_norm": 0.9273702187776793, "learning_rate": 8.873183415064401e-06, "loss": 0.2942, "step": 530 }, { "epoch": 1.4877365101611773, "grad_norm": 0.9669391636702603, "learning_rate": 8.866990567415785e-06, "loss": 0.3199, "step": 531 }, { "epoch": 1.4905395935529082, "grad_norm": 1.0603730492574703, "learning_rate": 8.860782922495821e-06, "loss": 0.3397, "step": 532 }, { "epoch": 1.4933426769446392, "grad_norm": 1.0224313519746386, "learning_rate": 8.854560504058442e-06, "loss": 0.3142, "step": 533 }, { "epoch": 1.4961457603363701, "grad_norm": 1.0076135234872883, "learning_rate": 8.8483233359141e-06, "loss": 0.3121, "step": 534 }, { "epoch": 1.4989488437281009, "grad_norm": 1.067686299191742, "learning_rate": 8.842071441929695e-06, "loss": 0.3051, "step": 535 }, { "epoch": 1.5017519271198319, "grad_norm": 0.9900504355007723, "learning_rate": 8.835804846028473e-06, "loss": 0.2995, "step": 536 }, { "epoch": 1.5045550105115626, "grad_norm": 1.0611202706352416, "learning_rate": 8.82952357218994e-06, "loss": 0.3311, "step": 537 }, { "epoch": 1.5073580939032936, "grad_norm": 1.0223872953740087, "learning_rate": 8.823227644449767e-06, "loss": 0.3298, "step": 538 }, { "epoch": 1.5101611772950245, "grad_norm": 1.004431585067902, "learning_rate": 8.8169170868997e-06, "loss": 0.3317, "step": 539 }, { "epoch": 1.5129642606867555, "grad_norm": 1.0372525705982982, "learning_rate": 8.810591923687468e-06, "loss": 0.3099, "step": 540 }, { "epoch": 1.5157673440784865, "grad_norm": 0.9763097184040664, "learning_rate": 8.804252179016681e-06, "loss": 0.3518, "step": 541 }, { "epoch": 1.5185704274702172, "grad_norm": 1.0786399759038623, "learning_rate": 8.797897877146757e-06, "loss": 0.351, "step": 542 }, { "epoch": 1.5213735108619482, "grad_norm": 1.0216449815318813, "learning_rate": 8.791529042392813e-06, "loss": 0.2949, "step": 543 }, { "epoch": 1.524176594253679, "grad_norm": 1.005020537619243, "learning_rate": 8.785145699125577e-06, "loss": 0.292, "step": 544 }, { "epoch": 1.5269796776454099, "grad_norm": 1.0117733349440563, "learning_rate": 8.778747871771293e-06, "loss": 0.3386, "step": 545 }, { "epoch": 1.5297827610371408, "grad_norm": 1.0619820560833055, "learning_rate": 8.772335584811631e-06, "loss": 0.2836, "step": 546 }, { "epoch": 1.5325858444288718, "grad_norm": 0.9828529966945531, "learning_rate": 8.765908862783595e-06, "loss": 0.2974, "step": 547 }, { "epoch": 1.5353889278206028, "grad_norm": 1.0825846295263688, "learning_rate": 8.75946773027942e-06, "loss": 0.2855, "step": 548 }, { "epoch": 1.5381920112123337, "grad_norm": 1.0026209804243462, "learning_rate": 8.753012211946485e-06, "loss": 0.29, "step": 549 }, { "epoch": 1.5409950946040645, "grad_norm": 0.9734512557172048, "learning_rate": 8.74654233248722e-06, "loss": 0.2806, "step": 550 }, { "epoch": 1.5437981779957952, "grad_norm": 1.0209335742663634, "learning_rate": 8.740058116659007e-06, "loss": 0.332, "step": 551 }, { "epoch": 1.5466012613875262, "grad_norm": 0.9987360165038864, "learning_rate": 8.733559589274086e-06, "loss": 0.2941, "step": 552 }, { "epoch": 1.5494043447792571, "grad_norm": 0.9709216484754912, "learning_rate": 8.72704677519946e-06, "loss": 0.3052, "step": 553 }, { "epoch": 1.5522074281709881, "grad_norm": 1.0291970398046555, "learning_rate": 8.720519699356804e-06, "loss": 0.3479, "step": 554 }, { "epoch": 1.555010511562719, "grad_norm": 1.042970879067335, "learning_rate": 8.713978386722367e-06, "loss": 0.3302, "step": 555 }, { "epoch": 1.55781359495445, "grad_norm": 0.9212891106054633, "learning_rate": 8.707422862326872e-06, "loss": 0.3301, "step": 556 }, { "epoch": 1.5606166783461808, "grad_norm": 1.0666204145131457, "learning_rate": 8.700853151255427e-06, "loss": 0.3177, "step": 557 }, { "epoch": 1.5634197617379118, "grad_norm": 1.0299268466077998, "learning_rate": 8.694269278647425e-06, "loss": 0.2912, "step": 558 }, { "epoch": 1.5662228451296425, "grad_norm": 0.9977307082011863, "learning_rate": 8.687671269696451e-06, "loss": 0.3274, "step": 559 }, { "epoch": 1.5690259285213735, "grad_norm": 0.9997169102518809, "learning_rate": 8.681059149650181e-06, "loss": 0.2897, "step": 560 }, { "epoch": 1.5718290119131044, "grad_norm": 1.0531067882949778, "learning_rate": 8.674432943810287e-06, "loss": 0.3507, "step": 561 }, { "epoch": 1.5746320953048354, "grad_norm": 0.9875951790749613, "learning_rate": 8.667792677532346e-06, "loss": 0.3176, "step": 562 }, { "epoch": 1.5774351786965664, "grad_norm": 0.988294038093233, "learning_rate": 8.661138376225735e-06, "loss": 0.3233, "step": 563 }, { "epoch": 1.580238262088297, "grad_norm": 0.9840482743038865, "learning_rate": 8.654470065353535e-06, "loss": 0.327, "step": 564 }, { "epoch": 1.583041345480028, "grad_norm": 0.9856704483396864, "learning_rate": 8.647787770432439e-06, "loss": 0.3056, "step": 565 }, { "epoch": 1.5858444288717588, "grad_norm": 0.9806977120576214, "learning_rate": 8.641091517032648e-06, "loss": 0.3596, "step": 566 }, { "epoch": 1.5886475122634898, "grad_norm": 0.9912904769787096, "learning_rate": 8.634381330777778e-06, "loss": 0.3086, "step": 567 }, { "epoch": 1.5914505956552207, "grad_norm": 1.0134362756226047, "learning_rate": 8.627657237344762e-06, "loss": 0.3402, "step": 568 }, { "epoch": 1.5942536790469517, "grad_norm": 1.0485977509584994, "learning_rate": 8.620919262463743e-06, "loss": 0.313, "step": 569 }, { "epoch": 1.5970567624386827, "grad_norm": 0.9780179966189363, "learning_rate": 8.614167431917986e-06, "loss": 0.3101, "step": 570 }, { "epoch": 1.5998598458304134, "grad_norm": 1.0499324110603412, "learning_rate": 8.607401771543778e-06, "loss": 0.3315, "step": 571 }, { "epoch": 1.6026629292221444, "grad_norm": 1.0940218868255043, "learning_rate": 8.600622307230323e-06, "loss": 0.323, "step": 572 }, { "epoch": 1.6054660126138751, "grad_norm": 1.0193645559843643, "learning_rate": 8.593829064919648e-06, "loss": 0.3376, "step": 573 }, { "epoch": 1.608269096005606, "grad_norm": 1.0962814559008822, "learning_rate": 8.5870220706065e-06, "loss": 0.3172, "step": 574 }, { "epoch": 1.611072179397337, "grad_norm": 1.0355134477786425, "learning_rate": 8.580201350338255e-06, "loss": 0.3445, "step": 575 }, { "epoch": 1.613875262789068, "grad_norm": 0.9832928764016196, "learning_rate": 8.573366930214807e-06, "loss": 0.3377, "step": 576 }, { "epoch": 1.616678346180799, "grad_norm": 1.0432431672185147, "learning_rate": 8.566518836388472e-06, "loss": 0.3156, "step": 577 }, { "epoch": 1.61948142957253, "grad_norm": 1.0386229114434877, "learning_rate": 8.559657095063893e-06, "loss": 0.3585, "step": 578 }, { "epoch": 1.6222845129642607, "grad_norm": 1.0190174075590552, "learning_rate": 8.552781732497937e-06, "loss": 0.3226, "step": 579 }, { "epoch": 1.6250875963559914, "grad_norm": 1.0755188869356076, "learning_rate": 8.545892774999589e-06, "loss": 0.2947, "step": 580 }, { "epoch": 1.6278906797477224, "grad_norm": 0.9724040098316253, "learning_rate": 8.538990248929858e-06, "loss": 0.3343, "step": 581 }, { "epoch": 1.6306937631394534, "grad_norm": 1.0380136461497245, "learning_rate": 8.532074180701674e-06, "loss": 0.3153, "step": 582 }, { "epoch": 1.6334968465311843, "grad_norm": 1.0437703219167505, "learning_rate": 8.525144596779788e-06, "loss": 0.3115, "step": 583 }, { "epoch": 1.6362999299229153, "grad_norm": 1.073505032497063, "learning_rate": 8.518201523680668e-06, "loss": 0.3228, "step": 584 }, { "epoch": 1.6391030133146463, "grad_norm": 0.9898757612005716, "learning_rate": 8.5112449879724e-06, "loss": 0.3132, "step": 585 }, { "epoch": 1.641906096706377, "grad_norm": 1.0294276811837257, "learning_rate": 8.504275016274584e-06, "loss": 0.2908, "step": 586 }, { "epoch": 1.644709180098108, "grad_norm": 1.0331323793330456, "learning_rate": 8.497291635258235e-06, "loss": 0.2939, "step": 587 }, { "epoch": 1.6475122634898387, "grad_norm": 1.0336815270279005, "learning_rate": 8.490294871645681e-06, "loss": 0.3085, "step": 588 }, { "epoch": 1.6503153468815697, "grad_norm": 0.9500201552994334, "learning_rate": 8.483284752210457e-06, "loss": 0.3207, "step": 589 }, { "epoch": 1.6531184302733006, "grad_norm": 1.0189139734647936, "learning_rate": 8.476261303777205e-06, "loss": 0.3321, "step": 590 }, { "epoch": 1.6559215136650316, "grad_norm": 1.0321224570797916, "learning_rate": 8.46922455322157e-06, "loss": 0.3169, "step": 591 }, { "epoch": 1.6587245970567626, "grad_norm": 1.0676739360063043, "learning_rate": 8.462174527470102e-06, "loss": 0.3405, "step": 592 }, { "epoch": 1.6615276804484933, "grad_norm": 1.0114685422562408, "learning_rate": 8.455111253500147e-06, "loss": 0.279, "step": 593 }, { "epoch": 1.6643307638402243, "grad_norm": 1.0101245131587206, "learning_rate": 8.448034758339747e-06, "loss": 0.2895, "step": 594 }, { "epoch": 1.667133847231955, "grad_norm": 1.0295531144079708, "learning_rate": 8.440945069067535e-06, "loss": 0.2911, "step": 595 }, { "epoch": 1.669936930623686, "grad_norm": 0.9981860968875388, "learning_rate": 8.433842212812632e-06, "loss": 0.2941, "step": 596 }, { "epoch": 1.672740014015417, "grad_norm": 0.9972808864654994, "learning_rate": 8.426726216754543e-06, "loss": 0.3283, "step": 597 }, { "epoch": 1.675543097407148, "grad_norm": 0.9204669787570922, "learning_rate": 8.419597108123054e-06, "loss": 0.3421, "step": 598 }, { "epoch": 1.6783461807988789, "grad_norm": 1.0403976854928259, "learning_rate": 8.412454914198127e-06, "loss": 0.3166, "step": 599 }, { "epoch": 1.6811492641906096, "grad_norm": 1.0572732959169833, "learning_rate": 8.405299662309794e-06, "loss": 0.3598, "step": 600 }, { "epoch": 1.6839523475823406, "grad_norm": 1.129066341505925, "learning_rate": 8.39813137983806e-06, "loss": 0.3221, "step": 601 }, { "epoch": 1.6867554309740713, "grad_norm": 1.0295275170206426, "learning_rate": 8.390950094212783e-06, "loss": 0.2815, "step": 602 }, { "epoch": 1.6895585143658023, "grad_norm": 0.9730574912477955, "learning_rate": 8.383755832913583e-06, "loss": 0.3538, "step": 603 }, { "epoch": 1.6923615977575333, "grad_norm": 1.1385237539297977, "learning_rate": 8.376548623469737e-06, "loss": 0.3224, "step": 604 }, { "epoch": 1.6951646811492642, "grad_norm": 1.0094609622481272, "learning_rate": 8.369328493460061e-06, "loss": 0.3015, "step": 605 }, { "epoch": 1.6979677645409952, "grad_norm": 1.0913134241157278, "learning_rate": 8.362095470512815e-06, "loss": 0.279, "step": 606 }, { "epoch": 1.7007708479327261, "grad_norm": 1.0647490413873508, "learning_rate": 8.354849582305596e-06, "loss": 0.3016, "step": 607 }, { "epoch": 1.703573931324457, "grad_norm": 1.0816023958456211, "learning_rate": 8.347590856565231e-06, "loss": 0.328, "step": 608 }, { "epoch": 1.7063770147161879, "grad_norm": 0.9665218301407825, "learning_rate": 8.340319321067668e-06, "loss": 0.3222, "step": 609 }, { "epoch": 1.7091800981079186, "grad_norm": 1.114093897425612, "learning_rate": 8.333035003637873e-06, "loss": 0.308, "step": 610 }, { "epoch": 1.7119831814996496, "grad_norm": 0.9870441819524826, "learning_rate": 8.325737932149726e-06, "loss": 0.2778, "step": 611 }, { "epoch": 1.7147862648913805, "grad_norm": 0.9972031988711308, "learning_rate": 8.318428134525906e-06, "loss": 0.3503, "step": 612 }, { "epoch": 1.7175893482831115, "grad_norm": 1.0611971970199314, "learning_rate": 8.311105638737796e-06, "loss": 0.2857, "step": 613 }, { "epoch": 1.7203924316748425, "grad_norm": 1.0049712797899566, "learning_rate": 8.303770472805361e-06, "loss": 0.3047, "step": 614 }, { "epoch": 1.7231955150665732, "grad_norm": 1.077820808212372, "learning_rate": 8.296422664797055e-06, "loss": 0.3514, "step": 615 }, { "epoch": 1.7259985984583042, "grad_norm": 0.9534620185925327, "learning_rate": 8.289062242829707e-06, "loss": 0.3175, "step": 616 }, { "epoch": 1.728801681850035, "grad_norm": 0.9533962250197514, "learning_rate": 8.281689235068411e-06, "loss": 0.3304, "step": 617 }, { "epoch": 1.7316047652417659, "grad_norm": 1.0377944574846643, "learning_rate": 8.274303669726427e-06, "loss": 0.3108, "step": 618 }, { "epoch": 1.7344078486334968, "grad_norm": 0.9776215264248671, "learning_rate": 8.26690557506506e-06, "loss": 0.3226, "step": 619 }, { "epoch": 1.7372109320252278, "grad_norm": 1.071996115988915, "learning_rate": 8.259494979393563e-06, "loss": 0.2754, "step": 620 }, { "epoch": 1.7400140154169588, "grad_norm": 0.9706195771601769, "learning_rate": 8.252071911069029e-06, "loss": 0.3221, "step": 621 }, { "epoch": 1.7428170988086895, "grad_norm": 0.9372850256828986, "learning_rate": 8.244636398496268e-06, "loss": 0.2968, "step": 622 }, { "epoch": 1.7456201822004205, "grad_norm": 0.9645042926256575, "learning_rate": 8.237188470127721e-06, "loss": 0.322, "step": 623 }, { "epoch": 1.7484232655921512, "grad_norm": 0.9650005316732679, "learning_rate": 8.229728154463331e-06, "loss": 0.2816, "step": 624 }, { "epoch": 1.7512263489838822, "grad_norm": 0.9863893918718459, "learning_rate": 8.22225548005044e-06, "loss": 0.3215, "step": 625 }, { "epoch": 1.7540294323756132, "grad_norm": 1.0018570360888792, "learning_rate": 8.214770475483686e-06, "loss": 0.339, "step": 626 }, { "epoch": 1.7568325157673441, "grad_norm": 1.1193797750866719, "learning_rate": 8.207273169404892e-06, "loss": 0.3577, "step": 627 }, { "epoch": 1.759635599159075, "grad_norm": 1.0543465412125153, "learning_rate": 8.199763590502945e-06, "loss": 0.2906, "step": 628 }, { "epoch": 1.762438682550806, "grad_norm": 1.0456949346053654, "learning_rate": 8.1922417675137e-06, "loss": 0.3132, "step": 629 }, { "epoch": 1.7652417659425368, "grad_norm": 1.0462724551977218, "learning_rate": 8.184707729219865e-06, "loss": 0.3245, "step": 630 }, { "epoch": 1.7680448493342675, "grad_norm": 0.9823174814664697, "learning_rate": 8.177161504450887e-06, "loss": 0.3309, "step": 631 }, { "epoch": 1.7708479327259985, "grad_norm": 0.9957506589063193, "learning_rate": 8.169603122082852e-06, "loss": 0.3393, "step": 632 }, { "epoch": 1.7736510161177295, "grad_norm": 1.0179763156590598, "learning_rate": 8.16203261103836e-06, "loss": 0.2781, "step": 633 }, { "epoch": 1.7764540995094604, "grad_norm": 1.0073877575165777, "learning_rate": 8.154450000286425e-06, "loss": 0.32, "step": 634 }, { "epoch": 1.7792571829011914, "grad_norm": 1.1450794851221155, "learning_rate": 8.146855318842366e-06, "loss": 0.3267, "step": 635 }, { "epoch": 1.7820602662929224, "grad_norm": 1.0133968691268989, "learning_rate": 8.139248595767682e-06, "loss": 0.3325, "step": 636 }, { "epoch": 1.784863349684653, "grad_norm": 1.0972907692412197, "learning_rate": 8.131629860169959e-06, "loss": 0.324, "step": 637 }, { "epoch": 1.787666433076384, "grad_norm": 1.0747756547403522, "learning_rate": 8.123999141202743e-06, "loss": 0.3291, "step": 638 }, { "epoch": 1.7904695164681148, "grad_norm": 1.0470139173415747, "learning_rate": 8.11635646806544e-06, "loss": 0.3523, "step": 639 }, { "epoch": 1.7932725998598458, "grad_norm": 0.9883826668641853, "learning_rate": 8.108701870003195e-06, "loss": 0.3225, "step": 640 }, { "epoch": 1.7960756832515767, "grad_norm": 0.9807749134553183, "learning_rate": 8.101035376306789e-06, "loss": 0.3078, "step": 641 }, { "epoch": 1.7988787666433077, "grad_norm": 0.9590271955969843, "learning_rate": 8.093357016312518e-06, "loss": 0.3199, "step": 642 }, { "epoch": 1.8016818500350387, "grad_norm": 0.999075545826115, "learning_rate": 8.085666819402089e-06, "loss": 0.2991, "step": 643 }, { "epoch": 1.8044849334267694, "grad_norm": 0.9834688572176502, "learning_rate": 8.077964815002497e-06, "loss": 0.2773, "step": 644 }, { "epoch": 1.8072880168185004, "grad_norm": 1.003972639907366, "learning_rate": 8.070251032585932e-06, "loss": 0.2935, "step": 645 }, { "epoch": 1.8100911002102311, "grad_norm": 1.0037639414072954, "learning_rate": 8.062525501669638e-06, "loss": 0.3337, "step": 646 }, { "epoch": 1.812894183601962, "grad_norm": 1.0504008521656023, "learning_rate": 8.054788251815824e-06, "loss": 0.3075, "step": 647 }, { "epoch": 1.815697266993693, "grad_norm": 1.041018797822688, "learning_rate": 8.047039312631542e-06, "loss": 0.2941, "step": 648 }, { "epoch": 1.818500350385424, "grad_norm": 1.1476887413595336, "learning_rate": 8.039278713768572e-06, "loss": 0.3008, "step": 649 }, { "epoch": 1.821303433777155, "grad_norm": 1.063731096461173, "learning_rate": 8.031506484923312e-06, "loss": 0.2836, "step": 650 }, { "epoch": 1.8241065171688857, "grad_norm": 1.0609057745391275, "learning_rate": 8.02372265583666e-06, "loss": 0.3083, "step": 651 }, { "epoch": 1.8269096005606167, "grad_norm": 0.9729626141695646, "learning_rate": 8.015927256293906e-06, "loss": 0.3062, "step": 652 }, { "epoch": 1.8297126839523474, "grad_norm": 1.0277640358114108, "learning_rate": 8.008120316124612e-06, "loss": 0.3389, "step": 653 }, { "epoch": 1.8325157673440784, "grad_norm": 1.0700790517166436, "learning_rate": 8.000301865202507e-06, "loss": 0.2827, "step": 654 }, { "epoch": 1.8353188507358094, "grad_norm": 1.0204749289182065, "learning_rate": 7.992471933445359e-06, "loss": 0.3574, "step": 655 }, { "epoch": 1.8381219341275403, "grad_norm": 1.054119271327418, "learning_rate": 7.984630550814872e-06, "loss": 0.3235, "step": 656 }, { "epoch": 1.8409250175192713, "grad_norm": 1.0276559132927785, "learning_rate": 7.976777747316568e-06, "loss": 0.3191, "step": 657 }, { "epoch": 1.8437281009110023, "grad_norm": 0.9749404229198887, "learning_rate": 7.96891355299967e-06, "loss": 0.2777, "step": 658 }, { "epoch": 1.846531184302733, "grad_norm": 0.9135511168776641, "learning_rate": 7.961037997956988e-06, "loss": 0.2897, "step": 659 }, { "epoch": 1.849334267694464, "grad_norm": 1.030736095815461, "learning_rate": 7.953151112324807e-06, "loss": 0.3229, "step": 660 }, { "epoch": 1.8521373510861947, "grad_norm": 1.0021336354889787, "learning_rate": 7.94525292628277e-06, "loss": 0.2957, "step": 661 }, { "epoch": 1.8549404344779257, "grad_norm": 1.111103040839117, "learning_rate": 7.937343470053753e-06, "loss": 0.327, "step": 662 }, { "epoch": 1.8577435178696566, "grad_norm": 1.0988776635638167, "learning_rate": 7.929422773903772e-06, "loss": 0.3038, "step": 663 }, { "epoch": 1.8605466012613876, "grad_norm": 1.0809251731993994, "learning_rate": 7.921490868141843e-06, "loss": 0.3069, "step": 664 }, { "epoch": 1.8633496846531186, "grad_norm": 1.05344570430685, "learning_rate": 7.91354778311988e-06, "loss": 0.3092, "step": 665 }, { "epoch": 1.8661527680448493, "grad_norm": 1.070191527052215, "learning_rate": 7.90559354923257e-06, "loss": 0.3209, "step": 666 }, { "epoch": 1.8689558514365803, "grad_norm": 1.085172191078759, "learning_rate": 7.897628196917273e-06, "loss": 0.298, "step": 667 }, { "epoch": 1.871758934828311, "grad_norm": 1.0239732236288808, "learning_rate": 7.889651756653882e-06, "loss": 0.3235, "step": 668 }, { "epoch": 1.874562018220042, "grad_norm": 1.0072086598087635, "learning_rate": 7.881664258964726e-06, "loss": 0.285, "step": 669 }, { "epoch": 1.877365101611773, "grad_norm": 1.0171350132611843, "learning_rate": 7.873665734414445e-06, "loss": 0.3341, "step": 670 }, { "epoch": 1.880168185003504, "grad_norm": 1.0486133267457405, "learning_rate": 7.865656213609869e-06, "loss": 0.3324, "step": 671 }, { "epoch": 1.8829712683952349, "grad_norm": 1.0417101669974131, "learning_rate": 7.857635727199915e-06, "loss": 0.3424, "step": 672 }, { "epoch": 1.8857743517869656, "grad_norm": 0.9617447446447893, "learning_rate": 7.849604305875452e-06, "loss": 0.2945, "step": 673 }, { "epoch": 1.8885774351786966, "grad_norm": 1.001044918507061, "learning_rate": 7.841561980369197e-06, "loss": 0.294, "step": 674 }, { "epoch": 1.8913805185704273, "grad_norm": 0.938846630312967, "learning_rate": 7.83350878145559e-06, "loss": 0.3342, "step": 675 }, { "epoch": 1.8941836019621583, "grad_norm": 1.0509429693043588, "learning_rate": 7.825444739950678e-06, "loss": 0.3195, "step": 676 }, { "epoch": 1.8969866853538893, "grad_norm": 1.0549270517058966, "learning_rate": 7.817369886712003e-06, "loss": 0.3101, "step": 677 }, { "epoch": 1.8997897687456202, "grad_norm": 0.9941886709286132, "learning_rate": 7.809284252638474e-06, "loss": 0.3119, "step": 678 }, { "epoch": 1.9025928521373512, "grad_norm": 0.9572303093383921, "learning_rate": 7.801187868670253e-06, "loss": 0.3086, "step": 679 }, { "epoch": 1.9053959355290822, "grad_norm": 1.0565296838152023, "learning_rate": 7.79308076578864e-06, "loss": 0.3451, "step": 680 }, { "epoch": 1.908199018920813, "grad_norm": 1.0167744263598684, "learning_rate": 7.784962975015953e-06, "loss": 0.3098, "step": 681 }, { "epoch": 1.9110021023125436, "grad_norm": 0.9452488464744048, "learning_rate": 7.776834527415403e-06, "loss": 0.2989, "step": 682 }, { "epoch": 1.9138051857042746, "grad_norm": 0.9649112314730967, "learning_rate": 7.76869545409098e-06, "loss": 0.3242, "step": 683 }, { "epoch": 1.9166082690960056, "grad_norm": 0.9854323873721721, "learning_rate": 7.76054578618734e-06, "loss": 0.3344, "step": 684 }, { "epoch": 1.9194113524877365, "grad_norm": 1.0405989073796909, "learning_rate": 7.752385554889674e-06, "loss": 0.297, "step": 685 }, { "epoch": 1.9222144358794675, "grad_norm": 0.9433706915766583, "learning_rate": 7.744214791423597e-06, "loss": 0.3116, "step": 686 }, { "epoch": 1.9250175192711985, "grad_norm": 1.0542882712602182, "learning_rate": 7.736033527055025e-06, "loss": 0.2795, "step": 687 }, { "epoch": 1.9278206026629292, "grad_norm": 1.1667611822036257, "learning_rate": 7.727841793090058e-06, "loss": 0.3235, "step": 688 }, { "epoch": 1.9306236860546602, "grad_norm": 1.0443348900152258, "learning_rate": 7.719639620874857e-06, "loss": 0.3042, "step": 689 }, { "epoch": 1.933426769446391, "grad_norm": 1.0629009170054755, "learning_rate": 7.711427041795525e-06, "loss": 0.3191, "step": 690 }, { "epoch": 1.9362298528381219, "grad_norm": 0.995609193672333, "learning_rate": 7.703204087277989e-06, "loss": 0.2706, "step": 691 }, { "epoch": 1.9390329362298528, "grad_norm": 1.0613346871930387, "learning_rate": 7.694970788787877e-06, "loss": 0.3379, "step": 692 }, { "epoch": 1.9418360196215838, "grad_norm": 1.038889453929205, "learning_rate": 7.686727177830403e-06, "loss": 0.3378, "step": 693 }, { "epoch": 1.9446391030133148, "grad_norm": 0.9158531584661774, "learning_rate": 7.678473285950233e-06, "loss": 0.2839, "step": 694 }, { "epoch": 1.9474421864050455, "grad_norm": 0.9600167344012924, "learning_rate": 7.670209144731382e-06, "loss": 0.3405, "step": 695 }, { "epoch": 1.9502452697967765, "grad_norm": 1.0169027673837827, "learning_rate": 7.661934785797083e-06, "loss": 0.3189, "step": 696 }, { "epoch": 1.9530483531885072, "grad_norm": 0.9857662223953864, "learning_rate": 7.653650240809667e-06, "loss": 0.3097, "step": 697 }, { "epoch": 1.9558514365802382, "grad_norm": 1.0340909796071234, "learning_rate": 7.645355541470441e-06, "loss": 0.3514, "step": 698 }, { "epoch": 1.9586545199719692, "grad_norm": 1.0367612546167349, "learning_rate": 7.63705071951957e-06, "loss": 0.2808, "step": 699 }, { "epoch": 1.9614576033637001, "grad_norm": 0.9660509007381327, "learning_rate": 7.628735806735954e-06, "loss": 0.3482, "step": 700 }, { "epoch": 1.964260686755431, "grad_norm": 0.969366457212378, "learning_rate": 7.620410834937102e-06, "loss": 0.2941, "step": 701 }, { "epoch": 1.9670637701471618, "grad_norm": 0.9720365665382235, "learning_rate": 7.612075835979023e-06, "loss": 0.3183, "step": 702 }, { "epoch": 1.9698668535388928, "grad_norm": 0.9372831657252071, "learning_rate": 7.603730841756087e-06, "loss": 0.323, "step": 703 }, { "epoch": 1.9726699369306235, "grad_norm": 0.9036831971891457, "learning_rate": 7.595375884200917e-06, "loss": 0.3093, "step": 704 }, { "epoch": 1.9754730203223545, "grad_norm": 0.9862325513159128, "learning_rate": 7.587010995284259e-06, "loss": 0.3688, "step": 705 }, { "epoch": 1.9782761037140855, "grad_norm": 1.1290794165906493, "learning_rate": 7.57863620701486e-06, "loss": 0.3258, "step": 706 }, { "epoch": 1.9810791871058164, "grad_norm": 1.011312295887435, "learning_rate": 7.57025155143935e-06, "loss": 0.2734, "step": 707 }, { "epoch": 1.9838822704975474, "grad_norm": 0.9830584089932598, "learning_rate": 7.56185706064212e-06, "loss": 0.3022, "step": 708 }, { "epoch": 1.9866853538892784, "grad_norm": 1.0572513469714837, "learning_rate": 7.553452766745186e-06, "loss": 0.3281, "step": 709 }, { "epoch": 1.989488437281009, "grad_norm": 0.9856430690146805, "learning_rate": 7.54503870190809e-06, "loss": 0.3216, "step": 710 }, { "epoch": 1.99229152067274, "grad_norm": 0.9753729640372196, "learning_rate": 7.536614898327752e-06, "loss": 0.311, "step": 711 }, { "epoch": 1.9950946040644708, "grad_norm": 0.9864433750205578, "learning_rate": 7.52818138823836e-06, "loss": 0.2953, "step": 712 }, { "epoch": 1.9978976874562018, "grad_norm": 0.9892478387895355, "learning_rate": 7.519738203911251e-06, "loss": 0.3078, "step": 713 }, { "epoch": 2.0, "grad_norm": 0.9438255779603106, "learning_rate": 7.511285377654771e-06, "loss": 0.2418, "step": 714 }, { "epoch": 2.002803083391731, "grad_norm": 0.9080470878038334, "learning_rate": 7.502822941814171e-06, "loss": 0.2548, "step": 715 }, { "epoch": 2.005606166783462, "grad_norm": 0.8826594015336416, "learning_rate": 7.494350928771466e-06, "loss": 0.2594, "step": 716 }, { "epoch": 2.008409250175193, "grad_norm": 0.8386141382563697, "learning_rate": 7.485869370945325e-06, "loss": 0.2423, "step": 717 }, { "epoch": 2.0112123335669234, "grad_norm": 0.8223334458252292, "learning_rate": 7.477378300790935e-06, "loss": 0.2366, "step": 718 }, { "epoch": 2.0140154169586544, "grad_norm": 0.8888082287599987, "learning_rate": 7.468877750799887e-06, "loss": 0.2125, "step": 719 }, { "epoch": 2.0168185003503853, "grad_norm": 0.9620219450586115, "learning_rate": 7.460367753500045e-06, "loss": 0.2459, "step": 720 }, { "epoch": 2.0196215837421163, "grad_norm": 0.8418731161458017, "learning_rate": 7.451848341455424e-06, "loss": 0.2294, "step": 721 }, { "epoch": 2.0224246671338473, "grad_norm": 0.9676558507429851, "learning_rate": 7.443319547266064e-06, "loss": 0.204, "step": 722 }, { "epoch": 2.0252277505255782, "grad_norm": 0.8911468048586156, "learning_rate": 7.43478140356791e-06, "loss": 0.2358, "step": 723 }, { "epoch": 2.028030833917309, "grad_norm": 0.9789863373908476, "learning_rate": 7.426233943032679e-06, "loss": 0.2105, "step": 724 }, { "epoch": 2.0308339173090397, "grad_norm": 0.8331751325732193, "learning_rate": 7.417677198367742e-06, "loss": 0.2348, "step": 725 }, { "epoch": 2.0336370007007707, "grad_norm": 0.9120461125338731, "learning_rate": 7.409111202315996e-06, "loss": 0.2534, "step": 726 }, { "epoch": 2.0364400840925017, "grad_norm": 0.902479135637207, "learning_rate": 7.400535987655739e-06, "loss": 0.2057, "step": 727 }, { "epoch": 2.0392431674842326, "grad_norm": 0.8439282822189133, "learning_rate": 7.391951587200543e-06, "loss": 0.2225, "step": 728 }, { "epoch": 2.0420462508759636, "grad_norm": 0.8248802433546494, "learning_rate": 7.3833580337991316e-06, "loss": 0.2535, "step": 729 }, { "epoch": 2.0448493342676946, "grad_norm": 0.8873168428680299, "learning_rate": 7.374755360335253e-06, "loss": 0.1747, "step": 730 }, { "epoch": 2.0476524176594255, "grad_norm": 0.83012996391051, "learning_rate": 7.366143599727551e-06, "loss": 0.2454, "step": 731 }, { "epoch": 2.0504555010511565, "grad_norm": 0.8914474878489079, "learning_rate": 7.3575227849294475e-06, "loss": 0.2396, "step": 732 }, { "epoch": 2.053258584442887, "grad_norm": 0.9122725647435613, "learning_rate": 7.3488929489290066e-06, "loss": 0.1932, "step": 733 }, { "epoch": 2.056061667834618, "grad_norm": 0.8529844434099163, "learning_rate": 7.340254124748811e-06, "loss": 0.2123, "step": 734 }, { "epoch": 2.058864751226349, "grad_norm": 0.872863409999642, "learning_rate": 7.331606345445841e-06, "loss": 0.2603, "step": 735 }, { "epoch": 2.06166783461808, "grad_norm": 0.8300659979722327, "learning_rate": 7.322949644111343e-06, "loss": 0.2199, "step": 736 }, { "epoch": 2.064470918009811, "grad_norm": 0.8094207514208382, "learning_rate": 7.314284053870704e-06, "loss": 0.2157, "step": 737 }, { "epoch": 2.067274001401542, "grad_norm": 0.8362024824106278, "learning_rate": 7.305609607883325e-06, "loss": 0.2148, "step": 738 }, { "epoch": 2.070077084793273, "grad_norm": 0.8690662905370502, "learning_rate": 7.296926339342493e-06, "loss": 0.236, "step": 739 }, { "epoch": 2.0728801681850033, "grad_norm": 0.8328393205731416, "learning_rate": 7.288234281475255e-06, "loss": 0.2276, "step": 740 }, { "epoch": 2.0756832515767343, "grad_norm": 0.875039815996908, "learning_rate": 7.279533467542295e-06, "loss": 0.2098, "step": 741 }, { "epoch": 2.0784863349684652, "grad_norm": 0.8813695483618433, "learning_rate": 7.270823930837796e-06, "loss": 0.2298, "step": 742 }, { "epoch": 2.081289418360196, "grad_norm": 0.8829770875487482, "learning_rate": 7.262105704689325e-06, "loss": 0.2129, "step": 743 }, { "epoch": 2.084092501751927, "grad_norm": 0.9148251721146092, "learning_rate": 7.253378822457696e-06, "loss": 0.2545, "step": 744 }, { "epoch": 2.086895585143658, "grad_norm": 0.8517069377242505, "learning_rate": 7.244643317536847e-06, "loss": 0.2087, "step": 745 }, { "epoch": 2.089698668535389, "grad_norm": 0.8335500923445399, "learning_rate": 7.235899223353713e-06, "loss": 0.209, "step": 746 }, { "epoch": 2.0925017519271196, "grad_norm": 0.8362768129075926, "learning_rate": 7.2271465733680935e-06, "loss": 0.2061, "step": 747 }, { "epoch": 2.0953048353188506, "grad_norm": 0.9051045608267754, "learning_rate": 7.21838540107253e-06, "loss": 0.2238, "step": 748 }, { "epoch": 2.0981079187105816, "grad_norm": 0.9254101156511656, "learning_rate": 7.20961573999217e-06, "loss": 0.1949, "step": 749 }, { "epoch": 2.1009110021023125, "grad_norm": 0.8494352768539863, "learning_rate": 7.200837623684654e-06, "loss": 0.2459, "step": 750 }, { "epoch": 2.1037140854940435, "grad_norm": 0.8708647921141418, "learning_rate": 7.192051085739966e-06, "loss": 0.2273, "step": 751 }, { "epoch": 2.1065171688857744, "grad_norm": 0.8469502145093527, "learning_rate": 7.183256159780321e-06, "loss": 0.2094, "step": 752 }, { "epoch": 2.1093202522775054, "grad_norm": 0.9201483952213855, "learning_rate": 7.17445287946003e-06, "loss": 0.205, "step": 753 }, { "epoch": 2.112123335669236, "grad_norm": 0.877283394608266, "learning_rate": 7.1656412784653765e-06, "loss": 0.2104, "step": 754 }, { "epoch": 2.114926419060967, "grad_norm": 0.85682961264913, "learning_rate": 7.1568213905144765e-06, "loss": 0.2305, "step": 755 }, { "epoch": 2.117729502452698, "grad_norm": 0.9068964187625099, "learning_rate": 7.147993249357163e-06, "loss": 0.229, "step": 756 }, { "epoch": 2.120532585844429, "grad_norm": 0.845779591253865, "learning_rate": 7.139156888774845e-06, "loss": 0.2203, "step": 757 }, { "epoch": 2.12333566923616, "grad_norm": 0.8107378653739076, "learning_rate": 7.130312342580389e-06, "loss": 0.2773, "step": 758 }, { "epoch": 2.1261387526278908, "grad_norm": 0.8869784736417072, "learning_rate": 7.121459644617979e-06, "loss": 0.2175, "step": 759 }, { "epoch": 2.1289418360196217, "grad_norm": 0.9196118134762095, "learning_rate": 7.112598828762998e-06, "loss": 0.1877, "step": 760 }, { "epoch": 2.1317449194113527, "grad_norm": 0.8425074607390686, "learning_rate": 7.103729928921886e-06, "loss": 0.2362, "step": 761 }, { "epoch": 2.134548002803083, "grad_norm": 0.8493908080559867, "learning_rate": 7.094852979032021e-06, "loss": 0.2223, "step": 762 }, { "epoch": 2.137351086194814, "grad_norm": 0.8539519141110158, "learning_rate": 7.085968013061585e-06, "loss": 0.1856, "step": 763 }, { "epoch": 2.140154169586545, "grad_norm": 0.8822840272790886, "learning_rate": 7.0770750650094335e-06, "loss": 0.2206, "step": 764 }, { "epoch": 2.142957252978276, "grad_norm": 0.8973299598197616, "learning_rate": 7.068174168904965e-06, "loss": 0.2333, "step": 765 }, { "epoch": 2.145760336370007, "grad_norm": 0.8417160109366958, "learning_rate": 7.059265358807991e-06, "loss": 0.2471, "step": 766 }, { "epoch": 2.148563419761738, "grad_norm": 0.8376755979996072, "learning_rate": 7.050348668808609e-06, "loss": 0.2082, "step": 767 }, { "epoch": 2.151366503153469, "grad_norm": 0.8551391805726851, "learning_rate": 7.041424133027068e-06, "loss": 0.2247, "step": 768 }, { "epoch": 2.1541695865451995, "grad_norm": 0.8439640386477707, "learning_rate": 7.032491785613641e-06, "loss": 0.2429, "step": 769 }, { "epoch": 2.1569726699369305, "grad_norm": 0.9282329146542126, "learning_rate": 7.023551660748489e-06, "loss": 0.2247, "step": 770 }, { "epoch": 2.1597757533286615, "grad_norm": 0.8759587860371314, "learning_rate": 7.014603792641536e-06, "loss": 0.224, "step": 771 }, { "epoch": 2.1625788367203924, "grad_norm": 0.8890636750475975, "learning_rate": 7.005648215532338e-06, "loss": 0.2169, "step": 772 }, { "epoch": 2.1653819201121234, "grad_norm": 0.8072442845315246, "learning_rate": 6.996684963689946e-06, "loss": 0.2254, "step": 773 }, { "epoch": 2.1681850035038543, "grad_norm": 0.8187290548361686, "learning_rate": 6.987714071412781e-06, "loss": 0.1894, "step": 774 }, { "epoch": 2.1709880868955853, "grad_norm": 0.8450241179024212, "learning_rate": 6.9787355730285e-06, "loss": 0.202, "step": 775 }, { "epoch": 2.1737911702873163, "grad_norm": 0.8925844522225541, "learning_rate": 6.969749502893868e-06, "loss": 0.23, "step": 776 }, { "epoch": 2.176594253679047, "grad_norm": 0.8790931855716, "learning_rate": 6.9607558953946175e-06, "loss": 0.2066, "step": 777 }, { "epoch": 2.1793973370707778, "grad_norm": 0.9381548374096674, "learning_rate": 6.9517547849453315e-06, "loss": 0.2706, "step": 778 }, { "epoch": 2.1822004204625087, "grad_norm": 0.9238601216907635, "learning_rate": 6.942746205989295e-06, "loss": 0.2297, "step": 779 }, { "epoch": 2.1850035038542397, "grad_norm": 0.9866325447543289, "learning_rate": 6.933730192998378e-06, "loss": 0.189, "step": 780 }, { "epoch": 2.1878065872459707, "grad_norm": 0.8449034073236847, "learning_rate": 6.924706780472894e-06, "loss": 0.2045, "step": 781 }, { "epoch": 2.1906096706377016, "grad_norm": 0.8553219392885109, "learning_rate": 6.915676002941473e-06, "loss": 0.2285, "step": 782 }, { "epoch": 2.193412754029432, "grad_norm": 0.8794945951009563, "learning_rate": 6.906637894960928e-06, "loss": 0.2156, "step": 783 }, { "epoch": 2.196215837421163, "grad_norm": 0.8987598950398725, "learning_rate": 6.897592491116117e-06, "loss": 0.1859, "step": 784 }, { "epoch": 2.199018920812894, "grad_norm": 0.8509492945576145, "learning_rate": 6.888539826019824e-06, "loss": 0.2158, "step": 785 }, { "epoch": 2.201822004204625, "grad_norm": 0.882168503694161, "learning_rate": 6.879479934312616e-06, "loss": 0.2092, "step": 786 }, { "epoch": 2.204625087596356, "grad_norm": 0.8634636045570749, "learning_rate": 6.8704128506627086e-06, "loss": 0.2266, "step": 787 }, { "epoch": 2.207428170988087, "grad_norm": 0.8518036135167025, "learning_rate": 6.861338609765842e-06, "loss": 0.2571, "step": 788 }, { "epoch": 2.210231254379818, "grad_norm": 0.8405617797518206, "learning_rate": 6.852257246345145e-06, "loss": 0.2266, "step": 789 }, { "epoch": 2.213034337771549, "grad_norm": 0.901508884663155, "learning_rate": 6.843168795150998e-06, "loss": 0.2163, "step": 790 }, { "epoch": 2.2158374211632794, "grad_norm": 0.820196270139301, "learning_rate": 6.834073290960905e-06, "loss": 0.2296, "step": 791 }, { "epoch": 2.2186405045550104, "grad_norm": 0.852940857104472, "learning_rate": 6.82497076857936e-06, "loss": 0.1952, "step": 792 }, { "epoch": 2.2214435879467413, "grad_norm": 0.8318702724851098, "learning_rate": 6.815861262837706e-06, "loss": 0.212, "step": 793 }, { "epoch": 2.2242466713384723, "grad_norm": 1.0371558711867102, "learning_rate": 6.80674480859402e-06, "loss": 0.2053, "step": 794 }, { "epoch": 2.2270497547302033, "grad_norm": 0.9082510123437368, "learning_rate": 6.797621440732955e-06, "loss": 0.2283, "step": 795 }, { "epoch": 2.2298528381219342, "grad_norm": 0.8122480768933811, "learning_rate": 6.788491194165629e-06, "loss": 0.2047, "step": 796 }, { "epoch": 2.232655921513665, "grad_norm": 0.8349666180090791, "learning_rate": 6.779354103829474e-06, "loss": 0.2559, "step": 797 }, { "epoch": 2.2354590049053957, "grad_norm": 0.8941808733581829, "learning_rate": 6.770210204688119e-06, "loss": 0.1728, "step": 798 }, { "epoch": 2.2382620882971267, "grad_norm": 0.8346775956103791, "learning_rate": 6.761059531731241e-06, "loss": 0.2162, "step": 799 }, { "epoch": 2.2410651716888577, "grad_norm": 0.8488565835499525, "learning_rate": 6.751902119974437e-06, "loss": 0.2154, "step": 800 } ], "logging_steps": 1, "max_steps": 1785, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 996094226137088.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }