{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998016660055534, "eval_steps": 500, "global_step": 1575, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0031733439111463705, "grad_norm": 6.348144432543206, "learning_rate": 5.063291139240507e-07, "loss": 0.8982, "step": 1 }, { "epoch": 0.006346687822292741, "grad_norm": 6.383645545622549, "learning_rate": 1.0126582278481013e-06, "loss": 0.9021, "step": 2 }, { "epoch": 0.009520031733439112, "grad_norm": 6.2539360712589085, "learning_rate": 1.518987341772152e-06, "loss": 0.8937, "step": 3 }, { "epoch": 0.012693375644585482, "grad_norm": 6.225921771088206, "learning_rate": 2.0253164556962026e-06, "loss": 0.8949, "step": 4 }, { "epoch": 0.01586671955573185, "grad_norm": 5.8423675974230775, "learning_rate": 2.5316455696202535e-06, "loss": 0.8753, "step": 5 }, { "epoch": 0.019040063466878223, "grad_norm": 4.757379141562461, "learning_rate": 3.037974683544304e-06, "loss": 0.8416, "step": 6 }, { "epoch": 0.022213407378024592, "grad_norm": 4.272314367321253, "learning_rate": 3.544303797468355e-06, "loss": 0.8311, "step": 7 }, { "epoch": 0.025386751289170964, "grad_norm": 2.4318992207766845, "learning_rate": 4.050632911392405e-06, "loss": 0.7784, "step": 8 }, { "epoch": 0.028560095200317333, "grad_norm": 2.248095466527857, "learning_rate": 4.556962025316456e-06, "loss": 0.7769, "step": 9 }, { "epoch": 0.0317334391114637, "grad_norm": 4.2093273786875836, "learning_rate": 5.063291139240507e-06, "loss": 0.7732, "step": 10 }, { "epoch": 0.03490678302261008, "grad_norm": 4.309584499183112, "learning_rate": 5.569620253164557e-06, "loss": 0.7619, "step": 11 }, { "epoch": 0.038080126933756446, "grad_norm": 4.357671892488039, "learning_rate": 6.075949367088608e-06, "loss": 0.7564, "step": 12 }, { "epoch": 0.041253470844902815, "grad_norm": 3.6078600546011184, "learning_rate": 6.582278481012659e-06, "loss": 0.7009, "step": 13 }, { "epoch": 0.044426814756049184, "grad_norm": 3.651249913873882, "learning_rate": 7.08860759493671e-06, "loss": 0.699, "step": 14 }, { "epoch": 0.04760015866719556, "grad_norm": 2.9003418519249107, "learning_rate": 7.5949367088607605e-06, "loss": 0.6807, "step": 15 }, { "epoch": 0.05077350257834193, "grad_norm": 2.0070454552304744, "learning_rate": 8.10126582278481e-06, "loss": 0.6622, "step": 16 }, { "epoch": 0.0539468464894883, "grad_norm": 1.6956361687151162, "learning_rate": 8.607594936708861e-06, "loss": 0.6405, "step": 17 }, { "epoch": 0.057120190400634666, "grad_norm": 2.212305308576385, "learning_rate": 9.113924050632912e-06, "loss": 0.6294, "step": 18 }, { "epoch": 0.06029353431178104, "grad_norm": 2.2094016296877634, "learning_rate": 9.620253164556963e-06, "loss": 0.6225, "step": 19 }, { "epoch": 0.0634668782229274, "grad_norm": 1.6141377735329752, "learning_rate": 1.0126582278481014e-05, "loss": 0.611, "step": 20 }, { "epoch": 0.06664022213407378, "grad_norm": 1.204912005316192, "learning_rate": 1.0632911392405063e-05, "loss": 0.6073, "step": 21 }, { "epoch": 0.06981356604522015, "grad_norm": 1.3322615853509538, "learning_rate": 1.1139240506329114e-05, "loss": 0.5935, "step": 22 }, { "epoch": 0.07298690995636652, "grad_norm": 1.3306388942331462, "learning_rate": 1.1645569620253165e-05, "loss": 0.5827, "step": 23 }, { "epoch": 0.07616025386751289, "grad_norm": 0.8751536661021161, "learning_rate": 1.2151898734177216e-05, "loss": 0.5805, "step": 24 }, { "epoch": 0.07933359777865927, "grad_norm": 0.9015772829079778, "learning_rate": 1.2658227848101268e-05, "loss": 0.5726, "step": 25 }, { "epoch": 0.08250694168980563, "grad_norm": 0.9818736669411537, "learning_rate": 1.3164556962025317e-05, "loss": 0.5683, "step": 26 }, { "epoch": 0.085680285600952, "grad_norm": 0.6758237512246137, "learning_rate": 1.3670886075949368e-05, "loss": 0.5554, "step": 27 }, { "epoch": 0.08885362951209837, "grad_norm": 0.7798065896044373, "learning_rate": 1.417721518987342e-05, "loss": 0.562, "step": 28 }, { "epoch": 0.09202697342324474, "grad_norm": 0.5943455450936407, "learning_rate": 1.468354430379747e-05, "loss": 0.5516, "step": 29 }, { "epoch": 0.09520031733439112, "grad_norm": 0.5360484272133458, "learning_rate": 1.5189873417721521e-05, "loss": 0.5466, "step": 30 }, { "epoch": 0.09837366124553748, "grad_norm": 0.6427312239326713, "learning_rate": 1.5696202531645572e-05, "loss": 0.5282, "step": 31 }, { "epoch": 0.10154700515668386, "grad_norm": 0.52852465695618, "learning_rate": 1.620253164556962e-05, "loss": 0.5358, "step": 32 }, { "epoch": 0.10472034906783023, "grad_norm": 0.5252420797220524, "learning_rate": 1.6708860759493674e-05, "loss": 0.5292, "step": 33 }, { "epoch": 0.1078936929789766, "grad_norm": 0.6243715389867477, "learning_rate": 1.7215189873417723e-05, "loss": 0.5371, "step": 34 }, { "epoch": 0.11106703689012297, "grad_norm": 0.4003457981794737, "learning_rate": 1.7721518987341772e-05, "loss": 0.5186, "step": 35 }, { "epoch": 0.11424038080126933, "grad_norm": 0.5457989224110974, "learning_rate": 1.8227848101265824e-05, "loss": 0.5223, "step": 36 }, { "epoch": 0.1174137247124157, "grad_norm": 0.3580435227349059, "learning_rate": 1.8734177215189874e-05, "loss": 0.5158, "step": 37 }, { "epoch": 0.12058706862356208, "grad_norm": 0.4663870760426878, "learning_rate": 1.9240506329113926e-05, "loss": 0.518, "step": 38 }, { "epoch": 0.12376041253470844, "grad_norm": 0.4046971344951154, "learning_rate": 1.974683544303798e-05, "loss": 0.5107, "step": 39 }, { "epoch": 0.1269337564458548, "grad_norm": 0.41758513906570793, "learning_rate": 2.0253164556962028e-05, "loss": 0.5124, "step": 40 }, { "epoch": 0.13010710035700118, "grad_norm": 0.4062531142420596, "learning_rate": 2.0759493670886077e-05, "loss": 0.504, "step": 41 }, { "epoch": 0.13328044426814756, "grad_norm": 0.3998294866521708, "learning_rate": 2.1265822784810126e-05, "loss": 0.5025, "step": 42 }, { "epoch": 0.13645378817929393, "grad_norm": 0.3867028742210112, "learning_rate": 2.177215189873418e-05, "loss": 0.5043, "step": 43 }, { "epoch": 0.1396271320904403, "grad_norm": 0.43575380555266197, "learning_rate": 2.2278481012658228e-05, "loss": 0.4972, "step": 44 }, { "epoch": 0.14280047600158668, "grad_norm": 0.6078264125881937, "learning_rate": 2.278481012658228e-05, "loss": 0.4975, "step": 45 }, { "epoch": 0.14597381991273303, "grad_norm": 0.973098853907099, "learning_rate": 2.329113924050633e-05, "loss": 0.4998, "step": 46 }, { "epoch": 0.1491471638238794, "grad_norm": 1.2032473307606681, "learning_rate": 2.379746835443038e-05, "loss": 0.5023, "step": 47 }, { "epoch": 0.15232050773502578, "grad_norm": 0.517082628902644, "learning_rate": 2.430379746835443e-05, "loss": 0.4926, "step": 48 }, { "epoch": 0.15549385164617216, "grad_norm": 0.8487019755795518, "learning_rate": 2.481012658227848e-05, "loss": 0.4981, "step": 49 }, { "epoch": 0.15866719555731854, "grad_norm": 1.0951673394024417, "learning_rate": 2.5316455696202537e-05, "loss": 0.4962, "step": 50 }, { "epoch": 0.16184053946846488, "grad_norm": 0.755728182616416, "learning_rate": 2.5822784810126586e-05, "loss": 0.4962, "step": 51 }, { "epoch": 0.16501388337961126, "grad_norm": 1.3057901440899915, "learning_rate": 2.6329113924050635e-05, "loss": 0.4865, "step": 52 }, { "epoch": 0.16818722729075763, "grad_norm": 0.6650026611670792, "learning_rate": 2.6835443037974687e-05, "loss": 0.4895, "step": 53 }, { "epoch": 0.171360571201904, "grad_norm": 0.7738295996967581, "learning_rate": 2.7341772151898737e-05, "loss": 0.4968, "step": 54 }, { "epoch": 0.1745339151130504, "grad_norm": 0.9000555872432531, "learning_rate": 2.784810126582279e-05, "loss": 0.4909, "step": 55 }, { "epoch": 0.17770725902419673, "grad_norm": 0.8162914872438413, "learning_rate": 2.835443037974684e-05, "loss": 0.4834, "step": 56 }, { "epoch": 0.1808806029353431, "grad_norm": 0.7906164914396787, "learning_rate": 2.8860759493670888e-05, "loss": 0.4865, "step": 57 }, { "epoch": 0.18405394684648949, "grad_norm": 0.9490020845109697, "learning_rate": 2.936708860759494e-05, "loss": 0.4908, "step": 58 }, { "epoch": 0.18722729075763586, "grad_norm": 0.7752131353512624, "learning_rate": 2.987341772151899e-05, "loss": 0.4782, "step": 59 }, { "epoch": 0.19040063466878224, "grad_norm": 0.84561035331472, "learning_rate": 3.0379746835443042e-05, "loss": 0.4703, "step": 60 }, { "epoch": 0.1935739785799286, "grad_norm": 0.5602877470932804, "learning_rate": 3.088607594936709e-05, "loss": 0.4775, "step": 61 }, { "epoch": 0.19674732249107496, "grad_norm": 0.6821307536004709, "learning_rate": 3.1392405063291144e-05, "loss": 0.4773, "step": 62 }, { "epoch": 0.19992066640222134, "grad_norm": 0.717968030563759, "learning_rate": 3.1898734177215196e-05, "loss": 0.4742, "step": 63 }, { "epoch": 0.2030940103133677, "grad_norm": 0.8033651165413181, "learning_rate": 3.240506329113924e-05, "loss": 0.4721, "step": 64 }, { "epoch": 0.2062673542245141, "grad_norm": 0.8192446128586057, "learning_rate": 3.2911392405063295e-05, "loss": 0.4772, "step": 65 }, { "epoch": 0.20944069813566046, "grad_norm": 1.431814362557968, "learning_rate": 3.341772151898735e-05, "loss": 0.4833, "step": 66 }, { "epoch": 0.2126140420468068, "grad_norm": 0.8878416063140385, "learning_rate": 3.392405063291139e-05, "loss": 0.4769, "step": 67 }, { "epoch": 0.2157873859579532, "grad_norm": 0.8851153809304894, "learning_rate": 3.4430379746835445e-05, "loss": 0.4702, "step": 68 }, { "epoch": 0.21896072986909956, "grad_norm": 0.8786193628266902, "learning_rate": 3.49367088607595e-05, "loss": 0.4685, "step": 69 }, { "epoch": 0.22213407378024594, "grad_norm": 0.7405283080781369, "learning_rate": 3.5443037974683544e-05, "loss": 0.4717, "step": 70 }, { "epoch": 0.22530741769139231, "grad_norm": 1.043777209641996, "learning_rate": 3.5949367088607596e-05, "loss": 0.4739, "step": 71 }, { "epoch": 0.22848076160253866, "grad_norm": 1.3637808792967987, "learning_rate": 3.645569620253165e-05, "loss": 0.4609, "step": 72 }, { "epoch": 0.23165410551368504, "grad_norm": 0.5922364571167895, "learning_rate": 3.69620253164557e-05, "loss": 0.4633, "step": 73 }, { "epoch": 0.2348274494248314, "grad_norm": 1.092592419482634, "learning_rate": 3.746835443037975e-05, "loss": 0.4709, "step": 74 }, { "epoch": 0.2380007933359778, "grad_norm": 1.3150716961544304, "learning_rate": 3.79746835443038e-05, "loss": 0.4645, "step": 75 }, { "epoch": 0.24117413724712417, "grad_norm": 0.8868277668057346, "learning_rate": 3.848101265822785e-05, "loss": 0.4701, "step": 76 }, { "epoch": 0.24434748115827054, "grad_norm": 1.0869800415170585, "learning_rate": 3.89873417721519e-05, "loss": 0.4646, "step": 77 }, { "epoch": 0.2475208250694169, "grad_norm": 1.0125334083010975, "learning_rate": 3.949367088607596e-05, "loss": 0.4678, "step": 78 }, { "epoch": 0.2506941689805633, "grad_norm": 1.312025871541637, "learning_rate": 4e-05, "loss": 0.4689, "step": 79 }, { "epoch": 0.2538675128917096, "grad_norm": 0.7660489575183855, "learning_rate": 4.0506329113924056e-05, "loss": 0.4627, "step": 80 }, { "epoch": 0.257040856802856, "grad_norm": 1.0627032211180467, "learning_rate": 4.10126582278481e-05, "loss": 0.4665, "step": 81 }, { "epoch": 0.26021420071400236, "grad_norm": 1.2004403182979329, "learning_rate": 4.1518987341772154e-05, "loss": 0.463, "step": 82 }, { "epoch": 0.26338754462514874, "grad_norm": 0.8697213717507531, "learning_rate": 4.202531645569621e-05, "loss": 0.4639, "step": 83 }, { "epoch": 0.2665608885362951, "grad_norm": 0.7770576209264962, "learning_rate": 4.253164556962025e-05, "loss": 0.4636, "step": 84 }, { "epoch": 0.2697342324474415, "grad_norm": 1.006934185356337, "learning_rate": 4.3037974683544305e-05, "loss": 0.4687, "step": 85 }, { "epoch": 0.27290757635858787, "grad_norm": 1.7918476049624474, "learning_rate": 4.354430379746836e-05, "loss": 0.4775, "step": 86 }, { "epoch": 0.27608092026973424, "grad_norm": 1.0471693596288953, "learning_rate": 4.405063291139241e-05, "loss": 0.4736, "step": 87 }, { "epoch": 0.2792542641808806, "grad_norm": 1.4597496826948222, "learning_rate": 4.4556962025316456e-05, "loss": 0.4786, "step": 88 }, { "epoch": 0.282427608092027, "grad_norm": 1.400079131837375, "learning_rate": 4.506329113924051e-05, "loss": 0.4699, "step": 89 }, { "epoch": 0.28560095200317337, "grad_norm": 1.1646233119330298, "learning_rate": 4.556962025316456e-05, "loss": 0.4721, "step": 90 }, { "epoch": 0.2887742959143197, "grad_norm": 1.5770765835982208, "learning_rate": 4.607594936708861e-05, "loss": 0.4741, "step": 91 }, { "epoch": 0.29194763982546607, "grad_norm": 1.2163067580443867, "learning_rate": 4.658227848101266e-05, "loss": 0.466, "step": 92 }, { "epoch": 0.29512098373661244, "grad_norm": 1.2720825044452424, "learning_rate": 4.708860759493671e-05, "loss": 0.4641, "step": 93 }, { "epoch": 0.2982943276477588, "grad_norm": 1.1086102734888972, "learning_rate": 4.759493670886076e-05, "loss": 0.4689, "step": 94 }, { "epoch": 0.3014676715589052, "grad_norm": 1.1621389294628086, "learning_rate": 4.810126582278481e-05, "loss": 0.46, "step": 95 }, { "epoch": 0.30464101547005157, "grad_norm": 1.023227048678522, "learning_rate": 4.860759493670886e-05, "loss": 0.4664, "step": 96 }, { "epoch": 0.30781435938119794, "grad_norm": 0.6681644577920394, "learning_rate": 4.911392405063292e-05, "loss": 0.46, "step": 97 }, { "epoch": 0.3109877032923443, "grad_norm": 0.8942148240973403, "learning_rate": 4.962025316455696e-05, "loss": 0.4578, "step": 98 }, { "epoch": 0.3141610472034907, "grad_norm": 0.8129981254381207, "learning_rate": 5.012658227848102e-05, "loss": 0.4588, "step": 99 }, { "epoch": 0.31733439111463707, "grad_norm": 0.7399247303363102, "learning_rate": 5.063291139240507e-05, "loss": 0.4684, "step": 100 }, { "epoch": 0.32050773502578345, "grad_norm": 0.9208344037352115, "learning_rate": 5.113924050632911e-05, "loss": 0.4537, "step": 101 }, { "epoch": 0.32368107893692977, "grad_norm": 0.7718334071956167, "learning_rate": 5.164556962025317e-05, "loss": 0.4541, "step": 102 }, { "epoch": 0.32685442284807614, "grad_norm": 0.9193553200855868, "learning_rate": 5.2151898734177224e-05, "loss": 0.4639, "step": 103 }, { "epoch": 0.3300277667592225, "grad_norm": 1.0893805228839906, "learning_rate": 5.265822784810127e-05, "loss": 0.4591, "step": 104 }, { "epoch": 0.3332011106703689, "grad_norm": 1.3296951795184973, "learning_rate": 5.316455696202532e-05, "loss": 0.4609, "step": 105 }, { "epoch": 0.33637445458151527, "grad_norm": 0.749655425862983, "learning_rate": 5.3670886075949375e-05, "loss": 0.4552, "step": 106 }, { "epoch": 0.33954779849266165, "grad_norm": 1.0304107904402815, "learning_rate": 5.417721518987342e-05, "loss": 0.455, "step": 107 }, { "epoch": 0.342721142403808, "grad_norm": 1.259806934599001, "learning_rate": 5.468354430379747e-05, "loss": 0.456, "step": 108 }, { "epoch": 0.3458944863149544, "grad_norm": 0.7667285389018935, "learning_rate": 5.5189873417721526e-05, "loss": 0.446, "step": 109 }, { "epoch": 0.3490678302261008, "grad_norm": 0.8396599205938527, "learning_rate": 5.569620253164558e-05, "loss": 0.444, "step": 110 }, { "epoch": 0.35224117413724715, "grad_norm": 1.0519977391007944, "learning_rate": 5.6202531645569624e-05, "loss": 0.4509, "step": 111 }, { "epoch": 0.35541451804839347, "grad_norm": 1.0013235222472492, "learning_rate": 5.670886075949368e-05, "loss": 0.4486, "step": 112 }, { "epoch": 0.35858786195953984, "grad_norm": 1.0607291724487944, "learning_rate": 5.721518987341773e-05, "loss": 0.4469, "step": 113 }, { "epoch": 0.3617612058706862, "grad_norm": 0.9739072199836216, "learning_rate": 5.7721518987341775e-05, "loss": 0.453, "step": 114 }, { "epoch": 0.3649345497818326, "grad_norm": 0.9878026910095292, "learning_rate": 5.822784810126583e-05, "loss": 0.4504, "step": 115 }, { "epoch": 0.36810789369297897, "grad_norm": 1.0083153796649502, "learning_rate": 5.873417721518988e-05, "loss": 0.4514, "step": 116 }, { "epoch": 0.37128123760412535, "grad_norm": 1.230069181330947, "learning_rate": 5.9240506329113926e-05, "loss": 0.4547, "step": 117 }, { "epoch": 0.3744545815152717, "grad_norm": 0.8910651737656846, "learning_rate": 5.974683544303798e-05, "loss": 0.447, "step": 118 }, { "epoch": 0.3776279254264181, "grad_norm": 1.0595908240339857, "learning_rate": 6.025316455696203e-05, "loss": 0.4466, "step": 119 }, { "epoch": 0.3808012693375645, "grad_norm": 0.8363383114472455, "learning_rate": 6.0759493670886084e-05, "loss": 0.4463, "step": 120 }, { "epoch": 0.38397461324871085, "grad_norm": 0.907469468063755, "learning_rate": 6.126582278481012e-05, "loss": 0.4492, "step": 121 }, { "epoch": 0.3871479571598572, "grad_norm": 1.3417514038396015, "learning_rate": 6.177215189873418e-05, "loss": 0.4572, "step": 122 }, { "epoch": 0.39032130107100355, "grad_norm": 0.6834706943125461, "learning_rate": 6.227848101265824e-05, "loss": 0.4452, "step": 123 }, { "epoch": 0.3934946449821499, "grad_norm": 1.1807244230068368, "learning_rate": 6.278481012658229e-05, "loss": 0.4486, "step": 124 }, { "epoch": 0.3966679888932963, "grad_norm": 0.9150568429572349, "learning_rate": 6.329113924050633e-05, "loss": 0.4475, "step": 125 }, { "epoch": 0.3998413328044427, "grad_norm": 0.9711686157986756, "learning_rate": 6.379746835443039e-05, "loss": 0.4548, "step": 126 }, { "epoch": 0.40301467671558905, "grad_norm": 0.8470526923713568, "learning_rate": 6.430379746835444e-05, "loss": 0.4573, "step": 127 }, { "epoch": 0.4061880206267354, "grad_norm": 1.1867353576740691, "learning_rate": 6.481012658227848e-05, "loss": 0.4524, "step": 128 }, { "epoch": 0.4093613645378818, "grad_norm": 0.9924058775384482, "learning_rate": 6.531645569620254e-05, "loss": 0.4445, "step": 129 }, { "epoch": 0.4125347084490282, "grad_norm": 1.0279428635787766, "learning_rate": 6.582278481012659e-05, "loss": 0.4536, "step": 130 }, { "epoch": 0.41570805236017455, "grad_norm": 1.327497625966449, "learning_rate": 6.632911392405063e-05, "loss": 0.4517, "step": 131 }, { "epoch": 0.4188813962713209, "grad_norm": 0.8047709339837646, "learning_rate": 6.68354430379747e-05, "loss": 0.4507, "step": 132 }, { "epoch": 0.42205474018246725, "grad_norm": 0.6624995618693278, "learning_rate": 6.734177215189874e-05, "loss": 0.4416, "step": 133 }, { "epoch": 0.4252280840936136, "grad_norm": 0.8937479460795225, "learning_rate": 6.784810126582279e-05, "loss": 0.4413, "step": 134 }, { "epoch": 0.42840142800476, "grad_norm": 1.2288491348657344, "learning_rate": 6.835443037974685e-05, "loss": 0.4481, "step": 135 }, { "epoch": 0.4315747719159064, "grad_norm": 0.7888292495946069, "learning_rate": 6.886075949367089e-05, "loss": 0.4434, "step": 136 }, { "epoch": 0.43474811582705275, "grad_norm": 0.8552313093639724, "learning_rate": 6.936708860759494e-05, "loss": 0.4415, "step": 137 }, { "epoch": 0.4379214597381991, "grad_norm": 1.1555499174697232, "learning_rate": 6.9873417721519e-05, "loss": 0.4469, "step": 138 }, { "epoch": 0.4410948036493455, "grad_norm": 0.8107672566991376, "learning_rate": 7.037974683544304e-05, "loss": 0.4391, "step": 139 }, { "epoch": 0.4442681475604919, "grad_norm": 0.7123665736327819, "learning_rate": 7.088607594936709e-05, "loss": 0.44, "step": 140 }, { "epoch": 0.44744149147163825, "grad_norm": 0.7484877480349433, "learning_rate": 7.139240506329115e-05, "loss": 0.4444, "step": 141 }, { "epoch": 0.45061483538278463, "grad_norm": 0.8956448687190801, "learning_rate": 7.189873417721519e-05, "loss": 0.4472, "step": 142 }, { "epoch": 0.453788179293931, "grad_norm": 1.1407806224479462, "learning_rate": 7.240506329113925e-05, "loss": 0.4549, "step": 143 }, { "epoch": 0.4569615232050773, "grad_norm": 0.6987492103428099, "learning_rate": 7.29113924050633e-05, "loss": 0.4376, "step": 144 }, { "epoch": 0.4601348671162237, "grad_norm": 1.0417474479567967, "learning_rate": 7.341772151898734e-05, "loss": 0.4496, "step": 145 }, { "epoch": 0.4633082110273701, "grad_norm": 0.9448597075694767, "learning_rate": 7.39240506329114e-05, "loss": 0.4446, "step": 146 }, { "epoch": 0.46648155493851645, "grad_norm": 0.891015819461564, "learning_rate": 7.443037974683545e-05, "loss": 0.445, "step": 147 }, { "epoch": 0.4696548988496628, "grad_norm": 0.6640649337472588, "learning_rate": 7.49367088607595e-05, "loss": 0.4415, "step": 148 }, { "epoch": 0.4728282427608092, "grad_norm": 0.6255801769841464, "learning_rate": 7.544303797468355e-05, "loss": 0.4401, "step": 149 }, { "epoch": 0.4760015866719556, "grad_norm": 0.6462120499489888, "learning_rate": 7.59493670886076e-05, "loss": 0.4443, "step": 150 }, { "epoch": 0.47917493058310195, "grad_norm": 0.6031979187078812, "learning_rate": 7.645569620253165e-05, "loss": 0.4315, "step": 151 }, { "epoch": 0.48234827449424833, "grad_norm": 0.7422409369854028, "learning_rate": 7.69620253164557e-05, "loss": 0.4411, "step": 152 }, { "epoch": 0.4855216184053947, "grad_norm": 0.6026165860234792, "learning_rate": 7.746835443037976e-05, "loss": 0.4372, "step": 153 }, { "epoch": 0.4886949623165411, "grad_norm": 0.4790898172907393, "learning_rate": 7.79746835443038e-05, "loss": 0.4397, "step": 154 }, { "epoch": 0.4918683062276874, "grad_norm": 0.5637573427525947, "learning_rate": 7.848101265822786e-05, "loss": 0.4364, "step": 155 }, { "epoch": 0.4950416501388338, "grad_norm": 0.7151994774739086, "learning_rate": 7.898734177215191e-05, "loss": 0.4418, "step": 156 }, { "epoch": 0.49821499404998015, "grad_norm": 0.8523966891138255, "learning_rate": 7.949367088607595e-05, "loss": 0.4426, "step": 157 }, { "epoch": 0.5013883379611266, "grad_norm": 1.0199664244681628, "learning_rate": 8e-05, "loss": 0.4396, "step": 158 }, { "epoch": 0.5045616818722729, "grad_norm": 0.9730498513447443, "learning_rate": 7.999990169177323e-05, "loss": 0.4524, "step": 159 }, { "epoch": 0.5077350257834192, "grad_norm": 0.8763687649740503, "learning_rate": 7.99996067675761e-05, "loss": 0.4385, "step": 160 }, { "epoch": 0.5109083696945657, "grad_norm": 0.7439185882238727, "learning_rate": 7.99991152288583e-05, "loss": 0.4426, "step": 161 }, { "epoch": 0.514081713605712, "grad_norm": 0.8277849239630293, "learning_rate": 7.999842707803597e-05, "loss": 0.4336, "step": 162 }, { "epoch": 0.5172550575168584, "grad_norm": 0.6740425300459671, "learning_rate": 7.999754231849163e-05, "loss": 0.4371, "step": 163 }, { "epoch": 0.5204284014280047, "grad_norm": 0.6114082854253373, "learning_rate": 7.999646095457422e-05, "loss": 0.4421, "step": 164 }, { "epoch": 0.5236017453391512, "grad_norm": 0.8489305672656007, "learning_rate": 7.999518299159912e-05, "loss": 0.4366, "step": 165 }, { "epoch": 0.5267750892502975, "grad_norm": 0.7585314746590911, "learning_rate": 7.999370843584805e-05, "loss": 0.432, "step": 166 }, { "epoch": 0.5299484331614439, "grad_norm": 0.9398955324495176, "learning_rate": 7.999203729456902e-05, "loss": 0.4361, "step": 167 }, { "epoch": 0.5331217770725902, "grad_norm": 1.7667814883362698, "learning_rate": 7.99901695759764e-05, "loss": 0.4484, "step": 168 }, { "epoch": 0.5362951209837367, "grad_norm": 0.5640015415710142, "learning_rate": 7.99881052892508e-05, "loss": 0.4347, "step": 169 }, { "epoch": 0.539468464894883, "grad_norm": 1.512073308742083, "learning_rate": 7.998584444453901e-05, "loss": 0.4454, "step": 170 }, { "epoch": 0.5426418088060293, "grad_norm": 0.9689824722762622, "learning_rate": 7.998338705295406e-05, "loss": 0.4386, "step": 171 }, { "epoch": 0.5458151527171757, "grad_norm": 0.7646001072380342, "learning_rate": 7.9980733126575e-05, "loss": 0.4358, "step": 172 }, { "epoch": 0.548988496628322, "grad_norm": 0.9621346516255858, "learning_rate": 7.997788267844699e-05, "loss": 0.4438, "step": 173 }, { "epoch": 0.5521618405394685, "grad_norm": 0.8313101918847394, "learning_rate": 7.997483572258112e-05, "loss": 0.4402, "step": 174 }, { "epoch": 0.5553351844506148, "grad_norm": 0.6402057742934584, "learning_rate": 7.997159227395449e-05, "loss": 0.4289, "step": 175 }, { "epoch": 0.5585085283617612, "grad_norm": 0.5861524014663406, "learning_rate": 7.996815234850994e-05, "loss": 0.4258, "step": 176 }, { "epoch": 0.5616818722729076, "grad_norm": 0.49147871141369276, "learning_rate": 7.996451596315613e-05, "loss": 0.4284, "step": 177 }, { "epoch": 0.564855216184054, "grad_norm": 0.5385589588185274, "learning_rate": 7.99606831357674e-05, "loss": 0.4326, "step": 178 }, { "epoch": 0.5680285600952003, "grad_norm": 0.5020636674847232, "learning_rate": 7.995665388518366e-05, "loss": 0.4275, "step": 179 }, { "epoch": 0.5712019040063467, "grad_norm": 0.34677645292080367, "learning_rate": 7.995242823121035e-05, "loss": 0.4313, "step": 180 }, { "epoch": 0.5743752479174931, "grad_norm": 0.41481246440728614, "learning_rate": 7.994800619461826e-05, "loss": 0.428, "step": 181 }, { "epoch": 0.5775485918286394, "grad_norm": 0.4728147459852834, "learning_rate": 7.994338779714356e-05, "loss": 0.429, "step": 182 }, { "epoch": 0.5807219357397858, "grad_norm": 0.34714272709199906, "learning_rate": 7.993857306148757e-05, "loss": 0.42, "step": 183 }, { "epoch": 0.5838952796509321, "grad_norm": 0.33307391540828185, "learning_rate": 7.993356201131667e-05, "loss": 0.4306, "step": 184 }, { "epoch": 0.5870686235620786, "grad_norm": 0.4557580388879463, "learning_rate": 7.992835467126226e-05, "loss": 0.4336, "step": 185 }, { "epoch": 0.5902419674732249, "grad_norm": 0.3413621320410112, "learning_rate": 7.992295106692053e-05, "loss": 0.4244, "step": 186 }, { "epoch": 0.5934153113843713, "grad_norm": 0.30301313621067855, "learning_rate": 7.991735122485244e-05, "loss": 0.4264, "step": 187 }, { "epoch": 0.5965886552955176, "grad_norm": 0.27873012092060434, "learning_rate": 7.991155517258351e-05, "loss": 0.4254, "step": 188 }, { "epoch": 0.5997619992066641, "grad_norm": 0.3234296671716833, "learning_rate": 7.990556293860373e-05, "loss": 0.4226, "step": 189 }, { "epoch": 0.6029353431178104, "grad_norm": 0.26061056621175, "learning_rate": 7.989937455236738e-05, "loss": 0.4212, "step": 190 }, { "epoch": 0.6061086870289568, "grad_norm": 0.32864221465315796, "learning_rate": 7.989299004429294e-05, "loss": 0.4226, "step": 191 }, { "epoch": 0.6092820309401031, "grad_norm": 0.4444198405279817, "learning_rate": 7.988640944576287e-05, "loss": 0.4266, "step": 192 }, { "epoch": 0.6124553748512495, "grad_norm": 0.5842630238421751, "learning_rate": 7.987963278912353e-05, "loss": 0.4239, "step": 193 }, { "epoch": 0.6156287187623959, "grad_norm": 0.8349781810783966, "learning_rate": 7.9872660107685e-05, "loss": 0.4249, "step": 194 }, { "epoch": 0.6188020626735422, "grad_norm": 1.167134998920243, "learning_rate": 7.986549143572085e-05, "loss": 0.4274, "step": 195 }, { "epoch": 0.6219754065846886, "grad_norm": 0.9339211226210348, "learning_rate": 7.985812680846804e-05, "loss": 0.4259, "step": 196 }, { "epoch": 0.625148750495835, "grad_norm": 0.7637681964537499, "learning_rate": 7.985056626212678e-05, "loss": 0.4293, "step": 197 }, { "epoch": 0.6283220944069814, "grad_norm": 0.758578666942809, "learning_rate": 7.984280983386022e-05, "loss": 0.4268, "step": 198 }, { "epoch": 0.6314954383181277, "grad_norm": 0.7219115681675803, "learning_rate": 7.983485756179443e-05, "loss": 0.4284, "step": 199 }, { "epoch": 0.6346687822292741, "grad_norm": 0.7774185389424639, "learning_rate": 7.98267094850181e-05, "loss": 0.4357, "step": 200 }, { "epoch": 0.6378421261404205, "grad_norm": 0.8684313423895228, "learning_rate": 7.981836564358235e-05, "loss": 0.4364, "step": 201 }, { "epoch": 0.6410154700515669, "grad_norm": 0.9490290231535304, "learning_rate": 7.980982607850062e-05, "loss": 0.4316, "step": 202 }, { "epoch": 0.6441888139627132, "grad_norm": 1.1158847171655581, "learning_rate": 7.980109083174838e-05, "loss": 0.4326, "step": 203 }, { "epoch": 0.6473621578738595, "grad_norm": 0.5821381704555008, "learning_rate": 7.979215994626295e-05, "loss": 0.4248, "step": 204 }, { "epoch": 0.650535501785006, "grad_norm": 0.6159942615763982, "learning_rate": 7.97830334659433e-05, "loss": 0.4228, "step": 205 }, { "epoch": 0.6537088456961523, "grad_norm": 0.9854660383074767, "learning_rate": 7.977371143564986e-05, "loss": 0.4319, "step": 206 }, { "epoch": 0.6568821896072987, "grad_norm": 0.6753873782895369, "learning_rate": 7.976419390120422e-05, "loss": 0.4257, "step": 207 }, { "epoch": 0.660055533518445, "grad_norm": 0.4578459840507763, "learning_rate": 7.9754480909389e-05, "loss": 0.4305, "step": 208 }, { "epoch": 0.6632288774295915, "grad_norm": 0.7788354439242892, "learning_rate": 7.974457250794752e-05, "loss": 0.4294, "step": 209 }, { "epoch": 0.6664022213407378, "grad_norm": 0.5659494748709242, "learning_rate": 7.973446874558367e-05, "loss": 0.4244, "step": 210 }, { "epoch": 0.6695755652518842, "grad_norm": 0.37038793237805434, "learning_rate": 7.97241696719616e-05, "loss": 0.4259, "step": 211 }, { "epoch": 0.6727489091630305, "grad_norm": 0.5598930828252042, "learning_rate": 7.971367533770548e-05, "loss": 0.424, "step": 212 }, { "epoch": 0.6759222530741769, "grad_norm": 0.38017398481352066, "learning_rate": 7.97029857943993e-05, "loss": 0.4259, "step": 213 }, { "epoch": 0.6790955969853233, "grad_norm": 0.34969847402690163, "learning_rate": 7.969210109458653e-05, "loss": 0.4224, "step": 214 }, { "epoch": 0.6822689408964696, "grad_norm": 0.4326242238353279, "learning_rate": 7.968102129176998e-05, "loss": 0.4217, "step": 215 }, { "epoch": 0.685442284807616, "grad_norm": 0.3678356901641005, "learning_rate": 7.966974644041142e-05, "loss": 0.4268, "step": 216 }, { "epoch": 0.6886156287187624, "grad_norm": 0.3291254839559168, "learning_rate": 7.965827659593138e-05, "loss": 0.4206, "step": 217 }, { "epoch": 0.6917889726299088, "grad_norm": 0.3567104966136532, "learning_rate": 7.964661181470887e-05, "loss": 0.4191, "step": 218 }, { "epoch": 0.6949623165410551, "grad_norm": 0.3146162751453091, "learning_rate": 7.96347521540811e-05, "loss": 0.4133, "step": 219 }, { "epoch": 0.6981356604522015, "grad_norm": 0.25903345117835813, "learning_rate": 7.962269767234315e-05, "loss": 0.4241, "step": 220 }, { "epoch": 0.7013090043633479, "grad_norm": 0.3697676932101927, "learning_rate": 7.96104484287478e-05, "loss": 0.4248, "step": 221 }, { "epoch": 0.7044823482744943, "grad_norm": 0.4012189220411039, "learning_rate": 7.959800448350507e-05, "loss": 0.4188, "step": 222 }, { "epoch": 0.7076556921856406, "grad_norm": 0.43083222853217606, "learning_rate": 7.95853658977821e-05, "loss": 0.4184, "step": 223 }, { "epoch": 0.7108290360967869, "grad_norm": 0.41949845884539716, "learning_rate": 7.957253273370275e-05, "loss": 0.4168, "step": 224 }, { "epoch": 0.7140023800079334, "grad_norm": 0.460716406317274, "learning_rate": 7.955950505434725e-05, "loss": 0.4193, "step": 225 }, { "epoch": 0.7171757239190797, "grad_norm": 0.6387648521603024, "learning_rate": 7.954628292375207e-05, "loss": 0.428, "step": 226 }, { "epoch": 0.7203490678302261, "grad_norm": 0.7552539307927338, "learning_rate": 7.953286640690936e-05, "loss": 0.4243, "step": 227 }, { "epoch": 0.7235224117413724, "grad_norm": 0.6681939851633835, "learning_rate": 7.951925556976686e-05, "loss": 0.4256, "step": 228 }, { "epoch": 0.7266957556525189, "grad_norm": 0.6068582323945105, "learning_rate": 7.950545047922741e-05, "loss": 0.4231, "step": 229 }, { "epoch": 0.7298690995636652, "grad_norm": 0.5668354270139984, "learning_rate": 7.949145120314871e-05, "loss": 0.4216, "step": 230 }, { "epoch": 0.7330424434748116, "grad_norm": 0.5694091160000379, "learning_rate": 7.947725781034299e-05, "loss": 0.4173, "step": 231 }, { "epoch": 0.7362157873859579, "grad_norm": 0.546717014419698, "learning_rate": 7.946287037057657e-05, "loss": 0.4181, "step": 232 }, { "epoch": 0.7393891312971044, "grad_norm": 0.3630503498245246, "learning_rate": 7.944828895456968e-05, "loss": 0.4137, "step": 233 }, { "epoch": 0.7425624752082507, "grad_norm": 0.43965037772138893, "learning_rate": 7.943351363399593e-05, "loss": 0.423, "step": 234 }, { "epoch": 0.745735819119397, "grad_norm": 0.6391260482000344, "learning_rate": 7.941854448148212e-05, "loss": 0.4161, "step": 235 }, { "epoch": 0.7489091630305434, "grad_norm": 0.5110774940868824, "learning_rate": 7.94033815706078e-05, "loss": 0.4128, "step": 236 }, { "epoch": 0.7520825069416898, "grad_norm": 0.3565529564135717, "learning_rate": 7.938802497590491e-05, "loss": 0.4138, "step": 237 }, { "epoch": 0.7552558508528362, "grad_norm": 0.4657810664776175, "learning_rate": 7.937247477285743e-05, "loss": 0.4161, "step": 238 }, { "epoch": 0.7584291947639825, "grad_norm": 0.4101562014538087, "learning_rate": 7.935673103790101e-05, "loss": 0.4198, "step": 239 }, { "epoch": 0.761602538675129, "grad_norm": 0.3474690454226505, "learning_rate": 7.934079384842255e-05, "loss": 0.4182, "step": 240 }, { "epoch": 0.7647758825862753, "grad_norm": 0.501425607063933, "learning_rate": 7.932466328275994e-05, "loss": 0.4154, "step": 241 }, { "epoch": 0.7679492264974217, "grad_norm": 0.5505691214538797, "learning_rate": 7.93083394202015e-05, "loss": 0.4192, "step": 242 }, { "epoch": 0.771122570408568, "grad_norm": 0.48063376141835473, "learning_rate": 7.929182234098576e-05, "loss": 0.4142, "step": 243 }, { "epoch": 0.7742959143197145, "grad_norm": 0.4777993396793008, "learning_rate": 7.927511212630096e-05, "loss": 0.4166, "step": 244 }, { "epoch": 0.7774692582308608, "grad_norm": 0.7002665548552994, "learning_rate": 7.925820885828468e-05, "loss": 0.4222, "step": 245 }, { "epoch": 0.7806426021420071, "grad_norm": 0.9739111639000991, "learning_rate": 7.924111262002338e-05, "loss": 0.4252, "step": 246 }, { "epoch": 0.7838159460531535, "grad_norm": 1.0430449575283998, "learning_rate": 7.922382349555218e-05, "loss": 0.4252, "step": 247 }, { "epoch": 0.7869892899642998, "grad_norm": 0.7337059616611151, "learning_rate": 7.92063415698542e-05, "loss": 0.4144, "step": 248 }, { "epoch": 0.7901626338754463, "grad_norm": 0.5385536889326978, "learning_rate": 7.918866692886031e-05, "loss": 0.4226, "step": 249 }, { "epoch": 0.7933359777865926, "grad_norm": 0.7647340729347486, "learning_rate": 7.917079965944862e-05, "loss": 0.4201, "step": 250 }, { "epoch": 0.796509321697739, "grad_norm": 0.8450817800646696, "learning_rate": 7.915273984944412e-05, "loss": 0.4224, "step": 251 }, { "epoch": 0.7996826656088853, "grad_norm": 0.4920042288499847, "learning_rate": 7.913448758761821e-05, "loss": 0.4146, "step": 252 }, { "epoch": 0.8028560095200318, "grad_norm": 0.39445337326690816, "learning_rate": 7.911604296368826e-05, "loss": 0.4157, "step": 253 }, { "epoch": 0.8060293534311781, "grad_norm": 0.5853009915458679, "learning_rate": 7.909740606831719e-05, "loss": 0.417, "step": 254 }, { "epoch": 0.8092026973423245, "grad_norm": 0.4974087565357452, "learning_rate": 7.907857699311299e-05, "loss": 0.4158, "step": 255 }, { "epoch": 0.8123760412534708, "grad_norm": 0.35737310825619095, "learning_rate": 7.905955583062833e-05, "loss": 0.4231, "step": 256 }, { "epoch": 0.8155493851646172, "grad_norm": 0.3107479347233733, "learning_rate": 7.904034267436004e-05, "loss": 0.4091, "step": 257 }, { "epoch": 0.8187227290757636, "grad_norm": 0.3938008913327802, "learning_rate": 7.902093761874867e-05, "loss": 0.4184, "step": 258 }, { "epoch": 0.8218960729869099, "grad_norm": 0.36143088089529846, "learning_rate": 7.900134075917807e-05, "loss": 0.4188, "step": 259 }, { "epoch": 0.8250694168980564, "grad_norm": 0.27530043552201605, "learning_rate": 7.898155219197488e-05, "loss": 0.409, "step": 260 }, { "epoch": 0.8282427608092027, "grad_norm": 0.30129724789128026, "learning_rate": 7.896157201440801e-05, "loss": 0.4114, "step": 261 }, { "epoch": 0.8314161047203491, "grad_norm": 0.3380484464752199, "learning_rate": 7.894140032468828e-05, "loss": 0.413, "step": 262 }, { "epoch": 0.8345894486314954, "grad_norm": 0.3928098788344459, "learning_rate": 7.892103722196782e-05, "loss": 0.4138, "step": 263 }, { "epoch": 0.8377627925426419, "grad_norm": 0.36724870276604, "learning_rate": 7.890048280633967e-05, "loss": 0.4084, "step": 264 }, { "epoch": 0.8409361364537882, "grad_norm": 0.35512151600153574, "learning_rate": 7.887973717883725e-05, "loss": 0.4101, "step": 265 }, { "epoch": 0.8441094803649345, "grad_norm": 0.32038340926763376, "learning_rate": 7.885880044143382e-05, "loss": 0.4149, "step": 266 }, { "epoch": 0.8472828242760809, "grad_norm": 0.385102685025381, "learning_rate": 7.883767269704209e-05, "loss": 0.4141, "step": 267 }, { "epoch": 0.8504561681872272, "grad_norm": 0.41759937563314564, "learning_rate": 7.88163540495136e-05, "loss": 0.4168, "step": 268 }, { "epoch": 0.8536295120983737, "grad_norm": 0.47749975244001913, "learning_rate": 7.879484460363825e-05, "loss": 0.4164, "step": 269 }, { "epoch": 0.85680285600952, "grad_norm": 0.49048299669029655, "learning_rate": 7.877314446514385e-05, "loss": 0.4102, "step": 270 }, { "epoch": 0.8599761999206664, "grad_norm": 0.5033680425309713, "learning_rate": 7.87512537406955e-05, "loss": 0.4168, "step": 271 }, { "epoch": 0.8631495438318127, "grad_norm": 0.632095279857067, "learning_rate": 7.87291725378951e-05, "loss": 0.4192, "step": 272 }, { "epoch": 0.8663228877429592, "grad_norm": 0.7890821846662114, "learning_rate": 7.870690096528084e-05, "loss": 0.4104, "step": 273 }, { "epoch": 0.8694962316541055, "grad_norm": 0.8461099097389182, "learning_rate": 7.868443913232669e-05, "loss": 0.4166, "step": 274 }, { "epoch": 0.8726695755652519, "grad_norm": 0.7484663137119127, "learning_rate": 7.866178714944178e-05, "loss": 0.4141, "step": 275 }, { "epoch": 0.8758429194763983, "grad_norm": 0.45570202002824084, "learning_rate": 7.863894512796992e-05, "loss": 0.4123, "step": 276 }, { "epoch": 0.8790162633875446, "grad_norm": 0.4294812192002864, "learning_rate": 7.861591318018904e-05, "loss": 0.4144, "step": 277 }, { "epoch": 0.882189607298691, "grad_norm": 0.58391480443298, "learning_rate": 7.859269141931065e-05, "loss": 0.4131, "step": 278 }, { "epoch": 0.8853629512098373, "grad_norm": 0.47580822422237723, "learning_rate": 7.856927995947925e-05, "loss": 0.4086, "step": 279 }, { "epoch": 0.8885362951209838, "grad_norm": 0.38129055085982533, "learning_rate": 7.854567891577179e-05, "loss": 0.4184, "step": 280 }, { "epoch": 0.8917096390321301, "grad_norm": 0.43052433481163255, "learning_rate": 7.852188840419711e-05, "loss": 0.4096, "step": 281 }, { "epoch": 0.8948829829432765, "grad_norm": 0.4433184262594291, "learning_rate": 7.849790854169536e-05, "loss": 0.4104, "step": 282 }, { "epoch": 0.8980563268544228, "grad_norm": 0.4435382933392679, "learning_rate": 7.847373944613745e-05, "loss": 0.4059, "step": 283 }, { "epoch": 0.9012296707655693, "grad_norm": 0.40447203187517, "learning_rate": 7.844938123632439e-05, "loss": 0.4094, "step": 284 }, { "epoch": 0.9044030146767156, "grad_norm": 0.34449120043205606, "learning_rate": 7.842483403198683e-05, "loss": 0.408, "step": 285 }, { "epoch": 0.907576358587862, "grad_norm": 0.37462807346909405, "learning_rate": 7.840009795378436e-05, "loss": 0.4146, "step": 286 }, { "epoch": 0.9107497024990083, "grad_norm": 0.4704923604854831, "learning_rate": 7.837517312330498e-05, "loss": 0.4122, "step": 287 }, { "epoch": 0.9139230464101546, "grad_norm": 0.5097972249194679, "learning_rate": 7.83500596630645e-05, "loss": 0.416, "step": 288 }, { "epoch": 0.9170963903213011, "grad_norm": 0.5351289342983636, "learning_rate": 7.832475769650588e-05, "loss": 0.415, "step": 289 }, { "epoch": 0.9202697342324474, "grad_norm": 0.4674732233010469, "learning_rate": 7.829926734799872e-05, "loss": 0.4107, "step": 290 }, { "epoch": 0.9234430781435938, "grad_norm": 0.4263783054265977, "learning_rate": 7.827358874283855e-05, "loss": 0.4131, "step": 291 }, { "epoch": 0.9266164220547402, "grad_norm": 0.5102719397399545, "learning_rate": 7.824772200724629e-05, "loss": 0.4096, "step": 292 }, { "epoch": 0.9297897659658866, "grad_norm": 0.5129309020648893, "learning_rate": 7.822166726836758e-05, "loss": 0.4077, "step": 293 }, { "epoch": 0.9329631098770329, "grad_norm": 0.45936698859229286, "learning_rate": 7.819542465427217e-05, "loss": 0.4072, "step": 294 }, { "epoch": 0.9361364537881793, "grad_norm": 0.35187396508942126, "learning_rate": 7.816899429395332e-05, "loss": 0.41, "step": 295 }, { "epoch": 0.9393097976993257, "grad_norm": 0.3773900389073242, "learning_rate": 7.814237631732711e-05, "loss": 0.4097, "step": 296 }, { "epoch": 0.9424831416104721, "grad_norm": 0.48196894071104524, "learning_rate": 7.811557085523187e-05, "loss": 0.4125, "step": 297 }, { "epoch": 0.9456564855216184, "grad_norm": 0.43803354037854536, "learning_rate": 7.808857803942741e-05, "loss": 0.4047, "step": 298 }, { "epoch": 0.9488298294327647, "grad_norm": 0.36020876635054705, "learning_rate": 7.80613980025946e-05, "loss": 0.4005, "step": 299 }, { "epoch": 0.9520031733439112, "grad_norm": 0.2741750720795513, "learning_rate": 7.803403087833444e-05, "loss": 0.4023, "step": 300 }, { "epoch": 0.9551765172550575, "grad_norm": 0.25720288277565834, "learning_rate": 7.800647680116764e-05, "loss": 0.4041, "step": 301 }, { "epoch": 0.9583498611662039, "grad_norm": 0.3359257955372094, "learning_rate": 7.797873590653381e-05, "loss": 0.3991, "step": 302 }, { "epoch": 0.9615232050773502, "grad_norm": 0.37500047533306236, "learning_rate": 7.795080833079084e-05, "loss": 0.4075, "step": 303 }, { "epoch": 0.9646965489884967, "grad_norm": 0.4231851966084708, "learning_rate": 7.792269421121429e-05, "loss": 0.408, "step": 304 }, { "epoch": 0.967869892899643, "grad_norm": 0.46773700878625746, "learning_rate": 7.78943936859966e-05, "loss": 0.406, "step": 305 }, { "epoch": 0.9710432368107894, "grad_norm": 0.48276037296021734, "learning_rate": 7.78659068942465e-05, "loss": 0.4113, "step": 306 }, { "epoch": 0.9742165807219357, "grad_norm": 0.4520209571383191, "learning_rate": 7.783723397598829e-05, "loss": 0.418, "step": 307 }, { "epoch": 0.9773899246330822, "grad_norm": 0.4761928527698079, "learning_rate": 7.780837507216114e-05, "loss": 0.406, "step": 308 }, { "epoch": 0.9805632685442285, "grad_norm": 0.6222583620657315, "learning_rate": 7.777933032461845e-05, "loss": 0.4115, "step": 309 }, { "epoch": 0.9837366124553748, "grad_norm": 0.6260831309463507, "learning_rate": 7.775009987612711e-05, "loss": 0.4079, "step": 310 }, { "epoch": 0.9869099563665212, "grad_norm": 0.44371491044635886, "learning_rate": 7.772068387036677e-05, "loss": 0.4057, "step": 311 }, { "epoch": 0.9900833002776676, "grad_norm": 0.3096918566187467, "learning_rate": 7.769108245192922e-05, "loss": 0.4134, "step": 312 }, { "epoch": 0.993256644188814, "grad_norm": 0.497339716362162, "learning_rate": 7.766129576631759e-05, "loss": 0.41, "step": 313 }, { "epoch": 0.9964299880999603, "grad_norm": 0.5309378071519795, "learning_rate": 7.763132395994572e-05, "loss": 0.4023, "step": 314 }, { "epoch": 0.9996033320111067, "grad_norm": 0.44514647594528395, "learning_rate": 7.760116718013735e-05, "loss": 0.4079, "step": 315 }, { "epoch": 1.0027766759222532, "grad_norm": 0.7972378708928999, "learning_rate": 7.757082557512545e-05, "loss": 0.7527, "step": 316 }, { "epoch": 1.0059500198333995, "grad_norm": 1.9850051036948724, "learning_rate": 7.75402992940515e-05, "loss": 0.4284, "step": 317 }, { "epoch": 1.0091233637445458, "grad_norm": 0.5262986098049574, "learning_rate": 7.750958848696473e-05, "loss": 0.3962, "step": 318 }, { "epoch": 1.0122967076556921, "grad_norm": 1.5250119834033022, "learning_rate": 7.747869330482137e-05, "loss": 0.4258, "step": 319 }, { "epoch": 1.0154700515668384, "grad_norm": 0.5638471189301302, "learning_rate": 7.744761389948397e-05, "loss": 0.4036, "step": 320 }, { "epoch": 1.018643395477985, "grad_norm": 0.9646189002979927, "learning_rate": 7.741635042372059e-05, "loss": 0.418, "step": 321 }, { "epoch": 1.0218167393891313, "grad_norm": 0.9961382446454835, "learning_rate": 7.738490303120407e-05, "loss": 0.4059, "step": 322 }, { "epoch": 1.0249900833002776, "grad_norm": 0.7909414038458441, "learning_rate": 7.735327187651127e-05, "loss": 0.4079, "step": 323 }, { "epoch": 1.028163427211424, "grad_norm": 0.5245790298223422, "learning_rate": 7.732145711512234e-05, "loss": 0.4063, "step": 324 }, { "epoch": 1.0313367711225705, "grad_norm": 0.7541383994484598, "learning_rate": 7.728945890341991e-05, "loss": 0.4016, "step": 325 }, { "epoch": 1.0345101150337168, "grad_norm": 0.484806781405723, "learning_rate": 7.725727739868837e-05, "loss": 0.3989, "step": 326 }, { "epoch": 1.0376834589448631, "grad_norm": 0.5797811834279861, "learning_rate": 7.722491275911302e-05, "loss": 0.4013, "step": 327 }, { "epoch": 1.0408568028560095, "grad_norm": 0.45973779856894975, "learning_rate": 7.71923651437794e-05, "loss": 0.4022, "step": 328 }, { "epoch": 1.044030146767156, "grad_norm": 0.4205253473707581, "learning_rate": 7.715963471267243e-05, "loss": 0.394, "step": 329 }, { "epoch": 1.0472034906783023, "grad_norm": 0.3671719170264264, "learning_rate": 7.712672162667563e-05, "loss": 0.3953, "step": 330 }, { "epoch": 1.0503768345894486, "grad_norm": 0.3923903715008762, "learning_rate": 7.709362604757037e-05, "loss": 0.3941, "step": 331 }, { "epoch": 1.053550178500595, "grad_norm": 0.38611037315155544, "learning_rate": 7.706034813803501e-05, "loss": 0.3967, "step": 332 }, { "epoch": 1.0567235224117413, "grad_norm": 0.3205248790594937, "learning_rate": 7.702688806164419e-05, "loss": 0.3978, "step": 333 }, { "epoch": 1.0598968663228878, "grad_norm": 0.3851041881561772, "learning_rate": 7.699324598286794e-05, "loss": 0.3969, "step": 334 }, { "epoch": 1.0630702102340341, "grad_norm": 0.3048551045247859, "learning_rate": 7.69594220670709e-05, "loss": 0.3981, "step": 335 }, { "epoch": 1.0662435541451805, "grad_norm": 0.30649990794634424, "learning_rate": 7.692541648051156e-05, "loss": 0.3883, "step": 336 }, { "epoch": 1.0694168980563268, "grad_norm": 0.28629978941116185, "learning_rate": 7.689122939034135e-05, "loss": 0.391, "step": 337 }, { "epoch": 1.0725902419674733, "grad_norm": 0.2815013785922316, "learning_rate": 7.685686096460387e-05, "loss": 0.3869, "step": 338 }, { "epoch": 1.0757635858786196, "grad_norm": 0.2953463475399726, "learning_rate": 7.682231137223409e-05, "loss": 0.3913, "step": 339 }, { "epoch": 1.078936929789766, "grad_norm": 0.2366855254680076, "learning_rate": 7.678758078305745e-05, "loss": 0.3883, "step": 340 }, { "epoch": 1.0821102737009123, "grad_norm": 0.22104183899331858, "learning_rate": 7.67526693677891e-05, "loss": 0.3926, "step": 341 }, { "epoch": 1.0852836176120586, "grad_norm": 0.22652465216086823, "learning_rate": 7.671757729803299e-05, "loss": 0.3831, "step": 342 }, { "epoch": 1.0884569615232051, "grad_norm": 0.2160165389313366, "learning_rate": 7.668230474628108e-05, "loss": 0.3921, "step": 343 }, { "epoch": 1.0916303054343515, "grad_norm": 0.2107397076821849, "learning_rate": 7.664685188591246e-05, "loss": 0.3855, "step": 344 }, { "epoch": 1.0948036493454978, "grad_norm": 0.24162805284413108, "learning_rate": 7.661121889119257e-05, "loss": 0.3864, "step": 345 }, { "epoch": 1.097976993256644, "grad_norm": 0.20058743270523813, "learning_rate": 7.657540593727218e-05, "loss": 0.3898, "step": 346 }, { "epoch": 1.1011503371677906, "grad_norm": 0.19024320800558892, "learning_rate": 7.653941320018672e-05, "loss": 0.3881, "step": 347 }, { "epoch": 1.104323681078937, "grad_norm": 0.19348467821715107, "learning_rate": 7.650324085685528e-05, "loss": 0.3861, "step": 348 }, { "epoch": 1.1074970249900833, "grad_norm": 0.17855591849282623, "learning_rate": 7.646688908507983e-05, "loss": 0.3879, "step": 349 }, { "epoch": 1.1106703689012296, "grad_norm": 0.17423718420205644, "learning_rate": 7.643035806354427e-05, "loss": 0.393, "step": 350 }, { "epoch": 1.113843712812376, "grad_norm": 0.17435999115090034, "learning_rate": 7.639364797181359e-05, "loss": 0.3923, "step": 351 }, { "epoch": 1.1170170567235225, "grad_norm": 0.1709673536337813, "learning_rate": 7.6356758990333e-05, "loss": 0.3906, "step": 352 }, { "epoch": 1.1201904006346688, "grad_norm": 0.17317855376977867, "learning_rate": 7.6319691300427e-05, "loss": 0.3888, "step": 353 }, { "epoch": 1.1233637445458151, "grad_norm": 0.18174880626145437, "learning_rate": 7.628244508429856e-05, "loss": 0.3865, "step": 354 }, { "epoch": 1.1265370884569614, "grad_norm": 0.20331689410780052, "learning_rate": 7.624502052502814e-05, "loss": 0.3874, "step": 355 }, { "epoch": 1.129710432368108, "grad_norm": 0.23136698366970124, "learning_rate": 7.620741780657284e-05, "loss": 0.3862, "step": 356 }, { "epoch": 1.1328837762792543, "grad_norm": 0.2578608983871263, "learning_rate": 7.61696371137655e-05, "loss": 0.3885, "step": 357 }, { "epoch": 1.1360571201904006, "grad_norm": 0.31713285899714155, "learning_rate": 7.613167863231376e-05, "loss": 0.3858, "step": 358 }, { "epoch": 1.139230464101547, "grad_norm": 0.4030624344185451, "learning_rate": 7.609354254879916e-05, "loss": 0.3894, "step": 359 }, { "epoch": 1.1424038080126935, "grad_norm": 0.5246514767856207, "learning_rate": 7.605522905067626e-05, "loss": 0.3852, "step": 360 }, { "epoch": 1.1455771519238398, "grad_norm": 0.5257615152434831, "learning_rate": 7.601673832627162e-05, "loss": 0.3892, "step": 361 }, { "epoch": 1.1487504958349861, "grad_norm": 0.47164366725819173, "learning_rate": 7.597807056478304e-05, "loss": 0.3941, "step": 362 }, { "epoch": 1.1519238397461324, "grad_norm": 0.40276010096670556, "learning_rate": 7.593922595627843e-05, "loss": 0.3895, "step": 363 }, { "epoch": 1.1550971836572788, "grad_norm": 0.3335379837317923, "learning_rate": 7.590020469169505e-05, "loss": 0.3868, "step": 364 }, { "epoch": 1.1582705275684253, "grad_norm": 0.389480610213482, "learning_rate": 7.586100696283845e-05, "loss": 0.3888, "step": 365 }, { "epoch": 1.1614438714795716, "grad_norm": 0.40641502830320464, "learning_rate": 7.582163296238158e-05, "loss": 0.3886, "step": 366 }, { "epoch": 1.164617215390718, "grad_norm": 0.34031545798126167, "learning_rate": 7.578208288386386e-05, "loss": 0.3888, "step": 367 }, { "epoch": 1.1677905593018643, "grad_norm": 0.3375442621679508, "learning_rate": 7.574235692169021e-05, "loss": 0.3901, "step": 368 }, { "epoch": 1.1709639032130108, "grad_norm": 0.34505650189241893, "learning_rate": 7.570245527113004e-05, "loss": 0.3875, "step": 369 }, { "epoch": 1.1741372471241571, "grad_norm": 0.32855600104681076, "learning_rate": 7.566237812831641e-05, "loss": 0.3808, "step": 370 }, { "epoch": 1.1773105910353034, "grad_norm": 0.33479682880517575, "learning_rate": 7.562212569024494e-05, "loss": 0.3831, "step": 371 }, { "epoch": 1.1804839349464498, "grad_norm": 0.3323687537285638, "learning_rate": 7.558169815477293e-05, "loss": 0.3763, "step": 372 }, { "epoch": 1.1836572788575963, "grad_norm": 0.29597912982148444, "learning_rate": 7.554109572061835e-05, "loss": 0.384, "step": 373 }, { "epoch": 1.1868306227687426, "grad_norm": 0.2608818292389468, "learning_rate": 7.550031858735885e-05, "loss": 0.3844, "step": 374 }, { "epoch": 1.190003966679889, "grad_norm": 0.2362089436062283, "learning_rate": 7.545936695543084e-05, "loss": 0.3847, "step": 375 }, { "epoch": 1.1931773105910353, "grad_norm": 0.27249864452918143, "learning_rate": 7.541824102612839e-05, "loss": 0.3843, "step": 376 }, { "epoch": 1.1963506545021816, "grad_norm": 0.291889265790341, "learning_rate": 7.537694100160242e-05, "loss": 0.385, "step": 377 }, { "epoch": 1.1995239984133281, "grad_norm": 0.26769841877142225, "learning_rate": 7.533546708485949e-05, "loss": 0.3857, "step": 378 }, { "epoch": 1.2026973423244744, "grad_norm": 0.25310229652456356, "learning_rate": 7.529381947976097e-05, "loss": 0.3842, "step": 379 }, { "epoch": 1.2058706862356208, "grad_norm": 0.3794658972915025, "learning_rate": 7.525199839102198e-05, "loss": 0.3853, "step": 380 }, { "epoch": 1.209044030146767, "grad_norm": 0.4433997626787792, "learning_rate": 7.521000402421039e-05, "loss": 0.3871, "step": 381 }, { "epoch": 1.2122173740579134, "grad_norm": 0.44670046763987153, "learning_rate": 7.516783658574575e-05, "loss": 0.38, "step": 382 }, { "epoch": 1.21539071796906, "grad_norm": 0.4470855938681001, "learning_rate": 7.51254962828984e-05, "loss": 0.3883, "step": 383 }, { "epoch": 1.2185640618802063, "grad_norm": 0.37472305499391584, "learning_rate": 7.508298332378832e-05, "loss": 0.3874, "step": 384 }, { "epoch": 1.2217374057913526, "grad_norm": 0.2675107029023701, "learning_rate": 7.504029791738419e-05, "loss": 0.3797, "step": 385 }, { "epoch": 1.224910749702499, "grad_norm": 0.2692926766700756, "learning_rate": 7.499744027350236e-05, "loss": 0.3877, "step": 386 }, { "epoch": 1.2280840936136455, "grad_norm": 0.35704547610000964, "learning_rate": 7.495441060280577e-05, "loss": 0.3847, "step": 387 }, { "epoch": 1.2312574375247918, "grad_norm": 0.393025585260643, "learning_rate": 7.491120911680295e-05, "loss": 0.3833, "step": 388 }, { "epoch": 1.234430781435938, "grad_norm": 0.34992195520115954, "learning_rate": 7.486783602784697e-05, "loss": 0.3804, "step": 389 }, { "epoch": 1.2376041253470844, "grad_norm": 0.26770410504282594, "learning_rate": 7.48242915491344e-05, "loss": 0.3792, "step": 390 }, { "epoch": 1.240777469258231, "grad_norm": 0.21538249724047004, "learning_rate": 7.478057589470429e-05, "loss": 0.3823, "step": 391 }, { "epoch": 1.2439508131693773, "grad_norm": 0.24012637317741398, "learning_rate": 7.473668927943703e-05, "loss": 0.3898, "step": 392 }, { "epoch": 1.2471241570805236, "grad_norm": 0.21927612950882752, "learning_rate": 7.469263191905342e-05, "loss": 0.3832, "step": 393 }, { "epoch": 1.25029750099167, "grad_norm": 0.22562831710853817, "learning_rate": 7.464840403011348e-05, "loss": 0.3781, "step": 394 }, { "epoch": 1.2534708449028162, "grad_norm": 0.25428834981926673, "learning_rate": 7.460400583001549e-05, "loss": 0.3812, "step": 395 }, { "epoch": 1.2566441888139628, "grad_norm": 0.2702984486835505, "learning_rate": 7.455943753699485e-05, "loss": 0.3866, "step": 396 }, { "epoch": 1.259817532725109, "grad_norm": 0.24121323703671108, "learning_rate": 7.451469937012308e-05, "loss": 0.3873, "step": 397 }, { "epoch": 1.2629908766362554, "grad_norm": 0.18653342749296287, "learning_rate": 7.446979154930664e-05, "loss": 0.3825, "step": 398 }, { "epoch": 1.2661642205474017, "grad_norm": 0.16332575184703055, "learning_rate": 7.4424714295286e-05, "loss": 0.3815, "step": 399 }, { "epoch": 1.269337564458548, "grad_norm": 0.25403145188276544, "learning_rate": 7.437946782963434e-05, "loss": 0.383, "step": 400 }, { "epoch": 1.2725109083696946, "grad_norm": 0.3444161535033614, "learning_rate": 7.433405237475668e-05, "loss": 0.3813, "step": 401 }, { "epoch": 1.275684252280841, "grad_norm": 0.3519022102988297, "learning_rate": 7.428846815388867e-05, "loss": 0.3855, "step": 402 }, { "epoch": 1.2788575961919872, "grad_norm": 0.33077524829421917, "learning_rate": 7.424271539109548e-05, "loss": 0.3821, "step": 403 }, { "epoch": 1.2820309401031338, "grad_norm": 0.3162039421386741, "learning_rate": 7.419679431127078e-05, "loss": 0.3851, "step": 404 }, { "epoch": 1.28520428401428, "grad_norm": 0.32717321840623165, "learning_rate": 7.415070514013554e-05, "loss": 0.3835, "step": 405 }, { "epoch": 1.2883776279254264, "grad_norm": 0.3260212879527716, "learning_rate": 7.410444810423703e-05, "loss": 0.3851, "step": 406 }, { "epoch": 1.2915509718365727, "grad_norm": 0.28141296423668416, "learning_rate": 7.405802343094761e-05, "loss": 0.384, "step": 407 }, { "epoch": 1.294724315747719, "grad_norm": 0.29242681028873885, "learning_rate": 7.401143134846361e-05, "loss": 0.3845, "step": 408 }, { "epoch": 1.2978976596588656, "grad_norm": 0.3089571656384374, "learning_rate": 7.396467208580431e-05, "loss": 0.3876, "step": 409 }, { "epoch": 1.301071003570012, "grad_norm": 0.2727831374787424, "learning_rate": 7.39177458728107e-05, "loss": 0.3853, "step": 410 }, { "epoch": 1.3042443474811583, "grad_norm": 0.24218021592794953, "learning_rate": 7.387065294014444e-05, "loss": 0.3854, "step": 411 }, { "epoch": 1.3074176913923046, "grad_norm": 0.26309492335546847, "learning_rate": 7.382339351928664e-05, "loss": 0.3831, "step": 412 }, { "epoch": 1.310591035303451, "grad_norm": 0.2590023202596478, "learning_rate": 7.377596784253682e-05, "loss": 0.3874, "step": 413 }, { "epoch": 1.3137643792145974, "grad_norm": 0.22355363656378588, "learning_rate": 7.372837614301167e-05, "loss": 0.3794, "step": 414 }, { "epoch": 1.3169377231257438, "grad_norm": 0.23277360549451911, "learning_rate": 7.368061865464398e-05, "loss": 0.3823, "step": 415 }, { "epoch": 1.32011106703689, "grad_norm": 0.29759664882744374, "learning_rate": 7.363269561218144e-05, "loss": 0.3854, "step": 416 }, { "epoch": 1.3232844109480366, "grad_norm": 0.37555739230090074, "learning_rate": 7.358460725118553e-05, "loss": 0.3796, "step": 417 }, { "epoch": 1.326457754859183, "grad_norm": 0.4302124224232923, "learning_rate": 7.353635380803031e-05, "loss": 0.3887, "step": 418 }, { "epoch": 1.3296310987703293, "grad_norm": 0.4773944462131208, "learning_rate": 7.348793551990132e-05, "loss": 0.3814, "step": 419 }, { "epoch": 1.3328044426814756, "grad_norm": 0.5403161096787458, "learning_rate": 7.343935262479433e-05, "loss": 0.3823, "step": 420 }, { "epoch": 1.335977786592622, "grad_norm": 0.5311329635820989, "learning_rate": 7.33906053615143e-05, "loss": 0.3863, "step": 421 }, { "epoch": 1.3391511305037684, "grad_norm": 0.44347216349851915, "learning_rate": 7.334169396967403e-05, "loss": 0.3811, "step": 422 }, { "epoch": 1.3423244744149148, "grad_norm": 0.4038867029004121, "learning_rate": 7.329261868969318e-05, "loss": 0.3786, "step": 423 }, { "epoch": 1.345497818326061, "grad_norm": 0.42166277469573826, "learning_rate": 7.324337976279688e-05, "loss": 0.386, "step": 424 }, { "epoch": 1.3486711622372074, "grad_norm": 0.4142405224624998, "learning_rate": 7.319397743101478e-05, "loss": 0.3824, "step": 425 }, { "epoch": 1.3518445061483537, "grad_norm": 0.36700141049097496, "learning_rate": 7.31444119371796e-05, "loss": 0.3829, "step": 426 }, { "epoch": 1.3550178500595003, "grad_norm": 0.3135329163503587, "learning_rate": 7.309468352492616e-05, "loss": 0.3824, "step": 427 }, { "epoch": 1.3581911939706466, "grad_norm": 0.2823327229949492, "learning_rate": 7.304479243869007e-05, "loss": 0.376, "step": 428 }, { "epoch": 1.361364537881793, "grad_norm": 0.30150602495208256, "learning_rate": 7.299473892370651e-05, "loss": 0.3792, "step": 429 }, { "epoch": 1.3645378817929394, "grad_norm": 0.2849241799804419, "learning_rate": 7.294452322600912e-05, "loss": 0.374, "step": 430 }, { "epoch": 1.3677112257040855, "grad_norm": 0.23906164695179707, "learning_rate": 7.289414559242871e-05, "loss": 0.3841, "step": 431 }, { "epoch": 1.370884569615232, "grad_norm": 0.27790217133033873, "learning_rate": 7.284360627059205e-05, "loss": 0.3894, "step": 432 }, { "epoch": 1.3740579135263784, "grad_norm": 0.27965618860067953, "learning_rate": 7.279290550892071e-05, "loss": 0.3856, "step": 433 }, { "epoch": 1.3772312574375247, "grad_norm": 0.2596553182069096, "learning_rate": 7.274204355662981e-05, "loss": 0.3835, "step": 434 }, { "epoch": 1.3804046013486713, "grad_norm": 0.22914836462750776, "learning_rate": 7.269102066372672e-05, "loss": 0.3801, "step": 435 }, { "epoch": 1.3835779452598176, "grad_norm": 0.24961150632312626, "learning_rate": 7.263983708100998e-05, "loss": 0.3792, "step": 436 }, { "epoch": 1.386751289170964, "grad_norm": 0.2799354977533222, "learning_rate": 7.258849306006796e-05, "loss": 0.3785, "step": 437 }, { "epoch": 1.3899246330821102, "grad_norm": 0.2609555149345694, "learning_rate": 7.253698885327761e-05, "loss": 0.3877, "step": 438 }, { "epoch": 1.3930979769932565, "grad_norm": 0.25476756664542266, "learning_rate": 7.24853247138033e-05, "loss": 0.3865, "step": 439 }, { "epoch": 1.396271320904403, "grad_norm": 0.2853597140584723, "learning_rate": 7.243350089559555e-05, "loss": 0.3835, "step": 440 }, { "epoch": 1.3994446648155494, "grad_norm": 0.3037545404410833, "learning_rate": 7.238151765338974e-05, "loss": 0.3755, "step": 441 }, { "epoch": 1.4026180087266957, "grad_norm": 0.32448084195253646, "learning_rate": 7.232937524270486e-05, "loss": 0.3824, "step": 442 }, { "epoch": 1.405791352637842, "grad_norm": 0.3389903691526235, "learning_rate": 7.227707391984233e-05, "loss": 0.3851, "step": 443 }, { "epoch": 1.4089646965489884, "grad_norm": 0.32339033942705137, "learning_rate": 7.222461394188467e-05, "loss": 0.3809, "step": 444 }, { "epoch": 1.412138040460135, "grad_norm": 0.3004560485468966, "learning_rate": 7.217199556669423e-05, "loss": 0.3753, "step": 445 }, { "epoch": 1.4153113843712812, "grad_norm": 0.2963352362165301, "learning_rate": 7.211921905291198e-05, "loss": 0.3845, "step": 446 }, { "epoch": 1.4184847282824276, "grad_norm": 0.3394987023293836, "learning_rate": 7.20662846599562e-05, "loss": 0.3829, "step": 447 }, { "epoch": 1.421658072193574, "grad_norm": 0.3691233795245846, "learning_rate": 7.201319264802118e-05, "loss": 0.3746, "step": 448 }, { "epoch": 1.4248314161047204, "grad_norm": 0.419112740006911, "learning_rate": 7.195994327807603e-05, "loss": 0.3889, "step": 449 }, { "epoch": 1.4280047600158667, "grad_norm": 0.4313621841160134, "learning_rate": 7.19065368118633e-05, "loss": 0.3821, "step": 450 }, { "epoch": 1.431178103927013, "grad_norm": 0.3566308145795978, "learning_rate": 7.185297351189771e-05, "loss": 0.3844, "step": 451 }, { "epoch": 1.4343514478381594, "grad_norm": 0.3637230498145468, "learning_rate": 7.179925364146496e-05, "loss": 0.3765, "step": 452 }, { "epoch": 1.437524791749306, "grad_norm": 0.3468382349782321, "learning_rate": 7.174537746462027e-05, "loss": 0.3744, "step": 453 }, { "epoch": 1.4406981356604522, "grad_norm": 0.3131084073191254, "learning_rate": 7.169134524618723e-05, "loss": 0.3814, "step": 454 }, { "epoch": 1.4438714795715986, "grad_norm": 0.23863373901146623, "learning_rate": 7.163715725175641e-05, "loss": 0.3848, "step": 455 }, { "epoch": 1.4470448234827449, "grad_norm": 0.25976033784025593, "learning_rate": 7.15828137476841e-05, "loss": 0.3836, "step": 456 }, { "epoch": 1.4502181673938912, "grad_norm": 0.3648800662769983, "learning_rate": 7.152831500109096e-05, "loss": 0.3848, "step": 457 }, { "epoch": 1.4533915113050377, "grad_norm": 0.3566172184643027, "learning_rate": 7.14736612798608e-05, "loss": 0.382, "step": 458 }, { "epoch": 1.456564855216184, "grad_norm": 0.3135261324431755, "learning_rate": 7.141885285263906e-05, "loss": 0.3741, "step": 459 }, { "epoch": 1.4597381991273304, "grad_norm": 0.3588285727185198, "learning_rate": 7.136388998883176e-05, "loss": 0.3836, "step": 460 }, { "epoch": 1.462911543038477, "grad_norm": 0.4144697912185411, "learning_rate": 7.130877295860396e-05, "loss": 0.3814, "step": 461 }, { "epoch": 1.4660848869496232, "grad_norm": 0.27886664253240845, "learning_rate": 7.125350203287856e-05, "loss": 0.3793, "step": 462 }, { "epoch": 1.4692582308607696, "grad_norm": 0.3160540382900367, "learning_rate": 7.119807748333488e-05, "loss": 0.382, "step": 463 }, { "epoch": 1.4724315747719159, "grad_norm": 0.43245832112598737, "learning_rate": 7.114249958240736e-05, "loss": 0.3792, "step": 464 }, { "epoch": 1.4756049186830622, "grad_norm": 0.3929381559573092, "learning_rate": 7.108676860328429e-05, "loss": 0.3813, "step": 465 }, { "epoch": 1.4787782625942087, "grad_norm": 0.36518334446855616, "learning_rate": 7.103088481990631e-05, "loss": 0.3794, "step": 466 }, { "epoch": 1.481951606505355, "grad_norm": 0.4062717060922552, "learning_rate": 7.097484850696523e-05, "loss": 0.3788, "step": 467 }, { "epoch": 1.4851249504165014, "grad_norm": 0.3985440491593577, "learning_rate": 7.091865993990257e-05, "loss": 0.3839, "step": 468 }, { "epoch": 1.4882982943276477, "grad_norm": 0.2478605758310385, "learning_rate": 7.086231939490825e-05, "loss": 0.3822, "step": 469 }, { "epoch": 1.491471638238794, "grad_norm": 0.2198881914089004, "learning_rate": 7.080582714891922e-05, "loss": 0.3844, "step": 470 }, { "epoch": 1.4946449821499406, "grad_norm": 0.30913344433263185, "learning_rate": 7.074918347961812e-05, "loss": 0.383, "step": 471 }, { "epoch": 1.497818326061087, "grad_norm": 0.3416286926665111, "learning_rate": 7.069238866543186e-05, "loss": 0.3836, "step": 472 }, { "epoch": 1.5009916699722332, "grad_norm": 0.26203884165321145, "learning_rate": 7.063544298553036e-05, "loss": 0.3857, "step": 473 }, { "epoch": 1.5041650138833798, "grad_norm": 0.2277269534373725, "learning_rate": 7.0578346719825e-05, "loss": 0.3768, "step": 474 }, { "epoch": 1.5073383577945259, "grad_norm": 0.29105785846246257, "learning_rate": 7.052110014896745e-05, "loss": 0.383, "step": 475 }, { "epoch": 1.5105117017056724, "grad_norm": 0.2943191343272462, "learning_rate": 7.046370355434814e-05, "loss": 0.3824, "step": 476 }, { "epoch": 1.5136850456168187, "grad_norm": 0.25257147532228097, "learning_rate": 7.040615721809495e-05, "loss": 0.3832, "step": 477 }, { "epoch": 1.516858389527965, "grad_norm": 0.227234673966253, "learning_rate": 7.03484614230718e-05, "loss": 0.3744, "step": 478 }, { "epoch": 1.5200317334391116, "grad_norm": 0.24823996008997273, "learning_rate": 7.029061645287724e-05, "loss": 0.3796, "step": 479 }, { "epoch": 1.5232050773502577, "grad_norm": 0.2871638979837815, "learning_rate": 7.023262259184309e-05, "loss": 0.381, "step": 480 }, { "epoch": 1.5263784212614042, "grad_norm": 0.3204999675625282, "learning_rate": 7.017448012503306e-05, "loss": 0.3798, "step": 481 }, { "epoch": 1.5295517651725505, "grad_norm": 0.3133074262497558, "learning_rate": 7.011618933824124e-05, "loss": 0.3811, "step": 482 }, { "epoch": 1.5327251090836969, "grad_norm": 0.2699740914992895, "learning_rate": 7.005775051799088e-05, "loss": 0.3764, "step": 483 }, { "epoch": 1.5358984529948434, "grad_norm": 0.26103326951221123, "learning_rate": 6.999916395153279e-05, "loss": 0.3839, "step": 484 }, { "epoch": 1.5390717969059897, "grad_norm": 0.319889184006099, "learning_rate": 6.994042992684406e-05, "loss": 0.3807, "step": 485 }, { "epoch": 1.542245140817136, "grad_norm": 0.3366203795427369, "learning_rate": 6.988154873262655e-05, "loss": 0.3859, "step": 486 }, { "epoch": 1.5454184847282826, "grad_norm": 0.31334787713113593, "learning_rate": 6.982252065830557e-05, "loss": 0.3801, "step": 487 }, { "epoch": 1.5485918286394287, "grad_norm": 0.2612084722641158, "learning_rate": 6.976334599402838e-05, "loss": 0.3818, "step": 488 }, { "epoch": 1.5517651725505752, "grad_norm": 0.23800865263666302, "learning_rate": 6.970402503066281e-05, "loss": 0.3796, "step": 489 }, { "epoch": 1.5549385164617215, "grad_norm": 0.20619756878025533, "learning_rate": 6.96445580597958e-05, "loss": 0.3784, "step": 490 }, { "epoch": 1.5581118603728679, "grad_norm": 0.22823026276130054, "learning_rate": 6.958494537373194e-05, "loss": 0.3898, "step": 491 }, { "epoch": 1.5612852042840144, "grad_norm": 0.2320286176349148, "learning_rate": 6.952518726549212e-05, "loss": 0.3776, "step": 492 }, { "epoch": 1.5644585481951605, "grad_norm": 0.18333626484330504, "learning_rate": 6.946528402881204e-05, "loss": 0.3768, "step": 493 }, { "epoch": 1.567631892106307, "grad_norm": 0.23594130345003322, "learning_rate": 6.940523595814073e-05, "loss": 0.3813, "step": 494 }, { "epoch": 1.5708052360174534, "grad_norm": 0.28075187874529217, "learning_rate": 6.934504334863915e-05, "loss": 0.3831, "step": 495 }, { "epoch": 1.5739785799285997, "grad_norm": 0.3164356465842641, "learning_rate": 6.928470649617876e-05, "loss": 0.3828, "step": 496 }, { "epoch": 1.5771519238397462, "grad_norm": 0.29758964129105847, "learning_rate": 6.922422569733998e-05, "loss": 0.3784, "step": 497 }, { "epoch": 1.5803252677508925, "grad_norm": 0.2444339210595453, "learning_rate": 6.916360124941084e-05, "loss": 0.3834, "step": 498 }, { "epoch": 1.5834986116620389, "grad_norm": 0.20948868463222103, "learning_rate": 6.910283345038542e-05, "loss": 0.3717, "step": 499 }, { "epoch": 1.5866719555731852, "grad_norm": 0.2172132590543899, "learning_rate": 6.904192259896247e-05, "loss": 0.3725, "step": 500 }, { "epoch": 1.5898452994843315, "grad_norm": 0.21384474550035673, "learning_rate": 6.898086899454387e-05, "loss": 0.3813, "step": 501 }, { "epoch": 1.593018643395478, "grad_norm": 0.1765294609272397, "learning_rate": 6.891967293723318e-05, "loss": 0.3754, "step": 502 }, { "epoch": 1.5961919873066244, "grad_norm": 0.19836511713366814, "learning_rate": 6.885833472783422e-05, "loss": 0.3789, "step": 503 }, { "epoch": 1.5993653312177707, "grad_norm": 0.24256580925484045, "learning_rate": 6.879685466784951e-05, "loss": 0.3786, "step": 504 }, { "epoch": 1.6025386751289172, "grad_norm": 0.2332208270352636, "learning_rate": 6.873523305947883e-05, "loss": 0.378, "step": 505 }, { "epoch": 1.6057120190400633, "grad_norm": 0.2380059856826742, "learning_rate": 6.867347020561774e-05, "loss": 0.382, "step": 506 }, { "epoch": 1.6088853629512099, "grad_norm": 0.24371122406892495, "learning_rate": 6.861156640985607e-05, "loss": 0.3813, "step": 507 }, { "epoch": 1.6120587068623562, "grad_norm": 0.22447844663148647, "learning_rate": 6.854952197647643e-05, "loss": 0.3762, "step": 508 }, { "epoch": 1.6152320507735025, "grad_norm": 0.19727536404273158, "learning_rate": 6.848733721045275e-05, "loss": 0.3732, "step": 509 }, { "epoch": 1.618405394684649, "grad_norm": 0.21660640742566128, "learning_rate": 6.842501241744873e-05, "loss": 0.3786, "step": 510 }, { "epoch": 1.6215787385957952, "grad_norm": 0.2567200817107154, "learning_rate": 6.836254790381635e-05, "loss": 0.3763, "step": 511 }, { "epoch": 1.6247520825069417, "grad_norm": 0.2867034859585703, "learning_rate": 6.829994397659439e-05, "loss": 0.3747, "step": 512 }, { "epoch": 1.627925426418088, "grad_norm": 0.3529920349888837, "learning_rate": 6.823720094350691e-05, "loss": 0.3801, "step": 513 }, { "epoch": 1.6310987703292343, "grad_norm": 0.393616670612644, "learning_rate": 6.817431911296174e-05, "loss": 0.3763, "step": 514 }, { "epoch": 1.6342721142403809, "grad_norm": 0.37774302228207385, "learning_rate": 6.811129879404892e-05, "loss": 0.374, "step": 515 }, { "epoch": 1.6374454581515272, "grad_norm": 0.3864964477468986, "learning_rate": 6.804814029653926e-05, "loss": 0.3791, "step": 516 }, { "epoch": 1.6406188020626735, "grad_norm": 0.39159495583279336, "learning_rate": 6.798484393088273e-05, "loss": 0.3829, "step": 517 }, { "epoch": 1.64379214597382, "grad_norm": 0.39302151980304456, "learning_rate": 6.792141000820703e-05, "loss": 0.3824, "step": 518 }, { "epoch": 1.6469654898849662, "grad_norm": 0.36360896642596874, "learning_rate": 6.785783884031596e-05, "loss": 0.3761, "step": 519 }, { "epoch": 1.6501388337961127, "grad_norm": 0.3241622597538385, "learning_rate": 6.779413073968798e-05, "loss": 0.3781, "step": 520 }, { "epoch": 1.653312177707259, "grad_norm": 0.29559003085786345, "learning_rate": 6.77302860194746e-05, "loss": 0.3767, "step": 521 }, { "epoch": 1.6564855216184053, "grad_norm": 0.30944837922176927, "learning_rate": 6.766630499349888e-05, "loss": 0.3747, "step": 522 }, { "epoch": 1.6596588655295519, "grad_norm": 0.3338849982759374, "learning_rate": 6.760218797625389e-05, "loss": 0.3778, "step": 523 }, { "epoch": 1.662832209440698, "grad_norm": 0.305222030262685, "learning_rate": 6.753793528290112e-05, "loss": 0.3861, "step": 524 }, { "epoch": 1.6660055533518445, "grad_norm": 0.23959583080697613, "learning_rate": 6.747354722926903e-05, "loss": 0.3845, "step": 525 }, { "epoch": 1.6691788972629908, "grad_norm": 0.28220363781779934, "learning_rate": 6.740902413185133e-05, "loss": 0.3788, "step": 526 }, { "epoch": 1.6723522411741372, "grad_norm": 0.2787836540873191, "learning_rate": 6.734436630780565e-05, "loss": 0.379, "step": 527 }, { "epoch": 1.6755255850852837, "grad_norm": 0.24464321106241743, "learning_rate": 6.727957407495174e-05, "loss": 0.3745, "step": 528 }, { "epoch": 1.67869892899643, "grad_norm": 0.2605023337214791, "learning_rate": 6.721464775177009e-05, "loss": 0.3742, "step": 529 }, { "epoch": 1.6818722729075763, "grad_norm": 0.257073037128768, "learning_rate": 6.71495876574003e-05, "loss": 0.3696, "step": 530 }, { "epoch": 1.685045616818723, "grad_norm": 0.22905434546418404, "learning_rate": 6.708439411163948e-05, "loss": 0.3744, "step": 531 }, { "epoch": 1.688218960729869, "grad_norm": 0.18138198252013102, "learning_rate": 6.701906743494075e-05, "loss": 0.3822, "step": 532 }, { "epoch": 1.6913923046410155, "grad_norm": 0.21324403956364202, "learning_rate": 6.695360794841156e-05, "loss": 0.374, "step": 533 }, { "epoch": 1.6945656485521619, "grad_norm": 0.2589520378305425, "learning_rate": 6.688801597381223e-05, "loss": 0.3766, "step": 534 }, { "epoch": 1.6977389924633082, "grad_norm": 0.2739815290314333, "learning_rate": 6.68222918335543e-05, "loss": 0.3787, "step": 535 }, { "epoch": 1.7009123363744547, "grad_norm": 0.2622879792779608, "learning_rate": 6.675643585069894e-05, "loss": 0.3762, "step": 536 }, { "epoch": 1.7040856802856008, "grad_norm": 0.3267913358280892, "learning_rate": 6.669044834895541e-05, "loss": 0.3785, "step": 537 }, { "epoch": 1.7072590241967474, "grad_norm": 0.32086484201627036, "learning_rate": 6.662432965267944e-05, "loss": 0.3761, "step": 538 }, { "epoch": 1.7104323681078937, "grad_norm": 0.23790201809058262, "learning_rate": 6.655808008687156e-05, "loss": 0.3779, "step": 539 }, { "epoch": 1.71360571201904, "grad_norm": 0.20836977293389647, "learning_rate": 6.649169997717571e-05, "loss": 0.3776, "step": 540 }, { "epoch": 1.7167790559301865, "grad_norm": 0.2053799652061652, "learning_rate": 6.642518964987739e-05, "loss": 0.3846, "step": 541 }, { "epoch": 1.7199523998413329, "grad_norm": 0.21743619572422304, "learning_rate": 6.635854943190221e-05, "loss": 0.3818, "step": 542 }, { "epoch": 1.7231257437524792, "grad_norm": 0.2330911970783851, "learning_rate": 6.629177965081428e-05, "loss": 0.3728, "step": 543 }, { "epoch": 1.7262990876636255, "grad_norm": 0.2971530159306548, "learning_rate": 6.622488063481454e-05, "loss": 0.385, "step": 544 }, { "epoch": 1.7294724315747718, "grad_norm": 0.34910792893542064, "learning_rate": 6.615785271273913e-05, "loss": 0.375, "step": 545 }, { "epoch": 1.7326457754859184, "grad_norm": 0.3668253793899281, "learning_rate": 6.609069621405791e-05, "loss": 0.3785, "step": 546 }, { "epoch": 1.7358191193970647, "grad_norm": 0.2748830512102228, "learning_rate": 6.602341146887267e-05, "loss": 0.3825, "step": 547 }, { "epoch": 1.738992463308211, "grad_norm": 0.20904532284444974, "learning_rate": 6.595599880791562e-05, "loss": 0.3825, "step": 548 }, { "epoch": 1.7421658072193575, "grad_norm": 0.23319626735985258, "learning_rate": 6.58884585625477e-05, "loss": 0.3817, "step": 549 }, { "epoch": 1.7453391511305036, "grad_norm": 0.3101304312167264, "learning_rate": 6.582079106475702e-05, "loss": 0.3783, "step": 550 }, { "epoch": 1.7485124950416502, "grad_norm": 0.40115725345017383, "learning_rate": 6.575299664715714e-05, "loss": 0.3753, "step": 551 }, { "epoch": 1.7516858389527965, "grad_norm": 0.3987859292934531, "learning_rate": 6.568507564298553e-05, "loss": 0.3855, "step": 552 }, { "epoch": 1.7548591828639428, "grad_norm": 0.3456599898982391, "learning_rate": 6.561702838610186e-05, "loss": 0.376, "step": 553 }, { "epoch": 1.7580325267750894, "grad_norm": 0.2417290147953845, "learning_rate": 6.55488552109864e-05, "loss": 0.3776, "step": 554 }, { "epoch": 1.7612058706862355, "grad_norm": 0.20640270006206152, "learning_rate": 6.548055645273831e-05, "loss": 0.3803, "step": 555 }, { "epoch": 1.764379214597382, "grad_norm": 0.2232956579416985, "learning_rate": 6.541213244707412e-05, "loss": 0.3752, "step": 556 }, { "epoch": 1.7675525585085283, "grad_norm": 0.2615817344479701, "learning_rate": 6.534358353032593e-05, "loss": 0.375, "step": 557 }, { "epoch": 1.7707259024196746, "grad_norm": 0.254734079264447, "learning_rate": 6.52749100394399e-05, "loss": 0.3751, "step": 558 }, { "epoch": 1.7738992463308212, "grad_norm": 0.21642500358045758, "learning_rate": 6.520611231197446e-05, "loss": 0.3845, "step": 559 }, { "epoch": 1.7770725902419675, "grad_norm": 0.19329608647091265, "learning_rate": 6.513719068609874e-05, "loss": 0.3789, "step": 560 }, { "epoch": 1.7802459341531138, "grad_norm": 0.2530286985366438, "learning_rate": 6.506814550059091e-05, "loss": 0.377, "step": 561 }, { "epoch": 1.7834192780642604, "grad_norm": 0.2896046782801796, "learning_rate": 6.499897709483641e-05, "loss": 0.375, "step": 562 }, { "epoch": 1.7865926219754065, "grad_norm": 0.2712317131665261, "learning_rate": 6.492968580882644e-05, "loss": 0.3776, "step": 563 }, { "epoch": 1.789765965886553, "grad_norm": 0.2314286827879497, "learning_rate": 6.486027198315617e-05, "loss": 0.3794, "step": 564 }, { "epoch": 1.7929393097976993, "grad_norm": 0.21626322109134916, "learning_rate": 6.479073595902309e-05, "loss": 0.3774, "step": 565 }, { "epoch": 1.7961126537088457, "grad_norm": 0.19733332972369577, "learning_rate": 6.472107807822538e-05, "loss": 0.3708, "step": 566 }, { "epoch": 1.7992859976199922, "grad_norm": 0.18353210245394538, "learning_rate": 6.465129868316016e-05, "loss": 0.3743, "step": 567 }, { "epoch": 1.8024593415311383, "grad_norm": 0.21893010037488972, "learning_rate": 6.458139811682188e-05, "loss": 0.3752, "step": 568 }, { "epoch": 1.8056326854422848, "grad_norm": 0.2628816377905819, "learning_rate": 6.451137672280056e-05, "loss": 0.3731, "step": 569 }, { "epoch": 1.8088060293534312, "grad_norm": 0.2548108269109919, "learning_rate": 6.444123484528015e-05, "loss": 0.3755, "step": 570 }, { "epoch": 1.8119793732645775, "grad_norm": 0.2260163520548028, "learning_rate": 6.437097282903685e-05, "loss": 0.3773, "step": 571 }, { "epoch": 1.815152717175724, "grad_norm": 0.19255377161005122, "learning_rate": 6.430059101943736e-05, "loss": 0.3788, "step": 572 }, { "epoch": 1.8183260610868703, "grad_norm": 0.16745873690738822, "learning_rate": 6.423008976243722e-05, "loss": 0.3742, "step": 573 }, { "epoch": 1.8214994049980167, "grad_norm": 0.24311299014307378, "learning_rate": 6.415946940457911e-05, "loss": 0.3781, "step": 574 }, { "epoch": 1.824672748909163, "grad_norm": 0.2692540724073406, "learning_rate": 6.408873029299115e-05, "loss": 0.3758, "step": 575 }, { "epoch": 1.8278460928203093, "grad_norm": 0.25293407337643353, "learning_rate": 6.401787277538515e-05, "loss": 0.3745, "step": 576 }, { "epoch": 1.8310194367314558, "grad_norm": 0.21866660096950907, "learning_rate": 6.394689720005499e-05, "loss": 0.3692, "step": 577 }, { "epoch": 1.8341927806426022, "grad_norm": 0.21682910277524756, "learning_rate": 6.387580391587477e-05, "loss": 0.3758, "step": 578 }, { "epoch": 1.8373661245537485, "grad_norm": 0.25230011318318546, "learning_rate": 6.380459327229727e-05, "loss": 0.3783, "step": 579 }, { "epoch": 1.840539468464895, "grad_norm": 0.26756389255355956, "learning_rate": 6.373326561935207e-05, "loss": 0.3723, "step": 580 }, { "epoch": 1.8437128123760411, "grad_norm": 0.27082347370882653, "learning_rate": 6.366182130764392e-05, "loss": 0.3695, "step": 581 }, { "epoch": 1.8468861562871877, "grad_norm": 0.2981854551164244, "learning_rate": 6.359026068835101e-05, "loss": 0.3725, "step": 582 }, { "epoch": 1.850059500198334, "grad_norm": 0.30431911893180386, "learning_rate": 6.351858411322324e-05, "loss": 0.3754, "step": 583 }, { "epoch": 1.8532328441094803, "grad_norm": 0.29505742082933084, "learning_rate": 6.344679193458043e-05, "loss": 0.374, "step": 584 }, { "epoch": 1.8564061880206268, "grad_norm": 0.2999630911197104, "learning_rate": 6.337488450531068e-05, "loss": 0.371, "step": 585 }, { "epoch": 1.859579531931773, "grad_norm": 0.35422358202016324, "learning_rate": 6.330286217886857e-05, "loss": 0.3689, "step": 586 }, { "epoch": 1.8627528758429195, "grad_norm": 0.31054325154643925, "learning_rate": 6.323072530927349e-05, "loss": 0.3803, "step": 587 }, { "epoch": 1.8659262197540658, "grad_norm": 0.27551184384415855, "learning_rate": 6.31584742511078e-05, "loss": 0.3731, "step": 588 }, { "epoch": 1.8690995636652121, "grad_norm": 0.31879930611418805, "learning_rate": 6.308610935951516e-05, "loss": 0.3767, "step": 589 }, { "epoch": 1.8722729075763587, "grad_norm": 0.33037474746040296, "learning_rate": 6.301363099019881e-05, "loss": 0.3751, "step": 590 }, { "epoch": 1.875446251487505, "grad_norm": 0.2595876949175527, "learning_rate": 6.294103949941975e-05, "loss": 0.3722, "step": 591 }, { "epoch": 1.8786195953986513, "grad_norm": 0.26859094455834825, "learning_rate": 6.2868335243995e-05, "loss": 0.3727, "step": 592 }, { "epoch": 1.8817929393097979, "grad_norm": 0.3602873489651573, "learning_rate": 6.279551858129588e-05, "loss": 0.371, "step": 593 }, { "epoch": 1.884966283220944, "grad_norm": 0.4148929616678669, "learning_rate": 6.272258986924624e-05, "loss": 0.3726, "step": 594 }, { "epoch": 1.8881396271320905, "grad_norm": 0.3239892420647555, "learning_rate": 6.26495494663207e-05, "loss": 0.3754, "step": 595 }, { "epoch": 1.8913129710432368, "grad_norm": 0.2219707121481662, "learning_rate": 6.257639773154288e-05, "loss": 0.3763, "step": 596 }, { "epoch": 1.8944863149543831, "grad_norm": 0.24701125744993022, "learning_rate": 6.250313502448368e-05, "loss": 0.3769, "step": 597 }, { "epoch": 1.8976596588655297, "grad_norm": 0.2508614116325903, "learning_rate": 6.24297617052594e-05, "loss": 0.3797, "step": 598 }, { "epoch": 1.9008330027766758, "grad_norm": 0.24307046170154817, "learning_rate": 6.23562781345301e-05, "loss": 0.3768, "step": 599 }, { "epoch": 1.9040063466878223, "grad_norm": 0.23596073863704914, "learning_rate": 6.228268467349776e-05, "loss": 0.3764, "step": 600 }, { "epoch": 1.9071796905989686, "grad_norm": 0.287641306711991, "learning_rate": 6.22089816839045e-05, "loss": 0.3679, "step": 601 }, { "epoch": 1.910353034510115, "grad_norm": 0.3350000639906152, "learning_rate": 6.213516952803084e-05, "loss": 0.3749, "step": 602 }, { "epoch": 1.9135263784212615, "grad_norm": 0.2953436732358185, "learning_rate": 6.20612485686939e-05, "loss": 0.3731, "step": 603 }, { "epoch": 1.9166997223324078, "grad_norm": 0.26307310235068826, "learning_rate": 6.198721916924559e-05, "loss": 0.3744, "step": 604 }, { "epoch": 1.9198730662435541, "grad_norm": 0.21992342429818196, "learning_rate": 6.191308169357084e-05, "loss": 0.3776, "step": 605 }, { "epoch": 1.9230464101547007, "grad_norm": 0.2096604115734754, "learning_rate": 6.183883650608588e-05, "loss": 0.3772, "step": 606 }, { "epoch": 1.9262197540658468, "grad_norm": 0.20221136579674542, "learning_rate": 6.176448397173632e-05, "loss": 0.3725, "step": 607 }, { "epoch": 1.9293930979769933, "grad_norm": 0.2152338983820113, "learning_rate": 6.169002445599544e-05, "loss": 0.3677, "step": 608 }, { "epoch": 1.9325664418881396, "grad_norm": 0.22320752106286315, "learning_rate": 6.161545832486242e-05, "loss": 0.3756, "step": 609 }, { "epoch": 1.935739785799286, "grad_norm": 0.2253156227252389, "learning_rate": 6.154078594486045e-05, "loss": 0.3714, "step": 610 }, { "epoch": 1.9389131297104325, "grad_norm": 0.2060476121512058, "learning_rate": 6.146600768303498e-05, "loss": 0.3765, "step": 611 }, { "epoch": 1.9420864736215786, "grad_norm": 0.25181408887727724, "learning_rate": 6.139112390695195e-05, "loss": 0.3736, "step": 612 }, { "epoch": 1.9452598175327251, "grad_norm": 0.28050692887038614, "learning_rate": 6.13161349846959e-05, "loss": 0.3711, "step": 613 }, { "epoch": 1.9484331614438715, "grad_norm": 0.2323287898084542, "learning_rate": 6.124104128486824e-05, "loss": 0.3787, "step": 614 }, { "epoch": 1.9516065053550178, "grad_norm": 0.22252533467543772, "learning_rate": 6.11658431765854e-05, "loss": 0.3761, "step": 615 }, { "epoch": 1.9547798492661643, "grad_norm": 0.23979064807450512, "learning_rate": 6.109054102947701e-05, "loss": 0.3768, "step": 616 }, { "epoch": 1.9579531931773104, "grad_norm": 0.20369908867353018, "learning_rate": 6.101513521368409e-05, "loss": 0.3717, "step": 617 }, { "epoch": 1.961126537088457, "grad_norm": 0.21979116733171586, "learning_rate": 6.0939626099857256e-05, "loss": 0.3719, "step": 618 }, { "epoch": 1.9642998809996033, "grad_norm": 0.2359782325683911, "learning_rate": 6.086401405915485e-05, "loss": 0.3697, "step": 619 }, { "epoch": 1.9674732249107496, "grad_norm": 0.22186897194718339, "learning_rate": 6.0788299463241146e-05, "loss": 0.3714, "step": 620 }, { "epoch": 1.9706465688218961, "grad_norm": 0.19286916683029354, "learning_rate": 6.071248268428455e-05, "loss": 0.3694, "step": 621 }, { "epoch": 1.9738199127330425, "grad_norm": 0.19038716732119104, "learning_rate": 6.06365640949557e-05, "loss": 0.3689, "step": 622 }, { "epoch": 1.9769932566441888, "grad_norm": 0.21853406474562687, "learning_rate": 6.0560544068425704e-05, "loss": 0.3713, "step": 623 }, { "epoch": 1.9801666005553353, "grad_norm": 0.2519707220662586, "learning_rate": 6.048442297836424e-05, "loss": 0.3745, "step": 624 }, { "epoch": 1.9833399444664814, "grad_norm": 0.27348429383893025, "learning_rate": 6.040820119893781e-05, "loss": 0.3726, "step": 625 }, { "epoch": 1.986513288377628, "grad_norm": 0.25727914228364634, "learning_rate": 6.033187910480779e-05, "loss": 0.3737, "step": 626 }, { "epoch": 1.9896866322887743, "grad_norm": 0.19830437617712135, "learning_rate": 6.025545707112868e-05, "loss": 0.3714, "step": 627 }, { "epoch": 1.9928599761999206, "grad_norm": 0.17578894688384492, "learning_rate": 6.017893547354618e-05, "loss": 0.3701, "step": 628 }, { "epoch": 1.9960333201110672, "grad_norm": 0.28565651904457573, "learning_rate": 6.0102314688195466e-05, "loss": 0.3707, "step": 629 }, { "epoch": 1.9992066640222133, "grad_norm": 0.3512726249295466, "learning_rate": 6.002559509169917e-05, "loss": 0.374, "step": 630 }, { "epoch": 2.00238000793336, "grad_norm": 0.5082104882029274, "learning_rate": 5.994877706116571e-05, "loss": 0.6682, "step": 631 }, { "epoch": 2.0055533518445063, "grad_norm": 0.7958584718546614, "learning_rate": 5.9871860974187266e-05, "loss": 0.3515, "step": 632 }, { "epoch": 2.0087266957556524, "grad_norm": 1.2309453398579198, "learning_rate": 5.979484720883806e-05, "loss": 0.3691, "step": 633 }, { "epoch": 2.011900039666799, "grad_norm": 0.6258361012105962, "learning_rate": 5.971773614367244e-05, "loss": 0.3592, "step": 634 }, { "epoch": 2.015073383577945, "grad_norm": 0.7180639249872159, "learning_rate": 5.964052815772298e-05, "loss": 0.3567, "step": 635 }, { "epoch": 2.0182467274890916, "grad_norm": 0.9313882834628479, "learning_rate": 5.9563223630498714e-05, "loss": 0.3601, "step": 636 }, { "epoch": 2.021420071400238, "grad_norm": 0.7992267279088702, "learning_rate": 5.9485822941983185e-05, "loss": 0.3532, "step": 637 }, { "epoch": 2.0245934153113843, "grad_norm": 0.507785688644202, "learning_rate": 5.940832647263262e-05, "loss": 0.3522, "step": 638 }, { "epoch": 2.027766759222531, "grad_norm": 0.578349620722966, "learning_rate": 5.933073460337404e-05, "loss": 0.347, "step": 639 }, { "epoch": 2.030940103133677, "grad_norm": 0.5232367250977141, "learning_rate": 5.9253047715603384e-05, "loss": 0.36, "step": 640 }, { "epoch": 2.0341134470448234, "grad_norm": 0.48100594573201455, "learning_rate": 5.917526619118368e-05, "loss": 0.3458, "step": 641 }, { "epoch": 2.03728679095597, "grad_norm": 0.3770908533781567, "learning_rate": 5.909739041244311e-05, "loss": 0.3491, "step": 642 }, { "epoch": 2.040460134867116, "grad_norm": 0.43489851021818743, "learning_rate": 5.9019420762173156e-05, "loss": 0.35, "step": 643 }, { "epoch": 2.0436334787782626, "grad_norm": 0.34486032651494825, "learning_rate": 5.894135762362673e-05, "loss": 0.3504, "step": 644 }, { "epoch": 2.046806822689409, "grad_norm": 0.3111171335430375, "learning_rate": 5.8863201380516255e-05, "loss": 0.3518, "step": 645 }, { "epoch": 2.0499801666005553, "grad_norm": 0.35964655450159183, "learning_rate": 5.8784952417011826e-05, "loss": 0.3492, "step": 646 }, { "epoch": 2.053153510511702, "grad_norm": 0.2793218850087469, "learning_rate": 5.8706611117739275e-05, "loss": 0.345, "step": 647 }, { "epoch": 2.056326854422848, "grad_norm": 0.2630228702707809, "learning_rate": 5.862817786777832e-05, "loss": 0.3527, "step": 648 }, { "epoch": 2.0595001983339944, "grad_norm": 0.287600094925337, "learning_rate": 5.854965305266065e-05, "loss": 0.3472, "step": 649 }, { "epoch": 2.062673542245141, "grad_norm": 0.23415839473852326, "learning_rate": 5.8471037058368035e-05, "loss": 0.3472, "step": 650 }, { "epoch": 2.065846886156287, "grad_norm": 0.2480609831522893, "learning_rate": 5.839233027133041e-05, "loss": 0.347, "step": 651 }, { "epoch": 2.0690202300674336, "grad_norm": 0.24277577378910167, "learning_rate": 5.8313533078424016e-05, "loss": 0.3502, "step": 652 }, { "epoch": 2.0721935739785797, "grad_norm": 0.22888858417929306, "learning_rate": 5.823464586696947e-05, "loss": 0.3398, "step": 653 }, { "epoch": 2.0753669178897263, "grad_norm": 0.22777692875348907, "learning_rate": 5.8155669024729864e-05, "loss": 0.3502, "step": 654 }, { "epoch": 2.078540261800873, "grad_norm": 0.21302274202843294, "learning_rate": 5.807660293990887e-05, "loss": 0.3467, "step": 655 }, { "epoch": 2.081713605712019, "grad_norm": 0.17698466873204569, "learning_rate": 5.79974480011488e-05, "loss": 0.3468, "step": 656 }, { "epoch": 2.0848869496231655, "grad_norm": 0.18921626118682697, "learning_rate": 5.7918204597528755e-05, "loss": 0.3401, "step": 657 }, { "epoch": 2.088060293534312, "grad_norm": 0.19642637234378701, "learning_rate": 5.7838873118562656e-05, "loss": 0.3467, "step": 658 }, { "epoch": 2.091233637445458, "grad_norm": 0.13768919840127494, "learning_rate": 5.775945395419736e-05, "loss": 0.3461, "step": 659 }, { "epoch": 2.0944069813566046, "grad_norm": 0.18625327191674967, "learning_rate": 5.7679947494810707e-05, "loss": 0.3445, "step": 660 }, { "epoch": 2.0975803252677507, "grad_norm": 0.17171644648932974, "learning_rate": 5.760035413120967e-05, "loss": 0.3481, "step": 661 }, { "epoch": 2.1007536691788973, "grad_norm": 0.181873538605243, "learning_rate": 5.752067425462835e-05, "loss": 0.348, "step": 662 }, { "epoch": 2.103927013090044, "grad_norm": 0.19042134398803523, "learning_rate": 5.744090825672615e-05, "loss": 0.3447, "step": 663 }, { "epoch": 2.10710035700119, "grad_norm": 0.160202051805729, "learning_rate": 5.7361056529585736e-05, "loss": 0.3508, "step": 664 }, { "epoch": 2.1102737009123365, "grad_norm": 0.1568895178074113, "learning_rate": 5.728111946571119e-05, "loss": 0.3517, "step": 665 }, { "epoch": 2.1134470448234826, "grad_norm": 0.16324890485854626, "learning_rate": 5.720109745802607e-05, "loss": 0.3464, "step": 666 }, { "epoch": 2.116620388734629, "grad_norm": 0.1457391254676544, "learning_rate": 5.712099089987146e-05, "loss": 0.3475, "step": 667 }, { "epoch": 2.1197937326457756, "grad_norm": 0.12805660018258297, "learning_rate": 5.704080018500405e-05, "loss": 0.3531, "step": 668 }, { "epoch": 2.1229670765569217, "grad_norm": 0.14439965378280822, "learning_rate": 5.696052570759418e-05, "loss": 0.3395, "step": 669 }, { "epoch": 2.1261404204680683, "grad_norm": 0.14704244500302163, "learning_rate": 5.6880167862223915e-05, "loss": 0.3456, "step": 670 }, { "epoch": 2.129313764379215, "grad_norm": 0.1650010303974902, "learning_rate": 5.6799727043885155e-05, "loss": 0.3472, "step": 671 }, { "epoch": 2.132487108290361, "grad_norm": 0.13673543011389194, "learning_rate": 5.671920364797758e-05, "loss": 0.3446, "step": 672 }, { "epoch": 2.1356604522015075, "grad_norm": 0.1524951210391999, "learning_rate": 5.66385980703068e-05, "loss": 0.3409, "step": 673 }, { "epoch": 2.1388337961126536, "grad_norm": 0.1651899044016252, "learning_rate": 5.655791070708242e-05, "loss": 0.3458, "step": 674 }, { "epoch": 2.1420071400238, "grad_norm": 0.18171434979507048, "learning_rate": 5.647714195491599e-05, "loss": 0.3467, "step": 675 }, { "epoch": 2.1451804839349466, "grad_norm": 0.1895404480414663, "learning_rate": 5.6396292210819154e-05, "loss": 0.3458, "step": 676 }, { "epoch": 2.1483538278460927, "grad_norm": 0.1977999320741365, "learning_rate": 5.6315361872201663e-05, "loss": 0.3439, "step": 677 }, { "epoch": 2.1515271717572393, "grad_norm": 0.1709340652432958, "learning_rate": 5.6234351336869425e-05, "loss": 0.3435, "step": 678 }, { "epoch": 2.1547005156683854, "grad_norm": 0.19031692182789184, "learning_rate": 5.6153261003022556e-05, "loss": 0.3449, "step": 679 }, { "epoch": 2.157873859579532, "grad_norm": 0.16069939440920603, "learning_rate": 5.607209126925337e-05, "loss": 0.3537, "step": 680 }, { "epoch": 2.1610472034906785, "grad_norm": 0.1433199497560158, "learning_rate": 5.599084253454452e-05, "loss": 0.3393, "step": 681 }, { "epoch": 2.1642205474018246, "grad_norm": 0.14956082186311095, "learning_rate": 5.5909515198266965e-05, "loss": 0.342, "step": 682 }, { "epoch": 2.167393891312971, "grad_norm": 0.1544287677040818, "learning_rate": 5.582810966017799e-05, "loss": 0.3503, "step": 683 }, { "epoch": 2.170567235224117, "grad_norm": 0.15319232608750777, "learning_rate": 5.574662632041932e-05, "loss": 0.3467, "step": 684 }, { "epoch": 2.1737405791352638, "grad_norm": 0.14670339038716207, "learning_rate": 5.566506557951508e-05, "loss": 0.3422, "step": 685 }, { "epoch": 2.1769139230464103, "grad_norm": 0.16667697899341136, "learning_rate": 5.558342783836987e-05, "loss": 0.3424, "step": 686 }, { "epoch": 2.1800872669575564, "grad_norm": 0.1975464831082886, "learning_rate": 5.550171349826675e-05, "loss": 0.3461, "step": 687 }, { "epoch": 2.183260610868703, "grad_norm": 0.17888577322944074, "learning_rate": 5.5419922960865334e-05, "loss": 0.3469, "step": 688 }, { "epoch": 2.186433954779849, "grad_norm": 0.17541637626798942, "learning_rate": 5.533805662819975e-05, "loss": 0.3505, "step": 689 }, { "epoch": 2.1896072986909956, "grad_norm": 0.2198583155605273, "learning_rate": 5.5256114902676675e-05, "loss": 0.3447, "step": 690 }, { "epoch": 2.192780642602142, "grad_norm": 0.22694834599020175, "learning_rate": 5.517409818707343e-05, "loss": 0.3504, "step": 691 }, { "epoch": 2.195953986513288, "grad_norm": 0.159759032699931, "learning_rate": 5.50920068845359e-05, "loss": 0.3429, "step": 692 }, { "epoch": 2.1991273304244348, "grad_norm": 0.11626634052913302, "learning_rate": 5.500984139857659e-05, "loss": 0.3465, "step": 693 }, { "epoch": 2.2023006743355813, "grad_norm": 0.16029340678767326, "learning_rate": 5.492760213307268e-05, "loss": 0.3493, "step": 694 }, { "epoch": 2.2054740182467274, "grad_norm": 0.20138781571087522, "learning_rate": 5.484528949226397e-05, "loss": 0.3448, "step": 695 }, { "epoch": 2.208647362157874, "grad_norm": 0.20673832786589078, "learning_rate": 5.4762903880750956e-05, "loss": 0.3472, "step": 696 }, { "epoch": 2.21182070606902, "grad_norm": 0.1945972361781099, "learning_rate": 5.468044570349282e-05, "loss": 0.3452, "step": 697 }, { "epoch": 2.2149940499801666, "grad_norm": 0.15260971345475988, "learning_rate": 5.45979153658054e-05, "loss": 0.3444, "step": 698 }, { "epoch": 2.218167393891313, "grad_norm": 0.16140310676647643, "learning_rate": 5.451531327335927e-05, "loss": 0.3464, "step": 699 }, { "epoch": 2.221340737802459, "grad_norm": 0.1710826897248202, "learning_rate": 5.4432639832177675e-05, "loss": 0.3477, "step": 700 }, { "epoch": 2.2245140817136058, "grad_norm": 0.1907595410644264, "learning_rate": 5.43498954486346e-05, "loss": 0.3399, "step": 701 }, { "epoch": 2.227687425624752, "grad_norm": 0.16532722473360295, "learning_rate": 5.426708052945272e-05, "loss": 0.3505, "step": 702 }, { "epoch": 2.2308607695358984, "grad_norm": 0.1536194473143472, "learning_rate": 5.4184195481701425e-05, "loss": 0.3474, "step": 703 }, { "epoch": 2.234034113447045, "grad_norm": 0.1520117940400569, "learning_rate": 5.4101240712794826e-05, "loss": 0.3437, "step": 704 }, { "epoch": 2.237207457358191, "grad_norm": 0.16336250647219758, "learning_rate": 5.401821663048974e-05, "loss": 0.3484, "step": 705 }, { "epoch": 2.2403808012693376, "grad_norm": 0.18048860842589537, "learning_rate": 5.393512364288366e-05, "loss": 0.3461, "step": 706 }, { "epoch": 2.243554145180484, "grad_norm": 0.2077517477600907, "learning_rate": 5.3851962158412835e-05, "loss": 0.3469, "step": 707 }, { "epoch": 2.2467274890916302, "grad_norm": 0.23694455387298174, "learning_rate": 5.3768732585850135e-05, "loss": 0.3458, "step": 708 }, { "epoch": 2.2499008330027768, "grad_norm": 0.2221303640440427, "learning_rate": 5.3685435334303144e-05, "loss": 0.3475, "step": 709 }, { "epoch": 2.253074176913923, "grad_norm": 0.15740414776377862, "learning_rate": 5.360207081321215e-05, "loss": 0.3514, "step": 710 }, { "epoch": 2.2562475208250694, "grad_norm": 0.14133878333017164, "learning_rate": 5.351863943234803e-05, "loss": 0.3501, "step": 711 }, { "epoch": 2.259420864736216, "grad_norm": 0.23287734656674552, "learning_rate": 5.343514160181037e-05, "loss": 0.3476, "step": 712 }, { "epoch": 2.262594208647362, "grad_norm": 0.2747098009784896, "learning_rate": 5.3351577732025324e-05, "loss": 0.3467, "step": 713 }, { "epoch": 2.2657675525585086, "grad_norm": 0.2544286593636421, "learning_rate": 5.3267948233743705e-05, "loss": 0.3474, "step": 714 }, { "epoch": 2.2689408964696547, "grad_norm": 0.23428301254687212, "learning_rate": 5.318425351803889e-05, "loss": 0.3436, "step": 715 }, { "epoch": 2.2721142403808012, "grad_norm": 0.19972273517158506, "learning_rate": 5.310049399630483e-05, "loss": 0.3428, "step": 716 }, { "epoch": 2.2752875842919478, "grad_norm": 0.17312802850984527, "learning_rate": 5.301667008025404e-05, "loss": 0.3507, "step": 717 }, { "epoch": 2.278460928203094, "grad_norm": 0.1884344961517114, "learning_rate": 5.293278218191553e-05, "loss": 0.344, "step": 718 }, { "epoch": 2.2816342721142404, "grad_norm": 0.16251980995579765, "learning_rate": 5.2848830713632844e-05, "loss": 0.3524, "step": 719 }, { "epoch": 2.284807616025387, "grad_norm": 0.13519307906275285, "learning_rate": 5.2764816088061974e-05, "loss": 0.3518, "step": 720 }, { "epoch": 2.287980959936533, "grad_norm": 0.15371744727582462, "learning_rate": 5.2680738718169355e-05, "loss": 0.3412, "step": 721 }, { "epoch": 2.2911543038476796, "grad_norm": 0.14184577228089562, "learning_rate": 5.2596599017229864e-05, "loss": 0.3436, "step": 722 }, { "epoch": 2.2943276477588257, "grad_norm": 0.1369004603255518, "learning_rate": 5.251239739882472e-05, "loss": 0.3446, "step": 723 }, { "epoch": 2.2975009916699722, "grad_norm": 0.15048696535358383, "learning_rate": 5.2428134276839525e-05, "loss": 0.3442, "step": 724 }, { "epoch": 2.300674335581119, "grad_norm": 0.14200830231004927, "learning_rate": 5.234381006546219e-05, "loss": 0.3446, "step": 725 }, { "epoch": 2.303847679492265, "grad_norm": 0.16320009543918237, "learning_rate": 5.2259425179180873e-05, "loss": 0.3443, "step": 726 }, { "epoch": 2.3070210234034114, "grad_norm": 0.15997774955995675, "learning_rate": 5.217498003278204e-05, "loss": 0.346, "step": 727 }, { "epoch": 2.3101943673145575, "grad_norm": 0.17017241599015379, "learning_rate": 5.209047504134828e-05, "loss": 0.3436, "step": 728 }, { "epoch": 2.313367711225704, "grad_norm": 0.13479061683911572, "learning_rate": 5.200591062025641e-05, "loss": 0.346, "step": 729 }, { "epoch": 2.3165410551368506, "grad_norm": 0.12285044545606259, "learning_rate": 5.192128718517535e-05, "loss": 0.3383, "step": 730 }, { "epoch": 2.3197143990479967, "grad_norm": 0.1359930168100327, "learning_rate": 5.1836605152064076e-05, "loss": 0.3515, "step": 731 }, { "epoch": 2.3228877429591432, "grad_norm": 0.14719463738675423, "learning_rate": 5.175186493716963e-05, "loss": 0.3451, "step": 732 }, { "epoch": 2.32606108687029, "grad_norm": 0.1286663460165603, "learning_rate": 5.1667066957025e-05, "loss": 0.3512, "step": 733 }, { "epoch": 2.329234430781436, "grad_norm": 0.10867634667327339, "learning_rate": 5.158221162844717e-05, "loss": 0.3439, "step": 734 }, { "epoch": 2.3324077746925824, "grad_norm": 0.1591243176843703, "learning_rate": 5.1497299368534965e-05, "loss": 0.3511, "step": 735 }, { "epoch": 2.3355811186037285, "grad_norm": 0.12942608362042907, "learning_rate": 5.1412330594667075e-05, "loss": 0.3434, "step": 736 }, { "epoch": 2.338754462514875, "grad_norm": 0.10948773984311001, "learning_rate": 5.132730572449997e-05, "loss": 0.3524, "step": 737 }, { "epoch": 2.3419278064260216, "grad_norm": 0.11783857681490648, "learning_rate": 5.124222517596586e-05, "loss": 0.349, "step": 738 }, { "epoch": 2.3451011503371677, "grad_norm": 0.13439082971050517, "learning_rate": 5.115708936727065e-05, "loss": 0.3405, "step": 739 }, { "epoch": 2.3482744942483142, "grad_norm": 0.11812799919852607, "learning_rate": 5.1071898716891853e-05, "loss": 0.3461, "step": 740 }, { "epoch": 2.3514478381594603, "grad_norm": 0.11652694906685064, "learning_rate": 5.098665364357656e-05, "loss": 0.3426, "step": 741 }, { "epoch": 2.354621182070607, "grad_norm": 0.139468896494999, "learning_rate": 5.0901354566339355e-05, "loss": 0.3426, "step": 742 }, { "epoch": 2.3577945259817534, "grad_norm": 0.13901937078693805, "learning_rate": 5.0816001904460316e-05, "loss": 0.3453, "step": 743 }, { "epoch": 2.3609678698928995, "grad_norm": 0.1469507185762338, "learning_rate": 5.073059607748287e-05, "loss": 0.3438, "step": 744 }, { "epoch": 2.364141213804046, "grad_norm": 0.12953053633992156, "learning_rate": 5.064513750521179e-05, "loss": 0.348, "step": 745 }, { "epoch": 2.3673145577151926, "grad_norm": 0.15355601770329005, "learning_rate": 5.05596266077111e-05, "loss": 0.3487, "step": 746 }, { "epoch": 2.3704879016263387, "grad_norm": 0.17548971874850716, "learning_rate": 5.047406380530205e-05, "loss": 0.3435, "step": 747 }, { "epoch": 2.3736612455374853, "grad_norm": 0.1336318519267535, "learning_rate": 5.038844951856101e-05, "loss": 0.3472, "step": 748 }, { "epoch": 2.3768345894486314, "grad_norm": 0.15489849858402407, "learning_rate": 5.0302784168317405e-05, "loss": 0.3507, "step": 749 }, { "epoch": 2.380007933359778, "grad_norm": 0.15730467807777396, "learning_rate": 5.021706817565168e-05, "loss": 0.3494, "step": 750 }, { "epoch": 2.383181277270924, "grad_norm": 0.14142472540150533, "learning_rate": 5.013130196189319e-05, "loss": 0.3508, "step": 751 }, { "epoch": 2.3863546211820705, "grad_norm": 0.1309447338438405, "learning_rate": 5.004548594861815e-05, "loss": 0.3414, "step": 752 }, { "epoch": 2.389527965093217, "grad_norm": 0.13689757661127885, "learning_rate": 4.995962055764758e-05, "loss": 0.3462, "step": 753 }, { "epoch": 2.392701309004363, "grad_norm": 0.15155743192346277, "learning_rate": 4.987370621104518e-05, "loss": 0.3502, "step": 754 }, { "epoch": 2.3958746529155097, "grad_norm": 0.1771759302131552, "learning_rate": 4.978774333111532e-05, "loss": 0.3415, "step": 755 }, { "epoch": 2.3990479968266563, "grad_norm": 0.14297561112928286, "learning_rate": 4.97017323404009e-05, "loss": 0.3445, "step": 756 }, { "epoch": 2.4022213407378024, "grad_norm": 0.1359038159105317, "learning_rate": 4.9615673661681314e-05, "loss": 0.3431, "step": 757 }, { "epoch": 2.405394684648949, "grad_norm": 0.14972172604238929, "learning_rate": 4.952956771797039e-05, "loss": 0.3449, "step": 758 }, { "epoch": 2.4085680285600954, "grad_norm": 0.14098278608768156, "learning_rate": 4.9443414932514245e-05, "loss": 0.3446, "step": 759 }, { "epoch": 2.4117413724712415, "grad_norm": 0.1300591686668367, "learning_rate": 4.935721572878927e-05, "loss": 0.3469, "step": 760 }, { "epoch": 2.414914716382388, "grad_norm": 0.13525638291526695, "learning_rate": 4.9270970530499995e-05, "loss": 0.3457, "step": 761 }, { "epoch": 2.418088060293534, "grad_norm": 0.13706454820860725, "learning_rate": 4.918467976157704e-05, "loss": 0.3493, "step": 762 }, { "epoch": 2.4212614042046807, "grad_norm": 0.1396470750888246, "learning_rate": 4.909834384617505e-05, "loss": 0.3466, "step": 763 }, { "epoch": 2.424434748115827, "grad_norm": 0.1579884171451332, "learning_rate": 4.901196320867054e-05, "loss": 0.3434, "step": 764 }, { "epoch": 2.4276080920269734, "grad_norm": 0.14762421037735238, "learning_rate": 4.89255382736599e-05, "loss": 0.3461, "step": 765 }, { "epoch": 2.43078143593812, "grad_norm": 0.1484907586908843, "learning_rate": 4.883906946595721e-05, "loss": 0.3429, "step": 766 }, { "epoch": 2.433954779849266, "grad_norm": 0.15347798353398318, "learning_rate": 4.875255721059223e-05, "loss": 0.3471, "step": 767 }, { "epoch": 2.4371281237604125, "grad_norm": 0.1291098504201812, "learning_rate": 4.8666001932808315e-05, "loss": 0.3492, "step": 768 }, { "epoch": 2.440301467671559, "grad_norm": 0.1277300904749933, "learning_rate": 4.857940405806022e-05, "loss": 0.3431, "step": 769 }, { "epoch": 2.443474811582705, "grad_norm": 0.12939888924222556, "learning_rate": 4.8492764012012146e-05, "loss": 0.3467, "step": 770 }, { "epoch": 2.4466481554938517, "grad_norm": 0.1224426894168111, "learning_rate": 4.840608222053553e-05, "loss": 0.3417, "step": 771 }, { "epoch": 2.449821499404998, "grad_norm": 0.1497720241182753, "learning_rate": 4.831935910970706e-05, "loss": 0.3444, "step": 772 }, { "epoch": 2.4529948433161444, "grad_norm": 0.11319605561124714, "learning_rate": 4.8232595105806486e-05, "loss": 0.3445, "step": 773 }, { "epoch": 2.456168187227291, "grad_norm": 0.1345350003328084, "learning_rate": 4.814579063531458e-05, "loss": 0.3453, "step": 774 }, { "epoch": 2.459341531138437, "grad_norm": 0.14108407549438753, "learning_rate": 4.8058946124911014e-05, "loss": 0.3456, "step": 775 }, { "epoch": 2.4625148750495836, "grad_norm": 0.13444987730516147, "learning_rate": 4.797206200147229e-05, "loss": 0.3444, "step": 776 }, { "epoch": 2.4656882189607296, "grad_norm": 0.1375640631845117, "learning_rate": 4.78851386920696e-05, "loss": 0.3446, "step": 777 }, { "epoch": 2.468861562871876, "grad_norm": 0.14096157244272645, "learning_rate": 4.779817662396679e-05, "loss": 0.3478, "step": 778 }, { "epoch": 2.4720349067830227, "grad_norm": 0.12488925047337064, "learning_rate": 4.771117622461816e-05, "loss": 0.3413, "step": 779 }, { "epoch": 2.475208250694169, "grad_norm": 0.13287851067977652, "learning_rate": 4.7624137921666475e-05, "loss": 0.3489, "step": 780 }, { "epoch": 2.4783815946053154, "grad_norm": 0.13197704125308232, "learning_rate": 4.753706214294082e-05, "loss": 0.3455, "step": 781 }, { "epoch": 2.481554938516462, "grad_norm": 0.13265503243246032, "learning_rate": 4.7449949316454425e-05, "loss": 0.3439, "step": 782 }, { "epoch": 2.484728282427608, "grad_norm": 0.130704370235696, "learning_rate": 4.736279987040269e-05, "loss": 0.3457, "step": 783 }, { "epoch": 2.4879016263387546, "grad_norm": 0.14148325746554727, "learning_rate": 4.727561423316099e-05, "loss": 0.3463, "step": 784 }, { "epoch": 2.4910749702499007, "grad_norm": 0.13608094038136562, "learning_rate": 4.7188392833282575e-05, "loss": 0.3484, "step": 785 }, { "epoch": 2.494248314161047, "grad_norm": 0.13874721828210285, "learning_rate": 4.710113609949653e-05, "loss": 0.346, "step": 786 }, { "epoch": 2.4974216580721937, "grad_norm": 0.14977483842518804, "learning_rate": 4.701384446070557e-05, "loss": 0.3424, "step": 787 }, { "epoch": 2.50059500198334, "grad_norm": 0.12967959257083173, "learning_rate": 4.6926518345984026e-05, "loss": 0.3406, "step": 788 }, { "epoch": 2.5037683458944864, "grad_norm": 0.12801141105870723, "learning_rate": 4.683915818457566e-05, "loss": 0.3396, "step": 789 }, { "epoch": 2.5069416898056325, "grad_norm": 0.13327811790487024, "learning_rate": 4.67517644058916e-05, "loss": 0.3403, "step": 790 }, { "epoch": 2.510115033716779, "grad_norm": 0.16238667666380158, "learning_rate": 4.6664337439508226e-05, "loss": 0.3411, "step": 791 }, { "epoch": 2.5132883776279256, "grad_norm": 0.16494439303835634, "learning_rate": 4.657687771516503e-05, "loss": 0.3409, "step": 792 }, { "epoch": 2.5164617215390717, "grad_norm": 0.14403868840938389, "learning_rate": 4.6489385662762544e-05, "loss": 0.3439, "step": 793 }, { "epoch": 2.519635065450218, "grad_norm": 0.13948670219193632, "learning_rate": 4.640186171236018e-05, "loss": 0.3455, "step": 794 }, { "epoch": 2.5228084093613647, "grad_norm": 0.16277210257572064, "learning_rate": 4.6314306294174164e-05, "loss": 0.3467, "step": 795 }, { "epoch": 2.525981753272511, "grad_norm": 0.17170615713941978, "learning_rate": 4.622671983857539e-05, "loss": 0.3447, "step": 796 }, { "epoch": 2.5291550971836574, "grad_norm": 0.15234975751861424, "learning_rate": 4.6139102776087316e-05, "loss": 0.3468, "step": 797 }, { "epoch": 2.5323284410948035, "grad_norm": 0.11279769192482242, "learning_rate": 4.605145553738385e-05, "loss": 0.3452, "step": 798 }, { "epoch": 2.53550178500595, "grad_norm": 0.13961162984914652, "learning_rate": 4.5963778553287215e-05, "loss": 0.3396, "step": 799 }, { "epoch": 2.538675128917096, "grad_norm": 0.15754039363591266, "learning_rate": 4.587607225476585e-05, "loss": 0.3464, "step": 800 }, { "epoch": 2.5418484728282427, "grad_norm": 0.15400642858424615, "learning_rate": 4.57883370729323e-05, "loss": 0.3429, "step": 801 }, { "epoch": 2.545021816739389, "grad_norm": 0.13587095295609775, "learning_rate": 4.570057343904107e-05, "loss": 0.3444, "step": 802 }, { "epoch": 2.5481951606505353, "grad_norm": 0.1661934512206222, "learning_rate": 4.5612781784486516e-05, "loss": 0.3456, "step": 803 }, { "epoch": 2.551368504561682, "grad_norm": 0.1818284189001732, "learning_rate": 4.5524962540800726e-05, "loss": 0.3454, "step": 804 }, { "epoch": 2.5545418484728284, "grad_norm": 0.134272995751436, "learning_rate": 4.5437116139651416e-05, "loss": 0.3458, "step": 805 }, { "epoch": 2.5577151923839745, "grad_norm": 0.11098594445466681, "learning_rate": 4.5349243012839787e-05, "loss": 0.3428, "step": 806 }, { "epoch": 2.560888536295121, "grad_norm": 0.1383868261385255, "learning_rate": 4.52613435922984e-05, "loss": 0.3433, "step": 807 }, { "epoch": 2.5640618802062676, "grad_norm": 0.13958525369734295, "learning_rate": 4.517341831008906e-05, "loss": 0.343, "step": 808 }, { "epoch": 2.5672352241174137, "grad_norm": 0.14299669733617631, "learning_rate": 4.5085467598400687e-05, "loss": 0.3481, "step": 809 }, { "epoch": 2.57040856802856, "grad_norm": 0.16300391548684726, "learning_rate": 4.499749188954721e-05, "loss": 0.3442, "step": 810 }, { "epoch": 2.5735819119397063, "grad_norm": 0.15939899931970958, "learning_rate": 4.490949161596545e-05, "loss": 0.346, "step": 811 }, { "epoch": 2.576755255850853, "grad_norm": 0.11711293584718259, "learning_rate": 4.4821467210212924e-05, "loss": 0.3394, "step": 812 }, { "epoch": 2.579928599761999, "grad_norm": 0.12318146826993423, "learning_rate": 4.473341910496579e-05, "loss": 0.3391, "step": 813 }, { "epoch": 2.5831019436731455, "grad_norm": 0.12321952201871014, "learning_rate": 4.464534773301674e-05, "loss": 0.3424, "step": 814 }, { "epoch": 2.586275287584292, "grad_norm": 0.12259038741152185, "learning_rate": 4.455725352727276e-05, "loss": 0.346, "step": 815 }, { "epoch": 2.589448631495438, "grad_norm": 0.14443376513578737, "learning_rate": 4.446913692075311e-05, "loss": 0.3436, "step": 816 }, { "epoch": 2.5926219754065847, "grad_norm": 0.1381230568223312, "learning_rate": 4.438099834658716e-05, "loss": 0.3444, "step": 817 }, { "epoch": 2.595795319317731, "grad_norm": 0.12800321400776432, "learning_rate": 4.429283823801227e-05, "loss": 0.3423, "step": 818 }, { "epoch": 2.5989686632288773, "grad_norm": 0.14739193433103212, "learning_rate": 4.420465702837162e-05, "loss": 0.346, "step": 819 }, { "epoch": 2.602142007140024, "grad_norm": 0.14778467346501864, "learning_rate": 4.4116455151112135e-05, "loss": 0.3481, "step": 820 }, { "epoch": 2.6053153510511704, "grad_norm": 0.11699717837600367, "learning_rate": 4.4028233039782336e-05, "loss": 0.3468, "step": 821 }, { "epoch": 2.6084886949623165, "grad_norm": 0.12377912947544831, "learning_rate": 4.393999112803017e-05, "loss": 0.3422, "step": 822 }, { "epoch": 2.611662038873463, "grad_norm": 0.11993635420501132, "learning_rate": 4.385172984960093e-05, "loss": 0.3479, "step": 823 }, { "epoch": 2.614835382784609, "grad_norm": 0.1198156205225748, "learning_rate": 4.3763449638335124e-05, "loss": 0.3486, "step": 824 }, { "epoch": 2.6180087266957557, "grad_norm": 0.14812630286596426, "learning_rate": 4.367515092816628e-05, "loss": 0.3494, "step": 825 }, { "epoch": 2.621182070606902, "grad_norm": 0.11969830749952089, "learning_rate": 4.3586834153118905e-05, "loss": 0.343, "step": 826 }, { "epoch": 2.6243554145180483, "grad_norm": 0.1330251306537408, "learning_rate": 4.3498499747306243e-05, "loss": 0.342, "step": 827 }, { "epoch": 2.627528758429195, "grad_norm": 0.12546162647602022, "learning_rate": 4.3410148144928256e-05, "loss": 0.3425, "step": 828 }, { "epoch": 2.630702102340341, "grad_norm": 0.13410489232741177, "learning_rate": 4.332177978026943e-05, "loss": 0.3483, "step": 829 }, { "epoch": 2.6338754462514875, "grad_norm": 0.1232508805552152, "learning_rate": 4.3233395087696585e-05, "loss": 0.346, "step": 830 }, { "epoch": 2.637048790162634, "grad_norm": 0.12167537812092089, "learning_rate": 4.314499450165688e-05, "loss": 0.3409, "step": 831 }, { "epoch": 2.64022213407378, "grad_norm": 0.13645509206436196, "learning_rate": 4.305657845667553e-05, "loss": 0.3462, "step": 832 }, { "epoch": 2.6433954779849267, "grad_norm": 0.10599165500884121, "learning_rate": 4.296814738735376e-05, "loss": 0.3464, "step": 833 }, { "epoch": 2.6465688218960732, "grad_norm": 0.13223375794191417, "learning_rate": 4.2879701728366686e-05, "loss": 0.3438, "step": 834 }, { "epoch": 2.6497421658072193, "grad_norm": 0.12369661834770018, "learning_rate": 4.2791241914461076e-05, "loss": 0.348, "step": 835 }, { "epoch": 2.652915509718366, "grad_norm": 0.1225955125555118, "learning_rate": 4.270276838045331e-05, "loss": 0.3396, "step": 836 }, { "epoch": 2.656088853629512, "grad_norm": 0.11973226274198374, "learning_rate": 4.26142815612272e-05, "loss": 0.3473, "step": 837 }, { "epoch": 2.6592621975406585, "grad_norm": 0.1302252203210119, "learning_rate": 4.252578189173186e-05, "loss": 0.3421, "step": 838 }, { "epoch": 2.6624355414518046, "grad_norm": 0.13620263914799938, "learning_rate": 4.2437269806979574e-05, "loss": 0.3447, "step": 839 }, { "epoch": 2.665608885362951, "grad_norm": 0.12216233976282596, "learning_rate": 4.234874574204364e-05, "loss": 0.3481, "step": 840 }, { "epoch": 2.6687822292740977, "grad_norm": 0.14511224880697332, "learning_rate": 4.226021013205626e-05, "loss": 0.3433, "step": 841 }, { "epoch": 2.671955573185244, "grad_norm": 0.13167003525580787, "learning_rate": 4.217166341220635e-05, "loss": 0.3462, "step": 842 }, { "epoch": 2.6751289170963903, "grad_norm": 0.11233528339177638, "learning_rate": 4.208310601773749e-05, "loss": 0.3422, "step": 843 }, { "epoch": 2.678302261007537, "grad_norm": 0.116009578862862, "learning_rate": 4.1994538383945686e-05, "loss": 0.3382, "step": 844 }, { "epoch": 2.681475604918683, "grad_norm": 0.14193128052476364, "learning_rate": 4.190596094617729e-05, "loss": 0.3477, "step": 845 }, { "epoch": 2.6846489488298295, "grad_norm": 0.14084760732606688, "learning_rate": 4.1817374139826857e-05, "loss": 0.3416, "step": 846 }, { "epoch": 2.687822292740976, "grad_norm": 0.1182925424974134, "learning_rate": 4.172877840033496e-05, "loss": 0.346, "step": 847 }, { "epoch": 2.690995636652122, "grad_norm": 0.14243194869284295, "learning_rate": 4.164017416318611e-05, "loss": 0.3319, "step": 848 }, { "epoch": 2.6941689805632687, "grad_norm": 0.14660981394007142, "learning_rate": 4.155156186390659e-05, "loss": 0.3454, "step": 849 }, { "epoch": 2.697342324474415, "grad_norm": 0.12282621458948739, "learning_rate": 4.1462941938062295e-05, "loss": 0.3425, "step": 850 }, { "epoch": 2.7005156683855613, "grad_norm": 0.11456728642253448, "learning_rate": 4.137431482125659e-05, "loss": 0.3437, "step": 851 }, { "epoch": 2.7036890122967074, "grad_norm": 0.13185395211536752, "learning_rate": 4.128568094912825e-05, "loss": 0.3437, "step": 852 }, { "epoch": 2.706862356207854, "grad_norm": 0.11115691532463522, "learning_rate": 4.1197040757349175e-05, "loss": 0.35, "step": 853 }, { "epoch": 2.7100357001190005, "grad_norm": 0.12965739444200763, "learning_rate": 4.1108394681622406e-05, "loss": 0.3469, "step": 854 }, { "epoch": 2.7132090440301466, "grad_norm": 0.12606196792345453, "learning_rate": 4.101974315767984e-05, "loss": 0.3424, "step": 855 }, { "epoch": 2.716382387941293, "grad_norm": 0.13377181792852472, "learning_rate": 4.0931086621280195e-05, "loss": 0.3449, "step": 856 }, { "epoch": 2.7195557318524397, "grad_norm": 0.1083820821550372, "learning_rate": 4.0842425508206814e-05, "loss": 0.3422, "step": 857 }, { "epoch": 2.722729075763586, "grad_norm": 0.14051865023509388, "learning_rate": 4.075376025426553e-05, "loss": 0.3432, "step": 858 }, { "epoch": 2.7259024196747323, "grad_norm": 0.1446305958755852, "learning_rate": 4.0665091295282557e-05, "loss": 0.3421, "step": 859 }, { "epoch": 2.729075763585879, "grad_norm": 0.1185199085913208, "learning_rate": 4.0576419067102294e-05, "loss": 0.3425, "step": 860 }, { "epoch": 2.732249107497025, "grad_norm": 0.12047038069530473, "learning_rate": 4.04877440055852e-05, "loss": 0.3433, "step": 861 }, { "epoch": 2.735422451408171, "grad_norm": 0.12298317824143638, "learning_rate": 4.03990665466057e-05, "loss": 0.3419, "step": 862 }, { "epoch": 2.7385957953193176, "grad_norm": 0.1134666458695627, "learning_rate": 4.0310387126049965e-05, "loss": 0.3449, "step": 863 }, { "epoch": 2.741769139230464, "grad_norm": 0.1151216459243458, "learning_rate": 4.022170617981383e-05, "loss": 0.3487, "step": 864 }, { "epoch": 2.7449424831416103, "grad_norm": 0.12438855242604728, "learning_rate": 4.013302414380062e-05, "loss": 0.3433, "step": 865 }, { "epoch": 2.748115827052757, "grad_norm": 0.12462238298496425, "learning_rate": 4.004434145391903e-05, "loss": 0.3379, "step": 866 }, { "epoch": 2.7512891709639034, "grad_norm": 0.12043847383946583, "learning_rate": 3.9955658546080975e-05, "loss": 0.3418, "step": 867 }, { "epoch": 2.7544625148750495, "grad_norm": 0.12670455035239242, "learning_rate": 3.9866975856199376e-05, "loss": 0.3374, "step": 868 }, { "epoch": 2.757635858786196, "grad_norm": 0.13869220804125282, "learning_rate": 3.9778293820186176e-05, "loss": 0.344, "step": 869 }, { "epoch": 2.7608092026973425, "grad_norm": 0.12510091538071094, "learning_rate": 3.968961287395004e-05, "loss": 0.3464, "step": 870 }, { "epoch": 2.7639825466084886, "grad_norm": 0.11788439172448362, "learning_rate": 3.960093345339432e-05, "loss": 0.3415, "step": 871 }, { "epoch": 2.767155890519635, "grad_norm": 0.13570817794277007, "learning_rate": 3.9512255994414804e-05, "loss": 0.3415, "step": 872 }, { "epoch": 2.7703292344307813, "grad_norm": 0.13940993558742085, "learning_rate": 3.9423580932897726e-05, "loss": 0.3423, "step": 873 }, { "epoch": 2.773502578341928, "grad_norm": 0.10631207090804463, "learning_rate": 3.933490870471745e-05, "loss": 0.3442, "step": 874 }, { "epoch": 2.776675922253074, "grad_norm": 0.1503520815021571, "learning_rate": 3.924623974573448e-05, "loss": 0.3416, "step": 875 }, { "epoch": 2.7798492661642205, "grad_norm": 0.12632156433331662, "learning_rate": 3.9157574491793185e-05, "loss": 0.3371, "step": 876 }, { "epoch": 2.783022610075367, "grad_norm": 0.13734903086765304, "learning_rate": 3.906891337871982e-05, "loss": 0.3491, "step": 877 }, { "epoch": 2.786195953986513, "grad_norm": 0.15639396884236917, "learning_rate": 3.898025684232016e-05, "loss": 0.3472, "step": 878 }, { "epoch": 2.7893692978976596, "grad_norm": 0.12483944951607047, "learning_rate": 3.889160531837761e-05, "loss": 0.3381, "step": 879 }, { "epoch": 2.792542641808806, "grad_norm": 0.13742280075928823, "learning_rate": 3.8802959242650825e-05, "loss": 0.3411, "step": 880 }, { "epoch": 2.7957159857199523, "grad_norm": 0.12266800245109839, "learning_rate": 3.8714319050871764e-05, "loss": 0.3414, "step": 881 }, { "epoch": 2.798889329631099, "grad_norm": 0.12861032154624427, "learning_rate": 3.862568517874341e-05, "loss": 0.3392, "step": 882 }, { "epoch": 2.8020626735422454, "grad_norm": 0.11470183191097062, "learning_rate": 3.8537058061937725e-05, "loss": 0.343, "step": 883 }, { "epoch": 2.8052360174533915, "grad_norm": 0.12702748667879213, "learning_rate": 3.8448438136093414e-05, "loss": 0.3453, "step": 884 }, { "epoch": 2.808409361364538, "grad_norm": 0.11316231412291199, "learning_rate": 3.8359825836813895e-05, "loss": 0.3446, "step": 885 }, { "epoch": 2.811582705275684, "grad_norm": 0.12175654245533254, "learning_rate": 3.827122159966504e-05, "loss": 0.3464, "step": 886 }, { "epoch": 2.8147560491868306, "grad_norm": 0.12805730071128243, "learning_rate": 3.818262586017315e-05, "loss": 0.3421, "step": 887 }, { "epoch": 2.8179293930979767, "grad_norm": 0.13404348902216565, "learning_rate": 3.8094039053822715e-05, "loss": 0.3448, "step": 888 }, { "epoch": 2.8211027370091233, "grad_norm": 0.12244995804431573, "learning_rate": 3.800546161605433e-05, "loss": 0.3408, "step": 889 }, { "epoch": 2.82427608092027, "grad_norm": 0.11165322485961218, "learning_rate": 3.791689398226252e-05, "loss": 0.3448, "step": 890 }, { "epoch": 2.827449424831416, "grad_norm": 0.12392250807575451, "learning_rate": 3.7828336587793665e-05, "loss": 0.339, "step": 891 }, { "epoch": 2.8306227687425625, "grad_norm": 0.14372027694900424, "learning_rate": 3.773978986794376e-05, "loss": 0.3471, "step": 892 }, { "epoch": 2.833796112653709, "grad_norm": 0.11903041102933404, "learning_rate": 3.765125425795637e-05, "loss": 0.3445, "step": 893 }, { "epoch": 2.836969456564855, "grad_norm": 0.11348910315003628, "learning_rate": 3.7562730193020425e-05, "loss": 0.34, "step": 894 }, { "epoch": 2.8401428004760016, "grad_norm": 0.12038861322563689, "learning_rate": 3.747421810826815e-05, "loss": 0.3389, "step": 895 }, { "epoch": 2.843316144387148, "grad_norm": 0.1097622809874721, "learning_rate": 3.73857184387728e-05, "loss": 0.3471, "step": 896 }, { "epoch": 2.8464894882982943, "grad_norm": 0.11347921572685421, "learning_rate": 3.7297231619546695e-05, "loss": 0.3443, "step": 897 }, { "epoch": 2.849662832209441, "grad_norm": 0.12369160424846627, "learning_rate": 3.7208758085538924e-05, "loss": 0.3432, "step": 898 }, { "epoch": 2.852836176120587, "grad_norm": 0.12357156761028097, "learning_rate": 3.712029827163332e-05, "loss": 0.3414, "step": 899 }, { "epoch": 2.8560095200317335, "grad_norm": 0.11376937375822221, "learning_rate": 3.703185261264624e-05, "loss": 0.3401, "step": 900 }, { "epoch": 2.8591828639428796, "grad_norm": 0.12349127066450852, "learning_rate": 3.694342154332449e-05, "loss": 0.3422, "step": 901 }, { "epoch": 2.862356207854026, "grad_norm": 0.1034961716279667, "learning_rate": 3.685500549834314e-05, "loss": 0.3434, "step": 902 }, { "epoch": 2.8655295517651727, "grad_norm": 0.12269370497851256, "learning_rate": 3.676660491230343e-05, "loss": 0.3383, "step": 903 }, { "epoch": 2.8687028956763188, "grad_norm": 0.13382966320810855, "learning_rate": 3.667822021973058e-05, "loss": 0.3435, "step": 904 }, { "epoch": 2.8718762395874653, "grad_norm": 0.11277938663333244, "learning_rate": 3.658985185507175e-05, "loss": 0.3406, "step": 905 }, { "epoch": 2.875049583498612, "grad_norm": 0.1143319724175766, "learning_rate": 3.6501500252693756e-05, "loss": 0.3376, "step": 906 }, { "epoch": 2.878222927409758, "grad_norm": 0.11689144421922465, "learning_rate": 3.641316584688111e-05, "loss": 0.3453, "step": 907 }, { "epoch": 2.8813962713209045, "grad_norm": 0.11809016351008388, "learning_rate": 3.632484907183372e-05, "loss": 0.3481, "step": 908 }, { "epoch": 2.884569615232051, "grad_norm": 0.1276259830733137, "learning_rate": 3.623655036166489e-05, "loss": 0.3421, "step": 909 }, { "epoch": 2.887742959143197, "grad_norm": 0.11884475840914586, "learning_rate": 3.6148270150399074e-05, "loss": 0.3446, "step": 910 }, { "epoch": 2.8909163030543437, "grad_norm": 0.11087658030941172, "learning_rate": 3.606000887196985e-05, "loss": 0.3439, "step": 911 }, { "epoch": 2.8940896469654898, "grad_norm": 0.10980416285438169, "learning_rate": 3.597176696021767e-05, "loss": 0.3372, "step": 912 }, { "epoch": 2.8972629908766363, "grad_norm": 0.11601487167763035, "learning_rate": 3.588354484888787e-05, "loss": 0.3459, "step": 913 }, { "epoch": 2.9004363347877824, "grad_norm": 0.11368911677025695, "learning_rate": 3.579534297162838e-05, "loss": 0.3438, "step": 914 }, { "epoch": 2.903609678698929, "grad_norm": 0.10395283389340351, "learning_rate": 3.5707161761987745e-05, "loss": 0.338, "step": 915 }, { "epoch": 2.9067830226100755, "grad_norm": 0.11161882010475302, "learning_rate": 3.561900165341284e-05, "loss": 0.3408, "step": 916 }, { "epoch": 2.9099563665212216, "grad_norm": 0.0997500952290493, "learning_rate": 3.55308630792469e-05, "loss": 0.3421, "step": 917 }, { "epoch": 2.913129710432368, "grad_norm": 0.1339571225523946, "learning_rate": 3.544274647272725e-05, "loss": 0.3442, "step": 918 }, { "epoch": 2.9163030543435147, "grad_norm": 0.11617441220009064, "learning_rate": 3.535465226698327e-05, "loss": 0.343, "step": 919 }, { "epoch": 2.9194763982546608, "grad_norm": 0.1410191016195631, "learning_rate": 3.526658089503421e-05, "loss": 0.3395, "step": 920 }, { "epoch": 2.9226497421658073, "grad_norm": 0.13899939104575385, "learning_rate": 3.517853278978708e-05, "loss": 0.3418, "step": 921 }, { "epoch": 2.925823086076954, "grad_norm": 0.1448300244004051, "learning_rate": 3.5090508384034554e-05, "loss": 0.3434, "step": 922 }, { "epoch": 2.9289964299881, "grad_norm": 0.14071342868222997, "learning_rate": 3.5002508110452796e-05, "loss": 0.3463, "step": 923 }, { "epoch": 2.9321697738992465, "grad_norm": 0.13757441367946524, "learning_rate": 3.491453240159932e-05, "loss": 0.3428, "step": 924 }, { "epoch": 2.9353431178103926, "grad_norm": 0.12380157230283552, "learning_rate": 3.4826581689910956e-05, "loss": 0.3425, "step": 925 }, { "epoch": 2.938516461721539, "grad_norm": 0.14057723094623828, "learning_rate": 3.473865640770161e-05, "loss": 0.3411, "step": 926 }, { "epoch": 2.9416898056326852, "grad_norm": 0.11650354025690383, "learning_rate": 3.465075698716022e-05, "loss": 0.3365, "step": 927 }, { "epoch": 2.9448631495438318, "grad_norm": 0.12980737726549346, "learning_rate": 3.4562883860348584e-05, "loss": 0.3399, "step": 928 }, { "epoch": 2.9480364934549783, "grad_norm": 0.1134055745141929, "learning_rate": 3.447503745919929e-05, "loss": 0.3414, "step": 929 }, { "epoch": 2.9512098373661244, "grad_norm": 0.1201253976384782, "learning_rate": 3.43872182155135e-05, "loss": 0.3457, "step": 930 }, { "epoch": 2.954383181277271, "grad_norm": 0.11035272651701446, "learning_rate": 3.429942656095895e-05, "loss": 0.3458, "step": 931 }, { "epoch": 2.9575565251884175, "grad_norm": 0.11154353610592178, "learning_rate": 3.4211662927067694e-05, "loss": 0.3439, "step": 932 }, { "epoch": 2.9607298690995636, "grad_norm": 0.11422392455235472, "learning_rate": 3.412392774523416e-05, "loss": 0.3382, "step": 933 }, { "epoch": 2.96390321301071, "grad_norm": 0.10421558917351098, "learning_rate": 3.4036221446712785e-05, "loss": 0.3442, "step": 934 }, { "epoch": 2.9670765569218567, "grad_norm": 0.10191858553645955, "learning_rate": 3.3948544462616154e-05, "loss": 0.3472, "step": 935 }, { "epoch": 2.9702499008330028, "grad_norm": 0.10534746393290977, "learning_rate": 3.386089722391268e-05, "loss": 0.3426, "step": 936 }, { "epoch": 2.973423244744149, "grad_norm": 0.10261252386705168, "learning_rate": 3.3773280161424614e-05, "loss": 0.3433, "step": 937 }, { "epoch": 2.9765965886552954, "grad_norm": 0.1170256748314465, "learning_rate": 3.368569370582584e-05, "loss": 0.3394, "step": 938 }, { "epoch": 2.979769932566442, "grad_norm": 0.11126882366159503, "learning_rate": 3.359813828763983e-05, "loss": 0.3397, "step": 939 }, { "epoch": 2.982943276477588, "grad_norm": 0.1198757045135952, "learning_rate": 3.351061433723746e-05, "loss": 0.3438, "step": 940 }, { "epoch": 2.9861166203887346, "grad_norm": 0.10112115474813273, "learning_rate": 3.3423122284834976e-05, "loss": 0.3342, "step": 941 }, { "epoch": 2.989289964299881, "grad_norm": 0.11249349782419689, "learning_rate": 3.3335662560491773e-05, "loss": 0.3429, "step": 942 }, { "epoch": 2.9924633082110272, "grad_norm": 0.1001185547565032, "learning_rate": 3.3248235594108415e-05, "loss": 0.3337, "step": 943 }, { "epoch": 2.995636652122174, "grad_norm": 0.11017756335045858, "learning_rate": 3.316084181542434e-05, "loss": 0.3375, "step": 944 }, { "epoch": 2.9988099960333203, "grad_norm": 0.12823056575818978, "learning_rate": 3.307348165401598e-05, "loss": 0.3306, "step": 945 }, { "epoch": 3.0019833399444664, "grad_norm": 0.27281229082142194, "learning_rate": 3.2986155539294435e-05, "loss": 0.6218, "step": 946 }, { "epoch": 3.005156683855613, "grad_norm": 0.2100816419248202, "learning_rate": 3.2898863900503484e-05, "loss": 0.3147, "step": 947 }, { "epoch": 3.008330027766759, "grad_norm": 0.17105137980537694, "learning_rate": 3.281160716671743e-05, "loss": 0.3116, "step": 948 }, { "epoch": 3.0115033716779056, "grad_norm": 0.22415298912899825, "learning_rate": 3.2724385766839026e-05, "loss": 0.3194, "step": 949 }, { "epoch": 3.014676715589052, "grad_norm": 0.19583525189357293, "learning_rate": 3.263720012959732e-05, "loss": 0.3173, "step": 950 }, { "epoch": 3.0178500595001982, "grad_norm": 0.15512832486443295, "learning_rate": 3.255005068354559e-05, "loss": 0.3157, "step": 951 }, { "epoch": 3.021023403411345, "grad_norm": 0.16721068805673456, "learning_rate": 3.2462937857059187e-05, "loss": 0.3178, "step": 952 }, { "epoch": 3.024196747322491, "grad_norm": 0.17107719512173727, "learning_rate": 3.237586207833353e-05, "loss": 0.3119, "step": 953 }, { "epoch": 3.0273700912336374, "grad_norm": 0.1585477770306446, "learning_rate": 3.2288823775381845e-05, "loss": 0.3134, "step": 954 }, { "epoch": 3.030543435144784, "grad_norm": 0.16343916704345743, "learning_rate": 3.2201823376033226e-05, "loss": 0.3132, "step": 955 }, { "epoch": 3.03371677905593, "grad_norm": 0.13291081320812426, "learning_rate": 3.21148613079304e-05, "loss": 0.3155, "step": 956 }, { "epoch": 3.0368901229670766, "grad_norm": 0.1495696649421576, "learning_rate": 3.202793799852772e-05, "loss": 0.3139, "step": 957 }, { "epoch": 3.040063466878223, "grad_norm": 0.14466866199473793, "learning_rate": 3.194105387508899e-05, "loss": 0.3134, "step": 958 }, { "epoch": 3.0432368107893693, "grad_norm": 0.1361091421412387, "learning_rate": 3.1854209364685436e-05, "loss": 0.3082, "step": 959 }, { "epoch": 3.046410154700516, "grad_norm": 0.13367023176470594, "learning_rate": 3.176740489419352e-05, "loss": 0.3126, "step": 960 }, { "epoch": 3.049583498611662, "grad_norm": 0.13627283601328063, "learning_rate": 3.168064089029296e-05, "loss": 0.3134, "step": 961 }, { "epoch": 3.0527568425228084, "grad_norm": 0.13148674412233574, "learning_rate": 3.159391777946447e-05, "loss": 0.3162, "step": 962 }, { "epoch": 3.055930186433955, "grad_norm": 0.12808707534046343, "learning_rate": 3.150723598798787e-05, "loss": 0.3118, "step": 963 }, { "epoch": 3.059103530345101, "grad_norm": 0.13273988457362224, "learning_rate": 3.1420595941939786e-05, "loss": 0.3133, "step": 964 }, { "epoch": 3.0622768742562476, "grad_norm": 0.11421318045881329, "learning_rate": 3.13339980671917e-05, "loss": 0.3098, "step": 965 }, { "epoch": 3.0654502181673937, "grad_norm": 0.13703482566876224, "learning_rate": 3.124744278940777e-05, "loss": 0.3195, "step": 966 }, { "epoch": 3.0686235620785403, "grad_norm": 0.11316298092381129, "learning_rate": 3.1160930534042805e-05, "loss": 0.3172, "step": 967 }, { "epoch": 3.071796905989687, "grad_norm": 0.1189425742935073, "learning_rate": 3.107446172634012e-05, "loss": 0.3108, "step": 968 }, { "epoch": 3.074970249900833, "grad_norm": 0.11756376777246882, "learning_rate": 3.098803679132947e-05, "loss": 0.3179, "step": 969 }, { "epoch": 3.0781435938119794, "grad_norm": 0.1143776389372859, "learning_rate": 3.090165615382496e-05, "loss": 0.3102, "step": 970 }, { "epoch": 3.0813169377231255, "grad_norm": 0.12423804582324503, "learning_rate": 3.0815320238422974e-05, "loss": 0.3115, "step": 971 }, { "epoch": 3.084490281634272, "grad_norm": 0.13996003835367157, "learning_rate": 3.072902946950001e-05, "loss": 0.3192, "step": 972 }, { "epoch": 3.0876636255454186, "grad_norm": 0.11255035117655975, "learning_rate": 3.0642784271210734e-05, "loss": 0.309, "step": 973 }, { "epoch": 3.0908369694565647, "grad_norm": 0.12699451919385885, "learning_rate": 3.055658506748575e-05, "loss": 0.3134, "step": 974 }, { "epoch": 3.0940103133677113, "grad_norm": 0.11635660673053044, "learning_rate": 3.0470432282029614e-05, "loss": 0.3167, "step": 975 }, { "epoch": 3.097183657278858, "grad_norm": 0.10873609232252231, "learning_rate": 3.0384326338318685e-05, "loss": 0.3133, "step": 976 }, { "epoch": 3.100357001190004, "grad_norm": 0.11807314933760477, "learning_rate": 3.0298267659599115e-05, "loss": 0.312, "step": 977 }, { "epoch": 3.1035303451011504, "grad_norm": 0.11916222364590734, "learning_rate": 3.0212256668884695e-05, "loss": 0.3078, "step": 978 }, { "epoch": 3.1067036890122965, "grad_norm": 0.12272695797856463, "learning_rate": 3.0126293788954833e-05, "loss": 0.3122, "step": 979 }, { "epoch": 3.109877032923443, "grad_norm": 0.11540153822434568, "learning_rate": 3.004037944235242e-05, "loss": 0.3136, "step": 980 }, { "epoch": 3.1130503768345896, "grad_norm": 0.11618456648740005, "learning_rate": 2.9954514051381863e-05, "loss": 0.3166, "step": 981 }, { "epoch": 3.1162237207457357, "grad_norm": 0.11244633521843918, "learning_rate": 2.9868698038106815e-05, "loss": 0.3148, "step": 982 }, { "epoch": 3.1193970646568823, "grad_norm": 0.10920070004246661, "learning_rate": 2.9782931824348328e-05, "loss": 0.3088, "step": 983 }, { "epoch": 3.1225704085680284, "grad_norm": 0.12174658516975441, "learning_rate": 2.9697215831682595e-05, "loss": 0.3165, "step": 984 }, { "epoch": 3.125743752479175, "grad_norm": 0.10643018736017142, "learning_rate": 2.9611550481439e-05, "loss": 0.3145, "step": 985 }, { "epoch": 3.1289170963903215, "grad_norm": 0.10684450615726702, "learning_rate": 2.952593619469795e-05, "loss": 0.3092, "step": 986 }, { "epoch": 3.1320904403014675, "grad_norm": 0.10941148930845793, "learning_rate": 2.9440373392288913e-05, "loss": 0.3138, "step": 987 }, { "epoch": 3.135263784212614, "grad_norm": 0.10382261849589255, "learning_rate": 2.9354862494788223e-05, "loss": 0.3119, "step": 988 }, { "epoch": 3.13843712812376, "grad_norm": 0.10958378594318682, "learning_rate": 2.9269403922517145e-05, "loss": 0.3099, "step": 989 }, { "epoch": 3.1416104720349067, "grad_norm": 0.12998774556538992, "learning_rate": 2.9183998095539684e-05, "loss": 0.3129, "step": 990 }, { "epoch": 3.1447838159460533, "grad_norm": 0.10561418907091862, "learning_rate": 2.9098645433660655e-05, "loss": 0.3128, "step": 991 }, { "epoch": 3.1479571598571994, "grad_norm": 0.12592324141074962, "learning_rate": 2.9013346356423446e-05, "loss": 0.317, "step": 992 }, { "epoch": 3.151130503768346, "grad_norm": 0.12513756646057633, "learning_rate": 2.8928101283108153e-05, "loss": 0.314, "step": 993 }, { "epoch": 3.1543038476794925, "grad_norm": 0.14282969282839109, "learning_rate": 2.884291063272935e-05, "loss": 0.3109, "step": 994 }, { "epoch": 3.1574771915906386, "grad_norm": 0.1199544503476591, "learning_rate": 2.8757774824034146e-05, "loss": 0.3119, "step": 995 }, { "epoch": 3.160650535501785, "grad_norm": 0.11681362905939408, "learning_rate": 2.867269427550004e-05, "loss": 0.3065, "step": 996 }, { "epoch": 3.163823879412931, "grad_norm": 0.11355969210006382, "learning_rate": 2.8587669405332942e-05, "loss": 0.3134, "step": 997 }, { "epoch": 3.1669972233240777, "grad_norm": 0.11125944189799743, "learning_rate": 2.850270063146505e-05, "loss": 0.3123, "step": 998 }, { "epoch": 3.1701705672352243, "grad_norm": 0.10903398382765295, "learning_rate": 2.8417788371552847e-05, "loss": 0.3137, "step": 999 }, { "epoch": 3.1733439111463704, "grad_norm": 0.11735770219676825, "learning_rate": 2.8332933042974997e-05, "loss": 0.3143, "step": 1000 }, { "epoch": 3.176517255057517, "grad_norm": 0.09886653400775318, "learning_rate": 2.824813506283038e-05, "loss": 0.312, "step": 1001 }, { "epoch": 3.179690598968663, "grad_norm": 0.10362486903153832, "learning_rate": 2.8163394847935924e-05, "loss": 0.3063, "step": 1002 }, { "epoch": 3.1828639428798096, "grad_norm": 0.11188362720582715, "learning_rate": 2.8078712814824657e-05, "loss": 0.3125, "step": 1003 }, { "epoch": 3.186037286790956, "grad_norm": 0.09559025289026829, "learning_rate": 2.799408937974359e-05, "loss": 0.3121, "step": 1004 }, { "epoch": 3.189210630702102, "grad_norm": 0.10812265121811047, "learning_rate": 2.790952495865173e-05, "loss": 0.3106, "step": 1005 }, { "epoch": 3.1923839746132487, "grad_norm": 0.1121565061938828, "learning_rate": 2.7825019967217975e-05, "loss": 0.3189, "step": 1006 }, { "epoch": 3.1955573185243953, "grad_norm": 0.1004631775145022, "learning_rate": 2.7740574820819133e-05, "loss": 0.3137, "step": 1007 }, { "epoch": 3.1987306624355414, "grad_norm": 0.12433594061957719, "learning_rate": 2.7656189934537815e-05, "loss": 0.3081, "step": 1008 }, { "epoch": 3.201904006346688, "grad_norm": 0.0976644148795563, "learning_rate": 2.7571865723160484e-05, "loss": 0.3108, "step": 1009 }, { "epoch": 3.205077350257834, "grad_norm": 0.12033671452278129, "learning_rate": 2.7487602601175274e-05, "loss": 0.3128, "step": 1010 }, { "epoch": 3.2082506941689806, "grad_norm": 0.11018748833970334, "learning_rate": 2.7403400982770142e-05, "loss": 0.3151, "step": 1011 }, { "epoch": 3.211424038080127, "grad_norm": 0.11256643868056433, "learning_rate": 2.731926128183064e-05, "loss": 0.314, "step": 1012 }, { "epoch": 3.214597381991273, "grad_norm": 0.11109067168322144, "learning_rate": 2.7235183911938033e-05, "loss": 0.3138, "step": 1013 }, { "epoch": 3.2177707259024197, "grad_norm": 0.11490231986287124, "learning_rate": 2.715116928636716e-05, "loss": 0.3131, "step": 1014 }, { "epoch": 3.220944069813566, "grad_norm": 0.1101964461135929, "learning_rate": 2.7067217818084475e-05, "loss": 0.3145, "step": 1015 }, { "epoch": 3.2241174137247124, "grad_norm": 0.10280292922760781, "learning_rate": 2.6983329919745968e-05, "loss": 0.3117, "step": 1016 }, { "epoch": 3.227290757635859, "grad_norm": 0.10260424316634227, "learning_rate": 2.689950600369518e-05, "loss": 0.3188, "step": 1017 }, { "epoch": 3.230464101547005, "grad_norm": 0.09915026576929485, "learning_rate": 2.681574648196111e-05, "loss": 0.3128, "step": 1018 }, { "epoch": 3.2336374454581516, "grad_norm": 0.09342686268392349, "learning_rate": 2.673205176625631e-05, "loss": 0.3127, "step": 1019 }, { "epoch": 3.236810789369298, "grad_norm": 0.10375205809388013, "learning_rate": 2.664842226797468e-05, "loss": 0.3125, "step": 1020 }, { "epoch": 3.239984133280444, "grad_norm": 0.09480673127381069, "learning_rate": 2.656485839818964e-05, "loss": 0.3103, "step": 1021 }, { "epoch": 3.2431574771915908, "grad_norm": 0.0967591082180364, "learning_rate": 2.648136056765197e-05, "loss": 0.3158, "step": 1022 }, { "epoch": 3.246330821102737, "grad_norm": 0.10704831819192964, "learning_rate": 2.639792918678786e-05, "loss": 0.3165, "step": 1023 }, { "epoch": 3.2495041650138834, "grad_norm": 0.10372920487733647, "learning_rate": 2.6314564665696853e-05, "loss": 0.314, "step": 1024 }, { "epoch": 3.25267750892503, "grad_norm": 0.10195810032807612, "learning_rate": 2.6231267414149882e-05, "loss": 0.3115, "step": 1025 }, { "epoch": 3.255850852836176, "grad_norm": 0.10118352358935338, "learning_rate": 2.6148037841587178e-05, "loss": 0.3124, "step": 1026 }, { "epoch": 3.2590241967473226, "grad_norm": 0.0960144819596772, "learning_rate": 2.606487635711634e-05, "loss": 0.3133, "step": 1027 }, { "epoch": 3.2621975406584687, "grad_norm": 0.10341496259177649, "learning_rate": 2.5981783369510262e-05, "loss": 0.3133, "step": 1028 }, { "epoch": 3.265370884569615, "grad_norm": 0.09978273187676878, "learning_rate": 2.589875928720518e-05, "loss": 0.3092, "step": 1029 }, { "epoch": 3.2685442284807618, "grad_norm": 0.10892472596771698, "learning_rate": 2.5815804518298575e-05, "loss": 0.3098, "step": 1030 }, { "epoch": 3.271717572391908, "grad_norm": 0.10275518766631388, "learning_rate": 2.5732919470547295e-05, "loss": 0.3083, "step": 1031 }, { "epoch": 3.2748909163030544, "grad_norm": 0.10103072194259327, "learning_rate": 2.5650104551365412e-05, "loss": 0.3108, "step": 1032 }, { "epoch": 3.278064260214201, "grad_norm": 0.10324262880702959, "learning_rate": 2.556736016782234e-05, "loss": 0.3122, "step": 1033 }, { "epoch": 3.281237604125347, "grad_norm": 0.09926981679742919, "learning_rate": 2.5484686726640744e-05, "loss": 0.3107, "step": 1034 }, { "epoch": 3.2844109480364936, "grad_norm": 0.10130743878380367, "learning_rate": 2.540208463419462e-05, "loss": 0.3074, "step": 1035 }, { "epoch": 3.2875842919476397, "grad_norm": 0.10557324952221134, "learning_rate": 2.5319554296507188e-05, "loss": 0.3131, "step": 1036 }, { "epoch": 3.2907576358587862, "grad_norm": 0.10083350399120836, "learning_rate": 2.5237096119249058e-05, "loss": 0.3204, "step": 1037 }, { "epoch": 3.2939309797699323, "grad_norm": 0.0986145583348089, "learning_rate": 2.5154710507736037e-05, "loss": 0.3079, "step": 1038 }, { "epoch": 3.297104323681079, "grad_norm": 0.11022375139763324, "learning_rate": 2.5072397866927335e-05, "loss": 0.317, "step": 1039 }, { "epoch": 3.3002776675922254, "grad_norm": 0.1125140361024117, "learning_rate": 2.4990158601423417e-05, "loss": 0.317, "step": 1040 }, { "epoch": 3.3034510115033715, "grad_norm": 0.11021070433389958, "learning_rate": 2.4907993115464116e-05, "loss": 0.3113, "step": 1041 }, { "epoch": 3.306624355414518, "grad_norm": 0.10634404742096815, "learning_rate": 2.4825901812926574e-05, "loss": 0.3158, "step": 1042 }, { "epoch": 3.3097976993256646, "grad_norm": 0.11699549952140445, "learning_rate": 2.474388509732333e-05, "loss": 0.3111, "step": 1043 }, { "epoch": 3.3129710432368107, "grad_norm": 0.0980534798385101, "learning_rate": 2.466194337180027e-05, "loss": 0.3143, "step": 1044 }, { "epoch": 3.3161443871479572, "grad_norm": 0.1194870796488275, "learning_rate": 2.4580077039134683e-05, "loss": 0.316, "step": 1045 }, { "epoch": 3.3193177310591038, "grad_norm": 0.10484409113962986, "learning_rate": 2.449828650173325e-05, "loss": 0.3135, "step": 1046 }, { "epoch": 3.32249107497025, "grad_norm": 0.10357015345939864, "learning_rate": 2.441657216163015e-05, "loss": 0.3109, "step": 1047 }, { "epoch": 3.3256644188813964, "grad_norm": 0.11296601395220406, "learning_rate": 2.433493442048492e-05, "loss": 0.3133, "step": 1048 }, { "epoch": 3.3288377627925425, "grad_norm": 0.09961102835039787, "learning_rate": 2.4253373679580686e-05, "loss": 0.3158, "step": 1049 }, { "epoch": 3.332011106703689, "grad_norm": 0.10181823522886456, "learning_rate": 2.4171890339822013e-05, "loss": 0.3116, "step": 1050 }, { "epoch": 3.335184450614835, "grad_norm": 0.10979798113117561, "learning_rate": 2.409048480173305e-05, "loss": 0.3162, "step": 1051 }, { "epoch": 3.3383577945259817, "grad_norm": 0.094779351238252, "learning_rate": 2.400915746545548e-05, "loss": 0.315, "step": 1052 }, { "epoch": 3.3415311384371282, "grad_norm": 0.10148023343814727, "learning_rate": 2.392790873074664e-05, "loss": 0.3146, "step": 1053 }, { "epoch": 3.3447044823482743, "grad_norm": 0.09289268477261449, "learning_rate": 2.384673899697746e-05, "loss": 0.3125, "step": 1054 }, { "epoch": 3.347877826259421, "grad_norm": 0.09684287864696564, "learning_rate": 2.376564866313058e-05, "loss": 0.3104, "step": 1055 }, { "epoch": 3.3510511701705674, "grad_norm": 0.09643736175489405, "learning_rate": 2.3684638127798336e-05, "loss": 0.3178, "step": 1056 }, { "epoch": 3.3542245140817135, "grad_norm": 0.10635216596639196, "learning_rate": 2.3603707789180863e-05, "loss": 0.3136, "step": 1057 }, { "epoch": 3.35739785799286, "grad_norm": 0.09676123073144416, "learning_rate": 2.3522858045084016e-05, "loss": 0.3122, "step": 1058 }, { "epoch": 3.360571201904006, "grad_norm": 0.10713967084993489, "learning_rate": 2.344208929291759e-05, "loss": 0.3141, "step": 1059 }, { "epoch": 3.3637445458151527, "grad_norm": 0.09842859790655265, "learning_rate": 2.3361401929693194e-05, "loss": 0.3192, "step": 1060 }, { "epoch": 3.3669178897262992, "grad_norm": 0.0960717662909176, "learning_rate": 2.3280796352022442e-05, "loss": 0.3053, "step": 1061 }, { "epoch": 3.3700912336374453, "grad_norm": 0.10290198906684156, "learning_rate": 2.3200272956114845e-05, "loss": 0.3115, "step": 1062 }, { "epoch": 3.373264577548592, "grad_norm": 0.09091365277942409, "learning_rate": 2.3119832137776088e-05, "loss": 0.3148, "step": 1063 }, { "epoch": 3.376437921459738, "grad_norm": 0.10196717070207367, "learning_rate": 2.3039474292405834e-05, "loss": 0.3111, "step": 1064 }, { "epoch": 3.3796112653708845, "grad_norm": 0.09586271821215989, "learning_rate": 2.295919981499596e-05, "loss": 0.3181, "step": 1065 }, { "epoch": 3.382784609282031, "grad_norm": 0.09899945933563967, "learning_rate": 2.287900910012854e-05, "loss": 0.3162, "step": 1066 }, { "epoch": 3.385957953193177, "grad_norm": 0.10322988072215468, "learning_rate": 2.2798902541973945e-05, "loss": 0.3153, "step": 1067 }, { "epoch": 3.3891312971043237, "grad_norm": 0.09259998496704394, "learning_rate": 2.2718880534288826e-05, "loss": 0.3121, "step": 1068 }, { "epoch": 3.3923046410154702, "grad_norm": 0.10569429128390048, "learning_rate": 2.2638943470414274e-05, "loss": 0.3125, "step": 1069 }, { "epoch": 3.3954779849266163, "grad_norm": 0.09760048377336895, "learning_rate": 2.2559091743273855e-05, "loss": 0.3127, "step": 1070 }, { "epoch": 3.398651328837763, "grad_norm": 0.10906819806313078, "learning_rate": 2.2479325745371662e-05, "loss": 0.3135, "step": 1071 }, { "epoch": 3.401824672748909, "grad_norm": 0.09977494398591062, "learning_rate": 2.239964586879033e-05, "loss": 0.312, "step": 1072 }, { "epoch": 3.4049980166600555, "grad_norm": 0.10832374432497219, "learning_rate": 2.2320052505189307e-05, "loss": 0.3181, "step": 1073 }, { "epoch": 3.408171360571202, "grad_norm": 0.1022824580898682, "learning_rate": 2.2240546045802657e-05, "loss": 0.3224, "step": 1074 }, { "epoch": 3.411344704482348, "grad_norm": 0.10034777595087839, "learning_rate": 2.216112688143735e-05, "loss": 0.3125, "step": 1075 }, { "epoch": 3.4145180483934947, "grad_norm": 0.10030841402583066, "learning_rate": 2.2081795402471248e-05, "loss": 0.3084, "step": 1076 }, { "epoch": 3.417691392304641, "grad_norm": 0.09762803350713443, "learning_rate": 2.2002551998851214e-05, "loss": 0.3093, "step": 1077 }, { "epoch": 3.4208647362157873, "grad_norm": 0.1009931981677632, "learning_rate": 2.192339706009115e-05, "loss": 0.3142, "step": 1078 }, { "epoch": 3.424038080126934, "grad_norm": 0.10087359052596663, "learning_rate": 2.1844330975270146e-05, "loss": 0.3116, "step": 1079 }, { "epoch": 3.42721142403808, "grad_norm": 0.09669963400697884, "learning_rate": 2.1765354133030537e-05, "loss": 0.3093, "step": 1080 }, { "epoch": 3.4303847679492265, "grad_norm": 0.10318895349642157, "learning_rate": 2.1686466921576e-05, "loss": 0.3139, "step": 1081 }, { "epoch": 3.433558111860373, "grad_norm": 0.100273860386507, "learning_rate": 2.1607669728669595e-05, "loss": 0.3147, "step": 1082 }, { "epoch": 3.436731455771519, "grad_norm": 0.10871940977569985, "learning_rate": 2.152896294163198e-05, "loss": 0.3195, "step": 1083 }, { "epoch": 3.4399047996826657, "grad_norm": 0.09861412764646266, "learning_rate": 2.1450346947339354e-05, "loss": 0.3099, "step": 1084 }, { "epoch": 3.443078143593812, "grad_norm": 0.1036174572244448, "learning_rate": 2.137182213222168e-05, "loss": 0.3141, "step": 1085 }, { "epoch": 3.4462514875049584, "grad_norm": 0.10204915828850486, "learning_rate": 2.1293388882260725e-05, "loss": 0.3098, "step": 1086 }, { "epoch": 3.449424831416105, "grad_norm": 0.09799894678131774, "learning_rate": 2.1215047582988195e-05, "loss": 0.3103, "step": 1087 }, { "epoch": 3.452598175327251, "grad_norm": 0.09504536792403677, "learning_rate": 2.113679861948376e-05, "loss": 0.3127, "step": 1088 }, { "epoch": 3.4557715192383975, "grad_norm": 0.09764248583285513, "learning_rate": 2.1058642376373283e-05, "loss": 0.3164, "step": 1089 }, { "epoch": 3.4589448631495436, "grad_norm": 0.0906376025849311, "learning_rate": 2.098057923782685e-05, "loss": 0.3152, "step": 1090 }, { "epoch": 3.46211820706069, "grad_norm": 0.09359589566718937, "learning_rate": 2.0902609587556896e-05, "loss": 0.3117, "step": 1091 }, { "epoch": 3.4652915509718367, "grad_norm": 0.09548394143187244, "learning_rate": 2.082473380881632e-05, "loss": 0.3115, "step": 1092 }, { "epoch": 3.468464894882983, "grad_norm": 0.09338528887344875, "learning_rate": 2.074695228439663e-05, "loss": 0.3114, "step": 1093 }, { "epoch": 3.4716382387941294, "grad_norm": 0.09884602966359947, "learning_rate": 2.066926539662598e-05, "loss": 0.3145, "step": 1094 }, { "epoch": 3.474811582705276, "grad_norm": 0.09230713872679078, "learning_rate": 2.0591673527367386e-05, "loss": 0.31, "step": 1095 }, { "epoch": 3.477984926616422, "grad_norm": 0.09898085658702097, "learning_rate": 2.051417705801681e-05, "loss": 0.3106, "step": 1096 }, { "epoch": 3.4811582705275685, "grad_norm": 0.08889141823884165, "learning_rate": 2.04367763695013e-05, "loss": 0.3096, "step": 1097 }, { "epoch": 3.4843316144387146, "grad_norm": 0.10424786455484286, "learning_rate": 2.0359471842277014e-05, "loss": 0.3125, "step": 1098 }, { "epoch": 3.487504958349861, "grad_norm": 0.09373054000252577, "learning_rate": 2.0282263856327575e-05, "loss": 0.3095, "step": 1099 }, { "epoch": 3.4906783022610077, "grad_norm": 0.09743243651722891, "learning_rate": 2.0205152791161942e-05, "loss": 0.3138, "step": 1100 }, { "epoch": 3.493851646172154, "grad_norm": 0.09903803728619201, "learning_rate": 2.0128139025812737e-05, "loss": 0.3117, "step": 1101 }, { "epoch": 3.4970249900833004, "grad_norm": 0.09797020262865515, "learning_rate": 2.00512229388343e-05, "loss": 0.3083, "step": 1102 }, { "epoch": 3.5001983339944465, "grad_norm": 0.10766478943557996, "learning_rate": 1.9974404908300837e-05, "loss": 0.3139, "step": 1103 }, { "epoch": 3.503371677905593, "grad_norm": 0.09779359977262553, "learning_rate": 1.9897685311804547e-05, "loss": 0.3106, "step": 1104 }, { "epoch": 3.5065450218167395, "grad_norm": 0.09387274555393152, "learning_rate": 1.982106452645382e-05, "loss": 0.3105, "step": 1105 }, { "epoch": 3.5097183657278856, "grad_norm": 0.10203359221398621, "learning_rate": 1.9744542928871335e-05, "loss": 0.3131, "step": 1106 }, { "epoch": 3.512891709639032, "grad_norm": 0.0940533035661825, "learning_rate": 1.966812089519223e-05, "loss": 0.3105, "step": 1107 }, { "epoch": 3.5160650535501787, "grad_norm": 0.11077496419812648, "learning_rate": 1.959179880106219e-05, "loss": 0.3124, "step": 1108 }, { "epoch": 3.519238397461325, "grad_norm": 0.08886436421169637, "learning_rate": 1.9515577021635766e-05, "loss": 0.3133, "step": 1109 }, { "epoch": 3.5224117413724714, "grad_norm": 0.1058427262771896, "learning_rate": 1.9439455931574306e-05, "loss": 0.3188, "step": 1110 }, { "epoch": 3.5255850852836175, "grad_norm": 0.0943621814009501, "learning_rate": 1.9363435905044303e-05, "loss": 0.3115, "step": 1111 }, { "epoch": 3.528758429194764, "grad_norm": 0.10860537135562053, "learning_rate": 1.9287517315715455e-05, "loss": 0.3109, "step": 1112 }, { "epoch": 3.53193177310591, "grad_norm": 0.08975638711859225, "learning_rate": 1.9211700536758867e-05, "loss": 0.3147, "step": 1113 }, { "epoch": 3.5351051170170567, "grad_norm": 0.09431070654205716, "learning_rate": 1.9135985940845167e-05, "loss": 0.3061, "step": 1114 }, { "epoch": 3.538278460928203, "grad_norm": 0.09176047436666471, "learning_rate": 1.9060373900142758e-05, "loss": 0.3039, "step": 1115 }, { "epoch": 3.5414518048393493, "grad_norm": 0.09566553815030118, "learning_rate": 1.898486478631591e-05, "loss": 0.3096, "step": 1116 }, { "epoch": 3.544625148750496, "grad_norm": 0.09398037913009233, "learning_rate": 1.8909458970523003e-05, "loss": 0.3117, "step": 1117 }, { "epoch": 3.5477984926616424, "grad_norm": 0.09297530409510038, "learning_rate": 1.8834156823414592e-05, "loss": 0.3143, "step": 1118 }, { "epoch": 3.5509718365727885, "grad_norm": 0.09616519717958327, "learning_rate": 1.8758958715131763e-05, "loss": 0.3134, "step": 1119 }, { "epoch": 3.554145180483935, "grad_norm": 0.09595619987699143, "learning_rate": 1.8683865015304107e-05, "loss": 0.3167, "step": 1120 }, { "epoch": 3.5573185243950816, "grad_norm": 0.09428857006752461, "learning_rate": 1.860887609304806e-05, "loss": 0.32, "step": 1121 }, { "epoch": 3.5604918683062277, "grad_norm": 0.09288431779361411, "learning_rate": 1.853399231696502e-05, "loss": 0.3105, "step": 1122 }, { "epoch": 3.563665212217374, "grad_norm": 0.09478695944187812, "learning_rate": 1.845921405513957e-05, "loss": 0.3136, "step": 1123 }, { "epoch": 3.5668385561285203, "grad_norm": 0.09259086063066573, "learning_rate": 1.838454167513759e-05, "loss": 0.3082, "step": 1124 }, { "epoch": 3.570011900039667, "grad_norm": 0.09731407575423262, "learning_rate": 1.8309975544004563e-05, "loss": 0.3097, "step": 1125 }, { "epoch": 3.573185243950813, "grad_norm": 0.08908940261693532, "learning_rate": 1.8235516028263693e-05, "loss": 0.3102, "step": 1126 }, { "epoch": 3.5763585878619595, "grad_norm": 0.0994114922720132, "learning_rate": 1.8161163493914138e-05, "loss": 0.3127, "step": 1127 }, { "epoch": 3.579531931773106, "grad_norm": 0.09284958185097307, "learning_rate": 1.808691830642915e-05, "loss": 0.306, "step": 1128 }, { "epoch": 3.582705275684252, "grad_norm": 0.09838434020935204, "learning_rate": 1.8012780830754428e-05, "loss": 0.3142, "step": 1129 }, { "epoch": 3.5858786195953987, "grad_norm": 0.0957701843609928, "learning_rate": 1.7938751431306108e-05, "loss": 0.3059, "step": 1130 }, { "epoch": 3.589051963506545, "grad_norm": 0.10452299557787663, "learning_rate": 1.7864830471969158e-05, "loss": 0.3149, "step": 1131 }, { "epoch": 3.5922253074176913, "grad_norm": 0.09852568670954905, "learning_rate": 1.77910183160955e-05, "loss": 0.318, "step": 1132 }, { "epoch": 3.595398651328838, "grad_norm": 0.08964997440395585, "learning_rate": 1.771731532650226e-05, "loss": 0.3132, "step": 1133 }, { "epoch": 3.5985719952399844, "grad_norm": 0.10225504250213735, "learning_rate": 1.7643721865469913e-05, "loss": 0.3125, "step": 1134 }, { "epoch": 3.6017453391511305, "grad_norm": 0.09337353095498398, "learning_rate": 1.757023829474061e-05, "loss": 0.3096, "step": 1135 }, { "epoch": 3.604918683062277, "grad_norm": 0.09729026605466504, "learning_rate": 1.7496864975516326e-05, "loss": 0.3143, "step": 1136 }, { "epoch": 3.608092026973423, "grad_norm": 0.09648684875445444, "learning_rate": 1.7423602268457124e-05, "loss": 0.3176, "step": 1137 }, { "epoch": 3.6112653708845697, "grad_norm": 0.0891461740774676, "learning_rate": 1.7350450533679298e-05, "loss": 0.3076, "step": 1138 }, { "epoch": 3.6144387147957158, "grad_norm": 0.09136934909265221, "learning_rate": 1.7277410130753775e-05, "loss": 0.3113, "step": 1139 }, { "epoch": 3.6176120587068623, "grad_norm": 0.09282078764180805, "learning_rate": 1.7204481418704136e-05, "loss": 0.3088, "step": 1140 }, { "epoch": 3.620785402618009, "grad_norm": 0.08738434824899044, "learning_rate": 1.7131664756005012e-05, "loss": 0.3112, "step": 1141 }, { "epoch": 3.623958746529155, "grad_norm": 0.09634650667327464, "learning_rate": 1.705896050058025e-05, "loss": 0.3159, "step": 1142 }, { "epoch": 3.6271320904403015, "grad_norm": 0.09161796695924315, "learning_rate": 1.698636900980119e-05, "loss": 0.3087, "step": 1143 }, { "epoch": 3.630305434351448, "grad_norm": 0.08736032109536832, "learning_rate": 1.6913890640484844e-05, "loss": 0.3137, "step": 1144 }, { "epoch": 3.633478778262594, "grad_norm": 0.09745072572329685, "learning_rate": 1.6841525748892216e-05, "loss": 0.3128, "step": 1145 }, { "epoch": 3.6366521221737407, "grad_norm": 0.08662042757890395, "learning_rate": 1.6769274690726523e-05, "loss": 0.3131, "step": 1146 }, { "epoch": 3.639825466084887, "grad_norm": 0.09312666065952754, "learning_rate": 1.6697137821131443e-05, "loss": 0.3086, "step": 1147 }, { "epoch": 3.6429988099960333, "grad_norm": 0.10075435906006584, "learning_rate": 1.6625115494689327e-05, "loss": 0.3144, "step": 1148 }, { "epoch": 3.6461721539071794, "grad_norm": 0.08168920500142361, "learning_rate": 1.6553208065419585e-05, "loss": 0.3067, "step": 1149 }, { "epoch": 3.649345497818326, "grad_norm": 0.09473700986154443, "learning_rate": 1.648141588677677e-05, "loss": 0.3086, "step": 1150 }, { "epoch": 3.6525188417294725, "grad_norm": 0.09995319497546384, "learning_rate": 1.6409739311648985e-05, "loss": 0.3153, "step": 1151 }, { "epoch": 3.6556921856406186, "grad_norm": 0.08763944506301463, "learning_rate": 1.633817869235608e-05, "loss": 0.3109, "step": 1152 }, { "epoch": 3.658865529551765, "grad_norm": 0.09881602368631207, "learning_rate": 1.626673438064795e-05, "loss": 0.321, "step": 1153 }, { "epoch": 3.6620388734629117, "grad_norm": 0.09238325300250558, "learning_rate": 1.6195406727702746e-05, "loss": 0.3176, "step": 1154 }, { "epoch": 3.665212217374058, "grad_norm": 0.09179321263341135, "learning_rate": 1.6124196084125235e-05, "loss": 0.3149, "step": 1155 }, { "epoch": 3.6683855612852043, "grad_norm": 0.09632257418168569, "learning_rate": 1.6053102799945026e-05, "loss": 0.3152, "step": 1156 }, { "epoch": 3.671558905196351, "grad_norm": 0.10636674102365737, "learning_rate": 1.5982127224614867e-05, "loss": 0.3183, "step": 1157 }, { "epoch": 3.674732249107497, "grad_norm": 0.0872410805303305, "learning_rate": 1.5911269707008857e-05, "loss": 0.3114, "step": 1158 }, { "epoch": 3.6779055930186435, "grad_norm": 0.09875023791232529, "learning_rate": 1.5840530595420903e-05, "loss": 0.3061, "step": 1159 }, { "epoch": 3.6810789369297896, "grad_norm": 0.09455931044972642, "learning_rate": 1.5769910237562798e-05, "loss": 0.3069, "step": 1160 }, { "epoch": 3.684252280840936, "grad_norm": 0.08864062113346818, "learning_rate": 1.5699408980562653e-05, "loss": 0.3091, "step": 1161 }, { "epoch": 3.6874256247520822, "grad_norm": 0.0957648926194275, "learning_rate": 1.562902717096316e-05, "loss": 0.3105, "step": 1162 }, { "epoch": 3.690598968663229, "grad_norm": 0.08822389225988175, "learning_rate": 1.5558765154719867e-05, "loss": 0.312, "step": 1163 }, { "epoch": 3.6937723125743753, "grad_norm": 0.09144318120715113, "learning_rate": 1.5488623277199463e-05, "loss": 0.3133, "step": 1164 }, { "epoch": 3.6969456564855214, "grad_norm": 0.08207249013636533, "learning_rate": 1.5418601883178138e-05, "loss": 0.3093, "step": 1165 }, { "epoch": 3.700119000396668, "grad_norm": 0.08659915132311609, "learning_rate": 1.5348701316839844e-05, "loss": 0.3118, "step": 1166 }, { "epoch": 3.7032923443078145, "grad_norm": 0.0829362247581557, "learning_rate": 1.5278921921774624e-05, "loss": 0.3101, "step": 1167 }, { "epoch": 3.7064656882189606, "grad_norm": 0.08830303321169561, "learning_rate": 1.5209264040976911e-05, "loss": 0.3121, "step": 1168 }, { "epoch": 3.709639032130107, "grad_norm": 0.08371651283489175, "learning_rate": 1.5139728016843846e-05, "loss": 0.3121, "step": 1169 }, { "epoch": 3.7128123760412537, "grad_norm": 0.09394104777007455, "learning_rate": 1.507031419117357e-05, "loss": 0.3128, "step": 1170 }, { "epoch": 3.7159857199524, "grad_norm": 0.08410977786060891, "learning_rate": 1.5001022905163596e-05, "loss": 0.3085, "step": 1171 }, { "epoch": 3.7191590638635463, "grad_norm": 0.09126456097238246, "learning_rate": 1.49318544994091e-05, "loss": 0.3134, "step": 1172 }, { "epoch": 3.7223324077746924, "grad_norm": 0.08545709007632069, "learning_rate": 1.4862809313901268e-05, "loss": 0.3154, "step": 1173 }, { "epoch": 3.725505751685839, "grad_norm": 0.08780137354204715, "learning_rate": 1.4793887688025534e-05, "loss": 0.311, "step": 1174 }, { "epoch": 3.728679095596985, "grad_norm": 0.09036873460150745, "learning_rate": 1.4725089960560106e-05, "loss": 0.3092, "step": 1175 }, { "epoch": 3.7318524395081316, "grad_norm": 0.0906949944979453, "learning_rate": 1.4656416469674067e-05, "loss": 0.3087, "step": 1176 }, { "epoch": 3.735025783419278, "grad_norm": 0.08524397104099314, "learning_rate": 1.4587867552925886e-05, "loss": 0.3097, "step": 1177 }, { "epoch": 3.7381991273304243, "grad_norm": 0.0831715872678655, "learning_rate": 1.4519443547261692e-05, "loss": 0.3091, "step": 1178 }, { "epoch": 3.741372471241571, "grad_norm": 0.08787932200139537, "learning_rate": 1.445114478901362e-05, "loss": 0.3104, "step": 1179 }, { "epoch": 3.7445458151527173, "grad_norm": 0.09013424171138545, "learning_rate": 1.4382971613898145e-05, "loss": 0.3133, "step": 1180 }, { "epoch": 3.7477191590638634, "grad_norm": 0.08872383417712887, "learning_rate": 1.431492435701447e-05, "loss": 0.3103, "step": 1181 }, { "epoch": 3.75089250297501, "grad_norm": 0.0964431058984415, "learning_rate": 1.424700335284286e-05, "loss": 0.3142, "step": 1182 }, { "epoch": 3.7540658468861565, "grad_norm": 0.08591965622221728, "learning_rate": 1.4179208935243e-05, "loss": 0.3075, "step": 1183 }, { "epoch": 3.7572391907973026, "grad_norm": 0.08719874856870091, "learning_rate": 1.4111541437452294e-05, "loss": 0.3111, "step": 1184 }, { "epoch": 3.760412534708449, "grad_norm": 0.09819733047781874, "learning_rate": 1.4044001192084391e-05, "loss": 0.3099, "step": 1185 }, { "epoch": 3.7635858786195953, "grad_norm": 0.0836739072391128, "learning_rate": 1.3976588531127334e-05, "loss": 0.3115, "step": 1186 }, { "epoch": 3.766759222530742, "grad_norm": 0.09633657842787988, "learning_rate": 1.3909303785942089e-05, "loss": 0.3154, "step": 1187 }, { "epoch": 3.769932566441888, "grad_norm": 0.08398570887749328, "learning_rate": 1.3842147287260863e-05, "loss": 0.3114, "step": 1188 }, { "epoch": 3.7731059103530344, "grad_norm": 0.08881126144850926, "learning_rate": 1.3775119365185484e-05, "loss": 0.3135, "step": 1189 }, { "epoch": 3.776279254264181, "grad_norm": 0.087196679652796, "learning_rate": 1.3708220349185731e-05, "loss": 0.3096, "step": 1190 }, { "epoch": 3.779452598175327, "grad_norm": 0.0818848922205358, "learning_rate": 1.3641450568097794e-05, "loss": 0.3072, "step": 1191 }, { "epoch": 3.7826259420864736, "grad_norm": 0.09311600595200806, "learning_rate": 1.3574810350122625e-05, "loss": 0.3104, "step": 1192 }, { "epoch": 3.78579928599762, "grad_norm": 0.0838102155532246, "learning_rate": 1.350830002282431e-05, "loss": 0.3128, "step": 1193 }, { "epoch": 3.7889726299087663, "grad_norm": 0.08273644119209583, "learning_rate": 1.344191991312843e-05, "loss": 0.3099, "step": 1194 }, { "epoch": 3.792145973819913, "grad_norm": 0.08673064009083564, "learning_rate": 1.3375670347320577e-05, "loss": 0.3108, "step": 1195 }, { "epoch": 3.7953193177310593, "grad_norm": 0.08342237037133293, "learning_rate": 1.330955165104459e-05, "loss": 0.3074, "step": 1196 }, { "epoch": 3.7984926616422054, "grad_norm": 0.08544215414242184, "learning_rate": 1.3243564149301058e-05, "loss": 0.3092, "step": 1197 }, { "epoch": 3.801666005553352, "grad_norm": 0.07858190033954376, "learning_rate": 1.3177708166445702e-05, "loss": 0.3085, "step": 1198 }, { "epoch": 3.804839349464498, "grad_norm": 0.0853891038054064, "learning_rate": 1.311198402618778e-05, "loss": 0.307, "step": 1199 }, { "epoch": 3.8080126933756446, "grad_norm": 0.07989799358178139, "learning_rate": 1.3046392051588454e-05, "loss": 0.3128, "step": 1200 }, { "epoch": 3.8111860372867907, "grad_norm": 0.08828442224939745, "learning_rate": 1.2980932565059261e-05, "loss": 0.3148, "step": 1201 }, { "epoch": 3.8143593811979373, "grad_norm": 0.08068725629005853, "learning_rate": 1.2915605888360516e-05, "loss": 0.3177, "step": 1202 }, { "epoch": 3.817532725109084, "grad_norm": 0.08257356138701971, "learning_rate": 1.2850412342599712e-05, "loss": 0.3119, "step": 1203 }, { "epoch": 3.82070606902023, "grad_norm": 0.0866333410804087, "learning_rate": 1.2785352248229907e-05, "loss": 0.3098, "step": 1204 }, { "epoch": 3.8238794129313765, "grad_norm": 0.08166886458082445, "learning_rate": 1.2720425925048274e-05, "loss": 0.3125, "step": 1205 }, { "epoch": 3.827052756842523, "grad_norm": 0.08920230072749204, "learning_rate": 1.2655633692194367e-05, "loss": 0.3106, "step": 1206 }, { "epoch": 3.830226100753669, "grad_norm": 0.08897293804748853, "learning_rate": 1.259097586814867e-05, "loss": 0.3146, "step": 1207 }, { "epoch": 3.8333994446648156, "grad_norm": 0.08018138419305465, "learning_rate": 1.2526452770730986e-05, "loss": 0.3138, "step": 1208 }, { "epoch": 3.836572788575962, "grad_norm": 0.08612406310222656, "learning_rate": 1.246206471709889e-05, "loss": 0.3125, "step": 1209 }, { "epoch": 3.8397461324871083, "grad_norm": 0.08560142904793838, "learning_rate": 1.2397812023746124e-05, "loss": 0.3098, "step": 1210 }, { "epoch": 3.842919476398255, "grad_norm": 0.08086605996564016, "learning_rate": 1.2333695006501127e-05, "loss": 0.3099, "step": 1211 }, { "epoch": 3.846092820309401, "grad_norm": 0.08291451210131907, "learning_rate": 1.2269713980525407e-05, "loss": 0.3057, "step": 1212 }, { "epoch": 3.8492661642205475, "grad_norm": 0.08303662204470749, "learning_rate": 1.2205869260312034e-05, "loss": 0.3133, "step": 1213 }, { "epoch": 3.8524395081316936, "grad_norm": 0.08226024846397162, "learning_rate": 1.2142161159684034e-05, "loss": 0.3158, "step": 1214 }, { "epoch": 3.85561285204284, "grad_norm": 0.08613484735188547, "learning_rate": 1.2078589991792983e-05, "loss": 0.3118, "step": 1215 }, { "epoch": 3.8587861959539866, "grad_norm": 0.0844370020344609, "learning_rate": 1.2015156069117278e-05, "loss": 0.311, "step": 1216 }, { "epoch": 3.8619595398651327, "grad_norm": 0.08157413068957893, "learning_rate": 1.195185970346075e-05, "loss": 0.3118, "step": 1217 }, { "epoch": 3.8651328837762793, "grad_norm": 0.0809161660461467, "learning_rate": 1.1888701205951084e-05, "loss": 0.3074, "step": 1218 }, { "epoch": 3.868306227687426, "grad_norm": 0.08569592239748124, "learning_rate": 1.1825680887038274e-05, "loss": 0.3136, "step": 1219 }, { "epoch": 3.871479571598572, "grad_norm": 0.08144000303149429, "learning_rate": 1.1762799056493095e-05, "loss": 0.3117, "step": 1220 }, { "epoch": 3.8746529155097185, "grad_norm": 0.08023713096676821, "learning_rate": 1.1700056023405622e-05, "loss": 0.3085, "step": 1221 }, { "epoch": 3.877826259420865, "grad_norm": 0.08533264894900448, "learning_rate": 1.1637452096183663e-05, "loss": 0.3112, "step": 1222 }, { "epoch": 3.880999603332011, "grad_norm": 0.08294444189988791, "learning_rate": 1.1574987582551293e-05, "loss": 0.312, "step": 1223 }, { "epoch": 3.884172947243157, "grad_norm": 0.07997988212163627, "learning_rate": 1.1512662789547249e-05, "loss": 0.3102, "step": 1224 }, { "epoch": 3.8873462911543037, "grad_norm": 0.08618908178309605, "learning_rate": 1.1450478023523575e-05, "loss": 0.3104, "step": 1225 }, { "epoch": 3.8905196350654503, "grad_norm": 0.08855469703733576, "learning_rate": 1.1388433590143939e-05, "loss": 0.3099, "step": 1226 }, { "epoch": 3.8936929789765964, "grad_norm": 0.08401029593007889, "learning_rate": 1.1326529794382264e-05, "loss": 0.3064, "step": 1227 }, { "epoch": 3.896866322887743, "grad_norm": 0.08447977219178371, "learning_rate": 1.1264766940521171e-05, "loss": 0.3144, "step": 1228 }, { "epoch": 3.9000396667988895, "grad_norm": 0.08858538731350699, "learning_rate": 1.1203145332150505e-05, "loss": 0.3121, "step": 1229 }, { "epoch": 3.9032130107100356, "grad_norm": 0.0909725806781119, "learning_rate": 1.1141665272165789e-05, "loss": 0.3109, "step": 1230 }, { "epoch": 3.906386354621182, "grad_norm": 0.08293088335470532, "learning_rate": 1.1080327062766827e-05, "loss": 0.3125, "step": 1231 }, { "epoch": 3.9095596985323287, "grad_norm": 0.08569314487810697, "learning_rate": 1.1019131005456143e-05, "loss": 0.3083, "step": 1232 }, { "epoch": 3.9127330424434748, "grad_norm": 0.08315219817463836, "learning_rate": 1.0958077401037542e-05, "loss": 0.313, "step": 1233 }, { "epoch": 3.9159063863546213, "grad_norm": 0.08276037981985424, "learning_rate": 1.0897166549614573e-05, "loss": 0.314, "step": 1234 }, { "epoch": 3.9190797302657674, "grad_norm": 0.08774104502041317, "learning_rate": 1.0836398750589172e-05, "loss": 0.3071, "step": 1235 }, { "epoch": 3.922253074176914, "grad_norm": 0.08859302501482878, "learning_rate": 1.0775774302660027e-05, "loss": 0.3148, "step": 1236 }, { "epoch": 3.92542641808806, "grad_norm": 0.08667466575751302, "learning_rate": 1.0715293503821256e-05, "loss": 0.309, "step": 1237 }, { "epoch": 3.9285997619992066, "grad_norm": 0.08724856226631149, "learning_rate": 1.0654956651360857e-05, "loss": 0.3147, "step": 1238 }, { "epoch": 3.931773105910353, "grad_norm": 0.08515851001054704, "learning_rate": 1.0594764041859293e-05, "loss": 0.312, "step": 1239 }, { "epoch": 3.934946449821499, "grad_norm": 0.08664134089549802, "learning_rate": 1.0534715971187976e-05, "loss": 0.3065, "step": 1240 }, { "epoch": 3.9381197937326458, "grad_norm": 0.07937577751856649, "learning_rate": 1.0474812734507886e-05, "loss": 0.3079, "step": 1241 }, { "epoch": 3.9412931376437923, "grad_norm": 0.08269171013905462, "learning_rate": 1.041505462626807e-05, "loss": 0.3078, "step": 1242 }, { "epoch": 3.9444664815549384, "grad_norm": 0.08011960205561683, "learning_rate": 1.0355441940204215e-05, "loss": 0.3066, "step": 1243 }, { "epoch": 3.947639825466085, "grad_norm": 0.07819385992242202, "learning_rate": 1.0295974969337186e-05, "loss": 0.3128, "step": 1244 }, { "epoch": 3.9508131693772315, "grad_norm": 0.08250283712608554, "learning_rate": 1.0236654005971625e-05, "loss": 0.3181, "step": 1245 }, { "epoch": 3.9539865132883776, "grad_norm": 0.6384901300040913, "learning_rate": 1.017747934169444e-05, "loss": 0.3072, "step": 1246 }, { "epoch": 3.957159857199524, "grad_norm": 0.08198945954442947, "learning_rate": 1.0118451267373462e-05, "loss": 0.3088, "step": 1247 }, { "epoch": 3.96033320111067, "grad_norm": 0.08781474473700562, "learning_rate": 1.0059570073155953e-05, "loss": 0.3119, "step": 1248 }, { "epoch": 3.9635065450218168, "grad_norm": 0.08246668180469044, "learning_rate": 1.0000836048467221e-05, "loss": 0.313, "step": 1249 }, { "epoch": 3.966679888932963, "grad_norm": 0.08889236389023081, "learning_rate": 9.942249482009117e-06, "loss": 0.3148, "step": 1250 }, { "epoch": 3.9698532328441094, "grad_norm": 0.08718334142633781, "learning_rate": 9.88381066175876e-06, "loss": 0.3098, "step": 1251 }, { "epoch": 3.973026576755256, "grad_norm": 0.08701871231451917, "learning_rate": 9.825519874966952e-06, "loss": 0.3132, "step": 1252 }, { "epoch": 3.976199920666402, "grad_norm": 0.08864822845899448, "learning_rate": 9.767377408156906e-06, "loss": 0.3141, "step": 1253 }, { "epoch": 3.9793732645775486, "grad_norm": 0.08251988049787, "learning_rate": 9.709383547122764e-06, "loss": 0.3106, "step": 1254 }, { "epoch": 3.982546608488695, "grad_norm": 0.0829769224836818, "learning_rate": 9.651538576928213e-06, "loss": 0.3147, "step": 1255 }, { "epoch": 3.9857199523998412, "grad_norm": 0.08191823227244799, "learning_rate": 9.59384278190505e-06, "loss": 0.3068, "step": 1256 }, { "epoch": 3.9888932963109878, "grad_norm": 0.08231464325661744, "learning_rate": 9.53629644565186e-06, "loss": 0.311, "step": 1257 }, { "epoch": 3.9920666402221343, "grad_norm": 0.07952802959102342, "learning_rate": 9.478899851032554e-06, "loss": 0.3092, "step": 1258 }, { "epoch": 3.9952399841332804, "grad_norm": 0.07741615193611691, "learning_rate": 9.421653280175014e-06, "loss": 0.3043, "step": 1259 }, { "epoch": 3.998413328044427, "grad_norm": 0.0792129238358985, "learning_rate": 9.364557014469651e-06, "loss": 0.3106, "step": 1260 }, { "epoch": 4.0015866719555735, "grad_norm": 0.2372544363090627, "learning_rate": 9.307611334568137e-06, "loss": 0.5642, "step": 1261 }, { "epoch": 4.00476001586672, "grad_norm": 0.12928047394977374, "learning_rate": 9.250816520381884e-06, "loss": 0.2916, "step": 1262 }, { "epoch": 4.007933359777866, "grad_norm": 0.1547813755040527, "learning_rate": 9.194172851080778e-06, "loss": 0.2904, "step": 1263 }, { "epoch": 4.011106703689013, "grad_norm": 0.12678189534036333, "learning_rate": 9.137680605091753e-06, "loss": 0.2871, "step": 1264 }, { "epoch": 4.014280047600159, "grad_norm": 0.13337837439164088, "learning_rate": 9.081340060097443e-06, "loss": 0.286, "step": 1265 }, { "epoch": 4.017453391511305, "grad_norm": 0.1460976490959922, "learning_rate": 9.025151493034779e-06, "loss": 0.288, "step": 1266 }, { "epoch": 4.020626735422451, "grad_norm": 0.11381826357536112, "learning_rate": 8.969115180093699e-06, "loss": 0.2894, "step": 1267 }, { "epoch": 4.023800079333598, "grad_norm": 0.1292486747466223, "learning_rate": 8.91323139671572e-06, "loss": 0.2931, "step": 1268 }, { "epoch": 4.026973423244744, "grad_norm": 0.1243312232469112, "learning_rate": 8.857500417592648e-06, "loss": 0.2908, "step": 1269 }, { "epoch": 4.03014676715589, "grad_norm": 0.10431292634709807, "learning_rate": 8.801922516665127e-06, "loss": 0.285, "step": 1270 }, { "epoch": 4.033320111067037, "grad_norm": 0.11511588958038933, "learning_rate": 8.746497967121445e-06, "loss": 0.288, "step": 1271 }, { "epoch": 4.036493454978183, "grad_norm": 0.09400500609718773, "learning_rate": 8.69122704139604e-06, "loss": 0.2906, "step": 1272 }, { "epoch": 4.039666798889329, "grad_norm": 0.10377314324554619, "learning_rate": 8.636110011168246e-06, "loss": 0.2847, "step": 1273 }, { "epoch": 4.042840142800476, "grad_norm": 0.09999767041994467, "learning_rate": 8.58114714736094e-06, "loss": 0.2892, "step": 1274 }, { "epoch": 4.046013486711622, "grad_norm": 0.0879076384755368, "learning_rate": 8.526338720139225e-06, "loss": 0.2866, "step": 1275 }, { "epoch": 4.0491868306227685, "grad_norm": 0.10595158327559934, "learning_rate": 8.471684998909033e-06, "loss": 0.2856, "step": 1276 }, { "epoch": 4.0523601745339155, "grad_norm": 0.09994029668981493, "learning_rate": 8.417186252315904e-06, "loss": 0.2862, "step": 1277 }, { "epoch": 4.055533518445062, "grad_norm": 0.08748710664677468, "learning_rate": 8.362842748243593e-06, "loss": 0.2888, "step": 1278 }, { "epoch": 4.058706862356208, "grad_norm": 0.1057862773429233, "learning_rate": 8.308654753812785e-06, "loss": 0.2922, "step": 1279 }, { "epoch": 4.061880206267354, "grad_norm": 0.10805451904620589, "learning_rate": 8.254622535379733e-06, "loss": 0.2933, "step": 1280 }, { "epoch": 4.065053550178501, "grad_norm": 0.08529498056433478, "learning_rate": 8.200746358535054e-06, "loss": 0.289, "step": 1281 }, { "epoch": 4.068226894089647, "grad_norm": 0.10025625808348602, "learning_rate": 8.147026488102288e-06, "loss": 0.2861, "step": 1282 }, { "epoch": 4.071400238000793, "grad_norm": 0.10466816524732167, "learning_rate": 8.093463188136712e-06, "loss": 0.2879, "step": 1283 }, { "epoch": 4.07457358191194, "grad_norm": 0.09159217995048068, "learning_rate": 8.04005672192397e-06, "loss": 0.2911, "step": 1284 }, { "epoch": 4.077746925823086, "grad_norm": 0.09573656877621275, "learning_rate": 7.986807351978827e-06, "loss": 0.2858, "step": 1285 }, { "epoch": 4.080920269734232, "grad_norm": 0.09231500426704371, "learning_rate": 7.933715340043822e-06, "loss": 0.2906, "step": 1286 }, { "epoch": 4.084093613645379, "grad_norm": 0.09110127087440445, "learning_rate": 7.880780947088031e-06, "loss": 0.2888, "step": 1287 }, { "epoch": 4.087266957556525, "grad_norm": 0.09178186024677654, "learning_rate": 7.82800443330578e-06, "loss": 0.2845, "step": 1288 }, { "epoch": 4.090440301467671, "grad_norm": 0.08699713547366167, "learning_rate": 7.77538605811535e-06, "loss": 0.2949, "step": 1289 }, { "epoch": 4.093613645378818, "grad_norm": 0.08651443976613882, "learning_rate": 7.722926080157673e-06, "loss": 0.2915, "step": 1290 }, { "epoch": 4.096786989289964, "grad_norm": 0.08839398111073136, "learning_rate": 7.670624757295151e-06, "loss": 0.2895, "step": 1291 }, { "epoch": 4.0999603332011105, "grad_norm": 0.07840294963123258, "learning_rate": 7.618482346610276e-06, "loss": 0.2842, "step": 1292 }, { "epoch": 4.103133677112257, "grad_norm": 0.08242092139423192, "learning_rate": 7.566499104404452e-06, "loss": 0.2848, "step": 1293 }, { "epoch": 4.106307021023404, "grad_norm": 0.08497119568891674, "learning_rate": 7.514675286196698e-06, "loss": 0.2914, "step": 1294 }, { "epoch": 4.10948036493455, "grad_norm": 0.0810225885617694, "learning_rate": 7.46301114672241e-06, "loss": 0.2872, "step": 1295 }, { "epoch": 4.112653708845696, "grad_norm": 0.08718653499899484, "learning_rate": 7.411506939932058e-06, "loss": 0.2875, "step": 1296 }, { "epoch": 4.115827052756843, "grad_norm": 0.08535338762467477, "learning_rate": 7.360162918990021e-06, "loss": 0.2846, "step": 1297 }, { "epoch": 4.119000396667989, "grad_norm": 0.07685735750488655, "learning_rate": 7.308979336273281e-06, "loss": 0.2843, "step": 1298 }, { "epoch": 4.122173740579135, "grad_norm": 0.08263564119903752, "learning_rate": 7.257956443370209e-06, "loss": 0.2905, "step": 1299 }, { "epoch": 4.125347084490282, "grad_norm": 0.08566614638966435, "learning_rate": 7.20709449107928e-06, "loss": 0.289, "step": 1300 }, { "epoch": 4.128520428401428, "grad_norm": 0.08073179020805406, "learning_rate": 7.156393729407956e-06, "loss": 0.285, "step": 1301 }, { "epoch": 4.131693772312574, "grad_norm": 0.08607718507088445, "learning_rate": 7.1058544075712995e-06, "loss": 0.2958, "step": 1302 }, { "epoch": 4.134867116223721, "grad_norm": 0.0773081314831222, "learning_rate": 7.055476773990881e-06, "loss": 0.2829, "step": 1303 }, { "epoch": 4.138040460134867, "grad_norm": 0.08091752523907358, "learning_rate": 7.0052610762934905e-06, "loss": 0.2864, "step": 1304 }, { "epoch": 4.141213804046013, "grad_norm": 0.08363422913570277, "learning_rate": 6.955207561309944e-06, "loss": 0.2949, "step": 1305 }, { "epoch": 4.1443871479571595, "grad_norm": 0.07855930876619456, "learning_rate": 6.905316475073842e-06, "loss": 0.2921, "step": 1306 }, { "epoch": 4.147560491868306, "grad_norm": 0.07964756493675755, "learning_rate": 6.855588062820407e-06, "loss": 0.2895, "step": 1307 }, { "epoch": 4.1507338357794525, "grad_norm": 0.0788278091150506, "learning_rate": 6.806022568985233e-06, "loss": 0.2864, "step": 1308 }, { "epoch": 4.153907179690599, "grad_norm": 0.08049149360175006, "learning_rate": 6.756620237203124e-06, "loss": 0.2825, "step": 1309 }, { "epoch": 4.157080523601746, "grad_norm": 0.080868151969321, "learning_rate": 6.707381310306833e-06, "loss": 0.2892, "step": 1310 }, { "epoch": 4.160253867512892, "grad_norm": 0.07798552901381123, "learning_rate": 6.658306030325978e-06, "loss": 0.2865, "step": 1311 }, { "epoch": 4.163427211424038, "grad_norm": 0.0805335375512832, "learning_rate": 6.60939463848572e-06, "loss": 0.2911, "step": 1312 }, { "epoch": 4.166600555335185, "grad_norm": 0.07518550741681629, "learning_rate": 6.560647375205676e-06, "loss": 0.2882, "step": 1313 }, { "epoch": 4.169773899246331, "grad_norm": 0.0781854434454258, "learning_rate": 6.512064480098694e-06, "loss": 0.2891, "step": 1314 }, { "epoch": 4.172947243157477, "grad_norm": 0.07987762578525559, "learning_rate": 6.4636461919697034e-06, "loss": 0.2856, "step": 1315 }, { "epoch": 4.176120587068624, "grad_norm": 0.07606785506312735, "learning_rate": 6.4153927488144865e-06, "loss": 0.2845, "step": 1316 }, { "epoch": 4.17929393097977, "grad_norm": 0.07637479917962597, "learning_rate": 6.367304387818567e-06, "loss": 0.277, "step": 1317 }, { "epoch": 4.182467274890916, "grad_norm": 0.08032228596283143, "learning_rate": 6.31938134535603e-06, "loss": 0.2869, "step": 1318 }, { "epoch": 4.185640618802062, "grad_norm": 0.07742967651427472, "learning_rate": 6.271623856988336e-06, "loss": 0.2841, "step": 1319 }, { "epoch": 4.188813962713209, "grad_norm": 0.0808988691015848, "learning_rate": 6.224032157463184e-06, "loss": 0.2906, "step": 1320 }, { "epoch": 4.191987306624355, "grad_norm": 0.07749448537178391, "learning_rate": 6.176606480713365e-06, "loss": 0.2898, "step": 1321 }, { "epoch": 4.1951606505355015, "grad_norm": 0.07984359673939721, "learning_rate": 6.129347059855572e-06, "loss": 0.2883, "step": 1322 }, { "epoch": 4.1983339944466485, "grad_norm": 0.07874063845144524, "learning_rate": 6.082254127189302e-06, "loss": 0.2863, "step": 1323 }, { "epoch": 4.2015073383577946, "grad_norm": 0.07786902211545077, "learning_rate": 6.035327914195694e-06, "loss": 0.2911, "step": 1324 }, { "epoch": 4.204680682268941, "grad_norm": 0.08187795024389079, "learning_rate": 5.988568651536399e-06, "loss": 0.2928, "step": 1325 }, { "epoch": 4.207854026180088, "grad_norm": 0.07811082790053991, "learning_rate": 5.941976569052394e-06, "loss": 0.2855, "step": 1326 }, { "epoch": 4.211027370091234, "grad_norm": 0.0781331548320789, "learning_rate": 5.895551895762968e-06, "loss": 0.2873, "step": 1327 }, { "epoch": 4.21420071400238, "grad_norm": 0.08057455261417476, "learning_rate": 5.849294859864456e-06, "loss": 0.2862, "step": 1328 }, { "epoch": 4.217374057913526, "grad_norm": 0.08046289285253863, "learning_rate": 5.8032056887292345e-06, "loss": 0.2926, "step": 1329 }, { "epoch": 4.220547401824673, "grad_norm": 0.07901814464604347, "learning_rate": 5.757284608904528e-06, "loss": 0.2927, "step": 1330 }, { "epoch": 4.223720745735819, "grad_norm": 0.08241125484928784, "learning_rate": 5.711531846111351e-06, "loss": 0.2919, "step": 1331 }, { "epoch": 4.226894089646965, "grad_norm": 0.07943725546881736, "learning_rate": 5.6659476252433285e-06, "loss": 0.2858, "step": 1332 }, { "epoch": 4.230067433558112, "grad_norm": 0.07690103398901868, "learning_rate": 5.620532170365667e-06, "loss": 0.288, "step": 1333 }, { "epoch": 4.233240777469258, "grad_norm": 0.07982460591594524, "learning_rate": 5.5752857047140086e-06, "loss": 0.292, "step": 1334 }, { "epoch": 4.236414121380404, "grad_norm": 0.07809749969035885, "learning_rate": 5.530208450693355e-06, "loss": 0.2895, "step": 1335 }, { "epoch": 4.239587465291551, "grad_norm": 0.07723500752452818, "learning_rate": 5.48530062987692e-06, "loss": 0.2924, "step": 1336 }, { "epoch": 4.242760809202697, "grad_norm": 0.07756528859698934, "learning_rate": 5.440562463005154e-06, "loss": 0.2923, "step": 1337 }, { "epoch": 4.2459341531138435, "grad_norm": 0.07872620120384079, "learning_rate": 5.395994169984522e-06, "loss": 0.2858, "step": 1338 }, { "epoch": 4.2491074970249905, "grad_norm": 0.07841592614920617, "learning_rate": 5.351595969886529e-06, "loss": 0.2948, "step": 1339 }, { "epoch": 4.252280840936137, "grad_norm": 0.07403115107349581, "learning_rate": 5.307368080946584e-06, "loss": 0.2853, "step": 1340 }, { "epoch": 4.255454184847283, "grad_norm": 0.07885594656165801, "learning_rate": 5.263310720562973e-06, "loss": 0.2929, "step": 1341 }, { "epoch": 4.25862752875843, "grad_norm": 0.07590630094007132, "learning_rate": 5.219424105295719e-06, "loss": 0.2859, "step": 1342 }, { "epoch": 4.261800872669576, "grad_norm": 0.07840740302270648, "learning_rate": 5.175708450865595e-06, "loss": 0.2858, "step": 1343 }, { "epoch": 4.264974216580722, "grad_norm": 0.07673578995611278, "learning_rate": 5.1321639721530325e-06, "loss": 0.2889, "step": 1344 }, { "epoch": 4.268147560491868, "grad_norm": 0.07951575168272773, "learning_rate": 5.088790883197061e-06, "loss": 0.2896, "step": 1345 }, { "epoch": 4.271320904403015, "grad_norm": 0.08030218736730213, "learning_rate": 5.045589397194231e-06, "loss": 0.2865, "step": 1346 }, { "epoch": 4.274494248314161, "grad_norm": 0.07842325630849278, "learning_rate": 5.0025597264976446e-06, "loss": 0.2858, "step": 1347 }, { "epoch": 4.277667592225307, "grad_norm": 0.07773496721087116, "learning_rate": 4.9597020826158114e-06, "loss": 0.2901, "step": 1348 }, { "epoch": 4.280840936136454, "grad_norm": 0.07839435165688115, "learning_rate": 4.917016676211686e-06, "loss": 0.2952, "step": 1349 }, { "epoch": 4.2840142800476, "grad_norm": 0.08007319222215062, "learning_rate": 4.8745037171016045e-06, "loss": 0.2895, "step": 1350 }, { "epoch": 4.287187623958746, "grad_norm": 0.0802943162822994, "learning_rate": 4.832163414254254e-06, "loss": 0.2854, "step": 1351 }, { "epoch": 4.290360967869893, "grad_norm": 0.07670078924225301, "learning_rate": 4.789995975789619e-06, "loss": 0.2866, "step": 1352 }, { "epoch": 4.293534311781039, "grad_norm": 0.07571900940799264, "learning_rate": 4.748001608978015e-06, "loss": 0.2864, "step": 1353 }, { "epoch": 4.2967076556921855, "grad_norm": 0.07887124085195305, "learning_rate": 4.70618052023903e-06, "loss": 0.2861, "step": 1354 }, { "epoch": 4.299880999603332, "grad_norm": 0.07844552010826854, "learning_rate": 4.664532915140525e-06, "loss": 0.2861, "step": 1355 }, { "epoch": 4.303054343514479, "grad_norm": 0.0758283591701092, "learning_rate": 4.623058998397585e-06, "loss": 0.2827, "step": 1356 }, { "epoch": 4.306227687425625, "grad_norm": 0.07567360663444131, "learning_rate": 4.581758973871609e-06, "loss": 0.2852, "step": 1357 }, { "epoch": 4.309401031336771, "grad_norm": 0.07698958638459642, "learning_rate": 4.540633044569172e-06, "loss": 0.2838, "step": 1358 }, { "epoch": 4.312574375247918, "grad_norm": 0.07772937829274414, "learning_rate": 4.499681412641148e-06, "loss": 0.2895, "step": 1359 }, { "epoch": 4.315747719159064, "grad_norm": 0.07637398650887665, "learning_rate": 4.4589042793816525e-06, "loss": 0.2921, "step": 1360 }, { "epoch": 4.31892106307021, "grad_norm": 0.07633260293292246, "learning_rate": 4.418301845227073e-06, "loss": 0.2829, "step": 1361 }, { "epoch": 4.322094406981357, "grad_norm": 0.07860481054083629, "learning_rate": 4.377874309755065e-06, "loss": 0.2866, "step": 1362 }, { "epoch": 4.325267750892503, "grad_norm": 0.0762763983665852, "learning_rate": 4.337621871683597e-06, "loss": 0.2833, "step": 1363 }, { "epoch": 4.328441094803649, "grad_norm": 0.0766144117665982, "learning_rate": 4.297544728869958e-06, "loss": 0.2901, "step": 1364 }, { "epoch": 4.331614438714796, "grad_norm": 0.07937899829074807, "learning_rate": 4.257643078309808e-06, "loss": 0.2924, "step": 1365 }, { "epoch": 4.334787782625942, "grad_norm": 0.07527289810281566, "learning_rate": 4.2179171161361365e-06, "loss": 0.2848, "step": 1366 }, { "epoch": 4.337961126537088, "grad_norm": 0.07688154141825307, "learning_rate": 4.178367037618429e-06, "loss": 0.2895, "step": 1367 }, { "epoch": 4.341134470448234, "grad_norm": 0.07790856052535607, "learning_rate": 4.138993037161565e-06, "loss": 0.293, "step": 1368 }, { "epoch": 4.344307814359381, "grad_norm": 0.0782212752825844, "learning_rate": 4.099795308304954e-06, "loss": 0.2873, "step": 1369 }, { "epoch": 4.3474811582705275, "grad_norm": 0.07864542088050423, "learning_rate": 4.060774043721565e-06, "loss": 0.2895, "step": 1370 }, { "epoch": 4.350654502181674, "grad_norm": 0.07539046652733071, "learning_rate": 4.0219294352169714e-06, "loss": 0.2928, "step": 1371 }, { "epoch": 4.353827846092821, "grad_norm": 0.07802077980079632, "learning_rate": 3.983261673728378e-06, "loss": 0.2907, "step": 1372 }, { "epoch": 4.357001190003967, "grad_norm": 0.07914754106412068, "learning_rate": 3.94477094932376e-06, "loss": 0.2897, "step": 1373 }, { "epoch": 4.360174533915113, "grad_norm": 0.07815313150549123, "learning_rate": 3.906457451200845e-06, "loss": 0.2823, "step": 1374 }, { "epoch": 4.36334787782626, "grad_norm": 0.0783643686014188, "learning_rate": 3.8683213676862585e-06, "loss": 0.2902, "step": 1375 }, { "epoch": 4.366521221737406, "grad_norm": 0.07899805915070553, "learning_rate": 3.830362886234502e-06, "loss": 0.2915, "step": 1376 }, { "epoch": 4.369694565648552, "grad_norm": 0.07616018845948855, "learning_rate": 3.7925821934271655e-06, "loss": 0.2853, "step": 1377 }, { "epoch": 4.372867909559698, "grad_norm": 0.07604900073546127, "learning_rate": 3.7549794749718673e-06, "loss": 0.2918, "step": 1378 }, { "epoch": 4.376041253470845, "grad_norm": 0.07611014329101679, "learning_rate": 3.717554915701449e-06, "loss": 0.2833, "step": 1379 }, { "epoch": 4.379214597381991, "grad_norm": 0.07836395731025556, "learning_rate": 3.680308699573005e-06, "loss": 0.2879, "step": 1380 }, { "epoch": 4.382387941293137, "grad_norm": 0.07492430726816589, "learning_rate": 3.64324100966702e-06, "loss": 0.2806, "step": 1381 }, { "epoch": 4.385561285204284, "grad_norm": 0.07741032733958537, "learning_rate": 3.606352028186426e-06, "loss": 0.2897, "step": 1382 }, { "epoch": 4.38873462911543, "grad_norm": 0.07451962544381002, "learning_rate": 3.5696419364557433e-06, "loss": 0.2893, "step": 1383 }, { "epoch": 4.391907973026576, "grad_norm": 0.08019767472904286, "learning_rate": 3.533110914920177e-06, "loss": 0.2884, "step": 1384 }, { "epoch": 4.395081316937723, "grad_norm": 0.07731877454550821, "learning_rate": 3.4967591431447256e-06, "loss": 0.2902, "step": 1385 }, { "epoch": 4.3982546608488695, "grad_norm": 0.08113895307948948, "learning_rate": 3.460586799813288e-06, "loss": 0.2906, "step": 1386 }, { "epoch": 4.401428004760016, "grad_norm": 0.07677456138444168, "learning_rate": 3.4245940627278284e-06, "loss": 0.2902, "step": 1387 }, { "epoch": 4.404601348671163, "grad_norm": 0.07648163374651913, "learning_rate": 3.388781108807444e-06, "loss": 0.2879, "step": 1388 }, { "epoch": 4.407774692582309, "grad_norm": 0.0758938270041518, "learning_rate": 3.3531481140875345e-06, "loss": 0.2934, "step": 1389 }, { "epoch": 4.410948036493455, "grad_norm": 0.07252099034254757, "learning_rate": 3.317695253718931e-06, "loss": 0.2855, "step": 1390 }, { "epoch": 4.414121380404602, "grad_norm": 0.08007509732355841, "learning_rate": 3.2824227019670272e-06, "loss": 0.29, "step": 1391 }, { "epoch": 4.417294724315748, "grad_norm": 0.07359637512325612, "learning_rate": 3.2473306322109078e-06, "loss": 0.2846, "step": 1392 }, { "epoch": 4.420468068226894, "grad_norm": 0.07691247352169074, "learning_rate": 3.2124192169425573e-06, "loss": 0.2938, "step": 1393 }, { "epoch": 4.42364141213804, "grad_norm": 0.07370622338875793, "learning_rate": 3.177688627765916e-06, "loss": 0.2838, "step": 1394 }, { "epoch": 4.426814756049187, "grad_norm": 0.0756129323864927, "learning_rate": 3.143139035396128e-06, "loss": 0.292, "step": 1395 }, { "epoch": 4.429988099960333, "grad_norm": 0.0775966898230028, "learning_rate": 3.1087706096586535e-06, "loss": 0.2929, "step": 1396 }, { "epoch": 4.433161443871479, "grad_norm": 0.07808391129944373, "learning_rate": 3.0745835194884435e-06, "loss": 0.2897, "step": 1397 }, { "epoch": 4.436334787782626, "grad_norm": 0.07349160355298512, "learning_rate": 3.0405779329290987e-06, "loss": 0.2821, "step": 1398 }, { "epoch": 4.439508131693772, "grad_norm": 0.07471990105942067, "learning_rate": 3.0067540171320674e-06, "loss": 0.2868, "step": 1399 }, { "epoch": 4.442681475604918, "grad_norm": 0.07304458565439569, "learning_rate": 2.973111938355815e-06, "loss": 0.289, "step": 1400 }, { "epoch": 4.445854819516065, "grad_norm": 0.07542736354862137, "learning_rate": 2.9396518619650007e-06, "loss": 0.2837, "step": 1401 }, { "epoch": 4.4490281634272115, "grad_norm": 0.07580500670249878, "learning_rate": 2.90637395242964e-06, "loss": 0.2886, "step": 1402 }, { "epoch": 4.452201507338358, "grad_norm": 0.07483645701648972, "learning_rate": 2.8732783733243754e-06, "loss": 0.2883, "step": 1403 }, { "epoch": 4.455374851249504, "grad_norm": 0.0734355618152173, "learning_rate": 2.8403652873275755e-06, "loss": 0.2896, "step": 1404 }, { "epoch": 4.458548195160651, "grad_norm": 0.07743307777656919, "learning_rate": 2.8076348562206024e-06, "loss": 0.2945, "step": 1405 }, { "epoch": 4.461721539071797, "grad_norm": 0.07285494607732518, "learning_rate": 2.7750872408869843e-06, "loss": 0.2823, "step": 1406 }, { "epoch": 4.464894882982943, "grad_norm": 0.07654578475317778, "learning_rate": 2.7427226013116448e-06, "loss": 0.2893, "step": 1407 }, { "epoch": 4.46806822689409, "grad_norm": 0.07415127461281228, "learning_rate": 2.7105410965800928e-06, "loss": 0.2881, "step": 1408 }, { "epoch": 4.471241570805236, "grad_norm": 0.07473740527863117, "learning_rate": 2.678542884877664e-06, "loss": 0.2861, "step": 1409 }, { "epoch": 4.474414914716382, "grad_norm": 0.07455517075536294, "learning_rate": 2.646728123488731e-06, "loss": 0.2868, "step": 1410 }, { "epoch": 4.477588258627529, "grad_norm": 0.07402864039008861, "learning_rate": 2.6150969687959426e-06, "loss": 0.2891, "step": 1411 }, { "epoch": 4.480761602538675, "grad_norm": 0.07506645131586324, "learning_rate": 2.5836495762794078e-06, "loss": 0.2875, "step": 1412 }, { "epoch": 4.483934946449821, "grad_norm": 0.07687328694069057, "learning_rate": 2.552386100516033e-06, "loss": 0.2889, "step": 1413 }, { "epoch": 4.487108290360968, "grad_norm": 0.07292287204353926, "learning_rate": 2.521306695178636e-06, "loss": 0.2862, "step": 1414 }, { "epoch": 4.490281634272114, "grad_norm": 0.07378533131418447, "learning_rate": 2.4904115130352845e-06, "loss": 0.2858, "step": 1415 }, { "epoch": 4.4934549781832605, "grad_norm": 0.07434204604179016, "learning_rate": 2.459700705948507e-06, "loss": 0.2855, "step": 1416 }, { "epoch": 4.496628322094407, "grad_norm": 0.07431002205125276, "learning_rate": 2.429174424874563e-06, "loss": 0.2816, "step": 1417 }, { "epoch": 4.4998016660055535, "grad_norm": 0.07821248980927963, "learning_rate": 2.398832819862662e-06, "loss": 0.2884, "step": 1418 }, { "epoch": 4.5029750099167, "grad_norm": 0.07520177957755667, "learning_rate": 2.3686760400542853e-06, "loss": 0.2912, "step": 1419 }, { "epoch": 4.506148353827846, "grad_norm": 0.07186434245067173, "learning_rate": 2.3387042336824097e-06, "loss": 0.287, "step": 1420 }, { "epoch": 4.509321697738993, "grad_norm": 0.07671516440501819, "learning_rate": 2.3089175480707926e-06, "loss": 0.2904, "step": 1421 }, { "epoch": 4.512495041650139, "grad_norm": 0.07403348893774686, "learning_rate": 2.2793161296332354e-06, "loss": 0.2848, "step": 1422 }, { "epoch": 4.515668385561285, "grad_norm": 0.07485509243857365, "learning_rate": 2.2499001238729034e-06, "loss": 0.2889, "step": 1423 }, { "epoch": 4.518841729472432, "grad_norm": 0.07341240180495677, "learning_rate": 2.2206696753815527e-06, "loss": 0.2876, "step": 1424 }, { "epoch": 4.522015073383578, "grad_norm": 0.07545046460513817, "learning_rate": 2.191624927838865e-06, "loss": 0.2803, "step": 1425 }, { "epoch": 4.525188417294724, "grad_norm": 0.0729065721268795, "learning_rate": 2.1627660240117177e-06, "loss": 0.2907, "step": 1426 }, { "epoch": 4.52836176120587, "grad_norm": 0.0739827535708724, "learning_rate": 2.134093105753503e-06, "loss": 0.2875, "step": 1427 }, { "epoch": 4.531535105117017, "grad_norm": 0.0737465566063914, "learning_rate": 2.1056063140034013e-06, "loss": 0.2886, "step": 1428 }, { "epoch": 4.534708449028163, "grad_norm": 0.07481103353206092, "learning_rate": 2.0773057887857105e-06, "loss": 0.2859, "step": 1429 }, { "epoch": 4.537881792939309, "grad_norm": 0.07443664446659384, "learning_rate": 2.049191669209156e-06, "loss": 0.2877, "step": 1430 }, { "epoch": 4.541055136850456, "grad_norm": 0.07271868744467536, "learning_rate": 2.0212640934662087e-06, "loss": 0.2808, "step": 1431 }, { "epoch": 4.5442284807616025, "grad_norm": 0.0731348085785222, "learning_rate": 1.99352319883237e-06, "loss": 0.2859, "step": 1432 }, { "epoch": 4.547401824672749, "grad_norm": 0.0728659918076379, "learning_rate": 1.9659691216655697e-06, "loss": 0.2817, "step": 1433 }, { "epoch": 4.5505751685838955, "grad_norm": 0.07052892858321479, "learning_rate": 1.9386019974054182e-06, "loss": 0.2845, "step": 1434 }, { "epoch": 4.553748512495042, "grad_norm": 0.07162865810925105, "learning_rate": 1.9114219605725905e-06, "loss": 0.2852, "step": 1435 }, { "epoch": 4.556921856406188, "grad_norm": 0.07216631582905737, "learning_rate": 1.8844291447681496e-06, "loss": 0.2893, "step": 1436 }, { "epoch": 4.560095200317335, "grad_norm": 0.07454657617595882, "learning_rate": 1.857623682672891e-06, "loss": 0.2884, "step": 1437 }, { "epoch": 4.563268544228481, "grad_norm": 0.07192684325273382, "learning_rate": 1.8310057060466845e-06, "loss": 0.2921, "step": 1438 }, { "epoch": 4.566441888139627, "grad_norm": 0.07346684054783836, "learning_rate": 1.8045753457278303e-06, "loss": 0.287, "step": 1439 }, { "epoch": 4.569615232050774, "grad_norm": 0.07368537492116049, "learning_rate": 1.7783327316324238e-06, "loss": 0.2874, "step": 1440 }, { "epoch": 4.57278857596192, "grad_norm": 0.07300900030526943, "learning_rate": 1.752277992753717e-06, "loss": 0.2827, "step": 1441 }, { "epoch": 4.575961919873066, "grad_norm": 0.0735865281943053, "learning_rate": 1.726411257161451e-06, "loss": 0.2902, "step": 1442 }, { "epoch": 4.579135263784213, "grad_norm": 0.07218495117782045, "learning_rate": 1.700732652001289e-06, "loss": 0.2878, "step": 1443 }, { "epoch": 4.582308607695359, "grad_norm": 0.07237820497971356, "learning_rate": 1.6752423034941223e-06, "loss": 0.2914, "step": 1444 }, { "epoch": 4.585481951606505, "grad_norm": 0.07169874424676832, "learning_rate": 1.6499403369355115e-06, "loss": 0.2906, "step": 1445 }, { "epoch": 4.588655295517651, "grad_norm": 0.07237942647540188, "learning_rate": 1.6248268766950204e-06, "loss": 0.2871, "step": 1446 }, { "epoch": 4.591828639428798, "grad_norm": 0.07206522725627988, "learning_rate": 1.5999020462156511e-06, "loss": 0.2872, "step": 1447 }, { "epoch": 4.5950019833399445, "grad_norm": 0.07232160129557513, "learning_rate": 1.5751659680131792e-06, "loss": 0.2859, "step": 1448 }, { "epoch": 4.598175327251091, "grad_norm": 0.07224139682967294, "learning_rate": 1.5506187636756109e-06, "loss": 0.2899, "step": 1449 }, { "epoch": 4.601348671162238, "grad_norm": 0.07350195322568023, "learning_rate": 1.5262605538625574e-06, "loss": 0.2888, "step": 1450 }, { "epoch": 4.604522015073384, "grad_norm": 0.0724134729249542, "learning_rate": 1.5020914583046398e-06, "loss": 0.2872, "step": 1451 }, { "epoch": 4.60769535898453, "grad_norm": 0.07146812102253265, "learning_rate": 1.4781115958028935e-06, "loss": 0.2863, "step": 1452 }, { "epoch": 4.610868702895676, "grad_norm": 0.07365565598868938, "learning_rate": 1.4543210842282229e-06, "loss": 0.2891, "step": 1453 }, { "epoch": 4.614042046806823, "grad_norm": 0.07266133951722602, "learning_rate": 1.4307200405207656e-06, "loss": 0.2875, "step": 1454 }, { "epoch": 4.617215390717969, "grad_norm": 0.07255421353863033, "learning_rate": 1.4073085806893593e-06, "loss": 0.2933, "step": 1455 }, { "epoch": 4.620388734629115, "grad_norm": 0.0738467244304907, "learning_rate": 1.3840868198109658e-06, "loss": 0.2853, "step": 1456 }, { "epoch": 4.623562078540262, "grad_norm": 0.07193567501371838, "learning_rate": 1.3610548720300965e-06, "loss": 0.2846, "step": 1457 }, { "epoch": 4.626735422451408, "grad_norm": 0.07215628995818985, "learning_rate": 1.3382128505582315e-06, "loss": 0.2907, "step": 1458 }, { "epoch": 4.629908766362554, "grad_norm": 0.07250278706923594, "learning_rate": 1.315560867673318e-06, "loss": 0.2892, "step": 1459 }, { "epoch": 4.633082110273701, "grad_norm": 0.07078718377132749, "learning_rate": 1.2930990347191607e-06, "loss": 0.2834, "step": 1460 }, { "epoch": 4.636255454184847, "grad_norm": 0.07349937695742143, "learning_rate": 1.2708274621049134e-06, "loss": 0.2861, "step": 1461 }, { "epoch": 4.639428798095993, "grad_norm": 0.07204842558684392, "learning_rate": 1.2487462593045075e-06, "loss": 0.2904, "step": 1462 }, { "epoch": 4.64260214200714, "grad_norm": 0.0721734105457553, "learning_rate": 1.2268555348561529e-06, "loss": 0.2835, "step": 1463 }, { "epoch": 4.6457754859182865, "grad_norm": 0.0720032080278003, "learning_rate": 1.20515539636175e-06, "loss": 0.2872, "step": 1464 }, { "epoch": 4.648948829829433, "grad_norm": 0.07173144513365656, "learning_rate": 1.183645950486416e-06, "loss": 0.2903, "step": 1465 }, { "epoch": 4.65212217374058, "grad_norm": 0.07173493810236914, "learning_rate": 1.1623273029579195e-06, "loss": 0.293, "step": 1466 }, { "epoch": 4.655295517651726, "grad_norm": 0.07082496375705209, "learning_rate": 1.141199558566184e-06, "loss": 0.286, "step": 1467 }, { "epoch": 4.658468861562872, "grad_norm": 0.07192243350970179, "learning_rate": 1.1202628211627587e-06, "loss": 0.2865, "step": 1468 }, { "epoch": 4.661642205474018, "grad_norm": 0.07242387413423897, "learning_rate": 1.099517193660331e-06, "loss": 0.2934, "step": 1469 }, { "epoch": 4.664815549385165, "grad_norm": 0.0733740588484503, "learning_rate": 1.0789627780321888e-06, "loss": 0.2905, "step": 1470 }, { "epoch": 4.667988893296311, "grad_norm": 0.07206002165507971, "learning_rate": 1.058599675311731e-06, "loss": 0.287, "step": 1471 }, { "epoch": 4.671162237207457, "grad_norm": 0.07127887265831032, "learning_rate": 1.0384279855919944e-06, "loss": 0.2896, "step": 1472 }, { "epoch": 4.674335581118604, "grad_norm": 0.07358907596433474, "learning_rate": 1.0184478080251315e-06, "loss": 0.2879, "step": 1473 }, { "epoch": 4.67750892502975, "grad_norm": 0.0714280248007389, "learning_rate": 9.986592408219286e-07, "loss": 0.2868, "step": 1474 }, { "epoch": 4.680682268940896, "grad_norm": 0.0711643498792862, "learning_rate": 9.7906238125133e-07, "loss": 0.2877, "step": 1475 }, { "epoch": 4.683855612852043, "grad_norm": 0.07071027647717443, "learning_rate": 9.59657325639971e-07, "loss": 0.2875, "step": 1476 }, { "epoch": 4.687028956763189, "grad_norm": 0.07098768877099393, "learning_rate": 9.404441693716771e-07, "loss": 0.2889, "step": 1477 }, { "epoch": 4.690202300674335, "grad_norm": 0.0713859105686287, "learning_rate": 9.2142300688701e-07, "loss": 0.292, "step": 1478 }, { "epoch": 4.6933756445854815, "grad_norm": 0.07126866567296886, "learning_rate": 9.025939316828203e-07, "loss": 0.2846, "step": 1479 }, { "epoch": 4.6965489884966285, "grad_norm": 0.0723870642231539, "learning_rate": 8.839570363117445e-07, "loss": 0.288, "step": 1480 }, { "epoch": 4.699722332407775, "grad_norm": 0.07141709931411543, "learning_rate": 8.655124123817926e-07, "loss": 0.2851, "step": 1481 }, { "epoch": 4.702895676318921, "grad_norm": 0.07173481548872705, "learning_rate": 8.47260150555882e-07, "loss": 0.2869, "step": 1482 }, { "epoch": 4.706069020230068, "grad_norm": 0.07243043839490769, "learning_rate": 8.292003405513882e-07, "loss": 0.2863, "step": 1483 }, { "epoch": 4.709242364141214, "grad_norm": 0.07029557557454981, "learning_rate": 8.113330711397016e-07, "loss": 0.2873, "step": 1484 }, { "epoch": 4.71241570805236, "grad_norm": 0.0704646381143378, "learning_rate": 7.936584301458006e-07, "loss": 0.2889, "step": 1485 }, { "epoch": 4.715589051963507, "grad_norm": 0.07189911994697401, "learning_rate": 7.761765044478209e-07, "loss": 0.2844, "step": 1486 }, { "epoch": 4.718762395874653, "grad_norm": 0.07145859830072916, "learning_rate": 7.588873799766161e-07, "loss": 0.29, "step": 1487 }, { "epoch": 4.721935739785799, "grad_norm": 0.0711567541075634, "learning_rate": 7.417911417153401e-07, "loss": 0.2919, "step": 1488 }, { "epoch": 4.725109083696946, "grad_norm": 0.07204569153095995, "learning_rate": 7.248878736990428e-07, "loss": 0.2877, "step": 1489 }, { "epoch": 4.728282427608092, "grad_norm": 0.07125020942303784, "learning_rate": 7.081776590142352e-07, "loss": 0.2858, "step": 1490 }, { "epoch": 4.731455771519238, "grad_norm": 0.06994584906572929, "learning_rate": 6.916605797984987e-07, "loss": 0.2807, "step": 1491 }, { "epoch": 4.734629115430385, "grad_norm": 0.0715883706850393, "learning_rate": 6.753367172400716e-07, "loss": 0.2859, "step": 1492 }, { "epoch": 4.737802459341531, "grad_norm": 0.07097376897268061, "learning_rate": 6.59206151577454e-07, "loss": 0.2831, "step": 1493 }, { "epoch": 4.740975803252677, "grad_norm": 0.07016656413072693, "learning_rate": 6.432689620990084e-07, "loss": 0.2814, "step": 1494 }, { "epoch": 4.7441491471638235, "grad_norm": 0.07157617127762622, "learning_rate": 6.275252271425736e-07, "loss": 0.2838, "step": 1495 }, { "epoch": 4.7473224910749705, "grad_norm": 0.07273178665389977, "learning_rate": 6.119750240950906e-07, "loss": 0.2892, "step": 1496 }, { "epoch": 4.750495834986117, "grad_norm": 0.07183066506951129, "learning_rate": 5.96618429392204e-07, "loss": 0.2838, "step": 1497 }, { "epoch": 4.753669178897263, "grad_norm": 0.07085345548841952, "learning_rate": 5.814555185178838e-07, "loss": 0.2924, "step": 1498 }, { "epoch": 4.75684252280841, "grad_norm": 0.071668882520716, "learning_rate": 5.664863660040843e-07, "loss": 0.2882, "step": 1499 }, { "epoch": 4.760015866719556, "grad_norm": 0.07232430109380229, "learning_rate": 5.517110454303387e-07, "loss": 0.2922, "step": 1500 }, { "epoch": 4.763189210630702, "grad_norm": 0.07157437165230283, "learning_rate": 5.371296294234318e-07, "loss": 0.2818, "step": 1501 }, { "epoch": 4.766362554541848, "grad_norm": 0.07138983987689239, "learning_rate": 5.227421896570217e-07, "loss": 0.2896, "step": 1502 }, { "epoch": 4.769535898452995, "grad_norm": 0.0728983951265643, "learning_rate": 5.085487968512892e-07, "loss": 0.2837, "step": 1503 }, { "epoch": 4.772709242364141, "grad_norm": 0.07175621867152891, "learning_rate": 4.945495207725958e-07, "loss": 0.2838, "step": 1504 }, { "epoch": 4.775882586275287, "grad_norm": 0.07008101989246392, "learning_rate": 4.807444302331509e-07, "loss": 0.2891, "step": 1505 }, { "epoch": 4.779055930186434, "grad_norm": 0.07114350131264044, "learning_rate": 4.671335930906429e-07, "loss": 0.2884, "step": 1506 }, { "epoch": 4.78222927409758, "grad_norm": 0.0712547000013449, "learning_rate": 4.537170762479459e-07, "loss": 0.2865, "step": 1507 }, { "epoch": 4.785402618008726, "grad_norm": 0.07044621429669759, "learning_rate": 4.404949456527474e-07, "loss": 0.2803, "step": 1508 }, { "epoch": 4.788575961919873, "grad_norm": 0.07132128750657112, "learning_rate": 4.274672662972679e-07, "loss": 0.2904, "step": 1509 }, { "epoch": 4.791749305831019, "grad_norm": 0.07164963300558302, "learning_rate": 4.146341022179057e-07, "loss": 0.2908, "step": 1510 }, { "epoch": 4.7949226497421655, "grad_norm": 0.07165377120863803, "learning_rate": 4.019955164949352e-07, "loss": 0.2874, "step": 1511 }, { "epoch": 4.7980959936533125, "grad_norm": 0.07017965132137843, "learning_rate": 3.8955157125221356e-07, "loss": 0.2849, "step": 1512 }, { "epoch": 4.801269337564459, "grad_norm": 0.0706363848720524, "learning_rate": 3.7730232765685213e-07, "loss": 0.2861, "step": 1513 }, { "epoch": 4.804442681475605, "grad_norm": 0.07329406027888734, "learning_rate": 3.6524784591891013e-07, "loss": 0.2873, "step": 1514 }, { "epoch": 4.807616025386752, "grad_norm": 0.06880746155049348, "learning_rate": 3.5338818529113253e-07, "loss": 0.2869, "step": 1515 }, { "epoch": 4.810789369297898, "grad_norm": 0.07176951769611108, "learning_rate": 3.417234040686257e-07, "loss": 0.2935, "step": 1516 }, { "epoch": 4.813962713209044, "grad_norm": 0.07173903151971806, "learning_rate": 3.302535595885914e-07, "loss": 0.292, "step": 1517 }, { "epoch": 4.817136057120191, "grad_norm": 0.06981608234215353, "learning_rate": 3.1897870823002883e-07, "loss": 0.2875, "step": 1518 }, { "epoch": 4.820309401031337, "grad_norm": 0.07079666453671589, "learning_rate": 3.078989054134729e-07, "loss": 0.2852, "step": 1519 }, { "epoch": 4.823482744942483, "grad_norm": 0.07073176594785087, "learning_rate": 2.9701420560071417e-07, "loss": 0.2905, "step": 1520 }, { "epoch": 4.826656088853629, "grad_norm": 0.06983353352491713, "learning_rate": 2.863246622945237e-07, "loss": 0.2851, "step": 1521 }, { "epoch": 4.829829432764776, "grad_norm": 0.07067428777959145, "learning_rate": 2.758303280384045e-07, "loss": 0.2873, "step": 1522 }, { "epoch": 4.833002776675922, "grad_norm": 0.07102800424361236, "learning_rate": 2.6553125441633355e-07, "loss": 0.2871, "step": 1523 }, { "epoch": 4.836176120587068, "grad_norm": 0.07060381894814903, "learning_rate": 2.5542749205248683e-07, "loss": 0.285, "step": 1524 }, { "epoch": 4.839349464498215, "grad_norm": 0.07059262948871846, "learning_rate": 2.4551909061101275e-07, "loss": 0.2838, "step": 1525 }, { "epoch": 4.8425228084093614, "grad_norm": 0.0700522721522073, "learning_rate": 2.3580609879578332e-07, "loss": 0.2913, "step": 1526 }, { "epoch": 4.8456961523205075, "grad_norm": 0.07311091919771467, "learning_rate": 2.2628856435015e-07, "loss": 0.2903, "step": 1527 }, { "epoch": 4.848869496231654, "grad_norm": 0.06916920833217112, "learning_rate": 2.1696653405670398e-07, "loss": 0.2831, "step": 1528 }, { "epoch": 4.852042840142801, "grad_norm": 0.0709586661157069, "learning_rate": 2.0784005373706729e-07, "loss": 0.2892, "step": 1529 }, { "epoch": 4.855216184053947, "grad_norm": 0.06984666564128461, "learning_rate": 1.9890916825163086e-07, "loss": 0.2895, "step": 1530 }, { "epoch": 4.858389527965093, "grad_norm": 0.07227174002725144, "learning_rate": 1.9017392149938585e-07, "loss": 0.2856, "step": 1531 }, { "epoch": 4.86156287187624, "grad_norm": 0.06981243660335461, "learning_rate": 1.816343564176526e-07, "loss": 0.2841, "step": 1532 }, { "epoch": 4.864736215787386, "grad_norm": 0.06955676961459405, "learning_rate": 1.7329051498191196e-07, "loss": 0.2853, "step": 1533 }, { "epoch": 4.867909559698532, "grad_norm": 0.07044007567746591, "learning_rate": 1.6514243820556996e-07, "loss": 0.2892, "step": 1534 }, { "epoch": 4.871082903609679, "grad_norm": 0.07011876795238156, "learning_rate": 1.5719016613978012e-07, "loss": 0.2888, "step": 1535 }, { "epoch": 4.874256247520825, "grad_norm": 0.0711153254470578, "learning_rate": 1.4943373787323468e-07, "loss": 0.2875, "step": 1536 }, { "epoch": 4.877429591431971, "grad_norm": 0.07033022450802037, "learning_rate": 1.4187319153196488e-07, "loss": 0.2937, "step": 1537 }, { "epoch": 4.880602935343118, "grad_norm": 0.07169239899295766, "learning_rate": 1.3450856427916325e-07, "loss": 0.2911, "step": 1538 }, { "epoch": 4.883776279254264, "grad_norm": 0.07061272545363809, "learning_rate": 1.2733989231500597e-07, "loss": 0.2852, "step": 1539 }, { "epoch": 4.88694962316541, "grad_norm": 0.07142348008808704, "learning_rate": 1.2036721087646642e-07, "loss": 0.2881, "step": 1540 }, { "epoch": 4.890122967076557, "grad_norm": 0.07491018067145877, "learning_rate": 1.1359055423713295e-07, "loss": 0.2883, "step": 1541 }, { "epoch": 4.8932963109877035, "grad_norm": 0.07043352846998585, "learning_rate": 1.070099557070714e-07, "loss": 0.2884, "step": 1542 }, { "epoch": 4.8964696548988496, "grad_norm": 0.0703268500097442, "learning_rate": 1.0062544763262516e-07, "loss": 0.2886, "step": 1543 }, { "epoch": 4.899642998809996, "grad_norm": 0.06967989026912552, "learning_rate": 9.44370613962775e-08, "loss": 0.2875, "step": 1544 }, { "epoch": 4.902816342721143, "grad_norm": 0.07091190304068178, "learning_rate": 8.844482741649174e-08, "loss": 0.2846, "step": 1545 }, { "epoch": 4.905989686632289, "grad_norm": 0.0704157347762079, "learning_rate": 8.264877514756465e-08, "loss": 0.2915, "step": 1546 }, { "epoch": 4.909163030543435, "grad_norm": 0.07197642287777141, "learning_rate": 7.704893307947547e-08, "loss": 0.2886, "step": 1547 }, { "epoch": 4.912336374454582, "grad_norm": 0.06954677224905088, "learning_rate": 7.164532873775276e-08, "loss": 0.288, "step": 1548 }, { "epoch": 4.915509718365728, "grad_norm": 0.07029671235552006, "learning_rate": 6.643798868333662e-08, "loss": 0.2868, "step": 1549 }, { "epoch": 4.918683062276874, "grad_norm": 0.07101459809224268, "learning_rate": 6.142693851244109e-08, "loss": 0.2871, "step": 1550 }, { "epoch": 4.921856406188021, "grad_norm": 0.07028381337765902, "learning_rate": 5.661220285643865e-08, "loss": 0.2923, "step": 1551 }, { "epoch": 4.925029750099167, "grad_norm": 0.07105984885088223, "learning_rate": 5.199380538174037e-08, "loss": 0.2935, "step": 1552 }, { "epoch": 4.928203094010313, "grad_norm": 0.07233897145123062, "learning_rate": 4.7571768789667075e-08, "loss": 0.2875, "step": 1553 }, { "epoch": 4.931376437921459, "grad_norm": 0.0703092103811166, "learning_rate": 4.3346114816347207e-08, "loss": 0.2858, "step": 1554 }, { "epoch": 4.934549781832606, "grad_norm": 0.0706968242740516, "learning_rate": 3.931686423261027e-08, "loss": 0.2872, "step": 1555 }, { "epoch": 4.937723125743752, "grad_norm": 0.069673162302404, "learning_rate": 3.5484036843875804e-08, "loss": 0.2882, "step": 1556 }, { "epoch": 4.9408964696548985, "grad_norm": 0.06968426384379744, "learning_rate": 3.1847651490068964e-08, "loss": 0.2855, "step": 1557 }, { "epoch": 4.9440698135660455, "grad_norm": 0.07194410996026296, "learning_rate": 2.8407726045522886e-08, "loss": 0.2928, "step": 1558 }, { "epoch": 4.947243157477192, "grad_norm": 0.06910814568161171, "learning_rate": 2.5164277418880945e-08, "loss": 0.2853, "step": 1559 }, { "epoch": 4.950416501388338, "grad_norm": 0.07039708426854659, "learning_rate": 2.2117321553030146e-08, "loss": 0.2896, "step": 1560 }, { "epoch": 4.953589845299485, "grad_norm": 0.07008182260440685, "learning_rate": 1.9266873425012323e-08, "loss": 0.2892, "step": 1561 }, { "epoch": 4.956763189210631, "grad_norm": 0.06932142266239405, "learning_rate": 1.6612947045953064e-08, "loss": 0.2852, "step": 1562 }, { "epoch": 4.959936533121777, "grad_norm": 0.06970140934176305, "learning_rate": 1.4155555460990677e-08, "loss": 0.2893, "step": 1563 }, { "epoch": 4.963109877032924, "grad_norm": 0.07142752676643119, "learning_rate": 1.1894710749214e-08, "loss": 0.2883, "step": 1564 }, { "epoch": 4.96628322094407, "grad_norm": 0.07011403382472035, "learning_rate": 9.83042402360912e-09, "loss": 0.2877, "step": 1565 }, { "epoch": 4.969456564855216, "grad_norm": 0.06985655852080626, "learning_rate": 7.962705430988315e-09, "loss": 0.2908, "step": 1566 }, { "epoch": 4.972629908766363, "grad_norm": 0.07205354220071218, "learning_rate": 6.291564151963414e-09, "loss": 0.2885, "step": 1567 }, { "epoch": 4.975803252677509, "grad_norm": 0.06900070418548365, "learning_rate": 4.817008400879175e-09, "loss": 0.2877, "step": 1568 }, { "epoch": 4.978976596588655, "grad_norm": 0.07029900380853947, "learning_rate": 3.539045425777765e-09, "loss": 0.2868, "step": 1569 }, { "epoch": 4.982149940499801, "grad_norm": 0.070893441701857, "learning_rate": 2.4576815083809933e-09, "loss": 0.2859, "step": 1570 }, { "epoch": 4.985323284410948, "grad_norm": 0.07118447299049073, "learning_rate": 1.572921964032581e-09, "loss": 0.2897, "step": 1571 }, { "epoch": 4.988496628322094, "grad_norm": 0.07138726064563218, "learning_rate": 8.847711416937188e-10, "loss": 0.2912, "step": 1572 }, { "epoch": 4.9916699722332405, "grad_norm": 0.06968196998745611, "learning_rate": 3.9323242390754137e-10, "loss": 0.2841, "step": 1573 }, { "epoch": 4.9948433161443875, "grad_norm": 0.07155054271656291, "learning_rate": 9.830822678136288e-11, "loss": 0.2906, "step": 1574 }, { "epoch": 4.998016660055534, "grad_norm": 0.07077675462284239, "learning_rate": 0.0, "loss": 0.2858, "step": 1575 }, { "epoch": 4.998016660055534, "step": 1575, "total_flos": 3.782923812788083e+19, "train_loss": 0.3593273034549895, "train_runtime": 93234.1699, "train_samples_per_second": 8.651, "train_steps_per_second": 0.017 } ], "logging_steps": 1.0, "max_steps": 1575, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.782923812788083e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }