{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5343, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005614823133071309, "grad_norm": 5.768040657043457, "learning_rate": 1.8691588785046733e-08, "loss": 0.7992, "step": 1 }, { "epoch": 0.0011229646266142617, "grad_norm": 5.75449800491333, "learning_rate": 3.7383177570093465e-08, "loss": 0.834, "step": 2 }, { "epoch": 0.0016844469399213925, "grad_norm": 5.702184677124023, "learning_rate": 5.6074766355140185e-08, "loss": 0.8137, "step": 3 }, { "epoch": 0.0022459292532285235, "grad_norm": 5.802980422973633, "learning_rate": 7.476635514018693e-08, "loss": 0.8502, "step": 4 }, { "epoch": 0.002807411566535654, "grad_norm": 6.078833103179932, "learning_rate": 9.345794392523364e-08, "loss": 0.8318, "step": 5 }, { "epoch": 0.003368893879842785, "grad_norm": 5.713484287261963, "learning_rate": 1.1214953271028037e-07, "loss": 0.8517, "step": 6 }, { "epoch": 0.0039303761931499155, "grad_norm": 5.884383678436279, "learning_rate": 1.3084112149532712e-07, "loss": 0.8606, "step": 7 }, { "epoch": 0.004491858506457047, "grad_norm": 6.062474727630615, "learning_rate": 1.4953271028037386e-07, "loss": 0.8518, "step": 8 }, { "epoch": 0.0050533408197641775, "grad_norm": 6.19154167175293, "learning_rate": 1.6822429906542057e-07, "loss": 0.908, "step": 9 }, { "epoch": 0.005614823133071308, "grad_norm": 6.009279251098633, "learning_rate": 1.8691588785046729e-07, "loss": 0.9012, "step": 10 }, { "epoch": 0.006176305446378439, "grad_norm": 5.894038200378418, "learning_rate": 2.0560747663551403e-07, "loss": 0.8552, "step": 11 }, { "epoch": 0.00673778775968557, "grad_norm": 5.978625774383545, "learning_rate": 2.2429906542056074e-07, "loss": 0.8755, "step": 12 }, { "epoch": 0.0072992700729927005, "grad_norm": 5.700766563415527, "learning_rate": 2.429906542056075e-07, "loss": 0.8512, "step": 13 }, { "epoch": 0.007860752386299831, "grad_norm": 5.972858428955078, "learning_rate": 2.6168224299065424e-07, "loss": 0.8652, "step": 14 }, { "epoch": 0.008422234699606962, "grad_norm": 5.6359686851501465, "learning_rate": 2.8037383177570096e-07, "loss": 0.8382, "step": 15 }, { "epoch": 0.008983717012914094, "grad_norm": 5.448653221130371, "learning_rate": 2.990654205607477e-07, "loss": 0.8233, "step": 16 }, { "epoch": 0.009545199326221224, "grad_norm": 5.464986801147461, "learning_rate": 3.1775700934579444e-07, "loss": 0.8397, "step": 17 }, { "epoch": 0.010106681639528355, "grad_norm": 5.482314586639404, "learning_rate": 3.3644859813084115e-07, "loss": 0.8484, "step": 18 }, { "epoch": 0.010668163952835485, "grad_norm": 5.304565906524658, "learning_rate": 3.551401869158879e-07, "loss": 0.8437, "step": 19 }, { "epoch": 0.011229646266142616, "grad_norm": 5.463379383087158, "learning_rate": 3.7383177570093457e-07, "loss": 0.8702, "step": 20 }, { "epoch": 0.011791128579449747, "grad_norm": 5.588697910308838, "learning_rate": 3.9252336448598134e-07, "loss": 0.8683, "step": 21 }, { "epoch": 0.012352610892756879, "grad_norm": 4.804734706878662, "learning_rate": 4.1121495327102805e-07, "loss": 0.8161, "step": 22 }, { "epoch": 0.01291409320606401, "grad_norm": 4.632859230041504, "learning_rate": 4.299065420560748e-07, "loss": 0.8293, "step": 23 }, { "epoch": 0.01347557551937114, "grad_norm": 4.176572799682617, "learning_rate": 4.485981308411215e-07, "loss": 0.7383, "step": 24 }, { "epoch": 0.01403705783267827, "grad_norm": 4.441606521606445, "learning_rate": 4.6728971962616824e-07, "loss": 0.8235, "step": 25 }, { "epoch": 0.014598540145985401, "grad_norm": 4.495850086212158, "learning_rate": 4.85981308411215e-07, "loss": 0.8319, "step": 26 }, { "epoch": 0.015160022459292532, "grad_norm": 3.9520819187164307, "learning_rate": 5.046728971962617e-07, "loss": 0.7975, "step": 27 }, { "epoch": 0.015721504772599662, "grad_norm": 4.069424152374268, "learning_rate": 5.233644859813085e-07, "loss": 0.847, "step": 28 }, { "epoch": 0.016282987085906794, "grad_norm": 4.02386474609375, "learning_rate": 5.420560747663551e-07, "loss": 0.771, "step": 29 }, { "epoch": 0.016844469399213923, "grad_norm": 3.6990175247192383, "learning_rate": 5.607476635514019e-07, "loss": 0.7897, "step": 30 }, { "epoch": 0.017405951712521055, "grad_norm": 2.4454500675201416, "learning_rate": 5.794392523364487e-07, "loss": 0.7433, "step": 31 }, { "epoch": 0.017967434025828188, "grad_norm": 2.3628153800964355, "learning_rate": 5.981308411214954e-07, "loss": 0.7456, "step": 32 }, { "epoch": 0.018528916339135316, "grad_norm": 2.3046045303344727, "learning_rate": 6.168224299065421e-07, "loss": 0.7487, "step": 33 }, { "epoch": 0.01909039865244245, "grad_norm": 2.20914363861084, "learning_rate": 6.355140186915889e-07, "loss": 0.7303, "step": 34 }, { "epoch": 0.019651880965749578, "grad_norm": 2.1492767333984375, "learning_rate": 6.542056074766355e-07, "loss": 0.7657, "step": 35 }, { "epoch": 0.02021336327905671, "grad_norm": 2.201486587524414, "learning_rate": 6.728971962616823e-07, "loss": 0.7846, "step": 36 }, { "epoch": 0.020774845592363842, "grad_norm": 2.1416687965393066, "learning_rate": 6.91588785046729e-07, "loss": 0.8095, "step": 37 }, { "epoch": 0.02133632790567097, "grad_norm": 1.9325106143951416, "learning_rate": 7.102803738317758e-07, "loss": 0.7363, "step": 38 }, { "epoch": 0.021897810218978103, "grad_norm": 2.010739803314209, "learning_rate": 7.289719626168225e-07, "loss": 0.7917, "step": 39 }, { "epoch": 0.022459292532285232, "grad_norm": 1.7687816619873047, "learning_rate": 7.476635514018691e-07, "loss": 0.7608, "step": 40 }, { "epoch": 0.023020774845592364, "grad_norm": 1.5275899171829224, "learning_rate": 7.66355140186916e-07, "loss": 0.7327, "step": 41 }, { "epoch": 0.023582257158899493, "grad_norm": 1.4784352779388428, "learning_rate": 7.850467289719627e-07, "loss": 0.7195, "step": 42 }, { "epoch": 0.024143739472206625, "grad_norm": 1.6322327852249146, "learning_rate": 8.037383177570094e-07, "loss": 0.6945, "step": 43 }, { "epoch": 0.024705221785513758, "grad_norm": 1.7997164726257324, "learning_rate": 8.224299065420561e-07, "loss": 0.733, "step": 44 }, { "epoch": 0.025266704098820886, "grad_norm": 1.9916223287582397, "learning_rate": 8.411214953271029e-07, "loss": 0.736, "step": 45 }, { "epoch": 0.02582818641212802, "grad_norm": 1.9249279499053955, "learning_rate": 8.598130841121496e-07, "loss": 0.7111, "step": 46 }, { "epoch": 0.026389668725435148, "grad_norm": 1.7830342054367065, "learning_rate": 8.785046728971963e-07, "loss": 0.676, "step": 47 }, { "epoch": 0.02695115103874228, "grad_norm": 1.8361444473266602, "learning_rate": 8.97196261682243e-07, "loss": 0.7291, "step": 48 }, { "epoch": 0.027512633352049412, "grad_norm": 1.631304144859314, "learning_rate": 9.158878504672898e-07, "loss": 0.7022, "step": 49 }, { "epoch": 0.02807411566535654, "grad_norm": 1.41349196434021, "learning_rate": 9.345794392523365e-07, "loss": 0.6857, "step": 50 }, { "epoch": 0.028635597978663673, "grad_norm": 1.3217582702636719, "learning_rate": 9.532710280373834e-07, "loss": 0.6777, "step": 51 }, { "epoch": 0.029197080291970802, "grad_norm": 1.1969060897827148, "learning_rate": 9.7196261682243e-07, "loss": 0.6946, "step": 52 }, { "epoch": 0.029758562605277934, "grad_norm": 1.1455937623977661, "learning_rate": 9.906542056074767e-07, "loss": 0.704, "step": 53 }, { "epoch": 0.030320044918585063, "grad_norm": 1.0362155437469482, "learning_rate": 1.0093457943925234e-06, "loss": 0.6765, "step": 54 }, { "epoch": 0.030881527231892195, "grad_norm": 0.9549208283424377, "learning_rate": 1.0280373831775702e-06, "loss": 0.6553, "step": 55 }, { "epoch": 0.031443009545199324, "grad_norm": 1.1246775388717651, "learning_rate": 1.046728971962617e-06, "loss": 0.6814, "step": 56 }, { "epoch": 0.032004491858506456, "grad_norm": 1.1046494245529175, "learning_rate": 1.0654205607476637e-06, "loss": 0.6848, "step": 57 }, { "epoch": 0.03256597417181359, "grad_norm": 1.1115449666976929, "learning_rate": 1.0841121495327103e-06, "loss": 0.6986, "step": 58 }, { "epoch": 0.03312745648512072, "grad_norm": 0.9690859317779541, "learning_rate": 1.102803738317757e-06, "loss": 0.6489, "step": 59 }, { "epoch": 0.033688938798427846, "grad_norm": 0.8515921831130981, "learning_rate": 1.1214953271028038e-06, "loss": 0.6435, "step": 60 }, { "epoch": 0.03425042111173498, "grad_norm": 0.8416945338249207, "learning_rate": 1.1401869158878506e-06, "loss": 0.6622, "step": 61 }, { "epoch": 0.03481190342504211, "grad_norm": 0.8463918566703796, "learning_rate": 1.1588785046728974e-06, "loss": 0.6631, "step": 62 }, { "epoch": 0.03537338573834924, "grad_norm": 0.8924703598022461, "learning_rate": 1.177570093457944e-06, "loss": 0.6095, "step": 63 }, { "epoch": 0.035934868051656375, "grad_norm": 0.8194422125816345, "learning_rate": 1.1962616822429909e-06, "loss": 0.6493, "step": 64 }, { "epoch": 0.0364963503649635, "grad_norm": 0.8139665722846985, "learning_rate": 1.2149532710280374e-06, "loss": 0.6391, "step": 65 }, { "epoch": 0.03705783267827063, "grad_norm": 0.6688620448112488, "learning_rate": 1.2336448598130842e-06, "loss": 0.6203, "step": 66 }, { "epoch": 0.037619314991577765, "grad_norm": 0.6619188785552979, "learning_rate": 1.252336448598131e-06, "loss": 0.6384, "step": 67 }, { "epoch": 0.0381807973048849, "grad_norm": 0.7313023805618286, "learning_rate": 1.2710280373831777e-06, "loss": 0.6122, "step": 68 }, { "epoch": 0.03874227961819203, "grad_norm": 0.6864597797393799, "learning_rate": 1.2897196261682243e-06, "loss": 0.6231, "step": 69 }, { "epoch": 0.039303761931499155, "grad_norm": 0.7016299366950989, "learning_rate": 1.308411214953271e-06, "loss": 0.6528, "step": 70 }, { "epoch": 0.03986524424480629, "grad_norm": 0.664181113243103, "learning_rate": 1.327102803738318e-06, "loss": 0.6443, "step": 71 }, { "epoch": 0.04042672655811342, "grad_norm": 0.6343922019004822, "learning_rate": 1.3457943925233646e-06, "loss": 0.6008, "step": 72 }, { "epoch": 0.04098820887142055, "grad_norm": 0.6530560255050659, "learning_rate": 1.3644859813084114e-06, "loss": 0.6293, "step": 73 }, { "epoch": 0.041549691184727684, "grad_norm": 0.6199630498886108, "learning_rate": 1.383177570093458e-06, "loss": 0.6004, "step": 74 }, { "epoch": 0.04211117349803481, "grad_norm": 0.5731170773506165, "learning_rate": 1.4018691588785047e-06, "loss": 0.5778, "step": 75 }, { "epoch": 0.04267265581134194, "grad_norm": 0.6075066328048706, "learning_rate": 1.4205607476635517e-06, "loss": 0.6062, "step": 76 }, { "epoch": 0.043234138124649074, "grad_norm": 0.6108478903770447, "learning_rate": 1.4392523364485982e-06, "loss": 0.6263, "step": 77 }, { "epoch": 0.043795620437956206, "grad_norm": 0.6291508674621582, "learning_rate": 1.457943925233645e-06, "loss": 0.6065, "step": 78 }, { "epoch": 0.04435710275126333, "grad_norm": 0.6872438788414001, "learning_rate": 1.4766355140186917e-06, "loss": 0.6508, "step": 79 }, { "epoch": 0.044918585064570464, "grad_norm": 0.5833560824394226, "learning_rate": 1.4953271028037383e-06, "loss": 0.6146, "step": 80 }, { "epoch": 0.045480067377877596, "grad_norm": 0.5691941976547241, "learning_rate": 1.5140186915887853e-06, "loss": 0.6367, "step": 81 }, { "epoch": 0.04604154969118473, "grad_norm": 0.5872597098350525, "learning_rate": 1.532710280373832e-06, "loss": 0.6599, "step": 82 }, { "epoch": 0.04660303200449186, "grad_norm": 0.5536590814590454, "learning_rate": 1.5514018691588786e-06, "loss": 0.5799, "step": 83 }, { "epoch": 0.047164514317798986, "grad_norm": 0.5517371296882629, "learning_rate": 1.5700934579439254e-06, "loss": 0.6208, "step": 84 }, { "epoch": 0.04772599663110612, "grad_norm": 0.546536386013031, "learning_rate": 1.588785046728972e-06, "loss": 0.6009, "step": 85 }, { "epoch": 0.04828747894441325, "grad_norm": 0.5205737352371216, "learning_rate": 1.6074766355140189e-06, "loss": 0.6322, "step": 86 }, { "epoch": 0.04884896125772038, "grad_norm": 0.5500140190124512, "learning_rate": 1.6261682242990657e-06, "loss": 0.5642, "step": 87 }, { "epoch": 0.049410443571027515, "grad_norm": 0.4702149033546448, "learning_rate": 1.6448598130841122e-06, "loss": 0.5587, "step": 88 }, { "epoch": 0.04997192588433464, "grad_norm": 0.5487768650054932, "learning_rate": 1.663551401869159e-06, "loss": 0.5909, "step": 89 }, { "epoch": 0.05053340819764177, "grad_norm": 0.541119396686554, "learning_rate": 1.6822429906542057e-06, "loss": 0.6127, "step": 90 }, { "epoch": 0.051094890510948905, "grad_norm": 0.48587819933891296, "learning_rate": 1.7009345794392525e-06, "loss": 0.5634, "step": 91 }, { "epoch": 0.05165637282425604, "grad_norm": 0.6195039749145508, "learning_rate": 1.7196261682242993e-06, "loss": 0.5912, "step": 92 }, { "epoch": 0.05221785513756317, "grad_norm": 0.5789273381233215, "learning_rate": 1.738317757009346e-06, "loss": 0.5872, "step": 93 }, { "epoch": 0.052779337450870295, "grad_norm": 0.5417637228965759, "learning_rate": 1.7570093457943926e-06, "loss": 0.5999, "step": 94 }, { "epoch": 0.05334081976417743, "grad_norm": 0.6048632860183716, "learning_rate": 1.7757009345794394e-06, "loss": 0.5803, "step": 95 }, { "epoch": 0.05390230207748456, "grad_norm": 0.5564274191856384, "learning_rate": 1.794392523364486e-06, "loss": 0.5473, "step": 96 }, { "epoch": 0.05446378439079169, "grad_norm": 0.5306727886199951, "learning_rate": 1.8130841121495329e-06, "loss": 0.5609, "step": 97 }, { "epoch": 0.055025266704098824, "grad_norm": 0.6427491903305054, "learning_rate": 1.8317757009345797e-06, "loss": 0.581, "step": 98 }, { "epoch": 0.05558674901740595, "grad_norm": 0.5311000347137451, "learning_rate": 1.8504672897196262e-06, "loss": 0.5909, "step": 99 }, { "epoch": 0.05614823133071308, "grad_norm": 0.4643082320690155, "learning_rate": 1.869158878504673e-06, "loss": 0.5638, "step": 100 }, { "epoch": 0.056709713644020214, "grad_norm": 0.5026910901069641, "learning_rate": 1.8878504672897197e-06, "loss": 0.5481, "step": 101 }, { "epoch": 0.057271195957327346, "grad_norm": 0.5044028162956238, "learning_rate": 1.9065420560747667e-06, "loss": 0.5414, "step": 102 }, { "epoch": 0.05783267827063447, "grad_norm": 0.6037092208862305, "learning_rate": 1.9252336448598133e-06, "loss": 0.5779, "step": 103 }, { "epoch": 0.058394160583941604, "grad_norm": 0.5051238536834717, "learning_rate": 1.94392523364486e-06, "loss": 0.577, "step": 104 }, { "epoch": 0.058955642897248736, "grad_norm": 0.513594925403595, "learning_rate": 1.962616822429907e-06, "loss": 0.551, "step": 105 }, { "epoch": 0.05951712521055587, "grad_norm": 0.5145809650421143, "learning_rate": 1.9813084112149534e-06, "loss": 0.5781, "step": 106 }, { "epoch": 0.060078607523863, "grad_norm": 0.5195441842079163, "learning_rate": 2.0000000000000003e-06, "loss": 0.5815, "step": 107 }, { "epoch": 0.060640089837170126, "grad_norm": 0.5372945666313171, "learning_rate": 2.018691588785047e-06, "loss": 0.5651, "step": 108 }, { "epoch": 0.06120157215047726, "grad_norm": 0.4849877655506134, "learning_rate": 2.0373831775700934e-06, "loss": 0.5811, "step": 109 }, { "epoch": 0.06176305446378439, "grad_norm": 0.5192245244979858, "learning_rate": 2.0560747663551404e-06, "loss": 0.5796, "step": 110 }, { "epoch": 0.06232453677709152, "grad_norm": 0.48092445731163025, "learning_rate": 2.074766355140187e-06, "loss": 0.5703, "step": 111 }, { "epoch": 0.06288601909039865, "grad_norm": 0.5670341849327087, "learning_rate": 2.093457943925234e-06, "loss": 0.5519, "step": 112 }, { "epoch": 0.06344750140370578, "grad_norm": 0.48778948187828064, "learning_rate": 2.1121495327102805e-06, "loss": 0.5167, "step": 113 }, { "epoch": 0.06400898371701291, "grad_norm": 0.505142331123352, "learning_rate": 2.1308411214953275e-06, "loss": 0.546, "step": 114 }, { "epoch": 0.06457046603032005, "grad_norm": 0.48205527663230896, "learning_rate": 2.149532710280374e-06, "loss": 0.5459, "step": 115 }, { "epoch": 0.06513194834362718, "grad_norm": 0.5646938681602478, "learning_rate": 2.1682242990654206e-06, "loss": 0.6041, "step": 116 }, { "epoch": 0.06569343065693431, "grad_norm": 0.48897823691368103, "learning_rate": 2.1869158878504676e-06, "loss": 0.591, "step": 117 }, { "epoch": 0.06625491297024144, "grad_norm": 0.515849769115448, "learning_rate": 2.205607476635514e-06, "loss": 0.5597, "step": 118 }, { "epoch": 0.06681639528354857, "grad_norm": 0.4776514172554016, "learning_rate": 2.224299065420561e-06, "loss": 0.5507, "step": 119 }, { "epoch": 0.06737787759685569, "grad_norm": 0.6082764863967896, "learning_rate": 2.2429906542056077e-06, "loss": 0.5722, "step": 120 }, { "epoch": 0.06793935991016282, "grad_norm": 0.5153414607048035, "learning_rate": 2.261682242990654e-06, "loss": 0.5484, "step": 121 }, { "epoch": 0.06850084222346996, "grad_norm": 0.5768982768058777, "learning_rate": 2.280373831775701e-06, "loss": 0.5793, "step": 122 }, { "epoch": 0.06906232453677709, "grad_norm": 0.5589241981506348, "learning_rate": 2.2990654205607477e-06, "loss": 0.5727, "step": 123 }, { "epoch": 0.06962380685008422, "grad_norm": 0.51249760389328, "learning_rate": 2.3177570093457947e-06, "loss": 0.5198, "step": 124 }, { "epoch": 0.07018528916339135, "grad_norm": 0.5239670872688293, "learning_rate": 2.3364485981308413e-06, "loss": 0.5601, "step": 125 }, { "epoch": 0.07074677147669849, "grad_norm": 0.5141129493713379, "learning_rate": 2.355140186915888e-06, "loss": 0.5451, "step": 126 }, { "epoch": 0.07130825379000562, "grad_norm": 0.49830353260040283, "learning_rate": 2.373831775700935e-06, "loss": 0.5335, "step": 127 }, { "epoch": 0.07186973610331275, "grad_norm": 0.5970834493637085, "learning_rate": 2.3925233644859818e-06, "loss": 0.5968, "step": 128 }, { "epoch": 0.07243121841661988, "grad_norm": 0.5098127722740173, "learning_rate": 2.4112149532710283e-06, "loss": 0.5632, "step": 129 }, { "epoch": 0.072992700729927, "grad_norm": 0.5146243572235107, "learning_rate": 2.429906542056075e-06, "loss": 0.5622, "step": 130 }, { "epoch": 0.07355418304323413, "grad_norm": 0.5628353953361511, "learning_rate": 2.4485981308411214e-06, "loss": 0.5824, "step": 131 }, { "epoch": 0.07411566535654127, "grad_norm": 0.5666977763175964, "learning_rate": 2.4672897196261684e-06, "loss": 0.5583, "step": 132 }, { "epoch": 0.0746771476698484, "grad_norm": 0.5500320196151733, "learning_rate": 2.4859813084112154e-06, "loss": 0.5582, "step": 133 }, { "epoch": 0.07523862998315553, "grad_norm": 0.4723847806453705, "learning_rate": 2.504672897196262e-06, "loss": 0.5713, "step": 134 }, { "epoch": 0.07580011229646266, "grad_norm": 0.5439716577529907, "learning_rate": 2.5233644859813085e-06, "loss": 0.5776, "step": 135 }, { "epoch": 0.0763615946097698, "grad_norm": 0.5559297800064087, "learning_rate": 2.5420560747663555e-06, "loss": 0.5579, "step": 136 }, { "epoch": 0.07692307692307693, "grad_norm": 0.45916908979415894, "learning_rate": 2.5607476635514025e-06, "loss": 0.529, "step": 137 }, { "epoch": 0.07748455923638406, "grad_norm": 0.49797847867012024, "learning_rate": 2.5794392523364486e-06, "loss": 0.5378, "step": 138 }, { "epoch": 0.07804604154969118, "grad_norm": 0.5377855896949768, "learning_rate": 2.5981308411214956e-06, "loss": 0.5331, "step": 139 }, { "epoch": 0.07860752386299831, "grad_norm": 0.550381064414978, "learning_rate": 2.616822429906542e-06, "loss": 0.5966, "step": 140 }, { "epoch": 0.07916900617630544, "grad_norm": 0.5161311626434326, "learning_rate": 2.635514018691589e-06, "loss": 0.5717, "step": 141 }, { "epoch": 0.07973048848961257, "grad_norm": 0.5422372221946716, "learning_rate": 2.654205607476636e-06, "loss": 0.545, "step": 142 }, { "epoch": 0.08029197080291971, "grad_norm": 0.5626325607299805, "learning_rate": 2.672897196261682e-06, "loss": 0.5325, "step": 143 }, { "epoch": 0.08085345311622684, "grad_norm": 0.4905477464199066, "learning_rate": 2.691588785046729e-06, "loss": 0.5502, "step": 144 }, { "epoch": 0.08141493542953397, "grad_norm": 0.6025540828704834, "learning_rate": 2.7102803738317757e-06, "loss": 0.57, "step": 145 }, { "epoch": 0.0819764177428411, "grad_norm": 0.44451966881752014, "learning_rate": 2.7289719626168227e-06, "loss": 0.5445, "step": 146 }, { "epoch": 0.08253790005614824, "grad_norm": 0.5365010499954224, "learning_rate": 2.7476635514018697e-06, "loss": 0.556, "step": 147 }, { "epoch": 0.08309938236945537, "grad_norm": 0.6012800931930542, "learning_rate": 2.766355140186916e-06, "loss": 0.536, "step": 148 }, { "epoch": 0.08366086468276249, "grad_norm": 0.5412779450416565, "learning_rate": 2.785046728971963e-06, "loss": 0.5568, "step": 149 }, { "epoch": 0.08422234699606962, "grad_norm": 0.49735352396965027, "learning_rate": 2.8037383177570094e-06, "loss": 0.5552, "step": 150 }, { "epoch": 0.08478382930937675, "grad_norm": 0.5109115242958069, "learning_rate": 2.8224299065420563e-06, "loss": 0.5676, "step": 151 }, { "epoch": 0.08534531162268388, "grad_norm": 0.5388306975364685, "learning_rate": 2.8411214953271033e-06, "loss": 0.5272, "step": 152 }, { "epoch": 0.08590679393599102, "grad_norm": 0.5270246267318726, "learning_rate": 2.8598130841121494e-06, "loss": 0.5968, "step": 153 }, { "epoch": 0.08646827624929815, "grad_norm": 0.5044521689414978, "learning_rate": 2.8785046728971964e-06, "loss": 0.533, "step": 154 }, { "epoch": 0.08702975856260528, "grad_norm": 0.5079155564308167, "learning_rate": 2.897196261682243e-06, "loss": 0.5577, "step": 155 }, { "epoch": 0.08759124087591241, "grad_norm": 0.5648968815803528, "learning_rate": 2.91588785046729e-06, "loss": 0.5746, "step": 156 }, { "epoch": 0.08815272318921955, "grad_norm": 0.5088842511177063, "learning_rate": 2.934579439252337e-06, "loss": 0.549, "step": 157 }, { "epoch": 0.08871420550252666, "grad_norm": 0.5372620224952698, "learning_rate": 2.9532710280373835e-06, "loss": 0.511, "step": 158 }, { "epoch": 0.0892756878158338, "grad_norm": 0.5546998977661133, "learning_rate": 2.9719626168224305e-06, "loss": 0.555, "step": 159 }, { "epoch": 0.08983717012914093, "grad_norm": 0.5083873271942139, "learning_rate": 2.9906542056074766e-06, "loss": 0.5647, "step": 160 }, { "epoch": 0.09039865244244806, "grad_norm": 0.49548789858818054, "learning_rate": 3.0093457943925236e-06, "loss": 0.5327, "step": 161 }, { "epoch": 0.09096013475575519, "grad_norm": 0.4746195077896118, "learning_rate": 3.0280373831775705e-06, "loss": 0.5392, "step": 162 }, { "epoch": 0.09152161706906232, "grad_norm": 0.5294780731201172, "learning_rate": 3.046728971962617e-06, "loss": 0.5336, "step": 163 }, { "epoch": 0.09208309938236946, "grad_norm": 0.506287157535553, "learning_rate": 3.065420560747664e-06, "loss": 0.5612, "step": 164 }, { "epoch": 0.09264458169567659, "grad_norm": 0.4978802502155304, "learning_rate": 3.08411214953271e-06, "loss": 0.5362, "step": 165 }, { "epoch": 0.09320606400898372, "grad_norm": 0.5676380395889282, "learning_rate": 3.102803738317757e-06, "loss": 0.5633, "step": 166 }, { "epoch": 0.09376754632229085, "grad_norm": 0.5164046883583069, "learning_rate": 3.121495327102804e-06, "loss": 0.5504, "step": 167 }, { "epoch": 0.09432902863559797, "grad_norm": 0.5138478875160217, "learning_rate": 3.1401869158878507e-06, "loss": 0.5199, "step": 168 }, { "epoch": 0.0948905109489051, "grad_norm": 0.5282585620880127, "learning_rate": 3.1588785046728977e-06, "loss": 0.5452, "step": 169 }, { "epoch": 0.09545199326221224, "grad_norm": 0.5155416131019592, "learning_rate": 3.177570093457944e-06, "loss": 0.5176, "step": 170 }, { "epoch": 0.09601347557551937, "grad_norm": 0.5294981002807617, "learning_rate": 3.196261682242991e-06, "loss": 0.5647, "step": 171 }, { "epoch": 0.0965749578888265, "grad_norm": 0.5615307688713074, "learning_rate": 3.2149532710280378e-06, "loss": 0.5358, "step": 172 }, { "epoch": 0.09713644020213363, "grad_norm": 0.5477790832519531, "learning_rate": 3.2336448598130843e-06, "loss": 0.5322, "step": 173 }, { "epoch": 0.09769792251544077, "grad_norm": 0.5162507891654968, "learning_rate": 3.2523364485981313e-06, "loss": 0.5332, "step": 174 }, { "epoch": 0.0982594048287479, "grad_norm": 0.48122143745422363, "learning_rate": 3.2710280373831774e-06, "loss": 0.5501, "step": 175 }, { "epoch": 0.09882088714205503, "grad_norm": 0.4893978238105774, "learning_rate": 3.2897196261682244e-06, "loss": 0.531, "step": 176 }, { "epoch": 0.09938236945536216, "grad_norm": 0.5128282904624939, "learning_rate": 3.3084112149532714e-06, "loss": 0.544, "step": 177 }, { "epoch": 0.09994385176866928, "grad_norm": 0.463338702917099, "learning_rate": 3.327102803738318e-06, "loss": 0.5239, "step": 178 }, { "epoch": 0.10050533408197641, "grad_norm": 0.534822940826416, "learning_rate": 3.345794392523365e-06, "loss": 0.5255, "step": 179 }, { "epoch": 0.10106681639528355, "grad_norm": 0.5621729493141174, "learning_rate": 3.3644859813084115e-06, "loss": 0.5401, "step": 180 }, { "epoch": 0.10162829870859068, "grad_norm": 0.49159976840019226, "learning_rate": 3.3831775700934585e-06, "loss": 0.5706, "step": 181 }, { "epoch": 0.10218978102189781, "grad_norm": 0.5380467772483826, "learning_rate": 3.401869158878505e-06, "loss": 0.5383, "step": 182 }, { "epoch": 0.10275126333520494, "grad_norm": 0.4590565264225006, "learning_rate": 3.4205607476635516e-06, "loss": 0.525, "step": 183 }, { "epoch": 0.10331274564851207, "grad_norm": 0.5124627947807312, "learning_rate": 3.4392523364485985e-06, "loss": 0.5328, "step": 184 }, { "epoch": 0.10387422796181921, "grad_norm": 0.5614336729049683, "learning_rate": 3.457943925233645e-06, "loss": 0.5166, "step": 185 }, { "epoch": 0.10443571027512634, "grad_norm": 0.5146556496620178, "learning_rate": 3.476635514018692e-06, "loss": 0.4973, "step": 186 }, { "epoch": 0.10499719258843346, "grad_norm": 0.5831396579742432, "learning_rate": 3.495327102803739e-06, "loss": 0.5158, "step": 187 }, { "epoch": 0.10555867490174059, "grad_norm": 0.5317298173904419, "learning_rate": 3.514018691588785e-06, "loss": 0.5566, "step": 188 }, { "epoch": 0.10612015721504772, "grad_norm": 0.5605750679969788, "learning_rate": 3.532710280373832e-06, "loss": 0.5585, "step": 189 }, { "epoch": 0.10668163952835485, "grad_norm": 0.5136295557022095, "learning_rate": 3.5514018691588787e-06, "loss": 0.5461, "step": 190 }, { "epoch": 0.10724312184166199, "grad_norm": 0.5151114463806152, "learning_rate": 3.5700934579439257e-06, "loss": 0.5143, "step": 191 }, { "epoch": 0.10780460415496912, "grad_norm": 0.5304511785507202, "learning_rate": 3.588785046728972e-06, "loss": 0.5082, "step": 192 }, { "epoch": 0.10836608646827625, "grad_norm": 0.5143433213233948, "learning_rate": 3.607476635514019e-06, "loss": 0.5151, "step": 193 }, { "epoch": 0.10892756878158338, "grad_norm": 0.5352661609649658, "learning_rate": 3.6261682242990658e-06, "loss": 0.5486, "step": 194 }, { "epoch": 0.10948905109489052, "grad_norm": 0.5392684936523438, "learning_rate": 3.6448598130841123e-06, "loss": 0.5553, "step": 195 }, { "epoch": 0.11005053340819765, "grad_norm": 0.4815359115600586, "learning_rate": 3.6635514018691593e-06, "loss": 0.5438, "step": 196 }, { "epoch": 0.11061201572150477, "grad_norm": 0.5181934833526611, "learning_rate": 3.682242990654206e-06, "loss": 0.5029, "step": 197 }, { "epoch": 0.1111734980348119, "grad_norm": 0.5692335367202759, "learning_rate": 3.7009345794392524e-06, "loss": 0.5212, "step": 198 }, { "epoch": 0.11173498034811903, "grad_norm": 0.5863697528839111, "learning_rate": 3.7196261682242994e-06, "loss": 0.5297, "step": 199 }, { "epoch": 0.11229646266142616, "grad_norm": 0.568873941898346, "learning_rate": 3.738317757009346e-06, "loss": 0.5415, "step": 200 }, { "epoch": 0.1128579449747333, "grad_norm": 0.5254502296447754, "learning_rate": 3.757009345794393e-06, "loss": 0.5296, "step": 201 }, { "epoch": 0.11341942728804043, "grad_norm": 0.5149471163749695, "learning_rate": 3.7757009345794395e-06, "loss": 0.5187, "step": 202 }, { "epoch": 0.11398090960134756, "grad_norm": 0.5104945302009583, "learning_rate": 3.7943925233644865e-06, "loss": 0.5157, "step": 203 }, { "epoch": 0.11454239191465469, "grad_norm": 0.5995305180549622, "learning_rate": 3.8130841121495334e-06, "loss": 0.564, "step": 204 }, { "epoch": 0.11510387422796182, "grad_norm": 0.5025469660758972, "learning_rate": 3.8317757009345796e-06, "loss": 0.5108, "step": 205 }, { "epoch": 0.11566535654126894, "grad_norm": 0.5697150230407715, "learning_rate": 3.8504672897196265e-06, "loss": 0.5083, "step": 206 }, { "epoch": 0.11622683885457608, "grad_norm": 0.5622625350952148, "learning_rate": 3.869158878504673e-06, "loss": 0.5251, "step": 207 }, { "epoch": 0.11678832116788321, "grad_norm": 0.49050793051719666, "learning_rate": 3.88785046728972e-06, "loss": 0.4865, "step": 208 }, { "epoch": 0.11734980348119034, "grad_norm": 0.5602784156799316, "learning_rate": 3.906542056074767e-06, "loss": 0.4963, "step": 209 }, { "epoch": 0.11791128579449747, "grad_norm": 0.5663114786148071, "learning_rate": 3.925233644859814e-06, "loss": 0.5345, "step": 210 }, { "epoch": 0.1184727681078046, "grad_norm": 0.49373361468315125, "learning_rate": 3.943925233644861e-06, "loss": 0.5121, "step": 211 }, { "epoch": 0.11903425042111174, "grad_norm": 0.5427638292312622, "learning_rate": 3.962616822429907e-06, "loss": 0.5394, "step": 212 }, { "epoch": 0.11959573273441887, "grad_norm": 0.5395470857620239, "learning_rate": 3.981308411214954e-06, "loss": 0.528, "step": 213 }, { "epoch": 0.120157215047726, "grad_norm": 0.518747091293335, "learning_rate": 4.000000000000001e-06, "loss": 0.5095, "step": 214 }, { "epoch": 0.12071869736103313, "grad_norm": 0.4676249921321869, "learning_rate": 4.018691588785047e-06, "loss": 0.5152, "step": 215 }, { "epoch": 0.12128017967434025, "grad_norm": 0.5247534513473511, "learning_rate": 4.037383177570094e-06, "loss": 0.5424, "step": 216 }, { "epoch": 0.12184166198764738, "grad_norm": 0.5519428253173828, "learning_rate": 4.05607476635514e-06, "loss": 0.53, "step": 217 }, { "epoch": 0.12240314430095452, "grad_norm": 0.48265621066093445, "learning_rate": 4.074766355140187e-06, "loss": 0.5283, "step": 218 }, { "epoch": 0.12296462661426165, "grad_norm": 0.5158567428588867, "learning_rate": 4.093457943925234e-06, "loss": 0.5276, "step": 219 }, { "epoch": 0.12352610892756878, "grad_norm": 0.5545607805252075, "learning_rate": 4.112149532710281e-06, "loss": 0.5375, "step": 220 }, { "epoch": 0.12408759124087591, "grad_norm": 0.5172088146209717, "learning_rate": 4.130841121495328e-06, "loss": 0.5182, "step": 221 }, { "epoch": 0.12464907355418305, "grad_norm": 0.5419506430625916, "learning_rate": 4.149532710280374e-06, "loss": 0.542, "step": 222 }, { "epoch": 0.12521055586749016, "grad_norm": 0.5324147939682007, "learning_rate": 4.168224299065421e-06, "loss": 0.5207, "step": 223 }, { "epoch": 0.1257720381807973, "grad_norm": 0.5635223388671875, "learning_rate": 4.186915887850468e-06, "loss": 0.5288, "step": 224 }, { "epoch": 0.12633352049410443, "grad_norm": 0.5783401131629944, "learning_rate": 4.205607476635514e-06, "loss": 0.5447, "step": 225 }, { "epoch": 0.12689500280741156, "grad_norm": 0.5283171534538269, "learning_rate": 4.224299065420561e-06, "loss": 0.5015, "step": 226 }, { "epoch": 0.1274564851207187, "grad_norm": 0.5164048075675964, "learning_rate": 4.242990654205608e-06, "loss": 0.4681, "step": 227 }, { "epoch": 0.12801796743402583, "grad_norm": 0.5482809543609619, "learning_rate": 4.261682242990655e-06, "loss": 0.5344, "step": 228 }, { "epoch": 0.12857944974733296, "grad_norm": 0.5465124249458313, "learning_rate": 4.280373831775702e-06, "loss": 0.5276, "step": 229 }, { "epoch": 0.1291409320606401, "grad_norm": 0.6989641785621643, "learning_rate": 4.299065420560748e-06, "loss": 0.5304, "step": 230 }, { "epoch": 0.12970241437394722, "grad_norm": 0.5421445369720459, "learning_rate": 4.317757009345795e-06, "loss": 0.5128, "step": 231 }, { "epoch": 0.13026389668725435, "grad_norm": 0.4625307321548462, "learning_rate": 4.336448598130841e-06, "loss": 0.5435, "step": 232 }, { "epoch": 0.1308253790005615, "grad_norm": 0.6012388467788696, "learning_rate": 4.355140186915888e-06, "loss": 0.5594, "step": 233 }, { "epoch": 0.13138686131386862, "grad_norm": 0.5257087349891663, "learning_rate": 4.373831775700935e-06, "loss": 0.5152, "step": 234 }, { "epoch": 0.13194834362717575, "grad_norm": 0.5837084054946899, "learning_rate": 4.392523364485981e-06, "loss": 0.5605, "step": 235 }, { "epoch": 0.13250982594048288, "grad_norm": 0.5396296381950378, "learning_rate": 4.411214953271028e-06, "loss": 0.5228, "step": 236 }, { "epoch": 0.13307130825379002, "grad_norm": 0.4601995646953583, "learning_rate": 4.429906542056075e-06, "loss": 0.5215, "step": 237 }, { "epoch": 0.13363279056709715, "grad_norm": 0.5359197854995728, "learning_rate": 4.448598130841122e-06, "loss": 0.5276, "step": 238 }, { "epoch": 0.13419427288040428, "grad_norm": 0.5293643474578857, "learning_rate": 4.467289719626169e-06, "loss": 0.5396, "step": 239 }, { "epoch": 0.13475575519371139, "grad_norm": 0.526126503944397, "learning_rate": 4.485981308411215e-06, "loss": 0.5122, "step": 240 }, { "epoch": 0.13531723750701852, "grad_norm": 0.506176233291626, "learning_rate": 4.504672897196262e-06, "loss": 0.5491, "step": 241 }, { "epoch": 0.13587871982032565, "grad_norm": 0.5577577948570251, "learning_rate": 4.523364485981308e-06, "loss": 0.5196, "step": 242 }, { "epoch": 0.13644020213363278, "grad_norm": 0.6053435206413269, "learning_rate": 4.542056074766355e-06, "loss": 0.525, "step": 243 }, { "epoch": 0.13700168444693991, "grad_norm": 0.5300360321998596, "learning_rate": 4.560747663551402e-06, "loss": 0.4926, "step": 244 }, { "epoch": 0.13756316676024705, "grad_norm": 0.5235571265220642, "learning_rate": 4.579439252336449e-06, "loss": 0.5485, "step": 245 }, { "epoch": 0.13812464907355418, "grad_norm": 0.5371932983398438, "learning_rate": 4.5981308411214955e-06, "loss": 0.5555, "step": 246 }, { "epoch": 0.1386861313868613, "grad_norm": 0.5221948027610779, "learning_rate": 4.6168224299065425e-06, "loss": 0.5033, "step": 247 }, { "epoch": 0.13924761370016844, "grad_norm": 0.5020176768302917, "learning_rate": 4.6355140186915894e-06, "loss": 0.5441, "step": 248 }, { "epoch": 0.13980909601347558, "grad_norm": 0.4835977554321289, "learning_rate": 4.6542056074766356e-06, "loss": 0.5313, "step": 249 }, { "epoch": 0.1403705783267827, "grad_norm": 0.5282566547393799, "learning_rate": 4.6728971962616825e-06, "loss": 0.526, "step": 250 }, { "epoch": 0.14093206064008984, "grad_norm": 0.5329122543334961, "learning_rate": 4.6915887850467295e-06, "loss": 0.5184, "step": 251 }, { "epoch": 0.14149354295339697, "grad_norm": 0.571976363658905, "learning_rate": 4.710280373831776e-06, "loss": 0.535, "step": 252 }, { "epoch": 0.1420550252667041, "grad_norm": 0.6141718626022339, "learning_rate": 4.728971962616823e-06, "loss": 0.5265, "step": 253 }, { "epoch": 0.14261650758001124, "grad_norm": 0.5334063768386841, "learning_rate": 4.74766355140187e-06, "loss": 0.5523, "step": 254 }, { "epoch": 0.14317798989331837, "grad_norm": 0.5084075927734375, "learning_rate": 4.766355140186917e-06, "loss": 0.5033, "step": 255 }, { "epoch": 0.1437394722066255, "grad_norm": 0.5196272730827332, "learning_rate": 4.7850467289719636e-06, "loss": 0.4939, "step": 256 }, { "epoch": 0.14430095451993263, "grad_norm": 0.6254199743270874, "learning_rate": 4.80373831775701e-06, "loss": 0.546, "step": 257 }, { "epoch": 0.14486243683323977, "grad_norm": 0.561668872833252, "learning_rate": 4.822429906542057e-06, "loss": 0.505, "step": 258 }, { "epoch": 0.14542391914654687, "grad_norm": 0.5426413416862488, "learning_rate": 4.841121495327103e-06, "loss": 0.5291, "step": 259 }, { "epoch": 0.145985401459854, "grad_norm": 0.5959562659263611, "learning_rate": 4.85981308411215e-06, "loss": 0.498, "step": 260 }, { "epoch": 0.14654688377316114, "grad_norm": 0.4923771619796753, "learning_rate": 4.878504672897197e-06, "loss": 0.4784, "step": 261 }, { "epoch": 0.14710836608646827, "grad_norm": 0.5952348709106445, "learning_rate": 4.897196261682243e-06, "loss": 0.5386, "step": 262 }, { "epoch": 0.1476698483997754, "grad_norm": 0.5596470832824707, "learning_rate": 4.91588785046729e-06, "loss": 0.4919, "step": 263 }, { "epoch": 0.14823133071308253, "grad_norm": 0.5192443132400513, "learning_rate": 4.934579439252337e-06, "loss": 0.5036, "step": 264 }, { "epoch": 0.14879281302638966, "grad_norm": 0.5843799710273743, "learning_rate": 4.953271028037384e-06, "loss": 0.5386, "step": 265 }, { "epoch": 0.1493542953396968, "grad_norm": 0.5129218101501465, "learning_rate": 4.971962616822431e-06, "loss": 0.5065, "step": 266 }, { "epoch": 0.14991577765300393, "grad_norm": 0.6191100478172302, "learning_rate": 4.990654205607477e-06, "loss": 0.5677, "step": 267 }, { "epoch": 0.15047725996631106, "grad_norm": 0.5753009915351868, "learning_rate": 5.009345794392524e-06, "loss": 0.4979, "step": 268 }, { "epoch": 0.1510387422796182, "grad_norm": 0.5379517078399658, "learning_rate": 5.028037383177571e-06, "loss": 0.5508, "step": 269 }, { "epoch": 0.15160022459292533, "grad_norm": 0.601753830909729, "learning_rate": 5.046728971962617e-06, "loss": 0.4906, "step": 270 }, { "epoch": 0.15216170690623246, "grad_norm": 0.5323045253753662, "learning_rate": 5.065420560747664e-06, "loss": 0.4705, "step": 271 }, { "epoch": 0.1527231892195396, "grad_norm": 0.6194905042648315, "learning_rate": 5.084112149532711e-06, "loss": 0.547, "step": 272 }, { "epoch": 0.15328467153284672, "grad_norm": 0.5551636219024658, "learning_rate": 5.102803738317758e-06, "loss": 0.4978, "step": 273 }, { "epoch": 0.15384615384615385, "grad_norm": 0.5420138239860535, "learning_rate": 5.121495327102805e-06, "loss": 0.5191, "step": 274 }, { "epoch": 0.154407636159461, "grad_norm": 0.5451993346214294, "learning_rate": 5.14018691588785e-06, "loss": 0.519, "step": 275 }, { "epoch": 0.15496911847276812, "grad_norm": 0.5864368081092834, "learning_rate": 5.158878504672897e-06, "loss": 0.5391, "step": 276 }, { "epoch": 0.15553060078607525, "grad_norm": 0.6334691047668457, "learning_rate": 5.177570093457944e-06, "loss": 0.5413, "step": 277 }, { "epoch": 0.15609208309938236, "grad_norm": 0.5098764896392822, "learning_rate": 5.196261682242991e-06, "loss": 0.5238, "step": 278 }, { "epoch": 0.1566535654126895, "grad_norm": 0.5493597388267517, "learning_rate": 5.214953271028038e-06, "loss": 0.5098, "step": 279 }, { "epoch": 0.15721504772599662, "grad_norm": 0.6663073301315308, "learning_rate": 5.233644859813084e-06, "loss": 0.4874, "step": 280 }, { "epoch": 0.15777653003930375, "grad_norm": 0.5497331023216248, "learning_rate": 5.252336448598131e-06, "loss": 0.4843, "step": 281 }, { "epoch": 0.15833801235261089, "grad_norm": 0.5485034584999084, "learning_rate": 5.271028037383178e-06, "loss": 0.5147, "step": 282 }, { "epoch": 0.15889949466591802, "grad_norm": 0.6035282015800476, "learning_rate": 5.289719626168225e-06, "loss": 0.5582, "step": 283 }, { "epoch": 0.15946097697922515, "grad_norm": 0.5660860538482666, "learning_rate": 5.308411214953272e-06, "loss": 0.513, "step": 284 }, { "epoch": 0.16002245929253228, "grad_norm": 0.5625800490379333, "learning_rate": 5.3271028037383174e-06, "loss": 0.5099, "step": 285 }, { "epoch": 0.16058394160583941, "grad_norm": 0.5266700387001038, "learning_rate": 5.345794392523364e-06, "loss": 0.4972, "step": 286 }, { "epoch": 0.16114542391914655, "grad_norm": 0.567945659160614, "learning_rate": 5.364485981308411e-06, "loss": 0.4867, "step": 287 }, { "epoch": 0.16170690623245368, "grad_norm": 0.6312300562858582, "learning_rate": 5.383177570093458e-06, "loss": 0.5026, "step": 288 }, { "epoch": 0.1622683885457608, "grad_norm": 0.488113135099411, "learning_rate": 5.401869158878505e-06, "loss": 0.4995, "step": 289 }, { "epoch": 0.16282987085906794, "grad_norm": 0.5753918290138245, "learning_rate": 5.4205607476635515e-06, "loss": 0.4932, "step": 290 }, { "epoch": 0.16339135317237508, "grad_norm": 0.5450886487960815, "learning_rate": 5.4392523364485985e-06, "loss": 0.5092, "step": 291 }, { "epoch": 0.1639528354856822, "grad_norm": 0.5247456431388855, "learning_rate": 5.4579439252336454e-06, "loss": 0.4789, "step": 292 }, { "epoch": 0.16451431779898934, "grad_norm": 0.6197958588600159, "learning_rate": 5.476635514018692e-06, "loss": 0.5207, "step": 293 }, { "epoch": 0.16507580011229647, "grad_norm": 0.49786123633384705, "learning_rate": 5.495327102803739e-06, "loss": 0.4803, "step": 294 }, { "epoch": 0.1656372824256036, "grad_norm": 0.5488682985305786, "learning_rate": 5.514018691588785e-06, "loss": 0.5223, "step": 295 }, { "epoch": 0.16619876473891074, "grad_norm": 0.738577127456665, "learning_rate": 5.532710280373832e-06, "loss": 0.5043, "step": 296 }, { "epoch": 0.16676024705221784, "grad_norm": 0.5574642419815063, "learning_rate": 5.551401869158879e-06, "loss": 0.5487, "step": 297 }, { "epoch": 0.16732172936552497, "grad_norm": 0.5056718587875366, "learning_rate": 5.570093457943926e-06, "loss": 0.5268, "step": 298 }, { "epoch": 0.1678832116788321, "grad_norm": 0.5359535217285156, "learning_rate": 5.588785046728973e-06, "loss": 0.4937, "step": 299 }, { "epoch": 0.16844469399213924, "grad_norm": 0.6076109409332275, "learning_rate": 5.607476635514019e-06, "loss": 0.5332, "step": 300 }, { "epoch": 0.16900617630544637, "grad_norm": 0.5157957077026367, "learning_rate": 5.626168224299066e-06, "loss": 0.5117, "step": 301 }, { "epoch": 0.1695676586187535, "grad_norm": 0.5176049470901489, "learning_rate": 5.644859813084113e-06, "loss": 0.49, "step": 302 }, { "epoch": 0.17012914093206064, "grad_norm": 0.5272052884101868, "learning_rate": 5.66355140186916e-06, "loss": 0.5094, "step": 303 }, { "epoch": 0.17069062324536777, "grad_norm": 0.5226345658302307, "learning_rate": 5.682242990654207e-06, "loss": 0.5197, "step": 304 }, { "epoch": 0.1712521055586749, "grad_norm": 0.5492976903915405, "learning_rate": 5.700934579439253e-06, "loss": 0.4902, "step": 305 }, { "epoch": 0.17181358787198203, "grad_norm": 0.5311779975891113, "learning_rate": 5.719626168224299e-06, "loss": 0.5391, "step": 306 }, { "epoch": 0.17237507018528916, "grad_norm": 0.5369917154312134, "learning_rate": 5.738317757009346e-06, "loss": 0.4969, "step": 307 }, { "epoch": 0.1729365524985963, "grad_norm": 0.47213488817214966, "learning_rate": 5.757009345794393e-06, "loss": 0.4719, "step": 308 }, { "epoch": 0.17349803481190343, "grad_norm": 0.5934032201766968, "learning_rate": 5.77570093457944e-06, "loss": 0.5103, "step": 309 }, { "epoch": 0.17405951712521056, "grad_norm": 0.5415632724761963, "learning_rate": 5.794392523364486e-06, "loss": 0.5129, "step": 310 }, { "epoch": 0.1746209994385177, "grad_norm": 0.5268533825874329, "learning_rate": 5.813084112149533e-06, "loss": 0.4919, "step": 311 }, { "epoch": 0.17518248175182483, "grad_norm": 0.5908380150794983, "learning_rate": 5.83177570093458e-06, "loss": 0.5227, "step": 312 }, { "epoch": 0.17574396406513196, "grad_norm": 0.5409054160118103, "learning_rate": 5.850467289719627e-06, "loss": 0.5149, "step": 313 }, { "epoch": 0.1763054463784391, "grad_norm": 0.5120905041694641, "learning_rate": 5.869158878504674e-06, "loss": 0.4877, "step": 314 }, { "epoch": 0.17686692869174622, "grad_norm": 0.6389188170433044, "learning_rate": 5.88785046728972e-06, "loss": 0.5147, "step": 315 }, { "epoch": 0.17742841100505333, "grad_norm": 0.5886951684951782, "learning_rate": 5.906542056074767e-06, "loss": 0.5522, "step": 316 }, { "epoch": 0.17798989331836046, "grad_norm": 0.46608081459999084, "learning_rate": 5.925233644859814e-06, "loss": 0.485, "step": 317 }, { "epoch": 0.1785513756316676, "grad_norm": 0.6622824668884277, "learning_rate": 5.943925233644861e-06, "loss": 0.4888, "step": 318 }, { "epoch": 0.17911285794497472, "grad_norm": 0.5817224979400635, "learning_rate": 5.962616822429908e-06, "loss": 0.4963, "step": 319 }, { "epoch": 0.17967434025828186, "grad_norm": 0.5209051370620728, "learning_rate": 5.981308411214953e-06, "loss": 0.5422, "step": 320 }, { "epoch": 0.180235822571589, "grad_norm": 0.6068258285522461, "learning_rate": 6e-06, "loss": 0.5189, "step": 321 }, { "epoch": 0.18079730488489612, "grad_norm": 0.72153240442276, "learning_rate": 6.018691588785047e-06, "loss": 0.4977, "step": 322 }, { "epoch": 0.18135878719820325, "grad_norm": 0.5457529425621033, "learning_rate": 6.037383177570094e-06, "loss": 0.5519, "step": 323 }, { "epoch": 0.18192026951151039, "grad_norm": 0.6254226565361023, "learning_rate": 6.056074766355141e-06, "loss": 0.5195, "step": 324 }, { "epoch": 0.18248175182481752, "grad_norm": 0.5446473956108093, "learning_rate": 6.074766355140187e-06, "loss": 0.5092, "step": 325 }, { "epoch": 0.18304323413812465, "grad_norm": 0.5605732202529907, "learning_rate": 6.093457943925234e-06, "loss": 0.4823, "step": 326 }, { "epoch": 0.18360471645143178, "grad_norm": 0.5811764001846313, "learning_rate": 6.112149532710281e-06, "loss": 0.5073, "step": 327 }, { "epoch": 0.18416619876473891, "grad_norm": 0.5872640609741211, "learning_rate": 6.130841121495328e-06, "loss": 0.5285, "step": 328 }, { "epoch": 0.18472768107804605, "grad_norm": 0.5084826946258545, "learning_rate": 6.149532710280375e-06, "loss": 0.5001, "step": 329 }, { "epoch": 0.18528916339135318, "grad_norm": 0.6071262359619141, "learning_rate": 6.16822429906542e-06, "loss": 0.4981, "step": 330 }, { "epoch": 0.1858506457046603, "grad_norm": 0.5533544421195984, "learning_rate": 6.186915887850467e-06, "loss": 0.4947, "step": 331 }, { "epoch": 0.18641212801796744, "grad_norm": 0.5573198795318604, "learning_rate": 6.205607476635514e-06, "loss": 0.5021, "step": 332 }, { "epoch": 0.18697361033127458, "grad_norm": 0.5753386616706848, "learning_rate": 6.224299065420561e-06, "loss": 0.5191, "step": 333 }, { "epoch": 0.1875350926445817, "grad_norm": 0.5553855895996094, "learning_rate": 6.242990654205608e-06, "loss": 0.5225, "step": 334 }, { "epoch": 0.18809657495788884, "grad_norm": 0.5927571058273315, "learning_rate": 6.2616822429906544e-06, "loss": 0.4864, "step": 335 }, { "epoch": 0.18865805727119594, "grad_norm": 0.5531664490699768, "learning_rate": 6.2803738317757014e-06, "loss": 0.4844, "step": 336 }, { "epoch": 0.18921953958450308, "grad_norm": 0.5487024784088135, "learning_rate": 6.299065420560748e-06, "loss": 0.4927, "step": 337 }, { "epoch": 0.1897810218978102, "grad_norm": 0.5995740294456482, "learning_rate": 6.317757009345795e-06, "loss": 0.5027, "step": 338 }, { "epoch": 0.19034250421111734, "grad_norm": 0.5572948455810547, "learning_rate": 6.336448598130842e-06, "loss": 0.4922, "step": 339 }, { "epoch": 0.19090398652442447, "grad_norm": 0.6223042607307434, "learning_rate": 6.355140186915888e-06, "loss": 0.5036, "step": 340 }, { "epoch": 0.1914654688377316, "grad_norm": 0.5123772025108337, "learning_rate": 6.373831775700935e-06, "loss": 0.5159, "step": 341 }, { "epoch": 0.19202695115103874, "grad_norm": 0.5524616837501526, "learning_rate": 6.392523364485982e-06, "loss": 0.5075, "step": 342 }, { "epoch": 0.19258843346434587, "grad_norm": 0.5154120922088623, "learning_rate": 6.4112149532710286e-06, "loss": 0.4796, "step": 343 }, { "epoch": 0.193149915777653, "grad_norm": 0.5266824960708618, "learning_rate": 6.4299065420560756e-06, "loss": 0.5009, "step": 344 }, { "epoch": 0.19371139809096014, "grad_norm": 0.5817278623580933, "learning_rate": 6.448598130841122e-06, "loss": 0.5166, "step": 345 }, { "epoch": 0.19427288040426727, "grad_norm": 0.5452004075050354, "learning_rate": 6.467289719626169e-06, "loss": 0.5272, "step": 346 }, { "epoch": 0.1948343627175744, "grad_norm": 0.5986294746398926, "learning_rate": 6.485981308411216e-06, "loss": 0.4769, "step": 347 }, { "epoch": 0.19539584503088153, "grad_norm": 0.598188042640686, "learning_rate": 6.504672897196263e-06, "loss": 0.4663, "step": 348 }, { "epoch": 0.19595732734418866, "grad_norm": 0.6233293414115906, "learning_rate": 6.52336448598131e-06, "loss": 0.5348, "step": 349 }, { "epoch": 0.1965188096574958, "grad_norm": 0.6342386603355408, "learning_rate": 6.542056074766355e-06, "loss": 0.5534, "step": 350 }, { "epoch": 0.19708029197080293, "grad_norm": 0.5552769899368286, "learning_rate": 6.560747663551402e-06, "loss": 0.4883, "step": 351 }, { "epoch": 0.19764177428411006, "grad_norm": 0.5914608836174011, "learning_rate": 6.579439252336449e-06, "loss": 0.4627, "step": 352 }, { "epoch": 0.1982032565974172, "grad_norm": 0.58082115650177, "learning_rate": 6.598130841121496e-06, "loss": 0.5081, "step": 353 }, { "epoch": 0.19876473891072433, "grad_norm": 0.5514166951179504, "learning_rate": 6.616822429906543e-06, "loss": 0.4967, "step": 354 }, { "epoch": 0.19932622122403143, "grad_norm": 0.5988174676895142, "learning_rate": 6.635514018691589e-06, "loss": 0.5044, "step": 355 }, { "epoch": 0.19988770353733856, "grad_norm": 0.6487159132957458, "learning_rate": 6.654205607476636e-06, "loss": 0.4977, "step": 356 }, { "epoch": 0.2004491858506457, "grad_norm": 0.5257630944252014, "learning_rate": 6.672897196261683e-06, "loss": 0.5071, "step": 357 }, { "epoch": 0.20101066816395283, "grad_norm": 0.5930215120315552, "learning_rate": 6.69158878504673e-06, "loss": 0.4917, "step": 358 }, { "epoch": 0.20157215047725996, "grad_norm": 0.534083366394043, "learning_rate": 6.710280373831777e-06, "loss": 0.4705, "step": 359 }, { "epoch": 0.2021336327905671, "grad_norm": 0.5709120035171509, "learning_rate": 6.728971962616823e-06, "loss": 0.5049, "step": 360 }, { "epoch": 0.20269511510387422, "grad_norm": 0.6237732768058777, "learning_rate": 6.74766355140187e-06, "loss": 0.5156, "step": 361 }, { "epoch": 0.20325659741718136, "grad_norm": 0.5226709246635437, "learning_rate": 6.766355140186917e-06, "loss": 0.4975, "step": 362 }, { "epoch": 0.2038180797304885, "grad_norm": 0.5731485486030579, "learning_rate": 6.785046728971964e-06, "loss": 0.4876, "step": 363 }, { "epoch": 0.20437956204379562, "grad_norm": 0.6437954902648926, "learning_rate": 6.80373831775701e-06, "loss": 0.5452, "step": 364 }, { "epoch": 0.20494104435710275, "grad_norm": 0.541183590888977, "learning_rate": 6.822429906542056e-06, "loss": 0.5086, "step": 365 }, { "epoch": 0.20550252667040989, "grad_norm": 0.6118248105049133, "learning_rate": 6.841121495327103e-06, "loss": 0.5018, "step": 366 }, { "epoch": 0.20606400898371702, "grad_norm": 0.5792096853256226, "learning_rate": 6.85981308411215e-06, "loss": 0.5202, "step": 367 }, { "epoch": 0.20662549129702415, "grad_norm": 0.5560618042945862, "learning_rate": 6.878504672897197e-06, "loss": 0.4637, "step": 368 }, { "epoch": 0.20718697361033128, "grad_norm": 0.5785664916038513, "learning_rate": 6.897196261682244e-06, "loss": 0.5043, "step": 369 }, { "epoch": 0.20774845592363841, "grad_norm": 0.5500240921974182, "learning_rate": 6.91588785046729e-06, "loss": 0.5191, "step": 370 }, { "epoch": 0.20830993823694555, "grad_norm": 0.5546773672103882, "learning_rate": 6.934579439252337e-06, "loss": 0.5039, "step": 371 }, { "epoch": 0.20887142055025268, "grad_norm": 0.5626396536827087, "learning_rate": 6.953271028037384e-06, "loss": 0.5045, "step": 372 }, { "epoch": 0.2094329028635598, "grad_norm": 0.5397660732269287, "learning_rate": 6.971962616822431e-06, "loss": 0.4802, "step": 373 }, { "epoch": 0.20999438517686692, "grad_norm": 0.6032595634460449, "learning_rate": 6.990654205607478e-06, "loss": 0.4989, "step": 374 }, { "epoch": 0.21055586749017405, "grad_norm": 0.5694195032119751, "learning_rate": 7.009345794392523e-06, "loss": 0.4703, "step": 375 }, { "epoch": 0.21111734980348118, "grad_norm": 0.6331584453582764, "learning_rate": 7.02803738317757e-06, "loss": 0.5204, "step": 376 }, { "epoch": 0.2116788321167883, "grad_norm": 0.5436736941337585, "learning_rate": 7.046728971962617e-06, "loss": 0.489, "step": 377 }, { "epoch": 0.21224031443009544, "grad_norm": 0.6157930493354797, "learning_rate": 7.065420560747664e-06, "loss": 0.4868, "step": 378 }, { "epoch": 0.21280179674340258, "grad_norm": 0.5815862417221069, "learning_rate": 7.0841121495327104e-06, "loss": 0.4853, "step": 379 }, { "epoch": 0.2133632790567097, "grad_norm": 0.6115589141845703, "learning_rate": 7.1028037383177574e-06, "loss": 0.4975, "step": 380 }, { "epoch": 0.21392476137001684, "grad_norm": 0.6007776260375977, "learning_rate": 7.121495327102804e-06, "loss": 0.5404, "step": 381 }, { "epoch": 0.21448624368332397, "grad_norm": 0.6242786049842834, "learning_rate": 7.140186915887851e-06, "loss": 0.4932, "step": 382 }, { "epoch": 0.2150477259966311, "grad_norm": 0.54986572265625, "learning_rate": 7.158878504672898e-06, "loss": 0.4858, "step": 383 }, { "epoch": 0.21560920830993824, "grad_norm": 0.6455982327461243, "learning_rate": 7.177570093457944e-06, "loss": 0.5187, "step": 384 }, { "epoch": 0.21617069062324537, "grad_norm": 0.5220648646354675, "learning_rate": 7.196261682242991e-06, "loss": 0.4939, "step": 385 }, { "epoch": 0.2167321729365525, "grad_norm": 0.6050408482551575, "learning_rate": 7.214953271028038e-06, "loss": 0.5066, "step": 386 }, { "epoch": 0.21729365524985964, "grad_norm": 0.5385282039642334, "learning_rate": 7.2336448598130846e-06, "loss": 0.4729, "step": 387 }, { "epoch": 0.21785513756316677, "grad_norm": 0.5264241695404053, "learning_rate": 7.2523364485981315e-06, "loss": 0.5122, "step": 388 }, { "epoch": 0.2184166198764739, "grad_norm": 0.545654296875, "learning_rate": 7.271028037383178e-06, "loss": 0.4731, "step": 389 }, { "epoch": 0.21897810218978103, "grad_norm": 0.6230910420417786, "learning_rate": 7.289719626168225e-06, "loss": 0.4821, "step": 390 }, { "epoch": 0.21953958450308816, "grad_norm": 0.5311231017112732, "learning_rate": 7.308411214953272e-06, "loss": 0.4784, "step": 391 }, { "epoch": 0.2201010668163953, "grad_norm": 0.53107750415802, "learning_rate": 7.327102803738319e-06, "loss": 0.4798, "step": 392 }, { "epoch": 0.2206625491297024, "grad_norm": 0.5852460265159607, "learning_rate": 7.345794392523366e-06, "loss": 0.4825, "step": 393 }, { "epoch": 0.22122403144300953, "grad_norm": 0.6133987903594971, "learning_rate": 7.364485981308412e-06, "loss": 0.5582, "step": 394 }, { "epoch": 0.22178551375631667, "grad_norm": 0.5947113037109375, "learning_rate": 7.383177570093458e-06, "loss": 0.5103, "step": 395 }, { "epoch": 0.2223469960696238, "grad_norm": 0.595399022102356, "learning_rate": 7.401869158878505e-06, "loss": 0.5236, "step": 396 }, { "epoch": 0.22290847838293093, "grad_norm": 0.6083643436431885, "learning_rate": 7.420560747663552e-06, "loss": 0.522, "step": 397 }, { "epoch": 0.22346996069623806, "grad_norm": 0.5888304710388184, "learning_rate": 7.439252336448599e-06, "loss": 0.4974, "step": 398 }, { "epoch": 0.2240314430095452, "grad_norm": 0.5949013829231262, "learning_rate": 7.457943925233645e-06, "loss": 0.5013, "step": 399 }, { "epoch": 0.22459292532285233, "grad_norm": 0.5905395746231079, "learning_rate": 7.476635514018692e-06, "loss": 0.4971, "step": 400 }, { "epoch": 0.22515440763615946, "grad_norm": 0.5874047875404358, "learning_rate": 7.495327102803739e-06, "loss": 0.519, "step": 401 }, { "epoch": 0.2257158899494666, "grad_norm": 0.5851378440856934, "learning_rate": 7.514018691588786e-06, "loss": 0.4858, "step": 402 }, { "epoch": 0.22627737226277372, "grad_norm": 0.6082876920700073, "learning_rate": 7.532710280373833e-06, "loss": 0.4767, "step": 403 }, { "epoch": 0.22683885457608086, "grad_norm": 0.5507972836494446, "learning_rate": 7.551401869158879e-06, "loss": 0.4642, "step": 404 }, { "epoch": 0.227400336889388, "grad_norm": 0.6616259813308716, "learning_rate": 7.570093457943926e-06, "loss": 0.5067, "step": 405 }, { "epoch": 0.22796181920269512, "grad_norm": 0.7018536925315857, "learning_rate": 7.588785046728973e-06, "loss": 0.5158, "step": 406 }, { "epoch": 0.22852330151600225, "grad_norm": 0.5942382216453552, "learning_rate": 7.60747663551402e-06, "loss": 0.4947, "step": 407 }, { "epoch": 0.22908478382930939, "grad_norm": 0.812664806842804, "learning_rate": 7.626168224299067e-06, "loss": 0.5114, "step": 408 }, { "epoch": 0.22964626614261652, "grad_norm": 0.6762606501579285, "learning_rate": 7.644859813084112e-06, "loss": 0.5106, "step": 409 }, { "epoch": 0.23020774845592365, "grad_norm": 0.6451171636581421, "learning_rate": 7.663551401869159e-06, "loss": 0.4707, "step": 410 }, { "epoch": 0.23076923076923078, "grad_norm": 0.6104170680046082, "learning_rate": 7.682242990654206e-06, "loss": 0.4674, "step": 411 }, { "epoch": 0.2313307130825379, "grad_norm": 0.6288776993751526, "learning_rate": 7.700934579439253e-06, "loss": 0.5022, "step": 412 }, { "epoch": 0.23189219539584502, "grad_norm": 0.5968853235244751, "learning_rate": 7.7196261682243e-06, "loss": 0.4997, "step": 413 }, { "epoch": 0.23245367770915215, "grad_norm": 0.4788106083869934, "learning_rate": 7.738317757009345e-06, "loss": 0.5024, "step": 414 }, { "epoch": 0.23301516002245928, "grad_norm": 0.5784845352172852, "learning_rate": 7.757009345794392e-06, "loss": 0.494, "step": 415 }, { "epoch": 0.23357664233576642, "grad_norm": 0.6584279537200928, "learning_rate": 7.77570093457944e-06, "loss": 0.5529, "step": 416 }, { "epoch": 0.23413812464907355, "grad_norm": 0.6224302053451538, "learning_rate": 7.794392523364486e-06, "loss": 0.5258, "step": 417 }, { "epoch": 0.23469960696238068, "grad_norm": 0.5566938519477844, "learning_rate": 7.813084112149533e-06, "loss": 0.5162, "step": 418 }, { "epoch": 0.2352610892756878, "grad_norm": 0.580614447593689, "learning_rate": 7.83177570093458e-06, "loss": 0.4656, "step": 419 }, { "epoch": 0.23582257158899494, "grad_norm": 0.6026407480239868, "learning_rate": 7.850467289719627e-06, "loss": 0.4798, "step": 420 }, { "epoch": 0.23638405390230208, "grad_norm": 0.6312263011932373, "learning_rate": 7.869158878504674e-06, "loss": 0.5294, "step": 421 }, { "epoch": 0.2369455362156092, "grad_norm": 0.558363139629364, "learning_rate": 7.887850467289721e-06, "loss": 0.4945, "step": 422 }, { "epoch": 0.23750701852891634, "grad_norm": 0.535563588142395, "learning_rate": 7.906542056074768e-06, "loss": 0.4864, "step": 423 }, { "epoch": 0.23806850084222347, "grad_norm": 0.5808517336845398, "learning_rate": 7.925233644859813e-06, "loss": 0.4911, "step": 424 }, { "epoch": 0.2386299831555306, "grad_norm": 0.5400141477584839, "learning_rate": 7.94392523364486e-06, "loss": 0.4962, "step": 425 }, { "epoch": 0.23919146546883774, "grad_norm": 0.5512329339981079, "learning_rate": 7.962616822429907e-06, "loss": 0.4792, "step": 426 }, { "epoch": 0.23975294778214487, "grad_norm": 0.5886077284812927, "learning_rate": 7.981308411214954e-06, "loss": 0.5463, "step": 427 }, { "epoch": 0.240314430095452, "grad_norm": 0.5762323141098022, "learning_rate": 8.000000000000001e-06, "loss": 0.4816, "step": 428 }, { "epoch": 0.24087591240875914, "grad_norm": 0.6191070675849915, "learning_rate": 8.018691588785047e-06, "loss": 0.4848, "step": 429 }, { "epoch": 0.24143739472206627, "grad_norm": 0.6425450444221497, "learning_rate": 8.037383177570094e-06, "loss": 0.5061, "step": 430 }, { "epoch": 0.24199887703537337, "grad_norm": 0.6068159341812134, "learning_rate": 8.05607476635514e-06, "loss": 0.5322, "step": 431 }, { "epoch": 0.2425603593486805, "grad_norm": 0.5641397833824158, "learning_rate": 8.074766355140188e-06, "loss": 0.4746, "step": 432 }, { "epoch": 0.24312184166198764, "grad_norm": 0.6732816100120544, "learning_rate": 8.093457943925235e-06, "loss": 0.4572, "step": 433 }, { "epoch": 0.24368332397529477, "grad_norm": 0.5947111248970032, "learning_rate": 8.11214953271028e-06, "loss": 0.474, "step": 434 }, { "epoch": 0.2442448062886019, "grad_norm": 0.5817164778709412, "learning_rate": 8.130841121495327e-06, "loss": 0.4757, "step": 435 }, { "epoch": 0.24480628860190903, "grad_norm": 0.677460253238678, "learning_rate": 8.149532710280374e-06, "loss": 0.4919, "step": 436 }, { "epoch": 0.24536777091521617, "grad_norm": 0.6698125600814819, "learning_rate": 8.16822429906542e-06, "loss": 0.508, "step": 437 }, { "epoch": 0.2459292532285233, "grad_norm": 0.5366109609603882, "learning_rate": 8.186915887850468e-06, "loss": 0.4327, "step": 438 }, { "epoch": 0.24649073554183043, "grad_norm": 0.5972552299499512, "learning_rate": 8.205607476635515e-06, "loss": 0.5137, "step": 439 }, { "epoch": 0.24705221785513756, "grad_norm": 0.6460276246070862, "learning_rate": 8.224299065420562e-06, "loss": 0.4849, "step": 440 }, { "epoch": 0.2476137001684447, "grad_norm": 0.5455697774887085, "learning_rate": 8.242990654205609e-06, "loss": 0.5233, "step": 441 }, { "epoch": 0.24817518248175183, "grad_norm": 0.5630013346672058, "learning_rate": 8.261682242990656e-06, "loss": 0.4712, "step": 442 }, { "epoch": 0.24873666479505896, "grad_norm": 0.7004854679107666, "learning_rate": 8.280373831775703e-06, "loss": 0.5004, "step": 443 }, { "epoch": 0.2492981471083661, "grad_norm": 0.5845527052879333, "learning_rate": 8.299065420560748e-06, "loss": 0.5077, "step": 444 }, { "epoch": 0.24985962942167322, "grad_norm": 0.6031231880187988, "learning_rate": 8.317757009345795e-06, "loss": 0.4757, "step": 445 }, { "epoch": 0.25042111173498033, "grad_norm": 0.5969230532646179, "learning_rate": 8.336448598130842e-06, "loss": 0.5065, "step": 446 }, { "epoch": 0.2509825940482875, "grad_norm": 0.5948389768600464, "learning_rate": 8.355140186915889e-06, "loss": 0.4806, "step": 447 }, { "epoch": 0.2515440763615946, "grad_norm": 0.5883708000183105, "learning_rate": 8.373831775700936e-06, "loss": 0.4954, "step": 448 }, { "epoch": 0.25210555867490175, "grad_norm": 0.6650047302246094, "learning_rate": 8.392523364485981e-06, "loss": 0.4997, "step": 449 }, { "epoch": 0.25266704098820886, "grad_norm": 0.6043297648429871, "learning_rate": 8.411214953271028e-06, "loss": 0.4968, "step": 450 }, { "epoch": 0.253228523301516, "grad_norm": 0.5786547064781189, "learning_rate": 8.429906542056075e-06, "loss": 0.49, "step": 451 }, { "epoch": 0.2537900056148231, "grad_norm": 0.6476893424987793, "learning_rate": 8.448598130841122e-06, "loss": 0.5073, "step": 452 }, { "epoch": 0.2543514879281303, "grad_norm": 0.5443849563598633, "learning_rate": 8.467289719626169e-06, "loss": 0.4502, "step": 453 }, { "epoch": 0.2549129702414374, "grad_norm": 0.6505762338638306, "learning_rate": 8.485981308411216e-06, "loss": 0.5137, "step": 454 }, { "epoch": 0.25547445255474455, "grad_norm": 0.6667113900184631, "learning_rate": 8.504672897196263e-06, "loss": 0.4915, "step": 455 }, { "epoch": 0.25603593486805165, "grad_norm": 0.6364516019821167, "learning_rate": 8.52336448598131e-06, "loss": 0.4977, "step": 456 }, { "epoch": 0.2565974171813588, "grad_norm": 0.6998821496963501, "learning_rate": 8.542056074766357e-06, "loss": 0.4977, "step": 457 }, { "epoch": 0.2571588994946659, "grad_norm": 0.8241758942604065, "learning_rate": 8.560747663551404e-06, "loss": 0.4987, "step": 458 }, { "epoch": 0.2577203818079731, "grad_norm": 0.5717706680297852, "learning_rate": 8.57943925233645e-06, "loss": 0.46, "step": 459 }, { "epoch": 0.2582818641212802, "grad_norm": 0.6626983284950256, "learning_rate": 8.598130841121496e-06, "loss": 0.5091, "step": 460 }, { "epoch": 0.2588433464345873, "grad_norm": 0.7759358882904053, "learning_rate": 8.616822429906543e-06, "loss": 0.4867, "step": 461 }, { "epoch": 0.25940482874789444, "grad_norm": 0.562786340713501, "learning_rate": 8.63551401869159e-06, "loss": 0.4941, "step": 462 }, { "epoch": 0.25996631106120155, "grad_norm": 0.6443027257919312, "learning_rate": 8.654205607476637e-06, "loss": 0.4932, "step": 463 }, { "epoch": 0.2605277933745087, "grad_norm": 0.5393490791320801, "learning_rate": 8.672897196261682e-06, "loss": 0.4673, "step": 464 }, { "epoch": 0.2610892756878158, "grad_norm": 0.6316348910331726, "learning_rate": 8.69158878504673e-06, "loss": 0.4774, "step": 465 }, { "epoch": 0.261650758001123, "grad_norm": 0.6481581330299377, "learning_rate": 8.710280373831776e-06, "loss": 0.5051, "step": 466 }, { "epoch": 0.2622122403144301, "grad_norm": 0.5675401091575623, "learning_rate": 8.728971962616823e-06, "loss": 0.5017, "step": 467 }, { "epoch": 0.26277372262773724, "grad_norm": 0.6409372091293335, "learning_rate": 8.74766355140187e-06, "loss": 0.4905, "step": 468 }, { "epoch": 0.26333520494104434, "grad_norm": 0.6161016225814819, "learning_rate": 8.766355140186916e-06, "loss": 0.5057, "step": 469 }, { "epoch": 0.2638966872543515, "grad_norm": 0.5478185415267944, "learning_rate": 8.785046728971963e-06, "loss": 0.494, "step": 470 }, { "epoch": 0.2644581695676586, "grad_norm": 0.6255790591239929, "learning_rate": 8.80373831775701e-06, "loss": 0.5215, "step": 471 }, { "epoch": 0.26501965188096577, "grad_norm": 0.5488994121551514, "learning_rate": 8.822429906542056e-06, "loss": 0.4733, "step": 472 }, { "epoch": 0.26558113419427287, "grad_norm": 0.6423689723014832, "learning_rate": 8.841121495327103e-06, "loss": 0.4694, "step": 473 }, { "epoch": 0.26614261650758003, "grad_norm": 0.6147846579551697, "learning_rate": 8.85981308411215e-06, "loss": 0.518, "step": 474 }, { "epoch": 0.26670409882088714, "grad_norm": 0.6425021290779114, "learning_rate": 8.878504672897197e-06, "loss": 0.4836, "step": 475 }, { "epoch": 0.2672655811341943, "grad_norm": 0.5844836831092834, "learning_rate": 8.897196261682244e-06, "loss": 0.4793, "step": 476 }, { "epoch": 0.2678270634475014, "grad_norm": 0.5992730855941772, "learning_rate": 8.915887850467291e-06, "loss": 0.4975, "step": 477 }, { "epoch": 0.26838854576080856, "grad_norm": 0.6294432282447815, "learning_rate": 8.934579439252338e-06, "loss": 0.5102, "step": 478 }, { "epoch": 0.26895002807411567, "grad_norm": 0.5589650869369507, "learning_rate": 8.953271028037384e-06, "loss": 0.4975, "step": 479 }, { "epoch": 0.26951151038742277, "grad_norm": 0.5278250575065613, "learning_rate": 8.97196261682243e-06, "loss": 0.4694, "step": 480 }, { "epoch": 0.27007299270072993, "grad_norm": 0.6472972631454468, "learning_rate": 8.990654205607478e-06, "loss": 0.4936, "step": 481 }, { "epoch": 0.27063447501403703, "grad_norm": 0.5857380628585815, "learning_rate": 9.009345794392525e-06, "loss": 0.4913, "step": 482 }, { "epoch": 0.2711959573273442, "grad_norm": 0.5924478769302368, "learning_rate": 9.02803738317757e-06, "loss": 0.5107, "step": 483 }, { "epoch": 0.2717574396406513, "grad_norm": 0.5974671840667725, "learning_rate": 9.046728971962617e-06, "loss": 0.4735, "step": 484 }, { "epoch": 0.27231892195395846, "grad_norm": 0.6146833896636963, "learning_rate": 9.065420560747664e-06, "loss": 0.4826, "step": 485 }, { "epoch": 0.27288040426726556, "grad_norm": 0.6232050657272339, "learning_rate": 9.08411214953271e-06, "loss": 0.4733, "step": 486 }, { "epoch": 0.2734418865805727, "grad_norm": 0.6057919263839722, "learning_rate": 9.102803738317758e-06, "loss": 0.5013, "step": 487 }, { "epoch": 0.27400336889387983, "grad_norm": 0.5516281127929688, "learning_rate": 9.121495327102805e-06, "loss": 0.4617, "step": 488 }, { "epoch": 0.274564851207187, "grad_norm": 0.636155366897583, "learning_rate": 9.140186915887852e-06, "loss": 0.488, "step": 489 }, { "epoch": 0.2751263335204941, "grad_norm": 0.6760314106941223, "learning_rate": 9.158878504672899e-06, "loss": 0.4874, "step": 490 }, { "epoch": 0.27568781583380125, "grad_norm": 0.5903568863868713, "learning_rate": 9.177570093457944e-06, "loss": 0.4814, "step": 491 }, { "epoch": 0.27624929814710836, "grad_norm": 0.6354197263717651, "learning_rate": 9.196261682242991e-06, "loss": 0.4708, "step": 492 }, { "epoch": 0.2768107804604155, "grad_norm": 0.6159335374832153, "learning_rate": 9.214953271028038e-06, "loss": 0.4717, "step": 493 }, { "epoch": 0.2773722627737226, "grad_norm": 0.7262251973152161, "learning_rate": 9.233644859813085e-06, "loss": 0.5007, "step": 494 }, { "epoch": 0.2779337450870298, "grad_norm": 0.5721843838691711, "learning_rate": 9.252336448598132e-06, "loss": 0.5004, "step": 495 }, { "epoch": 0.2784952274003369, "grad_norm": 0.6459770202636719, "learning_rate": 9.271028037383179e-06, "loss": 0.52, "step": 496 }, { "epoch": 0.27905670971364405, "grad_norm": 0.6006195545196533, "learning_rate": 9.289719626168226e-06, "loss": 0.4652, "step": 497 }, { "epoch": 0.27961819202695115, "grad_norm": 0.6801638007164001, "learning_rate": 9.308411214953271e-06, "loss": 0.4696, "step": 498 }, { "epoch": 0.28017967434025826, "grad_norm": 0.5677798986434937, "learning_rate": 9.327102803738318e-06, "loss": 0.4907, "step": 499 }, { "epoch": 0.2807411566535654, "grad_norm": 0.6845209002494812, "learning_rate": 9.345794392523365e-06, "loss": 0.5608, "step": 500 }, { "epoch": 0.2813026389668725, "grad_norm": 0.6536325812339783, "learning_rate": 9.364485981308412e-06, "loss": 0.4971, "step": 501 }, { "epoch": 0.2818641212801797, "grad_norm": 0.7101975083351135, "learning_rate": 9.383177570093459e-06, "loss": 0.508, "step": 502 }, { "epoch": 0.2824256035934868, "grad_norm": 0.5692508816719055, "learning_rate": 9.401869158878504e-06, "loss": 0.4944, "step": 503 }, { "epoch": 0.28298708590679394, "grad_norm": 0.8482427000999451, "learning_rate": 9.420560747663551e-06, "loss": 0.5101, "step": 504 }, { "epoch": 0.28354856822010105, "grad_norm": 0.5255241394042969, "learning_rate": 9.439252336448598e-06, "loss": 0.5027, "step": 505 }, { "epoch": 0.2841100505334082, "grad_norm": 0.503354549407959, "learning_rate": 9.457943925233645e-06, "loss": 0.4719, "step": 506 }, { "epoch": 0.2846715328467153, "grad_norm": 0.6531822681427002, "learning_rate": 9.476635514018692e-06, "loss": 0.4491, "step": 507 }, { "epoch": 0.2852330151600225, "grad_norm": 0.616445779800415, "learning_rate": 9.49532710280374e-06, "loss": 0.472, "step": 508 }, { "epoch": 0.2857944974733296, "grad_norm": 0.5447178483009338, "learning_rate": 9.514018691588786e-06, "loss": 0.4922, "step": 509 }, { "epoch": 0.28635597978663674, "grad_norm": 0.6563045978546143, "learning_rate": 9.532710280373833e-06, "loss": 0.5044, "step": 510 }, { "epoch": 0.28691746209994384, "grad_norm": 0.6533622741699219, "learning_rate": 9.55140186915888e-06, "loss": 0.4813, "step": 511 }, { "epoch": 0.287478944413251, "grad_norm": 0.5659510493278503, "learning_rate": 9.570093457943927e-06, "loss": 0.4795, "step": 512 }, { "epoch": 0.2880404267265581, "grad_norm": 0.6325262188911438, "learning_rate": 9.588785046728972e-06, "loss": 0.4823, "step": 513 }, { "epoch": 0.28860190903986527, "grad_norm": 0.662864089012146, "learning_rate": 9.60747663551402e-06, "loss": 0.5154, "step": 514 }, { "epoch": 0.28916339135317237, "grad_norm": 0.5904008746147156, "learning_rate": 9.626168224299066e-06, "loss": 0.5109, "step": 515 }, { "epoch": 0.28972487366647953, "grad_norm": 0.6146894693374634, "learning_rate": 9.644859813084113e-06, "loss": 0.5094, "step": 516 }, { "epoch": 0.29028635597978664, "grad_norm": 0.5517740845680237, "learning_rate": 9.66355140186916e-06, "loss": 0.513, "step": 517 }, { "epoch": 0.29084783829309374, "grad_norm": 0.6402063369750977, "learning_rate": 9.682242990654206e-06, "loss": 0.4738, "step": 518 }, { "epoch": 0.2914093206064009, "grad_norm": 0.5052649974822998, "learning_rate": 9.700934579439253e-06, "loss": 0.4421, "step": 519 }, { "epoch": 0.291970802919708, "grad_norm": 0.5606950521469116, "learning_rate": 9.7196261682243e-06, "loss": 0.4737, "step": 520 }, { "epoch": 0.29253228523301517, "grad_norm": 0.541986882686615, "learning_rate": 9.738317757009347e-06, "loss": 0.438, "step": 521 }, { "epoch": 0.29309376754632227, "grad_norm": 0.6044184565544128, "learning_rate": 9.757009345794393e-06, "loss": 0.483, "step": 522 }, { "epoch": 0.29365524985962943, "grad_norm": 0.5376028418540955, "learning_rate": 9.775700934579439e-06, "loss": 0.5027, "step": 523 }, { "epoch": 0.29421673217293653, "grad_norm": 0.7020241618156433, "learning_rate": 9.794392523364486e-06, "loss": 0.4955, "step": 524 }, { "epoch": 0.2947782144862437, "grad_norm": 0.6261412501335144, "learning_rate": 9.813084112149533e-06, "loss": 0.4845, "step": 525 }, { "epoch": 0.2953396967995508, "grad_norm": 0.5997791886329651, "learning_rate": 9.83177570093458e-06, "loss": 0.4565, "step": 526 }, { "epoch": 0.29590117911285796, "grad_norm": 0.6261727213859558, "learning_rate": 9.850467289719627e-06, "loss": 0.4828, "step": 527 }, { "epoch": 0.29646266142616506, "grad_norm": 0.6748504638671875, "learning_rate": 9.869158878504674e-06, "loss": 0.4512, "step": 528 }, { "epoch": 0.2970241437394722, "grad_norm": 0.7056427597999573, "learning_rate": 9.88785046728972e-06, "loss": 0.5109, "step": 529 }, { "epoch": 0.29758562605277933, "grad_norm": 0.7199645042419434, "learning_rate": 9.906542056074768e-06, "loss": 0.5183, "step": 530 }, { "epoch": 0.2981471083660865, "grad_norm": 0.7202709913253784, "learning_rate": 9.925233644859815e-06, "loss": 0.5061, "step": 531 }, { "epoch": 0.2987085906793936, "grad_norm": 0.6551154255867004, "learning_rate": 9.943925233644862e-06, "loss": 0.4645, "step": 532 }, { "epoch": 0.29927007299270075, "grad_norm": 0.6740177869796753, "learning_rate": 9.962616822429907e-06, "loss": 0.4854, "step": 533 }, { "epoch": 0.29983155530600786, "grad_norm": 0.672265887260437, "learning_rate": 9.981308411214954e-06, "loss": 0.4848, "step": 534 }, { "epoch": 0.300393037619315, "grad_norm": 0.7126233577728271, "learning_rate": 1e-05, "loss": 0.4967, "step": 535 }, { "epoch": 0.3009545199326221, "grad_norm": 0.6888909339904785, "learning_rate": 9.999998932640253e-06, "loss": 0.488, "step": 536 }, { "epoch": 0.3015160022459292, "grad_norm": 0.7566066980361938, "learning_rate": 9.999995730561465e-06, "loss": 0.4895, "step": 537 }, { "epoch": 0.3020774845592364, "grad_norm": 0.7075061202049255, "learning_rate": 9.999990393765007e-06, "loss": 0.5032, "step": 538 }, { "epoch": 0.3026389668725435, "grad_norm": 0.6433613300323486, "learning_rate": 9.999982922253152e-06, "loss": 0.5097, "step": 539 }, { "epoch": 0.30320044918585065, "grad_norm": 0.6404690742492676, "learning_rate": 9.999973316029094e-06, "loss": 0.4424, "step": 540 }, { "epoch": 0.30376193149915776, "grad_norm": 0.726622998714447, "learning_rate": 9.999961575096935e-06, "loss": 0.467, "step": 541 }, { "epoch": 0.3043234138124649, "grad_norm": 0.5744560956954956, "learning_rate": 9.999947699461686e-06, "loss": 0.4776, "step": 542 }, { "epoch": 0.304884896125772, "grad_norm": 0.6869432926177979, "learning_rate": 9.999931689129269e-06, "loss": 0.5112, "step": 543 }, { "epoch": 0.3054463784390792, "grad_norm": 0.5740162134170532, "learning_rate": 9.999913544106523e-06, "loss": 0.4636, "step": 544 }, { "epoch": 0.3060078607523863, "grad_norm": 0.5996674299240112, "learning_rate": 9.999893264401192e-06, "loss": 0.5075, "step": 545 }, { "epoch": 0.30656934306569344, "grad_norm": 0.6099842190742493, "learning_rate": 9.999870850021937e-06, "loss": 0.4888, "step": 546 }, { "epoch": 0.30713082537900055, "grad_norm": 0.6563755869865417, "learning_rate": 9.999846300978327e-06, "loss": 0.4845, "step": 547 }, { "epoch": 0.3076923076923077, "grad_norm": 0.590053141117096, "learning_rate": 9.999819617280843e-06, "loss": 0.4858, "step": 548 }, { "epoch": 0.3082537900056148, "grad_norm": 0.5961744785308838, "learning_rate": 9.999790798940876e-06, "loss": 0.4977, "step": 549 }, { "epoch": 0.308815272318922, "grad_norm": 0.6326956748962402, "learning_rate": 9.99975984597073e-06, "loss": 0.4816, "step": 550 }, { "epoch": 0.3093767546322291, "grad_norm": 0.6592427492141724, "learning_rate": 9.999726758383624e-06, "loss": 0.4971, "step": 551 }, { "epoch": 0.30993823694553624, "grad_norm": 0.6441349983215332, "learning_rate": 9.999691536193679e-06, "loss": 0.4798, "step": 552 }, { "epoch": 0.31049971925884334, "grad_norm": 0.5824711322784424, "learning_rate": 9.999654179415936e-06, "loss": 0.4687, "step": 553 }, { "epoch": 0.3110612015721505, "grad_norm": 0.6174152493476868, "learning_rate": 9.999614688066345e-06, "loss": 0.5063, "step": 554 }, { "epoch": 0.3116226838854576, "grad_norm": 0.5334718823432922, "learning_rate": 9.999573062161765e-06, "loss": 0.4696, "step": 555 }, { "epoch": 0.3121841661987647, "grad_norm": 0.614521861076355, "learning_rate": 9.999529301719967e-06, "loss": 0.4624, "step": 556 }, { "epoch": 0.31274564851207187, "grad_norm": 0.6167088150978088, "learning_rate": 9.999483406759636e-06, "loss": 0.5255, "step": 557 }, { "epoch": 0.313307130825379, "grad_norm": 0.6591715216636658, "learning_rate": 9.999435377300366e-06, "loss": 0.4758, "step": 558 }, { "epoch": 0.31386861313868614, "grad_norm": 0.5817695260047913, "learning_rate": 9.999385213362663e-06, "loss": 0.4919, "step": 559 }, { "epoch": 0.31443009545199324, "grad_norm": 0.696372926235199, "learning_rate": 9.999332914967946e-06, "loss": 0.4684, "step": 560 }, { "epoch": 0.3149915777653004, "grad_norm": 0.6518309712409973, "learning_rate": 9.999278482138539e-06, "loss": 0.4803, "step": 561 }, { "epoch": 0.3155530600786075, "grad_norm": 0.6153174042701721, "learning_rate": 9.999221914897686e-06, "loss": 0.4321, "step": 562 }, { "epoch": 0.31611454239191467, "grad_norm": 0.6802576184272766, "learning_rate": 9.999163213269536e-06, "loss": 0.4687, "step": 563 }, { "epoch": 0.31667602470522177, "grad_norm": 0.6208354234695435, "learning_rate": 9.99910237727915e-06, "loss": 0.5234, "step": 564 }, { "epoch": 0.31723750701852893, "grad_norm": 0.540846049785614, "learning_rate": 9.999039406952506e-06, "loss": 0.4815, "step": 565 }, { "epoch": 0.31779898933183603, "grad_norm": 0.7106029987335205, "learning_rate": 9.998974302316485e-06, "loss": 0.4567, "step": 566 }, { "epoch": 0.3183604716451432, "grad_norm": 0.720335841178894, "learning_rate": 9.998907063398884e-06, "loss": 0.5088, "step": 567 }, { "epoch": 0.3189219539584503, "grad_norm": 0.6540274024009705, "learning_rate": 9.998837690228412e-06, "loss": 0.4964, "step": 568 }, { "epoch": 0.31948343627175746, "grad_norm": 0.6151735186576843, "learning_rate": 9.998766182834683e-06, "loss": 0.4727, "step": 569 }, { "epoch": 0.32004491858506456, "grad_norm": 0.6639697551727295, "learning_rate": 9.99869254124823e-06, "loss": 0.4942, "step": 570 }, { "epoch": 0.3206064008983717, "grad_norm": 0.6260003447532654, "learning_rate": 9.998616765500495e-06, "loss": 0.5069, "step": 571 }, { "epoch": 0.32116788321167883, "grad_norm": 0.5463575720787048, "learning_rate": 9.998538855623828e-06, "loss": 0.484, "step": 572 }, { "epoch": 0.321729365524986, "grad_norm": 0.6952374577522278, "learning_rate": 9.998458811651491e-06, "loss": 0.5208, "step": 573 }, { "epoch": 0.3222908478382931, "grad_norm": 0.6541756391525269, "learning_rate": 9.998376633617662e-06, "loss": 0.495, "step": 574 }, { "epoch": 0.3228523301516002, "grad_norm": 0.6143467426300049, "learning_rate": 9.99829232155742e-06, "loss": 0.5036, "step": 575 }, { "epoch": 0.32341381246490736, "grad_norm": 0.634221613407135, "learning_rate": 9.99820587550677e-06, "loss": 0.5371, "step": 576 }, { "epoch": 0.32397529477821446, "grad_norm": 0.6687210202217102, "learning_rate": 9.998117295502612e-06, "loss": 0.4553, "step": 577 }, { "epoch": 0.3245367770915216, "grad_norm": 0.659206211566925, "learning_rate": 9.99802658158277e-06, "loss": 0.4582, "step": 578 }, { "epoch": 0.3250982594048287, "grad_norm": 0.613252580165863, "learning_rate": 9.997933733785972e-06, "loss": 0.5029, "step": 579 }, { "epoch": 0.3256597417181359, "grad_norm": 0.6920384764671326, "learning_rate": 9.997838752151859e-06, "loss": 0.4825, "step": 580 }, { "epoch": 0.326221224031443, "grad_norm": 0.599012017250061, "learning_rate": 9.997741636720978e-06, "loss": 0.4935, "step": 581 }, { "epoch": 0.32678270634475015, "grad_norm": 0.617759644985199, "learning_rate": 9.997642387534801e-06, "loss": 0.5181, "step": 582 }, { "epoch": 0.32734418865805726, "grad_norm": 0.6980809569358826, "learning_rate": 9.997541004635695e-06, "loss": 0.4394, "step": 583 }, { "epoch": 0.3279056709713644, "grad_norm": 0.6711085438728333, "learning_rate": 9.997437488066948e-06, "loss": 0.4741, "step": 584 }, { "epoch": 0.3284671532846715, "grad_norm": 0.5653877258300781, "learning_rate": 9.997331837872752e-06, "loss": 0.4643, "step": 585 }, { "epoch": 0.3290286355979787, "grad_norm": 0.6528114676475525, "learning_rate": 9.99722405409822e-06, "loss": 0.473, "step": 586 }, { "epoch": 0.3295901179112858, "grad_norm": 0.7015301585197449, "learning_rate": 9.997114136789363e-06, "loss": 0.5252, "step": 587 }, { "epoch": 0.33015160022459294, "grad_norm": 0.5674737691879272, "learning_rate": 9.997002085993112e-06, "loss": 0.4847, "step": 588 }, { "epoch": 0.33071308253790005, "grad_norm": 0.5571652054786682, "learning_rate": 9.99688790175731e-06, "loss": 0.4774, "step": 589 }, { "epoch": 0.3312745648512072, "grad_norm": 0.6391467452049255, "learning_rate": 9.996771584130703e-06, "loss": 0.5155, "step": 590 }, { "epoch": 0.3318360471645143, "grad_norm": 0.5601240992546082, "learning_rate": 9.996653133162951e-06, "loss": 0.4721, "step": 591 }, { "epoch": 0.3323975294778215, "grad_norm": 0.6685102581977844, "learning_rate": 9.99653254890463e-06, "loss": 0.4679, "step": 592 }, { "epoch": 0.3329590117911286, "grad_norm": 0.5798707008361816, "learning_rate": 9.996409831407222e-06, "loss": 0.5046, "step": 593 }, { "epoch": 0.3335204941044357, "grad_norm": 0.6773939728736877, "learning_rate": 9.996284980723116e-06, "loss": 0.4833, "step": 594 }, { "epoch": 0.33408197641774284, "grad_norm": 0.6108511090278625, "learning_rate": 9.996157996905622e-06, "loss": 0.4847, "step": 595 }, { "epoch": 0.33464345873104995, "grad_norm": 0.5785918831825256, "learning_rate": 9.996028880008952e-06, "loss": 0.4863, "step": 596 }, { "epoch": 0.3352049410443571, "grad_norm": 0.7960228323936462, "learning_rate": 9.995897630088232e-06, "loss": 0.5248, "step": 597 }, { "epoch": 0.3357664233576642, "grad_norm": 0.7487167119979858, "learning_rate": 9.995764247199498e-06, "loss": 0.5284, "step": 598 }, { "epoch": 0.33632790567097137, "grad_norm": 0.6396642923355103, "learning_rate": 9.995628731399699e-06, "loss": 0.4864, "step": 599 }, { "epoch": 0.3368893879842785, "grad_norm": 0.8160842657089233, "learning_rate": 9.99549108274669e-06, "loss": 0.4783, "step": 600 }, { "epoch": 0.33745087029758564, "grad_norm": 0.5324838161468506, "learning_rate": 9.995351301299242e-06, "loss": 0.4542, "step": 601 }, { "epoch": 0.33801235261089274, "grad_norm": 0.6088083982467651, "learning_rate": 9.99520938711703e-06, "loss": 0.4655, "step": 602 }, { "epoch": 0.3385738349241999, "grad_norm": 0.6525353789329529, "learning_rate": 9.995065340260648e-06, "loss": 0.5022, "step": 603 }, { "epoch": 0.339135317237507, "grad_norm": 0.6307079195976257, "learning_rate": 9.994919160791592e-06, "loss": 0.483, "step": 604 }, { "epoch": 0.33969679955081417, "grad_norm": 0.6334748864173889, "learning_rate": 9.994770848772274e-06, "loss": 0.4621, "step": 605 }, { "epoch": 0.34025828186412127, "grad_norm": 0.6524969339370728, "learning_rate": 9.994620404266017e-06, "loss": 0.4632, "step": 606 }, { "epoch": 0.34081976417742843, "grad_norm": 0.698352038860321, "learning_rate": 9.994467827337048e-06, "loss": 0.5153, "step": 607 }, { "epoch": 0.34138124649073553, "grad_norm": 0.6269802451133728, "learning_rate": 9.994313118050513e-06, "loss": 0.499, "step": 608 }, { "epoch": 0.3419427288040427, "grad_norm": 0.6260303854942322, "learning_rate": 9.994156276472462e-06, "loss": 0.4877, "step": 609 }, { "epoch": 0.3425042111173498, "grad_norm": 0.5416358113288879, "learning_rate": 9.993997302669857e-06, "loss": 0.4715, "step": 610 }, { "epoch": 0.34306569343065696, "grad_norm": 0.6757084131240845, "learning_rate": 9.993836196710572e-06, "loss": 0.4877, "step": 611 }, { "epoch": 0.34362717574396406, "grad_norm": 0.6194686889648438, "learning_rate": 9.99367295866339e-06, "loss": 0.4908, "step": 612 }, { "epoch": 0.34418865805727117, "grad_norm": 0.48504143953323364, "learning_rate": 9.993507588598004e-06, "loss": 0.4875, "step": 613 }, { "epoch": 0.34475014037057833, "grad_norm": 0.5754215121269226, "learning_rate": 9.99334008658502e-06, "loss": 0.4927, "step": 614 }, { "epoch": 0.34531162268388543, "grad_norm": 0.5979257225990295, "learning_rate": 9.993170452695948e-06, "loss": 0.4797, "step": 615 }, { "epoch": 0.3458731049971926, "grad_norm": 0.5077643394470215, "learning_rate": 9.992998687003214e-06, "loss": 0.5177, "step": 616 }, { "epoch": 0.3464345873104997, "grad_norm": 0.5822879076004028, "learning_rate": 9.992824789580155e-06, "loss": 0.4666, "step": 617 }, { "epoch": 0.34699606962380686, "grad_norm": 0.6227225065231323, "learning_rate": 9.992648760501011e-06, "loss": 0.4985, "step": 618 }, { "epoch": 0.34755755193711396, "grad_norm": 0.570460855960846, "learning_rate": 9.99247059984094e-06, "loss": 0.4656, "step": 619 }, { "epoch": 0.3481190342504211, "grad_norm": 0.5625332593917847, "learning_rate": 9.992290307676004e-06, "loss": 0.4764, "step": 620 }, { "epoch": 0.3486805165637282, "grad_norm": 0.7892563939094543, "learning_rate": 9.99210788408318e-06, "loss": 0.4854, "step": 621 }, { "epoch": 0.3492419988770354, "grad_norm": 0.644889235496521, "learning_rate": 9.991923329140352e-06, "loss": 0.4638, "step": 622 }, { "epoch": 0.3498034811903425, "grad_norm": 0.5973142385482788, "learning_rate": 9.991736642926311e-06, "loss": 0.4605, "step": 623 }, { "epoch": 0.35036496350364965, "grad_norm": 0.7218355536460876, "learning_rate": 9.991547825520768e-06, "loss": 0.5056, "step": 624 }, { "epoch": 0.35092644581695676, "grad_norm": 0.5331411361694336, "learning_rate": 9.991356877004332e-06, "loss": 0.4741, "step": 625 }, { "epoch": 0.3514879281302639, "grad_norm": 0.5863574147224426, "learning_rate": 9.99116379745853e-06, "loss": 0.4692, "step": 626 }, { "epoch": 0.352049410443571, "grad_norm": 0.6889180541038513, "learning_rate": 9.990968586965796e-06, "loss": 0.489, "step": 627 }, { "epoch": 0.3526108927568782, "grad_norm": 0.5508275628089905, "learning_rate": 9.990771245609475e-06, "loss": 0.5046, "step": 628 }, { "epoch": 0.3531723750701853, "grad_norm": 0.5482058525085449, "learning_rate": 9.990571773473816e-06, "loss": 0.4931, "step": 629 }, { "epoch": 0.35373385738349244, "grad_norm": 0.7193758487701416, "learning_rate": 9.990370170643987e-06, "loss": 0.5064, "step": 630 }, { "epoch": 0.35429533969679955, "grad_norm": 0.6425488591194153, "learning_rate": 9.99016643720606e-06, "loss": 0.5047, "step": 631 }, { "epoch": 0.35485682201010665, "grad_norm": 0.6007935404777527, "learning_rate": 9.989960573247016e-06, "loss": 0.4587, "step": 632 }, { "epoch": 0.3554183043234138, "grad_norm": 0.6143595576286316, "learning_rate": 9.98975257885475e-06, "loss": 0.4688, "step": 633 }, { "epoch": 0.3559797866367209, "grad_norm": 0.6141740679740906, "learning_rate": 9.989542454118063e-06, "loss": 0.4989, "step": 634 }, { "epoch": 0.3565412689500281, "grad_norm": 0.6778146624565125, "learning_rate": 9.989330199126666e-06, "loss": 0.5162, "step": 635 }, { "epoch": 0.3571027512633352, "grad_norm": 0.7215615510940552, "learning_rate": 9.989115813971178e-06, "loss": 0.505, "step": 636 }, { "epoch": 0.35766423357664234, "grad_norm": 0.5518330931663513, "learning_rate": 9.988899298743132e-06, "loss": 0.4822, "step": 637 }, { "epoch": 0.35822571588994945, "grad_norm": 0.8082155585289001, "learning_rate": 9.98868065353497e-06, "loss": 0.4653, "step": 638 }, { "epoch": 0.3587871982032566, "grad_norm": 0.5523286461830139, "learning_rate": 9.988459878440038e-06, "loss": 0.4778, "step": 639 }, { "epoch": 0.3593486805165637, "grad_norm": 0.7196412682533264, "learning_rate": 9.988236973552594e-06, "loss": 0.4689, "step": 640 }, { "epoch": 0.35991016282987087, "grad_norm": 0.7045707702636719, "learning_rate": 9.988011938967808e-06, "loss": 0.4987, "step": 641 }, { "epoch": 0.360471645143178, "grad_norm": 0.6602107286453247, "learning_rate": 9.987784774781758e-06, "loss": 0.5001, "step": 642 }, { "epoch": 0.36103312745648514, "grad_norm": 0.74434494972229, "learning_rate": 9.987555481091425e-06, "loss": 0.4882, "step": 643 }, { "epoch": 0.36159460976979224, "grad_norm": 0.6435182690620422, "learning_rate": 9.98732405799471e-06, "loss": 0.507, "step": 644 }, { "epoch": 0.3621560920830994, "grad_norm": 0.5853757858276367, "learning_rate": 9.987090505590416e-06, "loss": 0.4674, "step": 645 }, { "epoch": 0.3627175743964065, "grad_norm": 0.7377463579177856, "learning_rate": 9.986854823978255e-06, "loss": 0.4579, "step": 646 }, { "epoch": 0.36327905670971367, "grad_norm": 0.6361244320869446, "learning_rate": 9.986617013258854e-06, "loss": 0.5123, "step": 647 }, { "epoch": 0.36384053902302077, "grad_norm": 0.6835609078407288, "learning_rate": 9.98637707353374e-06, "loss": 0.4776, "step": 648 }, { "epoch": 0.36440202133632793, "grad_norm": 0.7298732399940491, "learning_rate": 9.986135004905357e-06, "loss": 0.4848, "step": 649 }, { "epoch": 0.36496350364963503, "grad_norm": 0.6481173038482666, "learning_rate": 9.985890807477054e-06, "loss": 0.4974, "step": 650 }, { "epoch": 0.3655249859629422, "grad_norm": 0.6771376132965088, "learning_rate": 9.985644481353091e-06, "loss": 0.4825, "step": 651 }, { "epoch": 0.3660864682762493, "grad_norm": 0.6069446206092834, "learning_rate": 9.985396026638632e-06, "loss": 0.471, "step": 652 }, { "epoch": 0.3666479505895564, "grad_norm": 0.6943634152412415, "learning_rate": 9.985145443439754e-06, "loss": 0.5025, "step": 653 }, { "epoch": 0.36720943290286356, "grad_norm": 0.6880871653556824, "learning_rate": 9.984892731863444e-06, "loss": 0.4965, "step": 654 }, { "epoch": 0.36777091521617067, "grad_norm": 0.7234885096549988, "learning_rate": 9.984637892017597e-06, "loss": 0.5156, "step": 655 }, { "epoch": 0.36833239752947783, "grad_norm": 0.5515395998954773, "learning_rate": 9.984380924011011e-06, "loss": 0.4731, "step": 656 }, { "epoch": 0.36889387984278493, "grad_norm": 0.7107821106910706, "learning_rate": 9.984121827953399e-06, "loss": 0.4824, "step": 657 }, { "epoch": 0.3694553621560921, "grad_norm": 0.570938229560852, "learning_rate": 9.98386060395538e-06, "loss": 0.4482, "step": 658 }, { "epoch": 0.3700168444693992, "grad_norm": 0.6416492462158203, "learning_rate": 9.983597252128483e-06, "loss": 0.4859, "step": 659 }, { "epoch": 0.37057832678270636, "grad_norm": 0.6199230551719666, "learning_rate": 9.983331772585144e-06, "loss": 0.4794, "step": 660 }, { "epoch": 0.37113980909601346, "grad_norm": 0.6251494288444519, "learning_rate": 9.983064165438707e-06, "loss": 0.4872, "step": 661 }, { "epoch": 0.3717012914093206, "grad_norm": 0.6157795786857605, "learning_rate": 9.982794430803428e-06, "loss": 0.4926, "step": 662 }, { "epoch": 0.3722627737226277, "grad_norm": 0.5687664151191711, "learning_rate": 9.982522568794466e-06, "loss": 0.4895, "step": 663 }, { "epoch": 0.3728242560359349, "grad_norm": 0.68374103307724, "learning_rate": 9.98224857952789e-06, "loss": 0.4951, "step": 664 }, { "epoch": 0.373385738349242, "grad_norm": 0.6192730665206909, "learning_rate": 9.981972463120679e-06, "loss": 0.4983, "step": 665 }, { "epoch": 0.37394722066254915, "grad_norm": 0.5433259010314941, "learning_rate": 9.98169421969072e-06, "loss": 0.4811, "step": 666 }, { "epoch": 0.37450870297585626, "grad_norm": 0.6925939321517944, "learning_rate": 9.981413849356809e-06, "loss": 0.5015, "step": 667 }, { "epoch": 0.3750701852891634, "grad_norm": 0.5218213796615601, "learning_rate": 9.981131352238644e-06, "loss": 0.4692, "step": 668 }, { "epoch": 0.3756316676024705, "grad_norm": 0.6678874492645264, "learning_rate": 9.980846728456839e-06, "loss": 0.4865, "step": 669 }, { "epoch": 0.3761931499157777, "grad_norm": 0.6147998571395874, "learning_rate": 9.98055997813291e-06, "loss": 0.4762, "step": 670 }, { "epoch": 0.3767546322290848, "grad_norm": 0.656909704208374, "learning_rate": 9.980271101389287e-06, "loss": 0.4796, "step": 671 }, { "epoch": 0.3773161145423919, "grad_norm": 0.6738446950912476, "learning_rate": 9.9799800983493e-06, "loss": 0.4856, "step": 672 }, { "epoch": 0.37787759685569905, "grad_norm": 0.5343733429908752, "learning_rate": 9.979686969137192e-06, "loss": 0.5079, "step": 673 }, { "epoch": 0.37843907916900615, "grad_norm": 0.6019995808601379, "learning_rate": 9.979391713878114e-06, "loss": 0.4886, "step": 674 }, { "epoch": 0.3790005614823133, "grad_norm": 0.6628249287605286, "learning_rate": 9.979094332698124e-06, "loss": 0.4832, "step": 675 }, { "epoch": 0.3795620437956204, "grad_norm": 0.6247780323028564, "learning_rate": 9.978794825724185e-06, "loss": 0.4568, "step": 676 }, { "epoch": 0.3801235261089276, "grad_norm": 0.5778994560241699, "learning_rate": 9.97849319308417e-06, "loss": 0.4642, "step": 677 }, { "epoch": 0.3806850084222347, "grad_norm": 0.6601492166519165, "learning_rate": 9.97818943490686e-06, "loss": 0.48, "step": 678 }, { "epoch": 0.38124649073554184, "grad_norm": 0.568792998790741, "learning_rate": 9.977883551321943e-06, "loss": 0.4758, "step": 679 }, { "epoch": 0.38180797304884895, "grad_norm": 0.6137762069702148, "learning_rate": 9.977575542460012e-06, "loss": 0.4914, "step": 680 }, { "epoch": 0.3823694553621561, "grad_norm": 0.6125585436820984, "learning_rate": 9.977265408452571e-06, "loss": 0.4497, "step": 681 }, { "epoch": 0.3829309376754632, "grad_norm": 0.579738199710846, "learning_rate": 9.976953149432034e-06, "loss": 0.4755, "step": 682 }, { "epoch": 0.38349241998877037, "grad_norm": 0.5827937722206116, "learning_rate": 9.976638765531709e-06, "loss": 0.4769, "step": 683 }, { "epoch": 0.3840539023020775, "grad_norm": 0.5925636291503906, "learning_rate": 9.976322256885828e-06, "loss": 0.4589, "step": 684 }, { "epoch": 0.38461538461538464, "grad_norm": 0.6079484820365906, "learning_rate": 9.976003623629519e-06, "loss": 0.4525, "step": 685 }, { "epoch": 0.38517686692869174, "grad_norm": 0.7009006142616272, "learning_rate": 9.975682865898821e-06, "loss": 0.4795, "step": 686 }, { "epoch": 0.3857383492419989, "grad_norm": 0.6466800570487976, "learning_rate": 9.975359983830681e-06, "loss": 0.4805, "step": 687 }, { "epoch": 0.386299831555306, "grad_norm": 0.7069918513298035, "learning_rate": 9.97503497756295e-06, "loss": 0.4954, "step": 688 }, { "epoch": 0.38686131386861317, "grad_norm": 0.6978471279144287, "learning_rate": 9.974707847234387e-06, "loss": 0.5058, "step": 689 }, { "epoch": 0.38742279618192027, "grad_norm": 0.6149472594261169, "learning_rate": 9.97437859298466e-06, "loss": 0.478, "step": 690 }, { "epoch": 0.3879842784952274, "grad_norm": 0.6552645564079285, "learning_rate": 9.97404721495434e-06, "loss": 0.4612, "step": 691 }, { "epoch": 0.38854576080853453, "grad_norm": 0.5619072914123535, "learning_rate": 9.973713713284908e-06, "loss": 0.4572, "step": 692 }, { "epoch": 0.38910724312184164, "grad_norm": 0.6136743426322937, "learning_rate": 9.973378088118752e-06, "loss": 0.4453, "step": 693 }, { "epoch": 0.3896687254351488, "grad_norm": 0.5822853446006775, "learning_rate": 9.973040339599163e-06, "loss": 0.4884, "step": 694 }, { "epoch": 0.3902302077484559, "grad_norm": 0.5643738508224487, "learning_rate": 9.97270046787034e-06, "loss": 0.4789, "step": 695 }, { "epoch": 0.39079169006176306, "grad_norm": 0.6689649224281311, "learning_rate": 9.972358473077391e-06, "loss": 0.4571, "step": 696 }, { "epoch": 0.39135317237507017, "grad_norm": 0.5762844681739807, "learning_rate": 9.972014355366329e-06, "loss": 0.4776, "step": 697 }, { "epoch": 0.39191465468837733, "grad_norm": 0.691846489906311, "learning_rate": 9.97166811488407e-06, "loss": 0.5015, "step": 698 }, { "epoch": 0.39247613700168443, "grad_norm": 0.6377466320991516, "learning_rate": 9.971319751778444e-06, "loss": 0.445, "step": 699 }, { "epoch": 0.3930376193149916, "grad_norm": 0.6744887232780457, "learning_rate": 9.970969266198178e-06, "loss": 0.4836, "step": 700 }, { "epoch": 0.3935991016282987, "grad_norm": 0.7905594110488892, "learning_rate": 9.970616658292912e-06, "loss": 0.4778, "step": 701 }, { "epoch": 0.39416058394160586, "grad_norm": 0.5043938159942627, "learning_rate": 9.97026192821319e-06, "loss": 0.4659, "step": 702 }, { "epoch": 0.39472206625491296, "grad_norm": 0.6422023177146912, "learning_rate": 9.969905076110458e-06, "loss": 0.4752, "step": 703 }, { "epoch": 0.3952835485682201, "grad_norm": 0.651925802230835, "learning_rate": 9.969546102137078e-06, "loss": 0.4457, "step": 704 }, { "epoch": 0.3958450308815272, "grad_norm": 0.6396771669387817, "learning_rate": 9.969185006446307e-06, "loss": 0.4876, "step": 705 }, { "epoch": 0.3964065131948344, "grad_norm": 0.6739466786384583, "learning_rate": 9.968821789192316e-06, "loss": 0.4964, "step": 706 }, { "epoch": 0.3969679955081415, "grad_norm": 0.6628484725952148, "learning_rate": 9.968456450530175e-06, "loss": 0.4588, "step": 707 }, { "epoch": 0.39752947782144865, "grad_norm": 0.7460634112358093, "learning_rate": 9.968088990615865e-06, "loss": 0.5081, "step": 708 }, { "epoch": 0.39809096013475576, "grad_norm": 0.6104995012283325, "learning_rate": 9.967719409606271e-06, "loss": 0.4918, "step": 709 }, { "epoch": 0.39865244244806286, "grad_norm": 0.7260275483131409, "learning_rate": 9.967347707659184e-06, "loss": 0.4942, "step": 710 }, { "epoch": 0.39921392476137, "grad_norm": 0.6126789450645447, "learning_rate": 9.966973884933298e-06, "loss": 0.4508, "step": 711 }, { "epoch": 0.3997754070746771, "grad_norm": 0.5747596621513367, "learning_rate": 9.966597941588215e-06, "loss": 0.4818, "step": 712 }, { "epoch": 0.4003368893879843, "grad_norm": 0.6592295169830322, "learning_rate": 9.96621987778444e-06, "loss": 0.5017, "step": 713 }, { "epoch": 0.4008983717012914, "grad_norm": 0.5649883151054382, "learning_rate": 9.965839693683387e-06, "loss": 0.5114, "step": 714 }, { "epoch": 0.40145985401459855, "grad_norm": 0.6057443022727966, "learning_rate": 9.965457389447376e-06, "loss": 0.4774, "step": 715 }, { "epoch": 0.40202133632790565, "grad_norm": 0.5168583393096924, "learning_rate": 9.965072965239625e-06, "loss": 0.4746, "step": 716 }, { "epoch": 0.4025828186412128, "grad_norm": 0.5572659373283386, "learning_rate": 9.964686421224262e-06, "loss": 0.4411, "step": 717 }, { "epoch": 0.4031443009545199, "grad_norm": 0.5464321970939636, "learning_rate": 9.964297757566323e-06, "loss": 0.4892, "step": 718 }, { "epoch": 0.4037057832678271, "grad_norm": 0.6230902671813965, "learning_rate": 9.963906974431742e-06, "loss": 0.465, "step": 719 }, { "epoch": 0.4042672655811342, "grad_norm": 0.5172702670097351, "learning_rate": 9.963514071987362e-06, "loss": 0.4834, "step": 720 }, { "epoch": 0.40482874789444134, "grad_norm": 0.5706448554992676, "learning_rate": 9.963119050400934e-06, "loss": 0.4511, "step": 721 }, { "epoch": 0.40539023020774845, "grad_norm": 0.5451928973197937, "learning_rate": 9.962721909841104e-06, "loss": 0.4192, "step": 722 }, { "epoch": 0.4059517125210556, "grad_norm": 0.5835235714912415, "learning_rate": 9.962322650477434e-06, "loss": 0.523, "step": 723 }, { "epoch": 0.4065131948343627, "grad_norm": 0.5130033493041992, "learning_rate": 9.961921272480382e-06, "loss": 0.4805, "step": 724 }, { "epoch": 0.40707467714766987, "grad_norm": 0.6199596524238586, "learning_rate": 9.961517776021316e-06, "loss": 0.4983, "step": 725 }, { "epoch": 0.407636159460977, "grad_norm": 0.580241322517395, "learning_rate": 9.961112161272505e-06, "loss": 0.4888, "step": 726 }, { "epoch": 0.40819764177428414, "grad_norm": 0.5732046365737915, "learning_rate": 9.960704428407124e-06, "loss": 0.4979, "step": 727 }, { "epoch": 0.40875912408759124, "grad_norm": 0.609740138053894, "learning_rate": 9.960294577599253e-06, "loss": 0.4972, "step": 728 }, { "epoch": 0.40932060640089835, "grad_norm": 0.6033239960670471, "learning_rate": 9.959882609023874e-06, "loss": 0.4548, "step": 729 }, { "epoch": 0.4098820887142055, "grad_norm": 0.5488548874855042, "learning_rate": 9.959468522856876e-06, "loss": 0.4811, "step": 730 }, { "epoch": 0.4104435710275126, "grad_norm": 0.7320557832717896, "learning_rate": 9.959052319275049e-06, "loss": 0.4907, "step": 731 }, { "epoch": 0.41100505334081977, "grad_norm": 0.6219567656517029, "learning_rate": 9.958633998456089e-06, "loss": 0.4748, "step": 732 }, { "epoch": 0.4115665356541269, "grad_norm": 0.6377459764480591, "learning_rate": 9.958213560578596e-06, "loss": 0.4506, "step": 733 }, { "epoch": 0.41212801796743403, "grad_norm": 0.6340068578720093, "learning_rate": 9.957791005822073e-06, "loss": 0.4895, "step": 734 }, { "epoch": 0.41268950028074114, "grad_norm": 0.6366626024246216, "learning_rate": 9.957366334366928e-06, "loss": 0.4571, "step": 735 }, { "epoch": 0.4132509825940483, "grad_norm": 0.638066291809082, "learning_rate": 9.95693954639447e-06, "loss": 0.5005, "step": 736 }, { "epoch": 0.4138124649073554, "grad_norm": 0.5280675888061523, "learning_rate": 9.956510642086916e-06, "loss": 0.4842, "step": 737 }, { "epoch": 0.41437394722066256, "grad_norm": 0.6171002388000488, "learning_rate": 9.956079621627382e-06, "loss": 0.4693, "step": 738 }, { "epoch": 0.41493542953396967, "grad_norm": 0.62358158826828, "learning_rate": 9.95564648519989e-06, "loss": 0.4686, "step": 739 }, { "epoch": 0.41549691184727683, "grad_norm": 0.5446628928184509, "learning_rate": 9.955211232989368e-06, "loss": 0.4586, "step": 740 }, { "epoch": 0.41605839416058393, "grad_norm": 0.48564454913139343, "learning_rate": 9.954773865181639e-06, "loss": 0.4445, "step": 741 }, { "epoch": 0.4166198764738911, "grad_norm": 0.5866141319274902, "learning_rate": 9.954334381963436e-06, "loss": 0.469, "step": 742 }, { "epoch": 0.4171813587871982, "grad_norm": 0.6213503479957581, "learning_rate": 9.953892783522396e-06, "loss": 0.5163, "step": 743 }, { "epoch": 0.41774284110050536, "grad_norm": 0.6302055716514587, "learning_rate": 9.953449070047056e-06, "loss": 0.4965, "step": 744 }, { "epoch": 0.41830432341381246, "grad_norm": 0.6292455196380615, "learning_rate": 9.953003241726858e-06, "loss": 0.4765, "step": 745 }, { "epoch": 0.4188658057271196, "grad_norm": 0.6649808883666992, "learning_rate": 9.95255529875214e-06, "loss": 0.4984, "step": 746 }, { "epoch": 0.4194272880404267, "grad_norm": 0.5715693235397339, "learning_rate": 9.952105241314157e-06, "loss": 0.4642, "step": 747 }, { "epoch": 0.41998877035373383, "grad_norm": 0.57093346118927, "learning_rate": 9.951653069605049e-06, "loss": 0.4959, "step": 748 }, { "epoch": 0.420550252667041, "grad_norm": 0.6142703294754028, "learning_rate": 9.951198783817876e-06, "loss": 0.4686, "step": 749 }, { "epoch": 0.4211117349803481, "grad_norm": 0.6213297843933105, "learning_rate": 9.950742384146588e-06, "loss": 0.4856, "step": 750 }, { "epoch": 0.42167321729365526, "grad_norm": 0.6713946461677551, "learning_rate": 9.950283870786044e-06, "loss": 0.4776, "step": 751 }, { "epoch": 0.42223469960696236, "grad_norm": 0.5497424602508545, "learning_rate": 9.949823243932003e-06, "loss": 0.4881, "step": 752 }, { "epoch": 0.4227961819202695, "grad_norm": 0.7140569686889648, "learning_rate": 9.949360503781125e-06, "loss": 0.4818, "step": 753 }, { "epoch": 0.4233576642335766, "grad_norm": 0.5247882008552551, "learning_rate": 9.948895650530977e-06, "loss": 0.4857, "step": 754 }, { "epoch": 0.4239191465468838, "grad_norm": 0.6468997001647949, "learning_rate": 9.948428684380025e-06, "loss": 0.4684, "step": 755 }, { "epoch": 0.4244806288601909, "grad_norm": 0.7057627439498901, "learning_rate": 9.947959605527633e-06, "loss": 0.4675, "step": 756 }, { "epoch": 0.42504211117349805, "grad_norm": 0.5241835117340088, "learning_rate": 9.947488414174078e-06, "loss": 0.4705, "step": 757 }, { "epoch": 0.42560359348680515, "grad_norm": 0.6848989129066467, "learning_rate": 9.947015110520528e-06, "loss": 0.4955, "step": 758 }, { "epoch": 0.4261650758001123, "grad_norm": 0.5578830242156982, "learning_rate": 9.946539694769057e-06, "loss": 0.4637, "step": 759 }, { "epoch": 0.4267265581134194, "grad_norm": 0.69393390417099, "learning_rate": 9.946062167122644e-06, "loss": 0.5046, "step": 760 }, { "epoch": 0.4272880404267266, "grad_norm": 0.6170006394386292, "learning_rate": 9.945582527785163e-06, "loss": 0.4661, "step": 761 }, { "epoch": 0.4278495227400337, "grad_norm": 0.5966355800628662, "learning_rate": 9.945100776961394e-06, "loss": 0.4687, "step": 762 }, { "epoch": 0.42841100505334084, "grad_norm": 0.7561265230178833, "learning_rate": 9.94461691485702e-06, "loss": 0.4818, "step": 763 }, { "epoch": 0.42897248736664795, "grad_norm": 0.6462828516960144, "learning_rate": 9.944130941678619e-06, "loss": 0.4327, "step": 764 }, { "epoch": 0.4295339696799551, "grad_norm": 0.5786879062652588, "learning_rate": 9.943642857633677e-06, "loss": 0.4596, "step": 765 }, { "epoch": 0.4300954519932622, "grad_norm": 0.6286542415618896, "learning_rate": 9.943152662930577e-06, "loss": 0.4328, "step": 766 }, { "epoch": 0.4306569343065693, "grad_norm": 0.5515727996826172, "learning_rate": 9.942660357778607e-06, "loss": 0.4885, "step": 767 }, { "epoch": 0.4312184166198765, "grad_norm": 0.5742945075035095, "learning_rate": 9.942165942387953e-06, "loss": 0.4624, "step": 768 }, { "epoch": 0.4317798989331836, "grad_norm": 0.5401940941810608, "learning_rate": 9.941669416969699e-06, "loss": 0.4565, "step": 769 }, { "epoch": 0.43234138124649074, "grad_norm": 0.6031684279441833, "learning_rate": 9.941170781735838e-06, "loss": 0.4851, "step": 770 }, { "epoch": 0.43290286355979785, "grad_norm": 0.5544772744178772, "learning_rate": 9.940670036899258e-06, "loss": 0.488, "step": 771 }, { "epoch": 0.433464345873105, "grad_norm": 0.6335550546646118, "learning_rate": 9.940167182673748e-06, "loss": 0.5096, "step": 772 }, { "epoch": 0.4340258281864121, "grad_norm": 0.6071961522102356, "learning_rate": 9.939662219274e-06, "loss": 0.4712, "step": 773 }, { "epoch": 0.43458731049971927, "grad_norm": 0.6205436587333679, "learning_rate": 9.939155146915603e-06, "loss": 0.4796, "step": 774 }, { "epoch": 0.4351487928130264, "grad_norm": 0.5668438076972961, "learning_rate": 9.938645965815049e-06, "loss": 0.4684, "step": 775 }, { "epoch": 0.43571027512633353, "grad_norm": 0.628088116645813, "learning_rate": 9.938134676189731e-06, "loss": 0.4931, "step": 776 }, { "epoch": 0.43627175743964064, "grad_norm": 0.5500897765159607, "learning_rate": 9.93762127825794e-06, "loss": 0.4532, "step": 777 }, { "epoch": 0.4368332397529478, "grad_norm": 0.5753090977668762, "learning_rate": 9.937105772238868e-06, "loss": 0.4563, "step": 778 }, { "epoch": 0.4373947220662549, "grad_norm": 0.6161931157112122, "learning_rate": 9.936588158352609e-06, "loss": 0.5018, "step": 779 }, { "epoch": 0.43795620437956206, "grad_norm": 0.588248074054718, "learning_rate": 9.936068436820155e-06, "loss": 0.4475, "step": 780 }, { "epoch": 0.43851768669286917, "grad_norm": 0.6825423240661621, "learning_rate": 9.935546607863394e-06, "loss": 0.4868, "step": 781 }, { "epoch": 0.43907916900617633, "grad_norm": 0.7200890779495239, "learning_rate": 9.93502267170512e-06, "loss": 0.4835, "step": 782 }, { "epoch": 0.43964065131948343, "grad_norm": 0.5322901606559753, "learning_rate": 9.934496628569025e-06, "loss": 0.4916, "step": 783 }, { "epoch": 0.4402021336327906, "grad_norm": 0.6314575672149658, "learning_rate": 9.9339684786797e-06, "loss": 0.4784, "step": 784 }, { "epoch": 0.4407636159460977, "grad_norm": 0.6586915850639343, "learning_rate": 9.933438222262637e-06, "loss": 0.4414, "step": 785 }, { "epoch": 0.4413250982594048, "grad_norm": 0.5654212236404419, "learning_rate": 9.932905859544222e-06, "loss": 0.4681, "step": 786 }, { "epoch": 0.44188658057271196, "grad_norm": 0.5721986293792725, "learning_rate": 9.932371390751746e-06, "loss": 0.4684, "step": 787 }, { "epoch": 0.44244806288601907, "grad_norm": 0.69707190990448, "learning_rate": 9.931834816113395e-06, "loss": 0.4276, "step": 788 }, { "epoch": 0.4430095451993262, "grad_norm": 0.5771670937538147, "learning_rate": 9.931296135858259e-06, "loss": 0.4624, "step": 789 }, { "epoch": 0.44357102751263333, "grad_norm": 0.7929498553276062, "learning_rate": 9.930755350216324e-06, "loss": 0.4809, "step": 790 }, { "epoch": 0.4441325098259405, "grad_norm": 0.5436962842941284, "learning_rate": 9.930212459418475e-06, "loss": 0.4629, "step": 791 }, { "epoch": 0.4446939921392476, "grad_norm": 0.6270423531532288, "learning_rate": 9.929667463696495e-06, "loss": 0.4518, "step": 792 }, { "epoch": 0.44525547445255476, "grad_norm": 0.6368919610977173, "learning_rate": 9.929120363283067e-06, "loss": 0.4629, "step": 793 }, { "epoch": 0.44581695676586186, "grad_norm": 0.6362285017967224, "learning_rate": 9.928571158411772e-06, "loss": 0.4694, "step": 794 }, { "epoch": 0.446378439079169, "grad_norm": 0.6335340142250061, "learning_rate": 9.92801984931709e-06, "loss": 0.4485, "step": 795 }, { "epoch": 0.4469399213924761, "grad_norm": 0.5832274556159973, "learning_rate": 9.927466436234399e-06, "loss": 0.46, "step": 796 }, { "epoch": 0.4475014037057833, "grad_norm": 0.695805013179779, "learning_rate": 9.926910919399975e-06, "loss": 0.4825, "step": 797 }, { "epoch": 0.4480628860190904, "grad_norm": 0.5159093737602234, "learning_rate": 9.926353299050993e-06, "loss": 0.4513, "step": 798 }, { "epoch": 0.44862436833239755, "grad_norm": 0.6740403175354004, "learning_rate": 9.925793575425526e-06, "loss": 0.4768, "step": 799 }, { "epoch": 0.44918585064570465, "grad_norm": 0.6113595366477966, "learning_rate": 9.925231748762542e-06, "loss": 0.4615, "step": 800 }, { "epoch": 0.4497473329590118, "grad_norm": 0.587507426738739, "learning_rate": 9.924667819301913e-06, "loss": 0.4535, "step": 801 }, { "epoch": 0.4503088152723189, "grad_norm": 0.555439829826355, "learning_rate": 9.924101787284403e-06, "loss": 0.4732, "step": 802 }, { "epoch": 0.4508702975856261, "grad_norm": 0.6153371930122375, "learning_rate": 9.923533652951678e-06, "loss": 0.4608, "step": 803 }, { "epoch": 0.4514317798989332, "grad_norm": 0.5431650876998901, "learning_rate": 9.922963416546297e-06, "loss": 0.4697, "step": 804 }, { "epoch": 0.4519932622122403, "grad_norm": 0.6368337273597717, "learning_rate": 9.922391078311722e-06, "loss": 0.4669, "step": 805 }, { "epoch": 0.45255474452554745, "grad_norm": 0.6214928030967712, "learning_rate": 9.921816638492305e-06, "loss": 0.5037, "step": 806 }, { "epoch": 0.45311622683885455, "grad_norm": 0.5506566762924194, "learning_rate": 9.921240097333302e-06, "loss": 0.4928, "step": 807 }, { "epoch": 0.4536777091521617, "grad_norm": 0.6251887083053589, "learning_rate": 9.920661455080865e-06, "loss": 0.5016, "step": 808 }, { "epoch": 0.4542391914654688, "grad_norm": 0.6105179786682129, "learning_rate": 9.92008071198204e-06, "loss": 0.5024, "step": 809 }, { "epoch": 0.454800673778776, "grad_norm": 0.6588506102561951, "learning_rate": 9.919497868284772e-06, "loss": 0.4703, "step": 810 }, { "epoch": 0.4553621560920831, "grad_norm": 0.5221635103225708, "learning_rate": 9.918912924237902e-06, "loss": 0.4672, "step": 811 }, { "epoch": 0.45592363840539024, "grad_norm": 0.6516666412353516, "learning_rate": 9.91832588009117e-06, "loss": 0.4664, "step": 812 }, { "epoch": 0.45648512071869735, "grad_norm": 0.5220449566841125, "learning_rate": 9.917736736095211e-06, "loss": 0.4705, "step": 813 }, { "epoch": 0.4570466030320045, "grad_norm": 0.6655415892601013, "learning_rate": 9.917145492501554e-06, "loss": 0.4537, "step": 814 }, { "epoch": 0.4576080853453116, "grad_norm": 0.5574368834495544, "learning_rate": 9.916552149562628e-06, "loss": 0.4999, "step": 815 }, { "epoch": 0.45816956765861877, "grad_norm": 0.6260889768600464, "learning_rate": 9.915956707531758e-06, "loss": 0.4866, "step": 816 }, { "epoch": 0.4587310499719259, "grad_norm": 0.5649206042289734, "learning_rate": 9.915359166663163e-06, "loss": 0.4352, "step": 817 }, { "epoch": 0.45929253228523303, "grad_norm": 0.5236784219741821, "learning_rate": 9.914759527211961e-06, "loss": 0.4599, "step": 818 }, { "epoch": 0.45985401459854014, "grad_norm": 0.6167043447494507, "learning_rate": 9.914157789434163e-06, "loss": 0.4618, "step": 819 }, { "epoch": 0.4604154969118473, "grad_norm": 0.6552812457084656, "learning_rate": 9.913553953586679e-06, "loss": 0.5123, "step": 820 }, { "epoch": 0.4609769792251544, "grad_norm": 0.567061722278595, "learning_rate": 9.912948019927312e-06, "loss": 0.5132, "step": 821 }, { "epoch": 0.46153846153846156, "grad_norm": 0.6554179787635803, "learning_rate": 9.91233998871476e-06, "loss": 0.4873, "step": 822 }, { "epoch": 0.46209994385176867, "grad_norm": 0.6341487765312195, "learning_rate": 9.91172986020862e-06, "loss": 0.4624, "step": 823 }, { "epoch": 0.4626614261650758, "grad_norm": 0.576332688331604, "learning_rate": 9.911117634669382e-06, "loss": 0.4747, "step": 824 }, { "epoch": 0.46322290847838293, "grad_norm": 0.6739412546157837, "learning_rate": 9.910503312358433e-06, "loss": 0.4649, "step": 825 }, { "epoch": 0.46378439079169004, "grad_norm": 0.6120472550392151, "learning_rate": 9.909886893538054e-06, "loss": 0.4597, "step": 826 }, { "epoch": 0.4643458731049972, "grad_norm": 0.6097128987312317, "learning_rate": 9.90926837847142e-06, "loss": 0.4643, "step": 827 }, { "epoch": 0.4649073554183043, "grad_norm": 0.6674468517303467, "learning_rate": 9.908647767422603e-06, "loss": 0.4729, "step": 828 }, { "epoch": 0.46546883773161146, "grad_norm": 0.6574826240539551, "learning_rate": 9.90802506065657e-06, "loss": 0.4588, "step": 829 }, { "epoch": 0.46603032004491857, "grad_norm": 0.6894350051879883, "learning_rate": 9.90740025843918e-06, "loss": 0.4794, "step": 830 }, { "epoch": 0.4665918023582257, "grad_norm": 0.7085055708885193, "learning_rate": 9.906773361037192e-06, "loss": 0.4624, "step": 831 }, { "epoch": 0.46715328467153283, "grad_norm": 0.6006091833114624, "learning_rate": 9.906144368718252e-06, "loss": 0.466, "step": 832 }, { "epoch": 0.46771476698484, "grad_norm": 0.6528046727180481, "learning_rate": 9.905513281750905e-06, "loss": 0.4707, "step": 833 }, { "epoch": 0.4682762492981471, "grad_norm": 0.600399911403656, "learning_rate": 9.90488010040459e-06, "loss": 0.4781, "step": 834 }, { "epoch": 0.46883773161145426, "grad_norm": 0.5789554119110107, "learning_rate": 9.90424482494964e-06, "loss": 0.4555, "step": 835 }, { "epoch": 0.46939921392476136, "grad_norm": 0.5756786465644836, "learning_rate": 9.903607455657287e-06, "loss": 0.4425, "step": 836 }, { "epoch": 0.4699606962380685, "grad_norm": 0.5879197120666504, "learning_rate": 9.902967992799643e-06, "loss": 0.4712, "step": 837 }, { "epoch": 0.4705221785513756, "grad_norm": 0.6233859658241272, "learning_rate": 9.902326436649729e-06, "loss": 0.4596, "step": 838 }, { "epoch": 0.4710836608646828, "grad_norm": 0.5190982222557068, "learning_rate": 9.901682787481452e-06, "loss": 0.462, "step": 839 }, { "epoch": 0.4716451431779899, "grad_norm": 0.609997034072876, "learning_rate": 9.901037045569614e-06, "loss": 0.457, "step": 840 }, { "epoch": 0.47220662549129705, "grad_norm": 0.5319073796272278, "learning_rate": 9.900389211189909e-06, "loss": 0.4422, "step": 841 }, { "epoch": 0.47276810780460415, "grad_norm": 0.5420411229133606, "learning_rate": 9.899739284618928e-06, "loss": 0.4951, "step": 842 }, { "epoch": 0.47332959011791126, "grad_norm": 0.546933114528656, "learning_rate": 9.899087266134153e-06, "loss": 0.4779, "step": 843 }, { "epoch": 0.4738910724312184, "grad_norm": 0.513558566570282, "learning_rate": 9.898433156013959e-06, "loss": 0.4967, "step": 844 }, { "epoch": 0.4744525547445255, "grad_norm": 0.4705963730812073, "learning_rate": 9.897776954537613e-06, "loss": 0.418, "step": 845 }, { "epoch": 0.4750140370578327, "grad_norm": 0.5752717852592468, "learning_rate": 9.89711866198528e-06, "loss": 0.4767, "step": 846 }, { "epoch": 0.4755755193711398, "grad_norm": 0.6055124998092651, "learning_rate": 9.896458278638008e-06, "loss": 0.4562, "step": 847 }, { "epoch": 0.47613700168444695, "grad_norm": 0.5086150765419006, "learning_rate": 9.895795804777749e-06, "loss": 0.4727, "step": 848 }, { "epoch": 0.47669848399775405, "grad_norm": 0.5477786064147949, "learning_rate": 9.895131240687339e-06, "loss": 0.4722, "step": 849 }, { "epoch": 0.4772599663110612, "grad_norm": 0.5507827401161194, "learning_rate": 9.89446458665051e-06, "loss": 0.4751, "step": 850 }, { "epoch": 0.4778214486243683, "grad_norm": 0.6649587154388428, "learning_rate": 9.893795842951888e-06, "loss": 0.4904, "step": 851 }, { "epoch": 0.4783829309376755, "grad_norm": 0.5696722865104675, "learning_rate": 9.893125009876987e-06, "loss": 0.4789, "step": 852 }, { "epoch": 0.4789444132509826, "grad_norm": 0.6192601919174194, "learning_rate": 9.892452087712216e-06, "loss": 0.4351, "step": 853 }, { "epoch": 0.47950589556428974, "grad_norm": 0.7516271471977234, "learning_rate": 9.891777076744876e-06, "loss": 0.4627, "step": 854 }, { "epoch": 0.48006737787759685, "grad_norm": 0.6651103496551514, "learning_rate": 9.891099977263156e-06, "loss": 0.4463, "step": 855 }, { "epoch": 0.480628860190904, "grad_norm": 0.6699653267860413, "learning_rate": 9.890420789556142e-06, "loss": 0.4739, "step": 856 }, { "epoch": 0.4811903425042111, "grad_norm": 0.7136918306350708, "learning_rate": 9.889739513913808e-06, "loss": 0.4884, "step": 857 }, { "epoch": 0.48175182481751827, "grad_norm": 0.7006744742393494, "learning_rate": 9.88905615062702e-06, "loss": 0.4506, "step": 858 }, { "epoch": 0.4823133071308254, "grad_norm": 0.6310953497886658, "learning_rate": 9.888370699987538e-06, "loss": 0.4916, "step": 859 }, { "epoch": 0.48287478944413254, "grad_norm": 0.5223665833473206, "learning_rate": 9.887683162288006e-06, "loss": 0.454, "step": 860 }, { "epoch": 0.48343627175743964, "grad_norm": 0.6904533505439758, "learning_rate": 9.88699353782197e-06, "loss": 0.5059, "step": 861 }, { "epoch": 0.48399775407074674, "grad_norm": 0.661899745464325, "learning_rate": 9.886301826883858e-06, "loss": 0.4709, "step": 862 }, { "epoch": 0.4845592363840539, "grad_norm": 0.6379549503326416, "learning_rate": 9.88560802976899e-06, "loss": 0.4737, "step": 863 }, { "epoch": 0.485120718697361, "grad_norm": 0.5662031173706055, "learning_rate": 9.884912146773585e-06, "loss": 0.452, "step": 864 }, { "epoch": 0.48568220101066817, "grad_norm": 0.65473473072052, "learning_rate": 9.884214178194739e-06, "loss": 0.5156, "step": 865 }, { "epoch": 0.4862436833239753, "grad_norm": 0.4941713809967041, "learning_rate": 9.883514124330447e-06, "loss": 0.4618, "step": 866 }, { "epoch": 0.48680516563728243, "grad_norm": 0.5886207818984985, "learning_rate": 9.882811985479595e-06, "loss": 0.4657, "step": 867 }, { "epoch": 0.48736664795058954, "grad_norm": 0.6978502869606018, "learning_rate": 9.882107761941954e-06, "loss": 0.4516, "step": 868 }, { "epoch": 0.4879281302638967, "grad_norm": 0.6012189388275146, "learning_rate": 9.881401454018192e-06, "loss": 0.4812, "step": 869 }, { "epoch": 0.4884896125772038, "grad_norm": 0.5946515202522278, "learning_rate": 9.88069306200986e-06, "loss": 0.4606, "step": 870 }, { "epoch": 0.48905109489051096, "grad_norm": 0.6577261090278625, "learning_rate": 9.879982586219401e-06, "loss": 0.4895, "step": 871 }, { "epoch": 0.48961257720381807, "grad_norm": 0.5876297354698181, "learning_rate": 9.879270026950149e-06, "loss": 0.4621, "step": 872 }, { "epoch": 0.4901740595171252, "grad_norm": 0.5888384580612183, "learning_rate": 9.878555384506328e-06, "loss": 0.4859, "step": 873 }, { "epoch": 0.49073554183043233, "grad_norm": 0.6316752433776855, "learning_rate": 9.877838659193048e-06, "loss": 0.4715, "step": 874 }, { "epoch": 0.4912970241437395, "grad_norm": 0.6064761877059937, "learning_rate": 9.877119851316314e-06, "loss": 0.465, "step": 875 }, { "epoch": 0.4918585064570466, "grad_norm": 0.5941804647445679, "learning_rate": 9.876398961183014e-06, "loss": 0.4628, "step": 876 }, { "epoch": 0.49241998877035376, "grad_norm": 0.7835717797279358, "learning_rate": 9.875675989100928e-06, "loss": 0.463, "step": 877 }, { "epoch": 0.49298147108366086, "grad_norm": 0.5778387188911438, "learning_rate": 9.874950935378724e-06, "loss": 0.461, "step": 878 }, { "epoch": 0.493542953396968, "grad_norm": 0.6206223964691162, "learning_rate": 9.87422380032596e-06, "loss": 0.4762, "step": 879 }, { "epoch": 0.4941044357102751, "grad_norm": 0.6955769062042236, "learning_rate": 9.873494584253083e-06, "loss": 0.4933, "step": 880 }, { "epoch": 0.49466591802358223, "grad_norm": 0.5494604706764221, "learning_rate": 9.872763287471426e-06, "loss": 0.4694, "step": 881 }, { "epoch": 0.4952274003368894, "grad_norm": 0.6478627324104309, "learning_rate": 9.87202991029321e-06, "loss": 0.4612, "step": 882 }, { "epoch": 0.4957888826501965, "grad_norm": 0.6621410846710205, "learning_rate": 9.87129445303155e-06, "loss": 0.5138, "step": 883 }, { "epoch": 0.49635036496350365, "grad_norm": 0.5670858025550842, "learning_rate": 9.870556916000443e-06, "loss": 0.46, "step": 884 }, { "epoch": 0.49691184727681076, "grad_norm": 0.5697691440582275, "learning_rate": 9.869817299514775e-06, "loss": 0.4864, "step": 885 }, { "epoch": 0.4974733295901179, "grad_norm": 0.5803970098495483, "learning_rate": 9.869075603890322e-06, "loss": 0.4438, "step": 886 }, { "epoch": 0.498034811903425, "grad_norm": 0.545804500579834, "learning_rate": 9.868331829443746e-06, "loss": 0.4875, "step": 887 }, { "epoch": 0.4985962942167322, "grad_norm": 0.5060734152793884, "learning_rate": 9.867585976492595e-06, "loss": 0.4744, "step": 888 }, { "epoch": 0.4991577765300393, "grad_norm": 0.5491567850112915, "learning_rate": 9.86683804535531e-06, "loss": 0.4644, "step": 889 }, { "epoch": 0.49971925884334645, "grad_norm": 0.4969500005245209, "learning_rate": 9.866088036351211e-06, "loss": 0.4481, "step": 890 }, { "epoch": 0.5002807411566536, "grad_norm": 0.4930895268917084, "learning_rate": 9.865335949800517e-06, "loss": 0.4387, "step": 891 }, { "epoch": 0.5008422234699607, "grad_norm": 0.6362313628196716, "learning_rate": 9.864581786024318e-06, "loss": 0.4996, "step": 892 }, { "epoch": 0.5014037057832679, "grad_norm": 0.5780199766159058, "learning_rate": 9.863825545344608e-06, "loss": 0.4848, "step": 893 }, { "epoch": 0.501965188096575, "grad_norm": 0.6645207405090332, "learning_rate": 9.863067228084253e-06, "loss": 0.4685, "step": 894 }, { "epoch": 0.5025266704098821, "grad_norm": 0.5542525053024292, "learning_rate": 9.862306834567014e-06, "loss": 0.4539, "step": 895 }, { "epoch": 0.5030881527231892, "grad_norm": 0.600723385810852, "learning_rate": 9.861544365117539e-06, "loss": 0.4533, "step": 896 }, { "epoch": 0.5036496350364964, "grad_norm": 0.6015508770942688, "learning_rate": 9.860779820061356e-06, "loss": 0.457, "step": 897 }, { "epoch": 0.5042111173498035, "grad_norm": 0.6092362999916077, "learning_rate": 9.860013199724885e-06, "loss": 0.4689, "step": 898 }, { "epoch": 0.5047725996631106, "grad_norm": 0.588207483291626, "learning_rate": 9.859244504435428e-06, "loss": 0.4878, "step": 899 }, { "epoch": 0.5053340819764177, "grad_norm": 0.5679312348365784, "learning_rate": 9.858473734521176e-06, "loss": 0.4626, "step": 900 }, { "epoch": 0.5058955642897248, "grad_norm": 0.6437835097312927, "learning_rate": 9.857700890311203e-06, "loss": 0.4907, "step": 901 }, { "epoch": 0.506457046603032, "grad_norm": 0.625053882598877, "learning_rate": 9.856925972135474e-06, "loss": 0.4841, "step": 902 }, { "epoch": 0.5070185289163391, "grad_norm": 0.5608245134353638, "learning_rate": 9.856148980324832e-06, "loss": 0.498, "step": 903 }, { "epoch": 0.5075800112296462, "grad_norm": 0.7052861452102661, "learning_rate": 9.855369915211011e-06, "loss": 0.4475, "step": 904 }, { "epoch": 0.5081414935429533, "grad_norm": 0.6163288354873657, "learning_rate": 9.854588777126626e-06, "loss": 0.4815, "step": 905 }, { "epoch": 0.5087029758562606, "grad_norm": 0.596463680267334, "learning_rate": 9.85380556640518e-06, "loss": 0.4983, "step": 906 }, { "epoch": 0.5092644581695677, "grad_norm": 0.6328208446502686, "learning_rate": 9.853020283381061e-06, "loss": 0.4761, "step": 907 }, { "epoch": 0.5098259404828748, "grad_norm": 0.5891175270080566, "learning_rate": 9.85223292838954e-06, "loss": 0.4829, "step": 908 }, { "epoch": 0.5103874227961819, "grad_norm": 0.6386239528656006, "learning_rate": 9.851443501766773e-06, "loss": 0.4768, "step": 909 }, { "epoch": 0.5109489051094891, "grad_norm": 0.552609384059906, "learning_rate": 9.850652003849801e-06, "loss": 0.4686, "step": 910 }, { "epoch": 0.5115103874227962, "grad_norm": 0.5483083128929138, "learning_rate": 9.849858434976549e-06, "loss": 0.4743, "step": 911 }, { "epoch": 0.5120718697361033, "grad_norm": 0.5701147317886353, "learning_rate": 9.849062795485829e-06, "loss": 0.4616, "step": 912 }, { "epoch": 0.5126333520494104, "grad_norm": 0.5443629622459412, "learning_rate": 9.848265085717332e-06, "loss": 0.4487, "step": 913 }, { "epoch": 0.5131948343627176, "grad_norm": 0.7080327868461609, "learning_rate": 9.847465306011634e-06, "loss": 0.4955, "step": 914 }, { "epoch": 0.5137563166760247, "grad_norm": 0.5545181035995483, "learning_rate": 9.8466634567102e-06, "loss": 0.4658, "step": 915 }, { "epoch": 0.5143177989893318, "grad_norm": 0.5489552021026611, "learning_rate": 9.84585953815537e-06, "loss": 0.4613, "step": 916 }, { "epoch": 0.5148792813026389, "grad_norm": 0.5872184634208679, "learning_rate": 9.845053550690376e-06, "loss": 0.4637, "step": 917 }, { "epoch": 0.5154407636159462, "grad_norm": 0.647486686706543, "learning_rate": 9.844245494659328e-06, "loss": 0.4626, "step": 918 }, { "epoch": 0.5160022459292533, "grad_norm": 0.619476854801178, "learning_rate": 9.84343537040722e-06, "loss": 0.4749, "step": 919 }, { "epoch": 0.5165637282425604, "grad_norm": 0.5136461853981018, "learning_rate": 9.84262317827993e-06, "loss": 0.4461, "step": 920 }, { "epoch": 0.5171252105558675, "grad_norm": 0.7591120600700378, "learning_rate": 9.841808918624217e-06, "loss": 0.5193, "step": 921 }, { "epoch": 0.5176866928691746, "grad_norm": 0.608278751373291, "learning_rate": 9.84099259178773e-06, "loss": 0.4739, "step": 922 }, { "epoch": 0.5182481751824818, "grad_norm": 0.5614476203918457, "learning_rate": 9.840174198118989e-06, "loss": 0.4521, "step": 923 }, { "epoch": 0.5188096574957889, "grad_norm": 0.6261391639709473, "learning_rate": 9.839353737967404e-06, "loss": 0.4574, "step": 924 }, { "epoch": 0.519371139809096, "grad_norm": 0.8089165687561035, "learning_rate": 9.838531211683261e-06, "loss": 0.4627, "step": 925 }, { "epoch": 0.5199326221224031, "grad_norm": 0.6090323328971863, "learning_rate": 9.837706619617742e-06, "loss": 0.5064, "step": 926 }, { "epoch": 0.5204941044357103, "grad_norm": 0.7856566905975342, "learning_rate": 9.836879962122895e-06, "loss": 0.4744, "step": 927 }, { "epoch": 0.5210555867490174, "grad_norm": 0.6732569932937622, "learning_rate": 9.836051239551658e-06, "loss": 0.496, "step": 928 }, { "epoch": 0.5216170690623245, "grad_norm": 0.6310299634933472, "learning_rate": 9.835220452257848e-06, "loss": 0.4418, "step": 929 }, { "epoch": 0.5221785513756316, "grad_norm": 0.5662758350372314, "learning_rate": 9.834387600596165e-06, "loss": 0.4654, "step": 930 }, { "epoch": 0.5227400336889388, "grad_norm": 0.6582419276237488, "learning_rate": 9.83355268492219e-06, "loss": 0.4537, "step": 931 }, { "epoch": 0.523301516002246, "grad_norm": 0.8025732040405273, "learning_rate": 9.832715705592386e-06, "loss": 0.4637, "step": 932 }, { "epoch": 0.523862998315553, "grad_norm": 0.5203531384468079, "learning_rate": 9.831876662964096e-06, "loss": 0.4832, "step": 933 }, { "epoch": 0.5244244806288602, "grad_norm": 0.6465120911598206, "learning_rate": 9.831035557395545e-06, "loss": 0.4602, "step": 934 }, { "epoch": 0.5249859629421674, "grad_norm": 0.6982278227806091, "learning_rate": 9.830192389245835e-06, "loss": 0.498, "step": 935 }, { "epoch": 0.5255474452554745, "grad_norm": 0.5495709180831909, "learning_rate": 9.829347158874954e-06, "loss": 0.4742, "step": 936 }, { "epoch": 0.5261089275687816, "grad_norm": 0.5452244877815247, "learning_rate": 9.828499866643766e-06, "loss": 0.4633, "step": 937 }, { "epoch": 0.5266704098820887, "grad_norm": 0.650924801826477, "learning_rate": 9.82765051291402e-06, "loss": 0.445, "step": 938 }, { "epoch": 0.5272318921953958, "grad_norm": 0.6532645225524902, "learning_rate": 9.826799098048339e-06, "loss": 0.456, "step": 939 }, { "epoch": 0.527793374508703, "grad_norm": 0.5290435552597046, "learning_rate": 9.825945622410232e-06, "loss": 0.5087, "step": 940 }, { "epoch": 0.5283548568220101, "grad_norm": 0.6162605881690979, "learning_rate": 9.825090086364084e-06, "loss": 0.4741, "step": 941 }, { "epoch": 0.5289163391353172, "grad_norm": 0.787295401096344, "learning_rate": 9.824232490275163e-06, "loss": 0.497, "step": 942 }, { "epoch": 0.5294778214486243, "grad_norm": 0.6119143962860107, "learning_rate": 9.82337283450961e-06, "loss": 0.4702, "step": 943 }, { "epoch": 0.5300393037619315, "grad_norm": 0.6093367338180542, "learning_rate": 9.822511119434454e-06, "loss": 0.471, "step": 944 }, { "epoch": 0.5306007860752386, "grad_norm": 0.72612065076828, "learning_rate": 9.821647345417598e-06, "loss": 0.4683, "step": 945 }, { "epoch": 0.5311622683885457, "grad_norm": 0.5371798276901245, "learning_rate": 9.820781512827825e-06, "loss": 0.4409, "step": 946 }, { "epoch": 0.5317237507018528, "grad_norm": 0.6906511783599854, "learning_rate": 9.819913622034794e-06, "loss": 0.4822, "step": 947 }, { "epoch": 0.5322852330151601, "grad_norm": 0.7846553325653076, "learning_rate": 9.819043673409048e-06, "loss": 0.5001, "step": 948 }, { "epoch": 0.5328467153284672, "grad_norm": 0.5741400718688965, "learning_rate": 9.818171667322008e-06, "loss": 0.4626, "step": 949 }, { "epoch": 0.5334081976417743, "grad_norm": 0.5912113189697266, "learning_rate": 9.81729760414597e-06, "loss": 0.4046, "step": 950 }, { "epoch": 0.5339696799550814, "grad_norm": 0.7622829079627991, "learning_rate": 9.81642148425411e-06, "loss": 0.4759, "step": 951 }, { "epoch": 0.5345311622683886, "grad_norm": 0.7644887566566467, "learning_rate": 9.815543308020482e-06, "loss": 0.496, "step": 952 }, { "epoch": 0.5350926445816957, "grad_norm": 0.5442538261413574, "learning_rate": 9.814663075820018e-06, "loss": 0.4729, "step": 953 }, { "epoch": 0.5356541268950028, "grad_norm": 0.7387640476226807, "learning_rate": 9.813780788028528e-06, "loss": 0.4757, "step": 954 }, { "epoch": 0.5362156092083099, "grad_norm": 0.6327847838401794, "learning_rate": 9.812896445022699e-06, "loss": 0.442, "step": 955 }, { "epoch": 0.5367770915216171, "grad_norm": 0.5609349608421326, "learning_rate": 9.812010047180095e-06, "loss": 0.4564, "step": 956 }, { "epoch": 0.5373385738349242, "grad_norm": 0.6269439458847046, "learning_rate": 9.811121594879161e-06, "loss": 0.4344, "step": 957 }, { "epoch": 0.5379000561482313, "grad_norm": 0.8040373921394348, "learning_rate": 9.810231088499213e-06, "loss": 0.474, "step": 958 }, { "epoch": 0.5384615384615384, "grad_norm": 0.7410895228385925, "learning_rate": 9.809338528420449e-06, "loss": 0.4715, "step": 959 }, { "epoch": 0.5390230207748455, "grad_norm": 0.5630190372467041, "learning_rate": 9.808443915023942e-06, "loss": 0.4907, "step": 960 }, { "epoch": 0.5395845030881528, "grad_norm": 0.776658833026886, "learning_rate": 9.807547248691642e-06, "loss": 0.4745, "step": 961 }, { "epoch": 0.5401459854014599, "grad_norm": 0.6973573565483093, "learning_rate": 9.806648529806376e-06, "loss": 0.4396, "step": 962 }, { "epoch": 0.540707467714767, "grad_norm": 0.5954681634902954, "learning_rate": 9.805747758751843e-06, "loss": 0.4785, "step": 963 }, { "epoch": 0.5412689500280741, "grad_norm": 0.6725640892982483, "learning_rate": 9.804844935912624e-06, "loss": 0.4914, "step": 964 }, { "epoch": 0.5418304323413813, "grad_norm": 0.7291205525398254, "learning_rate": 9.803940061674175e-06, "loss": 0.4621, "step": 965 }, { "epoch": 0.5423919146546884, "grad_norm": 0.581109881401062, "learning_rate": 9.803033136422826e-06, "loss": 0.4537, "step": 966 }, { "epoch": 0.5429533969679955, "grad_norm": 0.6495437026023865, "learning_rate": 9.80212416054578e-06, "loss": 0.488, "step": 967 }, { "epoch": 0.5435148792813026, "grad_norm": 0.6344974637031555, "learning_rate": 9.801213134431123e-06, "loss": 0.4964, "step": 968 }, { "epoch": 0.5440763615946098, "grad_norm": 0.649075448513031, "learning_rate": 9.80030005846781e-06, "loss": 0.4476, "step": 969 }, { "epoch": 0.5446378439079169, "grad_norm": 0.5911461114883423, "learning_rate": 9.799384933045673e-06, "loss": 0.4595, "step": 970 }, { "epoch": 0.545199326221224, "grad_norm": 0.6093629002571106, "learning_rate": 9.79846775855542e-06, "loss": 0.4659, "step": 971 }, { "epoch": 0.5457608085345311, "grad_norm": 0.7152195572853088, "learning_rate": 9.797548535388632e-06, "loss": 0.4994, "step": 972 }, { "epoch": 0.5463222908478383, "grad_norm": 0.5251650214195251, "learning_rate": 9.796627263937768e-06, "loss": 0.4788, "step": 973 }, { "epoch": 0.5468837731611454, "grad_norm": 0.5499521493911743, "learning_rate": 9.795703944596155e-06, "loss": 0.4851, "step": 974 }, { "epoch": 0.5474452554744526, "grad_norm": 0.6222467422485352, "learning_rate": 9.794778577758003e-06, "loss": 0.4528, "step": 975 }, { "epoch": 0.5480067377877597, "grad_norm": 0.630702793598175, "learning_rate": 9.79385116381839e-06, "loss": 0.4693, "step": 976 }, { "epoch": 0.5485682201010668, "grad_norm": 0.5477985143661499, "learning_rate": 9.792921703173271e-06, "loss": 0.4548, "step": 977 }, { "epoch": 0.549129702414374, "grad_norm": 0.5784381628036499, "learning_rate": 9.79199019621947e-06, "loss": 0.4685, "step": 978 }, { "epoch": 0.5496911847276811, "grad_norm": 0.5996346473693848, "learning_rate": 9.791056643354691e-06, "loss": 0.4446, "step": 979 }, { "epoch": 0.5502526670409882, "grad_norm": 0.47539910674095154, "learning_rate": 9.790121044977508e-06, "loss": 0.446, "step": 980 }, { "epoch": 0.5508141493542953, "grad_norm": 0.5196050405502319, "learning_rate": 9.78918340148737e-06, "loss": 0.4244, "step": 981 }, { "epoch": 0.5513756316676025, "grad_norm": 0.614954948425293, "learning_rate": 9.788243713284595e-06, "loss": 0.4666, "step": 982 }, { "epoch": 0.5519371139809096, "grad_norm": 0.5443697571754456, "learning_rate": 9.787301980770383e-06, "loss": 0.4819, "step": 983 }, { "epoch": 0.5524985962942167, "grad_norm": 0.5372692346572876, "learning_rate": 9.786358204346795e-06, "loss": 0.4637, "step": 984 }, { "epoch": 0.5530600786075238, "grad_norm": 0.5645728707313538, "learning_rate": 9.785412384416773e-06, "loss": 0.4624, "step": 985 }, { "epoch": 0.553621560920831, "grad_norm": 0.5956112742424011, "learning_rate": 9.78446452138413e-06, "loss": 0.4606, "step": 986 }, { "epoch": 0.5541830432341381, "grad_norm": 0.5440046787261963, "learning_rate": 9.783514615653548e-06, "loss": 0.4721, "step": 987 }, { "epoch": 0.5547445255474452, "grad_norm": 0.6512797474861145, "learning_rate": 9.782562667630586e-06, "loss": 0.4615, "step": 988 }, { "epoch": 0.5553060078607523, "grad_norm": 0.520784854888916, "learning_rate": 9.781608677721671e-06, "loss": 0.4796, "step": 989 }, { "epoch": 0.5558674901740596, "grad_norm": 0.5884256362915039, "learning_rate": 9.780652646334103e-06, "loss": 0.4534, "step": 990 }, { "epoch": 0.5564289724873667, "grad_norm": 0.5586499571800232, "learning_rate": 9.779694573876056e-06, "loss": 0.5073, "step": 991 }, { "epoch": 0.5569904548006738, "grad_norm": 0.6196976900100708, "learning_rate": 9.77873446075657e-06, "loss": 0.468, "step": 992 }, { "epoch": 0.5575519371139809, "grad_norm": 0.5831805467605591, "learning_rate": 9.77777230738556e-06, "loss": 0.4732, "step": 993 }, { "epoch": 0.5581134194272881, "grad_norm": 0.5547516345977783, "learning_rate": 9.776808114173812e-06, "loss": 0.4547, "step": 994 }, { "epoch": 0.5586749017405952, "grad_norm": 0.6667516231536865, "learning_rate": 9.775841881532985e-06, "loss": 0.4449, "step": 995 }, { "epoch": 0.5592363840539023, "grad_norm": 0.5846383571624756, "learning_rate": 9.774873609875602e-06, "loss": 0.4828, "step": 996 }, { "epoch": 0.5597978663672094, "grad_norm": 0.5610396265983582, "learning_rate": 9.773903299615064e-06, "loss": 0.513, "step": 997 }, { "epoch": 0.5603593486805165, "grad_norm": 0.5821402668952942, "learning_rate": 9.772930951165636e-06, "loss": 0.5007, "step": 998 }, { "epoch": 0.5609208309938237, "grad_norm": 0.5649592280387878, "learning_rate": 9.77195656494246e-06, "loss": 0.4428, "step": 999 }, { "epoch": 0.5614823133071308, "grad_norm": 0.5460188388824463, "learning_rate": 9.770980141361539e-06, "loss": 0.4775, "step": 1000 }, { "epoch": 0.5620437956204379, "grad_norm": 0.65071702003479, "learning_rate": 9.770001680839756e-06, "loss": 0.46, "step": 1001 }, { "epoch": 0.562605277933745, "grad_norm": 0.50941401720047, "learning_rate": 9.769021183794859e-06, "loss": 0.439, "step": 1002 }, { "epoch": 0.5631667602470523, "grad_norm": 0.5456126928329468, "learning_rate": 9.768038650645459e-06, "loss": 0.4624, "step": 1003 }, { "epoch": 0.5637282425603594, "grad_norm": 0.5477569103240967, "learning_rate": 9.767054081811049e-06, "loss": 0.4807, "step": 1004 }, { "epoch": 0.5642897248736665, "grad_norm": 0.6268385052680969, "learning_rate": 9.766067477711982e-06, "loss": 0.4903, "step": 1005 }, { "epoch": 0.5648512071869736, "grad_norm": 0.5345587730407715, "learning_rate": 9.765078838769482e-06, "loss": 0.4403, "step": 1006 }, { "epoch": 0.5654126895002808, "grad_norm": 0.5900137424468994, "learning_rate": 9.764088165405645e-06, "loss": 0.4316, "step": 1007 }, { "epoch": 0.5659741718135879, "grad_norm": 0.5045136213302612, "learning_rate": 9.76309545804343e-06, "loss": 0.4478, "step": 1008 }, { "epoch": 0.566535654126895, "grad_norm": 0.5249596834182739, "learning_rate": 9.762100717106668e-06, "loss": 0.4441, "step": 1009 }, { "epoch": 0.5670971364402021, "grad_norm": 0.5868927836418152, "learning_rate": 9.761103943020061e-06, "loss": 0.4524, "step": 1010 }, { "epoch": 0.5676586187535093, "grad_norm": 0.5595040321350098, "learning_rate": 9.76010513620917e-06, "loss": 0.4662, "step": 1011 }, { "epoch": 0.5682201010668164, "grad_norm": 0.5449414253234863, "learning_rate": 9.759104297100432e-06, "loss": 0.4532, "step": 1012 }, { "epoch": 0.5687815833801235, "grad_norm": 0.653796911239624, "learning_rate": 9.75810142612115e-06, "loss": 0.4942, "step": 1013 }, { "epoch": 0.5693430656934306, "grad_norm": 0.5582761764526367, "learning_rate": 9.757096523699495e-06, "loss": 0.4428, "step": 1014 }, { "epoch": 0.5699045480067377, "grad_norm": 0.5760213732719421, "learning_rate": 9.7560895902645e-06, "loss": 0.4546, "step": 1015 }, { "epoch": 0.570466030320045, "grad_norm": 0.5375726819038391, "learning_rate": 9.755080626246072e-06, "loss": 0.4773, "step": 1016 }, { "epoch": 0.571027512633352, "grad_norm": 0.6005684733390808, "learning_rate": 9.754069632074981e-06, "loss": 0.4677, "step": 1017 }, { "epoch": 0.5715889949466592, "grad_norm": 0.5297277569770813, "learning_rate": 9.753056608182866e-06, "loss": 0.4629, "step": 1018 }, { "epoch": 0.5721504772599663, "grad_norm": 0.5412373542785645, "learning_rate": 9.75204155500223e-06, "loss": 0.4878, "step": 1019 }, { "epoch": 0.5727119595732735, "grad_norm": 0.5823129415512085, "learning_rate": 9.751024472966441e-06, "loss": 0.4557, "step": 1020 }, { "epoch": 0.5732734418865806, "grad_norm": 0.625793993473053, "learning_rate": 9.750005362509742e-06, "loss": 0.4854, "step": 1021 }, { "epoch": 0.5738349241998877, "grad_norm": 0.566156268119812, "learning_rate": 9.748984224067232e-06, "loss": 0.4834, "step": 1022 }, { "epoch": 0.5743964065131948, "grad_norm": 0.6327447295188904, "learning_rate": 9.747961058074882e-06, "loss": 0.4684, "step": 1023 }, { "epoch": 0.574957888826502, "grad_norm": 0.5784967541694641, "learning_rate": 9.746935864969524e-06, "loss": 0.4417, "step": 1024 }, { "epoch": 0.5755193711398091, "grad_norm": 0.5933436751365662, "learning_rate": 9.74590864518886e-06, "loss": 0.4869, "step": 1025 }, { "epoch": 0.5760808534531162, "grad_norm": 0.5754256248474121, "learning_rate": 9.744879399171454e-06, "loss": 0.4649, "step": 1026 }, { "epoch": 0.5766423357664233, "grad_norm": 0.5909319519996643, "learning_rate": 9.743848127356738e-06, "loss": 0.4619, "step": 1027 }, { "epoch": 0.5772038180797305, "grad_norm": 0.5287562012672424, "learning_rate": 9.742814830185004e-06, "loss": 0.4801, "step": 1028 }, { "epoch": 0.5777653003930376, "grad_norm": 0.5605972409248352, "learning_rate": 9.741779508097416e-06, "loss": 0.473, "step": 1029 }, { "epoch": 0.5783267827063447, "grad_norm": 0.6046238541603088, "learning_rate": 9.740742161535995e-06, "loss": 0.4726, "step": 1030 }, { "epoch": 0.5788882650196518, "grad_norm": 0.6328635811805725, "learning_rate": 9.739702790943631e-06, "loss": 0.4466, "step": 1031 }, { "epoch": 0.5794497473329591, "grad_norm": 0.6207427978515625, "learning_rate": 9.738661396764077e-06, "loss": 0.4723, "step": 1032 }, { "epoch": 0.5800112296462662, "grad_norm": 0.5776227116584778, "learning_rate": 9.737617979441952e-06, "loss": 0.4781, "step": 1033 }, { "epoch": 0.5805727119595733, "grad_norm": 0.6213696599006653, "learning_rate": 9.736572539422733e-06, "loss": 0.4653, "step": 1034 }, { "epoch": 0.5811341942728804, "grad_norm": 0.7408880591392517, "learning_rate": 9.735525077152765e-06, "loss": 0.4551, "step": 1035 }, { "epoch": 0.5816956765861875, "grad_norm": 0.5356180667877197, "learning_rate": 9.734475593079259e-06, "loss": 0.4143, "step": 1036 }, { "epoch": 0.5822571588994947, "grad_norm": 0.6045482158660889, "learning_rate": 9.733424087650283e-06, "loss": 0.4625, "step": 1037 }, { "epoch": 0.5828186412128018, "grad_norm": 0.7720041871070862, "learning_rate": 9.73237056131477e-06, "loss": 0.4457, "step": 1038 }, { "epoch": 0.5833801235261089, "grad_norm": 0.5324476361274719, "learning_rate": 9.731315014522518e-06, "loss": 0.4698, "step": 1039 }, { "epoch": 0.583941605839416, "grad_norm": 0.677348792552948, "learning_rate": 9.730257447724185e-06, "loss": 0.4463, "step": 1040 }, { "epoch": 0.5845030881527232, "grad_norm": 0.6087179183959961, "learning_rate": 9.729197861371296e-06, "loss": 0.4773, "step": 1041 }, { "epoch": 0.5850645704660303, "grad_norm": 0.5901072025299072, "learning_rate": 9.72813625591623e-06, "loss": 0.4869, "step": 1042 }, { "epoch": 0.5856260527793374, "grad_norm": 0.6505506038665771, "learning_rate": 9.727072631812238e-06, "loss": 0.4883, "step": 1043 }, { "epoch": 0.5861875350926445, "grad_norm": 0.5609641075134277, "learning_rate": 9.726006989513424e-06, "loss": 0.4422, "step": 1044 }, { "epoch": 0.5867490174059518, "grad_norm": 0.5764316916465759, "learning_rate": 9.72493932947476e-06, "loss": 0.4552, "step": 1045 }, { "epoch": 0.5873104997192589, "grad_norm": 0.6611195802688599, "learning_rate": 9.723869652152076e-06, "loss": 0.5029, "step": 1046 }, { "epoch": 0.587871982032566, "grad_norm": 0.7669707536697388, "learning_rate": 9.722797958002064e-06, "loss": 0.4839, "step": 1047 }, { "epoch": 0.5884334643458731, "grad_norm": 0.5435432195663452, "learning_rate": 9.721724247482276e-06, "loss": 0.4324, "step": 1048 }, { "epoch": 0.5889949466591803, "grad_norm": 0.6291168928146362, "learning_rate": 9.72064852105113e-06, "loss": 0.4683, "step": 1049 }, { "epoch": 0.5895564289724874, "grad_norm": 0.6352596879005432, "learning_rate": 9.719570779167896e-06, "loss": 0.4332, "step": 1050 }, { "epoch": 0.5901179112857945, "grad_norm": 0.5494540333747864, "learning_rate": 9.718491022292713e-06, "loss": 0.4614, "step": 1051 }, { "epoch": 0.5906793935991016, "grad_norm": 0.5602513551712036, "learning_rate": 9.717409250886576e-06, "loss": 0.468, "step": 1052 }, { "epoch": 0.5912408759124088, "grad_norm": 0.5561378598213196, "learning_rate": 9.716325465411339e-06, "loss": 0.4525, "step": 1053 }, { "epoch": 0.5918023582257159, "grad_norm": 0.5595511198043823, "learning_rate": 9.715239666329718e-06, "loss": 0.4291, "step": 1054 }, { "epoch": 0.592363840539023, "grad_norm": 0.565251886844635, "learning_rate": 9.71415185410529e-06, "loss": 0.4537, "step": 1055 }, { "epoch": 0.5929253228523301, "grad_norm": 0.5593476295471191, "learning_rate": 9.713062029202487e-06, "loss": 0.433, "step": 1056 }, { "epoch": 0.5934868051656372, "grad_norm": 0.603686511516571, "learning_rate": 9.711970192086606e-06, "loss": 0.4461, "step": 1057 }, { "epoch": 0.5940482874789444, "grad_norm": 0.5838700532913208, "learning_rate": 9.7108763432238e-06, "loss": 0.4797, "step": 1058 }, { "epoch": 0.5946097697922516, "grad_norm": 0.5620068907737732, "learning_rate": 9.709780483081079e-06, "loss": 0.4768, "step": 1059 }, { "epoch": 0.5951712521055587, "grad_norm": 0.5839632749557495, "learning_rate": 9.708682612126315e-06, "loss": 0.4714, "step": 1060 }, { "epoch": 0.5957327344188658, "grad_norm": 0.5421780943870544, "learning_rate": 9.707582730828237e-06, "loss": 0.4315, "step": 1061 }, { "epoch": 0.596294216732173, "grad_norm": 0.6071469783782959, "learning_rate": 9.706480839656433e-06, "loss": 0.4569, "step": 1062 }, { "epoch": 0.5968556990454801, "grad_norm": 0.5571103692054749, "learning_rate": 9.705376939081347e-06, "loss": 0.4464, "step": 1063 }, { "epoch": 0.5974171813587872, "grad_norm": 0.5960492491722107, "learning_rate": 9.704271029574288e-06, "loss": 0.4527, "step": 1064 }, { "epoch": 0.5979786636720943, "grad_norm": 0.5700828433036804, "learning_rate": 9.70316311160741e-06, "loss": 0.477, "step": 1065 }, { "epoch": 0.5985401459854015, "grad_norm": 0.6553433537483215, "learning_rate": 9.702053185653736e-06, "loss": 0.4681, "step": 1066 }, { "epoch": 0.5991016282987086, "grad_norm": 0.5753922462463379, "learning_rate": 9.700941252187142e-06, "loss": 0.4888, "step": 1067 }, { "epoch": 0.5996631106120157, "grad_norm": 0.555130124092102, "learning_rate": 9.699827311682362e-06, "loss": 0.4751, "step": 1068 }, { "epoch": 0.6002245929253228, "grad_norm": 0.5406093597412109, "learning_rate": 9.698711364614982e-06, "loss": 0.4435, "step": 1069 }, { "epoch": 0.60078607523863, "grad_norm": 0.5663029551506042, "learning_rate": 9.697593411461453e-06, "loss": 0.43, "step": 1070 }, { "epoch": 0.6013475575519371, "grad_norm": 0.5444298982620239, "learning_rate": 9.696473452699075e-06, "loss": 0.4744, "step": 1071 }, { "epoch": 0.6019090398652442, "grad_norm": 0.5286763310432434, "learning_rate": 9.695351488806012e-06, "loss": 0.4676, "step": 1072 }, { "epoch": 0.6024705221785513, "grad_norm": 0.639815092086792, "learning_rate": 9.694227520261275e-06, "loss": 0.5137, "step": 1073 }, { "epoch": 0.6030320044918585, "grad_norm": 0.5698044300079346, "learning_rate": 9.693101547544738e-06, "loss": 0.455, "step": 1074 }, { "epoch": 0.6035934868051657, "grad_norm": 0.6056145429611206, "learning_rate": 9.691973571137128e-06, "loss": 0.4625, "step": 1075 }, { "epoch": 0.6041549691184728, "grad_norm": 0.5454999208450317, "learning_rate": 9.690843591520027e-06, "loss": 0.4623, "step": 1076 }, { "epoch": 0.6047164514317799, "grad_norm": 0.5458486080169678, "learning_rate": 9.689711609175874e-06, "loss": 0.4757, "step": 1077 }, { "epoch": 0.605277933745087, "grad_norm": 0.5272732377052307, "learning_rate": 9.688577624587961e-06, "loss": 0.434, "step": 1078 }, { "epoch": 0.6058394160583942, "grad_norm": 0.5480579137802124, "learning_rate": 9.687441638240434e-06, "loss": 0.4514, "step": 1079 }, { "epoch": 0.6064008983717013, "grad_norm": 0.5277768969535828, "learning_rate": 9.686303650618298e-06, "loss": 0.4344, "step": 1080 }, { "epoch": 0.6069623806850084, "grad_norm": 0.5433816313743591, "learning_rate": 9.685163662207411e-06, "loss": 0.4448, "step": 1081 }, { "epoch": 0.6075238629983155, "grad_norm": 0.5431236624717712, "learning_rate": 9.684021673494482e-06, "loss": 0.4757, "step": 1082 }, { "epoch": 0.6080853453116227, "grad_norm": 0.6209035515785217, "learning_rate": 9.682877684967076e-06, "loss": 0.5037, "step": 1083 }, { "epoch": 0.6086468276249298, "grad_norm": 0.4893648624420166, "learning_rate": 9.681731697113612e-06, "loss": 0.4651, "step": 1084 }, { "epoch": 0.6092083099382369, "grad_norm": 0.5628235340118408, "learning_rate": 9.680583710423364e-06, "loss": 0.4504, "step": 1085 }, { "epoch": 0.609769792251544, "grad_norm": 0.5369555354118347, "learning_rate": 9.679433725386456e-06, "loss": 0.4495, "step": 1086 }, { "epoch": 0.6103312745648513, "grad_norm": 0.5724145770072937, "learning_rate": 9.678281742493867e-06, "loss": 0.4633, "step": 1087 }, { "epoch": 0.6108927568781584, "grad_norm": 0.5574279427528381, "learning_rate": 9.67712776223743e-06, "loss": 0.4771, "step": 1088 }, { "epoch": 0.6114542391914655, "grad_norm": 0.61083984375, "learning_rate": 9.675971785109832e-06, "loss": 0.4408, "step": 1089 }, { "epoch": 0.6120157215047726, "grad_norm": 0.5071173310279846, "learning_rate": 9.674813811604608e-06, "loss": 0.4612, "step": 1090 }, { "epoch": 0.6125772038180798, "grad_norm": 0.48119714856147766, "learning_rate": 9.673653842216147e-06, "loss": 0.4577, "step": 1091 }, { "epoch": 0.6131386861313869, "grad_norm": 0.45652469992637634, "learning_rate": 9.672491877439692e-06, "loss": 0.4553, "step": 1092 }, { "epoch": 0.613700168444694, "grad_norm": 0.586669921875, "learning_rate": 9.671327917771336e-06, "loss": 0.4648, "step": 1093 }, { "epoch": 0.6142616507580011, "grad_norm": 0.576414942741394, "learning_rate": 9.670161963708025e-06, "loss": 0.458, "step": 1094 }, { "epoch": 0.6148231330713082, "grad_norm": 0.5987989902496338, "learning_rate": 9.668994015747557e-06, "loss": 0.4711, "step": 1095 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5615461468696594, "learning_rate": 9.667824074388578e-06, "loss": 0.4761, "step": 1096 }, { "epoch": 0.6159460976979225, "grad_norm": 0.6199878454208374, "learning_rate": 9.66665214013059e-06, "loss": 0.4763, "step": 1097 }, { "epoch": 0.6165075800112296, "grad_norm": 0.5992336869239807, "learning_rate": 9.665478213473942e-06, "loss": 0.4774, "step": 1098 }, { "epoch": 0.6170690623245367, "grad_norm": 0.5347209572792053, "learning_rate": 9.664302294919832e-06, "loss": 0.4631, "step": 1099 }, { "epoch": 0.617630544637844, "grad_norm": 0.5263800621032715, "learning_rate": 9.663124384970318e-06, "loss": 0.421, "step": 1100 }, { "epoch": 0.618192026951151, "grad_norm": 0.5808508396148682, "learning_rate": 9.661944484128293e-06, "loss": 0.471, "step": 1101 }, { "epoch": 0.6187535092644582, "grad_norm": 0.6286934018135071, "learning_rate": 9.660762592897515e-06, "loss": 0.4615, "step": 1102 }, { "epoch": 0.6193149915777653, "grad_norm": 0.5730308890342712, "learning_rate": 9.659578711782583e-06, "loss": 0.4839, "step": 1103 }, { "epoch": 0.6198764738910725, "grad_norm": 0.579643726348877, "learning_rate": 9.658392841288948e-06, "loss": 0.4326, "step": 1104 }, { "epoch": 0.6204379562043796, "grad_norm": 0.7626729011535645, "learning_rate": 9.657204981922909e-06, "loss": 0.4755, "step": 1105 }, { "epoch": 0.6209994385176867, "grad_norm": 0.6715488433837891, "learning_rate": 9.656015134191618e-06, "loss": 0.4457, "step": 1106 }, { "epoch": 0.6215609208309938, "grad_norm": 0.616806149482727, "learning_rate": 9.65482329860307e-06, "loss": 0.4619, "step": 1107 }, { "epoch": 0.622122403144301, "grad_norm": 0.7228384017944336, "learning_rate": 9.653629475666115e-06, "loss": 0.4629, "step": 1108 }, { "epoch": 0.6226838854576081, "grad_norm": 0.7238160967826843, "learning_rate": 9.652433665890447e-06, "loss": 0.4932, "step": 1109 }, { "epoch": 0.6232453677709152, "grad_norm": 0.5795125365257263, "learning_rate": 9.651235869786608e-06, "loss": 0.4512, "step": 1110 }, { "epoch": 0.6238068500842223, "grad_norm": 0.6132017970085144, "learning_rate": 9.650036087865993e-06, "loss": 0.424, "step": 1111 }, { "epoch": 0.6243683323975294, "grad_norm": 0.5222129225730896, "learning_rate": 9.648834320640839e-06, "loss": 0.4634, "step": 1112 }, { "epoch": 0.6249298147108366, "grad_norm": 0.5482558608055115, "learning_rate": 9.647630568624233e-06, "loss": 0.4485, "step": 1113 }, { "epoch": 0.6254912970241437, "grad_norm": 0.5444743037223816, "learning_rate": 9.646424832330112e-06, "loss": 0.4634, "step": 1114 }, { "epoch": 0.6260527793374508, "grad_norm": 0.6434916853904724, "learning_rate": 9.645217112273257e-06, "loss": 0.4665, "step": 1115 }, { "epoch": 0.626614261650758, "grad_norm": 0.5776771306991577, "learning_rate": 9.644007408969295e-06, "loss": 0.4759, "step": 1116 }, { "epoch": 0.6271757439640652, "grad_norm": 0.5524722933769226, "learning_rate": 9.642795722934702e-06, "loss": 0.439, "step": 1117 }, { "epoch": 0.6277372262773723, "grad_norm": 0.5447648763656616, "learning_rate": 9.641582054686802e-06, "loss": 0.4425, "step": 1118 }, { "epoch": 0.6282987085906794, "grad_norm": 0.5822136998176575, "learning_rate": 9.640366404743762e-06, "loss": 0.4828, "step": 1119 }, { "epoch": 0.6288601909039865, "grad_norm": 0.5383198857307434, "learning_rate": 9.639148773624597e-06, "loss": 0.436, "step": 1120 }, { "epoch": 0.6294216732172937, "grad_norm": 0.582713782787323, "learning_rate": 9.637929161849164e-06, "loss": 0.4674, "step": 1121 }, { "epoch": 0.6299831555306008, "grad_norm": 0.5690926313400269, "learning_rate": 9.636707569938171e-06, "loss": 0.4791, "step": 1122 }, { "epoch": 0.6305446378439079, "grad_norm": 0.603235125541687, "learning_rate": 9.635483998413171e-06, "loss": 0.5139, "step": 1123 }, { "epoch": 0.631106120157215, "grad_norm": 0.5586082339286804, "learning_rate": 9.634258447796558e-06, "loss": 0.4771, "step": 1124 }, { "epoch": 0.6316676024705222, "grad_norm": 0.5353853106498718, "learning_rate": 9.633030918611574e-06, "loss": 0.4563, "step": 1125 }, { "epoch": 0.6322290847838293, "grad_norm": 0.5715048313140869, "learning_rate": 9.631801411382306e-06, "loss": 0.4825, "step": 1126 }, { "epoch": 0.6327905670971364, "grad_norm": 0.5280632376670837, "learning_rate": 9.630569926633682e-06, "loss": 0.4413, "step": 1127 }, { "epoch": 0.6333520494104435, "grad_norm": 0.558343231678009, "learning_rate": 9.629336464891481e-06, "loss": 0.456, "step": 1128 }, { "epoch": 0.6339135317237508, "grad_norm": 0.5541366934776306, "learning_rate": 9.628101026682317e-06, "loss": 0.45, "step": 1129 }, { "epoch": 0.6344750140370579, "grad_norm": 0.5903697609901428, "learning_rate": 9.626863612533658e-06, "loss": 0.4596, "step": 1130 }, { "epoch": 0.635036496350365, "grad_norm": 0.5170342922210693, "learning_rate": 9.625624222973807e-06, "loss": 0.4493, "step": 1131 }, { "epoch": 0.6355979786636721, "grad_norm": 0.6377504467964172, "learning_rate": 9.624382858531916e-06, "loss": 0.4578, "step": 1132 }, { "epoch": 0.6361594609769792, "grad_norm": 0.5181000828742981, "learning_rate": 9.623139519737974e-06, "loss": 0.4464, "step": 1133 }, { "epoch": 0.6367209432902864, "grad_norm": 0.7008993029594421, "learning_rate": 9.621894207122822e-06, "loss": 0.4513, "step": 1134 }, { "epoch": 0.6372824256035935, "grad_norm": 0.5334010124206543, "learning_rate": 9.620646921218134e-06, "loss": 0.4663, "step": 1135 }, { "epoch": 0.6378439079169006, "grad_norm": 0.5991860628128052, "learning_rate": 9.619397662556434e-06, "loss": 0.4803, "step": 1136 }, { "epoch": 0.6384053902302077, "grad_norm": 0.6661640405654907, "learning_rate": 9.618146431671086e-06, "loss": 0.4534, "step": 1137 }, { "epoch": 0.6389668725435149, "grad_norm": 0.5136328935623169, "learning_rate": 9.616893229096293e-06, "loss": 0.4698, "step": 1138 }, { "epoch": 0.639528354856822, "grad_norm": 0.5733746886253357, "learning_rate": 9.615638055367104e-06, "loss": 0.4213, "step": 1139 }, { "epoch": 0.6400898371701291, "grad_norm": 0.5699252486228943, "learning_rate": 9.614380911019406e-06, "loss": 0.4575, "step": 1140 }, { "epoch": 0.6406513194834362, "grad_norm": 0.5900522470474243, "learning_rate": 9.61312179658993e-06, "loss": 0.4219, "step": 1141 }, { "epoch": 0.6412128017967434, "grad_norm": 0.594200849533081, "learning_rate": 9.61186071261625e-06, "loss": 0.4446, "step": 1142 }, { "epoch": 0.6417742841100506, "grad_norm": 0.5775567293167114, "learning_rate": 9.610597659636773e-06, "loss": 0.4578, "step": 1143 }, { "epoch": 0.6423357664233577, "grad_norm": 0.6124618649482727, "learning_rate": 9.609332638190754e-06, "loss": 0.4782, "step": 1144 }, { "epoch": 0.6428972487366648, "grad_norm": 0.5934448838233948, "learning_rate": 9.608065648818287e-06, "loss": 0.4577, "step": 1145 }, { "epoch": 0.643458731049972, "grad_norm": 0.6391990780830383, "learning_rate": 9.606796692060306e-06, "loss": 0.4223, "step": 1146 }, { "epoch": 0.6440202133632791, "grad_norm": 0.5326588749885559, "learning_rate": 9.605525768458583e-06, "loss": 0.4591, "step": 1147 }, { "epoch": 0.6445816956765862, "grad_norm": 0.596813976764679, "learning_rate": 9.60425287855573e-06, "loss": 0.4715, "step": 1148 }, { "epoch": 0.6451431779898933, "grad_norm": 0.5741242170333862, "learning_rate": 9.602978022895201e-06, "loss": 0.4459, "step": 1149 }, { "epoch": 0.6457046603032004, "grad_norm": 0.6231598854064941, "learning_rate": 9.601701202021287e-06, "loss": 0.4843, "step": 1150 }, { "epoch": 0.6462661426165076, "grad_norm": 0.5514549016952515, "learning_rate": 9.600422416479121e-06, "loss": 0.4253, "step": 1151 }, { "epoch": 0.6468276249298147, "grad_norm": 0.5814647674560547, "learning_rate": 9.599141666814671e-06, "loss": 0.4778, "step": 1152 }, { "epoch": 0.6473891072431218, "grad_norm": 0.7013500928878784, "learning_rate": 9.597858953574745e-06, "loss": 0.4926, "step": 1153 }, { "epoch": 0.6479505895564289, "grad_norm": 0.5510992407798767, "learning_rate": 9.596574277306992e-06, "loss": 0.4547, "step": 1154 }, { "epoch": 0.6485120718697361, "grad_norm": 0.5795218348503113, "learning_rate": 9.595287638559891e-06, "loss": 0.4743, "step": 1155 }, { "epoch": 0.6490735541830432, "grad_norm": 0.5627750158309937, "learning_rate": 9.593999037882772e-06, "loss": 0.4597, "step": 1156 }, { "epoch": 0.6496350364963503, "grad_norm": 0.5246656537055969, "learning_rate": 9.592708475825789e-06, "loss": 0.482, "step": 1157 }, { "epoch": 0.6501965188096575, "grad_norm": 0.5935583710670471, "learning_rate": 9.591415952939944e-06, "loss": 0.4626, "step": 1158 }, { "epoch": 0.6507580011229647, "grad_norm": 0.52081298828125, "learning_rate": 9.59012146977707e-06, "loss": 0.4631, "step": 1159 }, { "epoch": 0.6513194834362718, "grad_norm": 0.5512584447860718, "learning_rate": 9.588825026889836e-06, "loss": 0.4774, "step": 1160 }, { "epoch": 0.6518809657495789, "grad_norm": 0.6117110252380371, "learning_rate": 9.587526624831755e-06, "loss": 0.4501, "step": 1161 }, { "epoch": 0.652442448062886, "grad_norm": 0.5450863838195801, "learning_rate": 9.58622626415717e-06, "loss": 0.4533, "step": 1162 }, { "epoch": 0.6530039303761932, "grad_norm": 0.5933632254600525, "learning_rate": 9.58492394542126e-06, "loss": 0.4668, "step": 1163 }, { "epoch": 0.6535654126895003, "grad_norm": 0.4940392076969147, "learning_rate": 9.583619669180046e-06, "loss": 0.4926, "step": 1164 }, { "epoch": 0.6541268950028074, "grad_norm": 0.5404666066169739, "learning_rate": 9.582313435990377e-06, "loss": 0.4517, "step": 1165 }, { "epoch": 0.6546883773161145, "grad_norm": 0.5048272013664246, "learning_rate": 9.581005246409945e-06, "loss": 0.435, "step": 1166 }, { "epoch": 0.6552498596294217, "grad_norm": 0.5008298754692078, "learning_rate": 9.57969510099727e-06, "loss": 0.4528, "step": 1167 }, { "epoch": 0.6558113419427288, "grad_norm": 0.5899841785430908, "learning_rate": 9.578383000311713e-06, "loss": 0.4889, "step": 1168 }, { "epoch": 0.6563728242560359, "grad_norm": 0.48561111092567444, "learning_rate": 9.577068944913465e-06, "loss": 0.4323, "step": 1169 }, { "epoch": 0.656934306569343, "grad_norm": 0.5683648586273193, "learning_rate": 9.575752935363557e-06, "loss": 0.4402, "step": 1170 }, { "epoch": 0.6574957888826501, "grad_norm": 0.5526260137557983, "learning_rate": 9.57443497222385e-06, "loss": 0.4437, "step": 1171 }, { "epoch": 0.6580572711959574, "grad_norm": 0.5811257362365723, "learning_rate": 9.573115056057038e-06, "loss": 0.4708, "step": 1172 }, { "epoch": 0.6586187535092645, "grad_norm": 0.5578041672706604, "learning_rate": 9.571793187426654e-06, "loss": 0.4454, "step": 1173 }, { "epoch": 0.6591802358225716, "grad_norm": 0.5278154611587524, "learning_rate": 9.570469366897062e-06, "loss": 0.437, "step": 1174 }, { "epoch": 0.6597417181358787, "grad_norm": 0.5925083160400391, "learning_rate": 9.569143595033456e-06, "loss": 0.4862, "step": 1175 }, { "epoch": 0.6603032004491859, "grad_norm": 0.5719252228736877, "learning_rate": 9.567815872401871e-06, "loss": 0.4797, "step": 1176 }, { "epoch": 0.660864682762493, "grad_norm": 0.5118212699890137, "learning_rate": 9.566486199569166e-06, "loss": 0.4352, "step": 1177 }, { "epoch": 0.6614261650758001, "grad_norm": 0.605605959892273, "learning_rate": 9.565154577103038e-06, "loss": 0.471, "step": 1178 }, { "epoch": 0.6619876473891072, "grad_norm": 0.5905898213386536, "learning_rate": 9.563821005572014e-06, "loss": 0.4647, "step": 1179 }, { "epoch": 0.6625491297024144, "grad_norm": 0.607215166091919, "learning_rate": 9.562485485545456e-06, "loss": 0.4628, "step": 1180 }, { "epoch": 0.6631106120157215, "grad_norm": 0.5246893167495728, "learning_rate": 9.561148017593556e-06, "loss": 0.4155, "step": 1181 }, { "epoch": 0.6636720943290286, "grad_norm": 0.7411627769470215, "learning_rate": 9.559808602287337e-06, "loss": 0.503, "step": 1182 }, { "epoch": 0.6642335766423357, "grad_norm": 0.5923908352851868, "learning_rate": 9.558467240198655e-06, "loss": 0.4421, "step": 1183 }, { "epoch": 0.664795058955643, "grad_norm": 0.5434683561325073, "learning_rate": 9.557123931900194e-06, "loss": 0.4678, "step": 1184 }, { "epoch": 0.66535654126895, "grad_norm": 0.7292578816413879, "learning_rate": 9.555778677965475e-06, "loss": 0.428, "step": 1185 }, { "epoch": 0.6659180235822572, "grad_norm": 0.5184193849563599, "learning_rate": 9.554431478968842e-06, "loss": 0.4682, "step": 1186 }, { "epoch": 0.6664795058955643, "grad_norm": 0.5059313178062439, "learning_rate": 9.553082335485477e-06, "loss": 0.4823, "step": 1187 }, { "epoch": 0.6670409882088714, "grad_norm": 0.6051994562149048, "learning_rate": 9.551731248091387e-06, "loss": 0.4823, "step": 1188 }, { "epoch": 0.6676024705221786, "grad_norm": 0.5834611058235168, "learning_rate": 9.55037821736341e-06, "loss": 0.4415, "step": 1189 }, { "epoch": 0.6681639528354857, "grad_norm": 0.6317632794380188, "learning_rate": 9.549023243879214e-06, "loss": 0.4623, "step": 1190 }, { "epoch": 0.6687254351487928, "grad_norm": 0.5633893013000488, "learning_rate": 9.547666328217299e-06, "loss": 0.4581, "step": 1191 }, { "epoch": 0.6692869174620999, "grad_norm": 0.5855648517608643, "learning_rate": 9.54630747095699e-06, "loss": 0.4534, "step": 1192 }, { "epoch": 0.6698483997754071, "grad_norm": 0.5339756608009338, "learning_rate": 9.544946672678442e-06, "loss": 0.4453, "step": 1193 }, { "epoch": 0.6704098820887142, "grad_norm": 0.5621201395988464, "learning_rate": 9.54358393396264e-06, "loss": 0.4712, "step": 1194 }, { "epoch": 0.6709713644020213, "grad_norm": 0.5428036451339722, "learning_rate": 9.5422192553914e-06, "loss": 0.4683, "step": 1195 }, { "epoch": 0.6715328467153284, "grad_norm": 0.5228599905967712, "learning_rate": 9.54085263754736e-06, "loss": 0.4434, "step": 1196 }, { "epoch": 0.6720943290286356, "grad_norm": 0.6405354142189026, "learning_rate": 9.539484081013987e-06, "loss": 0.4786, "step": 1197 }, { "epoch": 0.6726558113419427, "grad_norm": 0.5819910764694214, "learning_rate": 9.538113586375585e-06, "loss": 0.4554, "step": 1198 }, { "epoch": 0.6732172936552498, "grad_norm": 0.5222212076187134, "learning_rate": 9.536741154217273e-06, "loss": 0.4575, "step": 1199 }, { "epoch": 0.673778775968557, "grad_norm": 0.4966205656528473, "learning_rate": 9.535366785125002e-06, "loss": 0.4261, "step": 1200 }, { "epoch": 0.6743402582818642, "grad_norm": 0.560589075088501, "learning_rate": 9.533990479685552e-06, "loss": 0.4804, "step": 1201 }, { "epoch": 0.6749017405951713, "grad_norm": 0.5509025454521179, "learning_rate": 9.532612238486531e-06, "loss": 0.471, "step": 1202 }, { "epoch": 0.6754632229084784, "grad_norm": 0.6125383973121643, "learning_rate": 9.531232062116365e-06, "loss": 0.4976, "step": 1203 }, { "epoch": 0.6760247052217855, "grad_norm": 0.6587722301483154, "learning_rate": 9.529849951164317e-06, "loss": 0.4697, "step": 1204 }, { "epoch": 0.6765861875350927, "grad_norm": 0.5240868926048279, "learning_rate": 9.528465906220468e-06, "loss": 0.4908, "step": 1205 }, { "epoch": 0.6771476698483998, "grad_norm": 0.6025630235671997, "learning_rate": 9.527079927875728e-06, "loss": 0.4656, "step": 1206 }, { "epoch": 0.6777091521617069, "grad_norm": 0.5719497799873352, "learning_rate": 9.525692016721834e-06, "loss": 0.4437, "step": 1207 }, { "epoch": 0.678270634475014, "grad_norm": 0.5673864483833313, "learning_rate": 9.524302173351342e-06, "loss": 0.4541, "step": 1208 }, { "epoch": 0.6788321167883211, "grad_norm": 0.5462242364883423, "learning_rate": 9.52291039835764e-06, "loss": 0.4949, "step": 1209 }, { "epoch": 0.6793935991016283, "grad_norm": 0.6730013489723206, "learning_rate": 9.52151669233494e-06, "loss": 0.4855, "step": 1210 }, { "epoch": 0.6799550814149354, "grad_norm": 0.6641866564750671, "learning_rate": 9.520121055878272e-06, "loss": 0.4701, "step": 1211 }, { "epoch": 0.6805165637282425, "grad_norm": 0.6068594455718994, "learning_rate": 9.518723489583496e-06, "loss": 0.496, "step": 1212 }, { "epoch": 0.6810780460415496, "grad_norm": 0.7079132795333862, "learning_rate": 9.517323994047293e-06, "loss": 0.4489, "step": 1213 }, { "epoch": 0.6816395283548569, "grad_norm": 0.7262161374092102, "learning_rate": 9.515922569867174e-06, "loss": 0.4676, "step": 1214 }, { "epoch": 0.682201010668164, "grad_norm": 0.527073323726654, "learning_rate": 9.514519217641466e-06, "loss": 0.4717, "step": 1215 }, { "epoch": 0.6827624929814711, "grad_norm": 0.6086965799331665, "learning_rate": 9.513113937969318e-06, "loss": 0.4664, "step": 1216 }, { "epoch": 0.6833239752947782, "grad_norm": 0.5838804244995117, "learning_rate": 9.511706731450708e-06, "loss": 0.4481, "step": 1217 }, { "epoch": 0.6838854576080854, "grad_norm": 0.6770086884498596, "learning_rate": 9.510297598686436e-06, "loss": 0.4638, "step": 1218 }, { "epoch": 0.6844469399213925, "grad_norm": 0.48718568682670593, "learning_rate": 9.508886540278122e-06, "loss": 0.4351, "step": 1219 }, { "epoch": 0.6850084222346996, "grad_norm": 0.5561348795890808, "learning_rate": 9.507473556828205e-06, "loss": 0.4745, "step": 1220 }, { "epoch": 0.6855699045480067, "grad_norm": 0.8328141570091248, "learning_rate": 9.506058648939958e-06, "loss": 0.4889, "step": 1221 }, { "epoch": 0.6861313868613139, "grad_norm": 0.5892333984375, "learning_rate": 9.504641817217458e-06, "loss": 0.472, "step": 1222 }, { "epoch": 0.686692869174621, "grad_norm": 0.6542025208473206, "learning_rate": 9.50322306226562e-06, "loss": 0.5071, "step": 1223 }, { "epoch": 0.6872543514879281, "grad_norm": 0.5862738490104675, "learning_rate": 9.501802384690167e-06, "loss": 0.4803, "step": 1224 }, { "epoch": 0.6878158338012352, "grad_norm": 0.595966637134552, "learning_rate": 9.500379785097651e-06, "loss": 0.4643, "step": 1225 }, { "epoch": 0.6883773161145423, "grad_norm": 0.5346269011497498, "learning_rate": 9.498955264095442e-06, "loss": 0.4625, "step": 1226 }, { "epoch": 0.6889387984278496, "grad_norm": 0.6767905354499817, "learning_rate": 9.497528822291733e-06, "loss": 0.4654, "step": 1227 }, { "epoch": 0.6895002807411567, "grad_norm": 0.5100619792938232, "learning_rate": 9.496100460295529e-06, "loss": 0.4226, "step": 1228 }, { "epoch": 0.6900617630544638, "grad_norm": 0.559453547000885, "learning_rate": 9.494670178716668e-06, "loss": 0.4696, "step": 1229 }, { "epoch": 0.6906232453677709, "grad_norm": 0.7481603026390076, "learning_rate": 9.493237978165794e-06, "loss": 0.4971, "step": 1230 }, { "epoch": 0.6911847276810781, "grad_norm": 0.5388820171356201, "learning_rate": 9.491803859254378e-06, "loss": 0.4689, "step": 1231 }, { "epoch": 0.6917462099943852, "grad_norm": 0.6786739826202393, "learning_rate": 9.49036782259471e-06, "loss": 0.4671, "step": 1232 }, { "epoch": 0.6923076923076923, "grad_norm": 0.6215696930885315, "learning_rate": 9.488929868799894e-06, "loss": 0.4782, "step": 1233 }, { "epoch": 0.6928691746209994, "grad_norm": 0.7870856523513794, "learning_rate": 9.487489998483858e-06, "loss": 0.4627, "step": 1234 }, { "epoch": 0.6934306569343066, "grad_norm": 0.7406066656112671, "learning_rate": 9.486048212261346e-06, "loss": 0.4737, "step": 1235 }, { "epoch": 0.6939921392476137, "grad_norm": 0.5845910906791687, "learning_rate": 9.484604510747918e-06, "loss": 0.4868, "step": 1236 }, { "epoch": 0.6945536215609208, "grad_norm": 0.8205158710479736, "learning_rate": 9.483158894559953e-06, "loss": 0.4545, "step": 1237 }, { "epoch": 0.6951151038742279, "grad_norm": 0.6251784563064575, "learning_rate": 9.481711364314654e-06, "loss": 0.4372, "step": 1238 }, { "epoch": 0.6956765861875351, "grad_norm": 0.764625608921051, "learning_rate": 9.480261920630027e-06, "loss": 0.426, "step": 1239 }, { "epoch": 0.6962380685008422, "grad_norm": 0.6412047147750854, "learning_rate": 9.478810564124907e-06, "loss": 0.4672, "step": 1240 }, { "epoch": 0.6967995508141493, "grad_norm": 0.7032860517501831, "learning_rate": 9.477357295418943e-06, "loss": 0.4746, "step": 1241 }, { "epoch": 0.6973610331274565, "grad_norm": 0.6651809215545654, "learning_rate": 9.475902115132597e-06, "loss": 0.4556, "step": 1242 }, { "epoch": 0.6979225154407637, "grad_norm": 0.5158226490020752, "learning_rate": 9.474445023887149e-06, "loss": 0.4539, "step": 1243 }, { "epoch": 0.6984839977540708, "grad_norm": 0.6315792798995972, "learning_rate": 9.472986022304699e-06, "loss": 0.4886, "step": 1244 }, { "epoch": 0.6990454800673779, "grad_norm": 0.5327520966529846, "learning_rate": 9.471525111008154e-06, "loss": 0.4664, "step": 1245 }, { "epoch": 0.699606962380685, "grad_norm": 0.5212946534156799, "learning_rate": 9.470062290621245e-06, "loss": 0.4399, "step": 1246 }, { "epoch": 0.7001684446939921, "grad_norm": 0.5535264611244202, "learning_rate": 9.468597561768513e-06, "loss": 0.4513, "step": 1247 }, { "epoch": 0.7007299270072993, "grad_norm": 0.4984138011932373, "learning_rate": 9.467130925075314e-06, "loss": 0.4479, "step": 1248 }, { "epoch": 0.7012914093206064, "grad_norm": 0.47929468750953674, "learning_rate": 9.465662381167819e-06, "loss": 0.469, "step": 1249 }, { "epoch": 0.7018528916339135, "grad_norm": 0.49994197487831116, "learning_rate": 9.464191930673016e-06, "loss": 0.4396, "step": 1250 }, { "epoch": 0.7024143739472206, "grad_norm": 0.5546662211418152, "learning_rate": 9.462719574218706e-06, "loss": 0.4604, "step": 1251 }, { "epoch": 0.7029758562605278, "grad_norm": 0.5672272443771362, "learning_rate": 9.461245312433498e-06, "loss": 0.4752, "step": 1252 }, { "epoch": 0.7035373385738349, "grad_norm": 0.4997836649417877, "learning_rate": 9.459769145946822e-06, "loss": 0.4663, "step": 1253 }, { "epoch": 0.704098820887142, "grad_norm": 0.46755945682525635, "learning_rate": 9.458291075388918e-06, "loss": 0.42, "step": 1254 }, { "epoch": 0.7046603032004491, "grad_norm": 0.5607076287269592, "learning_rate": 9.45681110139084e-06, "loss": 0.4481, "step": 1255 }, { "epoch": 0.7052217855137564, "grad_norm": 0.5423195958137512, "learning_rate": 9.455329224584454e-06, "loss": 0.451, "step": 1256 }, { "epoch": 0.7057832678270635, "grad_norm": 0.5262753367424011, "learning_rate": 9.453845445602435e-06, "loss": 0.4611, "step": 1257 }, { "epoch": 0.7063447501403706, "grad_norm": 0.5991084575653076, "learning_rate": 9.452359765078277e-06, "loss": 0.451, "step": 1258 }, { "epoch": 0.7069062324536777, "grad_norm": 0.5477986335754395, "learning_rate": 9.45087218364628e-06, "loss": 0.4476, "step": 1259 }, { "epoch": 0.7074677147669849, "grad_norm": 0.5690117478370667, "learning_rate": 9.44938270194156e-06, "loss": 0.45, "step": 1260 }, { "epoch": 0.708029197080292, "grad_norm": 0.5245962142944336, "learning_rate": 9.44789132060004e-06, "loss": 0.4416, "step": 1261 }, { "epoch": 0.7085906793935991, "grad_norm": 0.550674319267273, "learning_rate": 9.446398040258457e-06, "loss": 0.4579, "step": 1262 }, { "epoch": 0.7091521617069062, "grad_norm": 0.5316531658172607, "learning_rate": 9.444902861554359e-06, "loss": 0.4562, "step": 1263 }, { "epoch": 0.7097136440202133, "grad_norm": 0.6420412063598633, "learning_rate": 9.443405785126101e-06, "loss": 0.494, "step": 1264 }, { "epoch": 0.7102751263335205, "grad_norm": 0.5398480296134949, "learning_rate": 9.441906811612853e-06, "loss": 0.4635, "step": 1265 }, { "epoch": 0.7108366086468276, "grad_norm": 0.49178600311279297, "learning_rate": 9.440405941654592e-06, "loss": 0.4339, "step": 1266 }, { "epoch": 0.7113980909601347, "grad_norm": 0.5783883333206177, "learning_rate": 9.438903175892104e-06, "loss": 0.4687, "step": 1267 }, { "epoch": 0.7119595732734418, "grad_norm": 0.6454637050628662, "learning_rate": 9.437398514966987e-06, "loss": 0.4715, "step": 1268 }, { "epoch": 0.712521055586749, "grad_norm": 0.5269718170166016, "learning_rate": 9.435891959521646e-06, "loss": 0.4589, "step": 1269 }, { "epoch": 0.7130825379000562, "grad_norm": 0.6342408061027527, "learning_rate": 9.434383510199297e-06, "loss": 0.476, "step": 1270 }, { "epoch": 0.7136440202133633, "grad_norm": 0.5853685140609741, "learning_rate": 9.432873167643962e-06, "loss": 0.4626, "step": 1271 }, { "epoch": 0.7142055025266704, "grad_norm": 0.5455940365791321, "learning_rate": 9.431360932500472e-06, "loss": 0.5006, "step": 1272 }, { "epoch": 0.7147669848399776, "grad_norm": 0.5551667213439941, "learning_rate": 9.429846805414468e-06, "loss": 0.4816, "step": 1273 }, { "epoch": 0.7153284671532847, "grad_norm": 0.631697416305542, "learning_rate": 9.428330787032396e-06, "loss": 0.4493, "step": 1274 }, { "epoch": 0.7158899494665918, "grad_norm": 0.5494170784950256, "learning_rate": 9.426812878001512e-06, "loss": 0.4448, "step": 1275 }, { "epoch": 0.7164514317798989, "grad_norm": 0.5930984020233154, "learning_rate": 9.425293078969876e-06, "loss": 0.5029, "step": 1276 }, { "epoch": 0.7170129140932061, "grad_norm": 0.5500475764274597, "learning_rate": 9.423771390586362e-06, "loss": 0.4517, "step": 1277 }, { "epoch": 0.7175743964065132, "grad_norm": 0.5515722632408142, "learning_rate": 9.42224781350064e-06, "loss": 0.4905, "step": 1278 }, { "epoch": 0.7181358787198203, "grad_norm": 0.6170773506164551, "learning_rate": 9.420722348363194e-06, "loss": 0.4765, "step": 1279 }, { "epoch": 0.7186973610331274, "grad_norm": 0.6060858964920044, "learning_rate": 9.419194995825311e-06, "loss": 0.449, "step": 1280 }, { "epoch": 0.7192588433464346, "grad_norm": 0.6308845281600952, "learning_rate": 9.417665756539087e-06, "loss": 0.4333, "step": 1281 }, { "epoch": 0.7198203256597417, "grad_norm": 0.5513525605201721, "learning_rate": 9.41613463115742e-06, "loss": 0.4462, "step": 1282 }, { "epoch": 0.7203818079730488, "grad_norm": 0.6369423866271973, "learning_rate": 9.414601620334014e-06, "loss": 0.4752, "step": 1283 }, { "epoch": 0.720943290286356, "grad_norm": 0.6138648986816406, "learning_rate": 9.41306672472338e-06, "loss": 0.4611, "step": 1284 }, { "epoch": 0.7215047725996631, "grad_norm": 0.5008145570755005, "learning_rate": 9.411529944980833e-06, "loss": 0.4365, "step": 1285 }, { "epoch": 0.7220662549129703, "grad_norm": 0.7049123048782349, "learning_rate": 9.40999128176249e-06, "loss": 0.4562, "step": 1286 }, { "epoch": 0.7226277372262774, "grad_norm": 0.5486630201339722, "learning_rate": 9.408450735725272e-06, "loss": 0.4518, "step": 1287 }, { "epoch": 0.7231892195395845, "grad_norm": 0.5764488577842712, "learning_rate": 9.406908307526912e-06, "loss": 0.4704, "step": 1288 }, { "epoch": 0.7237507018528916, "grad_norm": 0.6440374851226807, "learning_rate": 9.405363997825934e-06, "loss": 0.4708, "step": 1289 }, { "epoch": 0.7243121841661988, "grad_norm": 0.5477806925773621, "learning_rate": 9.403817807281674e-06, "loss": 0.4476, "step": 1290 }, { "epoch": 0.7248736664795059, "grad_norm": 0.6267554759979248, "learning_rate": 9.402269736554269e-06, "loss": 0.4744, "step": 1291 }, { "epoch": 0.725435148792813, "grad_norm": 0.6013818979263306, "learning_rate": 9.40071978630466e-06, "loss": 0.4693, "step": 1292 }, { "epoch": 0.7259966311061201, "grad_norm": 0.6204025745391846, "learning_rate": 9.399167957194584e-06, "loss": 0.4457, "step": 1293 }, { "epoch": 0.7265581134194273, "grad_norm": 0.4874211251735687, "learning_rate": 9.39761424988659e-06, "loss": 0.4454, "step": 1294 }, { "epoch": 0.7271195957327344, "grad_norm": 0.5593193769454956, "learning_rate": 9.396058665044019e-06, "loss": 0.4498, "step": 1295 }, { "epoch": 0.7276810780460415, "grad_norm": 0.5597581267356873, "learning_rate": 9.394501203331022e-06, "loss": 0.4699, "step": 1296 }, { "epoch": 0.7282425603593486, "grad_norm": 0.5066706538200378, "learning_rate": 9.392941865412547e-06, "loss": 0.4486, "step": 1297 }, { "epoch": 0.7288040426726559, "grad_norm": 0.6618567109107971, "learning_rate": 9.391380651954344e-06, "loss": 0.4388, "step": 1298 }, { "epoch": 0.729365524985963, "grad_norm": 0.53850919008255, "learning_rate": 9.389817563622965e-06, "loss": 0.4535, "step": 1299 }, { "epoch": 0.7299270072992701, "grad_norm": 0.530697226524353, "learning_rate": 9.388252601085757e-06, "loss": 0.444, "step": 1300 }, { "epoch": 0.7304884896125772, "grad_norm": 0.5055609345436096, "learning_rate": 9.386685765010874e-06, "loss": 0.4193, "step": 1301 }, { "epoch": 0.7310499719258844, "grad_norm": 0.6366611123085022, "learning_rate": 9.385117056067266e-06, "loss": 0.4669, "step": 1302 }, { "epoch": 0.7316114542391915, "grad_norm": 0.5178635716438293, "learning_rate": 9.383546474924685e-06, "loss": 0.4372, "step": 1303 }, { "epoch": 0.7321729365524986, "grad_norm": 0.5172827243804932, "learning_rate": 9.381974022253681e-06, "loss": 0.4528, "step": 1304 }, { "epoch": 0.7327344188658057, "grad_norm": 0.5615124106407166, "learning_rate": 9.380399698725603e-06, "loss": 0.4757, "step": 1305 }, { "epoch": 0.7332959011791128, "grad_norm": 0.6000403165817261, "learning_rate": 9.378823505012595e-06, "loss": 0.4562, "step": 1306 }, { "epoch": 0.73385738349242, "grad_norm": 0.5186808705329895, "learning_rate": 9.377245441787608e-06, "loss": 0.447, "step": 1307 }, { "epoch": 0.7344188658057271, "grad_norm": 0.4840991497039795, "learning_rate": 9.375665509724387e-06, "loss": 0.4218, "step": 1308 }, { "epoch": 0.7349803481190342, "grad_norm": 0.6083183884620667, "learning_rate": 9.37408370949747e-06, "loss": 0.4569, "step": 1309 }, { "epoch": 0.7355418304323413, "grad_norm": 0.5747784376144409, "learning_rate": 9.3725000417822e-06, "loss": 0.4311, "step": 1310 }, { "epoch": 0.7361033127456486, "grad_norm": 0.6220675110816956, "learning_rate": 9.370914507254714e-06, "loss": 0.4439, "step": 1311 }, { "epoch": 0.7366647950589557, "grad_norm": 0.5895352959632874, "learning_rate": 9.369327106591945e-06, "loss": 0.4462, "step": 1312 }, { "epoch": 0.7372262773722628, "grad_norm": 0.5037561655044556, "learning_rate": 9.367737840471626e-06, "loss": 0.44, "step": 1313 }, { "epoch": 0.7377877596855699, "grad_norm": 0.6078534126281738, "learning_rate": 9.366146709572284e-06, "loss": 0.4581, "step": 1314 }, { "epoch": 0.7383492419988771, "grad_norm": 0.6255765557289124, "learning_rate": 9.364553714573239e-06, "loss": 0.4822, "step": 1315 }, { "epoch": 0.7389107243121842, "grad_norm": 0.5324340462684631, "learning_rate": 9.362958856154616e-06, "loss": 0.4429, "step": 1316 }, { "epoch": 0.7394722066254913, "grad_norm": 0.619867205619812, "learning_rate": 9.361362134997327e-06, "loss": 0.4658, "step": 1317 }, { "epoch": 0.7400336889387984, "grad_norm": 0.5855786800384521, "learning_rate": 9.359763551783083e-06, "loss": 0.4567, "step": 1318 }, { "epoch": 0.7405951712521056, "grad_norm": 0.694292426109314, "learning_rate": 9.35816310719439e-06, "loss": 0.4663, "step": 1319 }, { "epoch": 0.7411566535654127, "grad_norm": 0.5926117300987244, "learning_rate": 9.356560801914545e-06, "loss": 0.4725, "step": 1320 }, { "epoch": 0.7417181358787198, "grad_norm": 0.46904122829437256, "learning_rate": 9.354956636627646e-06, "loss": 0.4619, "step": 1321 }, { "epoch": 0.7422796181920269, "grad_norm": 0.6549757122993469, "learning_rate": 9.353350612018581e-06, "loss": 0.4792, "step": 1322 }, { "epoch": 0.742841100505334, "grad_norm": 0.630196213722229, "learning_rate": 9.35174272877303e-06, "loss": 0.4486, "step": 1323 }, { "epoch": 0.7434025828186412, "grad_norm": 0.6072958111763, "learning_rate": 9.350132987577473e-06, "loss": 0.4439, "step": 1324 }, { "epoch": 0.7439640651319483, "grad_norm": 0.588195264339447, "learning_rate": 9.348521389119177e-06, "loss": 0.4396, "step": 1325 }, { "epoch": 0.7445255474452555, "grad_norm": 0.5675851702690125, "learning_rate": 9.346907934086202e-06, "loss": 0.4634, "step": 1326 }, { "epoch": 0.7450870297585626, "grad_norm": 0.5746546387672424, "learning_rate": 9.345292623167406e-06, "loss": 0.4431, "step": 1327 }, { "epoch": 0.7456485120718698, "grad_norm": 0.703713595867157, "learning_rate": 9.343675457052434e-06, "loss": 0.5295, "step": 1328 }, { "epoch": 0.7462099943851769, "grad_norm": 0.6251710057258606, "learning_rate": 9.342056436431727e-06, "loss": 0.4668, "step": 1329 }, { "epoch": 0.746771476698484, "grad_norm": 0.5376668572425842, "learning_rate": 9.340435561996515e-06, "loss": 0.4794, "step": 1330 }, { "epoch": 0.7473329590117911, "grad_norm": 0.639296293258667, "learning_rate": 9.338812834438822e-06, "loss": 0.4835, "step": 1331 }, { "epoch": 0.7478944413250983, "grad_norm": 0.5877411365509033, "learning_rate": 9.337188254451457e-06, "loss": 0.4436, "step": 1332 }, { "epoch": 0.7484559236384054, "grad_norm": 0.6473490595817566, "learning_rate": 9.33556182272803e-06, "loss": 0.498, "step": 1333 }, { "epoch": 0.7490174059517125, "grad_norm": 0.5271211862564087, "learning_rate": 9.333933539962934e-06, "loss": 0.4718, "step": 1334 }, { "epoch": 0.7495788882650196, "grad_norm": 0.5478256344795227, "learning_rate": 9.332303406851353e-06, "loss": 0.4454, "step": 1335 }, { "epoch": 0.7501403705783268, "grad_norm": 0.5389241576194763, "learning_rate": 9.330671424089264e-06, "loss": 0.4471, "step": 1336 }, { "epoch": 0.7507018528916339, "grad_norm": 0.5150645971298218, "learning_rate": 9.329037592373431e-06, "loss": 0.4768, "step": 1337 }, { "epoch": 0.751263335204941, "grad_norm": 0.5766171216964722, "learning_rate": 9.327401912401411e-06, "loss": 0.4571, "step": 1338 }, { "epoch": 0.7518248175182481, "grad_norm": 0.507463812828064, "learning_rate": 9.325764384871545e-06, "loss": 0.4472, "step": 1339 }, { "epoch": 0.7523862998315554, "grad_norm": 0.5171255469322205, "learning_rate": 9.324125010482966e-06, "loss": 0.4547, "step": 1340 }, { "epoch": 0.7529477821448625, "grad_norm": 0.5959138870239258, "learning_rate": 9.322483789935594e-06, "loss": 0.4376, "step": 1341 }, { "epoch": 0.7535092644581696, "grad_norm": 0.5095019340515137, "learning_rate": 9.320840723930142e-06, "loss": 0.4486, "step": 1342 }, { "epoch": 0.7540707467714767, "grad_norm": 0.6578831672668457, "learning_rate": 9.319195813168103e-06, "loss": 0.4243, "step": 1343 }, { "epoch": 0.7546322290847838, "grad_norm": 0.49208056926727295, "learning_rate": 9.317549058351762e-06, "loss": 0.4661, "step": 1344 }, { "epoch": 0.755193711398091, "grad_norm": 0.6653839945793152, "learning_rate": 9.315900460184191e-06, "loss": 0.4584, "step": 1345 }, { "epoch": 0.7557551937113981, "grad_norm": 0.5557628870010376, "learning_rate": 9.314250019369251e-06, "loss": 0.4577, "step": 1346 }, { "epoch": 0.7563166760247052, "grad_norm": 0.4990582764148712, "learning_rate": 9.312597736611586e-06, "loss": 0.4337, "step": 1347 }, { "epoch": 0.7568781583380123, "grad_norm": 0.5219115614891052, "learning_rate": 9.310943612616627e-06, "loss": 0.4533, "step": 1348 }, { "epoch": 0.7574396406513195, "grad_norm": 0.48139432072639465, "learning_rate": 9.309287648090595e-06, "loss": 0.4411, "step": 1349 }, { "epoch": 0.7580011229646266, "grad_norm": 0.5212274193763733, "learning_rate": 9.307629843740493e-06, "loss": 0.4341, "step": 1350 }, { "epoch": 0.7585626052779337, "grad_norm": 0.5187464952468872, "learning_rate": 9.305970200274108e-06, "loss": 0.4772, "step": 1351 }, { "epoch": 0.7591240875912408, "grad_norm": 0.5388540625572205, "learning_rate": 9.304308718400017e-06, "loss": 0.487, "step": 1352 }, { "epoch": 0.759685569904548, "grad_norm": 0.5465543270111084, "learning_rate": 9.302645398827579e-06, "loss": 0.4321, "step": 1353 }, { "epoch": 0.7602470522178552, "grad_norm": 0.6606138944625854, "learning_rate": 9.300980242266939e-06, "loss": 0.4716, "step": 1354 }, { "epoch": 0.7608085345311623, "grad_norm": 0.5538738965988159, "learning_rate": 9.299313249429024e-06, "loss": 0.4788, "step": 1355 }, { "epoch": 0.7613700168444694, "grad_norm": 0.5800716280937195, "learning_rate": 9.297644421025547e-06, "loss": 0.4759, "step": 1356 }, { "epoch": 0.7619314991577766, "grad_norm": 0.5862640738487244, "learning_rate": 9.295973757769005e-06, "loss": 0.4488, "step": 1357 }, { "epoch": 0.7624929814710837, "grad_norm": 0.5576216578483582, "learning_rate": 9.294301260372676e-06, "loss": 0.4745, "step": 1358 }, { "epoch": 0.7630544637843908, "grad_norm": 0.6026124358177185, "learning_rate": 9.292626929550623e-06, "loss": 0.4415, "step": 1359 }, { "epoch": 0.7636159460976979, "grad_norm": 0.5110349059104919, "learning_rate": 9.290950766017693e-06, "loss": 0.4496, "step": 1360 }, { "epoch": 0.764177428411005, "grad_norm": 0.5873545408248901, "learning_rate": 9.28927277048951e-06, "loss": 0.488, "step": 1361 }, { "epoch": 0.7647389107243122, "grad_norm": 0.5599154829978943, "learning_rate": 9.287592943682487e-06, "loss": 0.4668, "step": 1362 }, { "epoch": 0.7653003930376193, "grad_norm": 0.5813698768615723, "learning_rate": 9.285911286313816e-06, "loss": 0.4942, "step": 1363 }, { "epoch": 0.7658618753509264, "grad_norm": 0.5573318600654602, "learning_rate": 9.28422779910147e-06, "loss": 0.4406, "step": 1364 }, { "epoch": 0.7664233576642335, "grad_norm": 0.5301180481910706, "learning_rate": 9.2825424827642e-06, "loss": 0.4217, "step": 1365 }, { "epoch": 0.7669848399775407, "grad_norm": 0.5385721921920776, "learning_rate": 9.280855338021547e-06, "loss": 0.4393, "step": 1366 }, { "epoch": 0.7675463222908478, "grad_norm": 0.6286836266517639, "learning_rate": 9.279166365593824e-06, "loss": 0.4993, "step": 1367 }, { "epoch": 0.768107804604155, "grad_norm": 0.4794997274875641, "learning_rate": 9.277475566202127e-06, "loss": 0.4451, "step": 1368 }, { "epoch": 0.7686692869174621, "grad_norm": 0.5693125128746033, "learning_rate": 9.275782940568337e-06, "loss": 0.4452, "step": 1369 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5873583555221558, "learning_rate": 9.274088489415105e-06, "loss": 0.4701, "step": 1370 }, { "epoch": 0.7697922515440764, "grad_norm": 0.530748724937439, "learning_rate": 9.272392213465868e-06, "loss": 0.4581, "step": 1371 }, { "epoch": 0.7703537338573835, "grad_norm": 0.5690277218818665, "learning_rate": 9.270694113444842e-06, "loss": 0.4463, "step": 1372 }, { "epoch": 0.7709152161706906, "grad_norm": 0.6244488954544067, "learning_rate": 9.268994190077018e-06, "loss": 0.4782, "step": 1373 }, { "epoch": 0.7714766984839978, "grad_norm": 0.5738887190818787, "learning_rate": 9.267292444088171e-06, "loss": 0.4572, "step": 1374 }, { "epoch": 0.7720381807973049, "grad_norm": 0.5999573469161987, "learning_rate": 9.26558887620485e-06, "loss": 0.4884, "step": 1375 }, { "epoch": 0.772599663110612, "grad_norm": 0.4721652567386627, "learning_rate": 9.263883487154382e-06, "loss": 0.4443, "step": 1376 }, { "epoch": 0.7731611454239191, "grad_norm": 0.5183330774307251, "learning_rate": 9.262176277664871e-06, "loss": 0.4339, "step": 1377 }, { "epoch": 0.7737226277372263, "grad_norm": 0.6509381532669067, "learning_rate": 9.260467248465204e-06, "loss": 0.4389, "step": 1378 }, { "epoch": 0.7742841100505334, "grad_norm": 0.49697422981262207, "learning_rate": 9.258756400285036e-06, "loss": 0.4145, "step": 1379 }, { "epoch": 0.7748455923638405, "grad_norm": 0.5263320207595825, "learning_rate": 9.257043733854808e-06, "loss": 0.4524, "step": 1380 }, { "epoch": 0.7754070746771476, "grad_norm": 0.607746958732605, "learning_rate": 9.255329249905726e-06, "loss": 0.4519, "step": 1381 }, { "epoch": 0.7759685569904547, "grad_norm": 0.6103513836860657, "learning_rate": 9.253612949169786e-06, "loss": 0.4342, "step": 1382 }, { "epoch": 0.776530039303762, "grad_norm": 0.6550548672676086, "learning_rate": 9.251894832379747e-06, "loss": 0.4705, "step": 1383 }, { "epoch": 0.7770915216170691, "grad_norm": 0.5749346017837524, "learning_rate": 9.250174900269151e-06, "loss": 0.4808, "step": 1384 }, { "epoch": 0.7776530039303762, "grad_norm": 0.6955148577690125, "learning_rate": 9.248453153572309e-06, "loss": 0.5053, "step": 1385 }, { "epoch": 0.7782144862436833, "grad_norm": 0.5365839004516602, "learning_rate": 9.246729593024316e-06, "loss": 0.493, "step": 1386 }, { "epoch": 0.7787759685569905, "grad_norm": 0.6456088423728943, "learning_rate": 9.24500421936103e-06, "loss": 0.4115, "step": 1387 }, { "epoch": 0.7793374508702976, "grad_norm": 0.5869093537330627, "learning_rate": 9.243277033319092e-06, "loss": 0.4591, "step": 1388 }, { "epoch": 0.7798989331836047, "grad_norm": 0.6063370704650879, "learning_rate": 9.241548035635912e-06, "loss": 0.486, "step": 1389 }, { "epoch": 0.7804604154969118, "grad_norm": 0.4981094002723694, "learning_rate": 9.239817227049676e-06, "loss": 0.4448, "step": 1390 }, { "epoch": 0.781021897810219, "grad_norm": 0.642355740070343, "learning_rate": 9.238084608299342e-06, "loss": 0.4432, "step": 1391 }, { "epoch": 0.7815833801235261, "grad_norm": 0.5500532388687134, "learning_rate": 9.23635018012464e-06, "loss": 0.4439, "step": 1392 }, { "epoch": 0.7821448624368332, "grad_norm": 0.6829796433448792, "learning_rate": 9.234613943266075e-06, "loss": 0.4414, "step": 1393 }, { "epoch": 0.7827063447501403, "grad_norm": 0.5591107606887817, "learning_rate": 9.232875898464923e-06, "loss": 0.4579, "step": 1394 }, { "epoch": 0.7832678270634476, "grad_norm": 0.5426923632621765, "learning_rate": 9.231136046463229e-06, "loss": 0.4515, "step": 1395 }, { "epoch": 0.7838293093767547, "grad_norm": 0.5366030931472778, "learning_rate": 9.229394388003815e-06, "loss": 0.453, "step": 1396 }, { "epoch": 0.7843907916900618, "grad_norm": 0.6888231039047241, "learning_rate": 9.22765092383027e-06, "loss": 0.5058, "step": 1397 }, { "epoch": 0.7849522740033689, "grad_norm": 0.582237184047699, "learning_rate": 9.225905654686957e-06, "loss": 0.4705, "step": 1398 }, { "epoch": 0.785513756316676, "grad_norm": 0.5525847673416138, "learning_rate": 9.224158581319005e-06, "loss": 0.4414, "step": 1399 }, { "epoch": 0.7860752386299832, "grad_norm": 0.5756683945655823, "learning_rate": 9.222409704472318e-06, "loss": 0.4604, "step": 1400 }, { "epoch": 0.7866367209432903, "grad_norm": 0.5517821311950684, "learning_rate": 9.22065902489357e-06, "loss": 0.4504, "step": 1401 }, { "epoch": 0.7871982032565974, "grad_norm": 0.5093729496002197, "learning_rate": 9.2189065433302e-06, "loss": 0.4623, "step": 1402 }, { "epoch": 0.7877596855699045, "grad_norm": 0.5511021614074707, "learning_rate": 9.217152260530422e-06, "loss": 0.4806, "step": 1403 }, { "epoch": 0.7883211678832117, "grad_norm": 0.5030124187469482, "learning_rate": 9.215396177243214e-06, "loss": 0.4542, "step": 1404 }, { "epoch": 0.7888826501965188, "grad_norm": 0.4617021679878235, "learning_rate": 9.213638294218326e-06, "loss": 0.4636, "step": 1405 }, { "epoch": 0.7894441325098259, "grad_norm": 0.6123405694961548, "learning_rate": 9.211878612206275e-06, "loss": 0.464, "step": 1406 }, { "epoch": 0.790005614823133, "grad_norm": 0.6046680212020874, "learning_rate": 9.210117131958348e-06, "loss": 0.4612, "step": 1407 }, { "epoch": 0.7905670971364402, "grad_norm": 0.5368892550468445, "learning_rate": 9.208353854226598e-06, "loss": 0.4675, "step": 1408 }, { "epoch": 0.7911285794497473, "grad_norm": 0.540223240852356, "learning_rate": 9.206588779763843e-06, "loss": 0.4303, "step": 1409 }, { "epoch": 0.7916900617630545, "grad_norm": 0.5481588244438171, "learning_rate": 9.204821909323673e-06, "loss": 0.4379, "step": 1410 }, { "epoch": 0.7922515440763616, "grad_norm": 0.5544547438621521, "learning_rate": 9.203053243660442e-06, "loss": 0.4254, "step": 1411 }, { "epoch": 0.7928130263896688, "grad_norm": 0.5125308632850647, "learning_rate": 9.201282783529273e-06, "loss": 0.4268, "step": 1412 }, { "epoch": 0.7933745087029759, "grad_norm": 0.606974184513092, "learning_rate": 9.199510529686051e-06, "loss": 0.4693, "step": 1413 }, { "epoch": 0.793935991016283, "grad_norm": 0.5916603803634644, "learning_rate": 9.197736482887429e-06, "loss": 0.4529, "step": 1414 }, { "epoch": 0.7944974733295901, "grad_norm": 0.5655441284179688, "learning_rate": 9.195960643890825e-06, "loss": 0.4584, "step": 1415 }, { "epoch": 0.7950589556428973, "grad_norm": 0.5558990836143494, "learning_rate": 9.194183013454423e-06, "loss": 0.4493, "step": 1416 }, { "epoch": 0.7956204379562044, "grad_norm": 0.5591253638267517, "learning_rate": 9.192403592337174e-06, "loss": 0.4365, "step": 1417 }, { "epoch": 0.7961819202695115, "grad_norm": 0.5988694429397583, "learning_rate": 9.190622381298788e-06, "loss": 0.4888, "step": 1418 }, { "epoch": 0.7967434025828186, "grad_norm": 0.6176961660385132, "learning_rate": 9.188839381099743e-06, "loss": 0.4854, "step": 1419 }, { "epoch": 0.7973048848961257, "grad_norm": 0.5134658217430115, "learning_rate": 9.18705459250128e-06, "loss": 0.4532, "step": 1420 }, { "epoch": 0.7978663672094329, "grad_norm": 0.4339836835861206, "learning_rate": 9.185268016265403e-06, "loss": 0.4262, "step": 1421 }, { "epoch": 0.79842784952274, "grad_norm": 0.5194941163063049, "learning_rate": 9.183479653154882e-06, "loss": 0.451, "step": 1422 }, { "epoch": 0.7989893318360471, "grad_norm": 0.5209234356880188, "learning_rate": 9.181689503933245e-06, "loss": 0.4529, "step": 1423 }, { "epoch": 0.7995508141493542, "grad_norm": 0.46940335631370544, "learning_rate": 9.179897569364787e-06, "loss": 0.4251, "step": 1424 }, { "epoch": 0.8001122964626615, "grad_norm": 0.5338002443313599, "learning_rate": 9.178103850214563e-06, "loss": 0.4753, "step": 1425 }, { "epoch": 0.8006737787759686, "grad_norm": 0.5030238032341003, "learning_rate": 9.176308347248391e-06, "loss": 0.4314, "step": 1426 }, { "epoch": 0.8012352610892757, "grad_norm": 0.527966320514679, "learning_rate": 9.174511061232849e-06, "loss": 0.4582, "step": 1427 }, { "epoch": 0.8017967434025828, "grad_norm": 0.563541054725647, "learning_rate": 9.17271199293528e-06, "loss": 0.4711, "step": 1428 }, { "epoch": 0.80235822571589, "grad_norm": 0.5749960541725159, "learning_rate": 9.170911143123781e-06, "loss": 0.434, "step": 1429 }, { "epoch": 0.8029197080291971, "grad_norm": 0.5923579931259155, "learning_rate": 9.169108512567216e-06, "loss": 0.4674, "step": 1430 }, { "epoch": 0.8034811903425042, "grad_norm": 0.5449298620223999, "learning_rate": 9.167304102035209e-06, "loss": 0.459, "step": 1431 }, { "epoch": 0.8040426726558113, "grad_norm": 0.6578465700149536, "learning_rate": 9.165497912298138e-06, "loss": 0.4642, "step": 1432 }, { "epoch": 0.8046041549691185, "grad_norm": 0.6105349063873291, "learning_rate": 9.163689944127146e-06, "loss": 0.4449, "step": 1433 }, { "epoch": 0.8051656372824256, "grad_norm": 0.5846604108810425, "learning_rate": 9.161880198294138e-06, "loss": 0.5027, "step": 1434 }, { "epoch": 0.8057271195957327, "grad_norm": 0.5238841772079468, "learning_rate": 9.160068675571769e-06, "loss": 0.4621, "step": 1435 }, { "epoch": 0.8062886019090398, "grad_norm": 0.6920968890190125, "learning_rate": 9.158255376733458e-06, "loss": 0.4415, "step": 1436 }, { "epoch": 0.8068500842223469, "grad_norm": 0.5769931077957153, "learning_rate": 9.156440302553386e-06, "loss": 0.4702, "step": 1437 }, { "epoch": 0.8074115665356542, "grad_norm": 0.5748425722122192, "learning_rate": 9.154623453806484e-06, "loss": 0.4724, "step": 1438 }, { "epoch": 0.8079730488489613, "grad_norm": 0.6257287859916687, "learning_rate": 9.152804831268445e-06, "loss": 0.446, "step": 1439 }, { "epoch": 0.8085345311622684, "grad_norm": 0.5788884162902832, "learning_rate": 9.150984435715719e-06, "loss": 0.4453, "step": 1440 }, { "epoch": 0.8090960134755755, "grad_norm": 0.5308550000190735, "learning_rate": 9.149162267925513e-06, "loss": 0.4848, "step": 1441 }, { "epoch": 0.8096574957888827, "grad_norm": 0.6025899648666382, "learning_rate": 9.147338328675792e-06, "loss": 0.4727, "step": 1442 }, { "epoch": 0.8102189781021898, "grad_norm": 0.6875593662261963, "learning_rate": 9.145512618745274e-06, "loss": 0.4644, "step": 1443 }, { "epoch": 0.8107804604154969, "grad_norm": 0.4982908070087433, "learning_rate": 9.143685138913435e-06, "loss": 0.4474, "step": 1444 }, { "epoch": 0.811341942728804, "grad_norm": 0.5529981255531311, "learning_rate": 9.141855889960506e-06, "loss": 0.4951, "step": 1445 }, { "epoch": 0.8119034250421112, "grad_norm": 0.6446360349655151, "learning_rate": 9.140024872667474e-06, "loss": 0.4923, "step": 1446 }, { "epoch": 0.8124649073554183, "grad_norm": 0.5971615314483643, "learning_rate": 9.13819208781608e-06, "loss": 0.4523, "step": 1447 }, { "epoch": 0.8130263896687254, "grad_norm": 0.5704857110977173, "learning_rate": 9.136357536188822e-06, "loss": 0.4484, "step": 1448 }, { "epoch": 0.8135878719820325, "grad_norm": 0.5727104544639587, "learning_rate": 9.134521218568949e-06, "loss": 0.4216, "step": 1449 }, { "epoch": 0.8141493542953397, "grad_norm": 0.6633376479148865, "learning_rate": 9.132683135740467e-06, "loss": 0.479, "step": 1450 }, { "epoch": 0.8147108366086468, "grad_norm": 0.5000513792037964, "learning_rate": 9.130843288488134e-06, "loss": 0.4287, "step": 1451 }, { "epoch": 0.815272318921954, "grad_norm": 0.7649497985839844, "learning_rate": 9.12900167759746e-06, "loss": 0.4711, "step": 1452 }, { "epoch": 0.8158338012352611, "grad_norm": 0.5525054335594177, "learning_rate": 9.127158303854711e-06, "loss": 0.4496, "step": 1453 }, { "epoch": 0.8163952835485683, "grad_norm": 0.5294052958488464, "learning_rate": 9.125313168046903e-06, "loss": 0.4468, "step": 1454 }, { "epoch": 0.8169567658618754, "grad_norm": 0.5733511447906494, "learning_rate": 9.123466270961807e-06, "loss": 0.4822, "step": 1455 }, { "epoch": 0.8175182481751825, "grad_norm": 0.5047231316566467, "learning_rate": 9.121617613387942e-06, "loss": 0.4813, "step": 1456 }, { "epoch": 0.8180797304884896, "grad_norm": 0.5453718304634094, "learning_rate": 9.119767196114585e-06, "loss": 0.4438, "step": 1457 }, { "epoch": 0.8186412128017967, "grad_norm": 0.6205439567565918, "learning_rate": 9.117915019931758e-06, "loss": 0.4605, "step": 1458 }, { "epoch": 0.8192026951151039, "grad_norm": 0.5980207324028015, "learning_rate": 9.116061085630234e-06, "loss": 0.4504, "step": 1459 }, { "epoch": 0.819764177428411, "grad_norm": 0.5843964219093323, "learning_rate": 9.114205394001545e-06, "loss": 0.474, "step": 1460 }, { "epoch": 0.8203256597417181, "grad_norm": 0.5029227137565613, "learning_rate": 9.112347945837961e-06, "loss": 0.4763, "step": 1461 }, { "epoch": 0.8208871420550252, "grad_norm": 0.5631417632102966, "learning_rate": 9.11048874193251e-06, "loss": 0.435, "step": 1462 }, { "epoch": 0.8214486243683324, "grad_norm": 0.573920726776123, "learning_rate": 9.108627783078971e-06, "loss": 0.4461, "step": 1463 }, { "epoch": 0.8220101066816395, "grad_norm": 0.5547534823417664, "learning_rate": 9.106765070071866e-06, "loss": 0.4725, "step": 1464 }, { "epoch": 0.8225715889949466, "grad_norm": 0.5956401824951172, "learning_rate": 9.104900603706468e-06, "loss": 0.4521, "step": 1465 }, { "epoch": 0.8231330713082537, "grad_norm": 0.6496511697769165, "learning_rate": 9.103034384778803e-06, "loss": 0.4424, "step": 1466 }, { "epoch": 0.823694553621561, "grad_norm": 0.5609480142593384, "learning_rate": 9.101166414085638e-06, "loss": 0.4747, "step": 1467 }, { "epoch": 0.8242560359348681, "grad_norm": 0.5053898096084595, "learning_rate": 9.099296692424495e-06, "loss": 0.4448, "step": 1468 }, { "epoch": 0.8248175182481752, "grad_norm": 0.6344479322433472, "learning_rate": 9.097425220593638e-06, "loss": 0.4382, "step": 1469 }, { "epoch": 0.8253790005614823, "grad_norm": 0.5566872358322144, "learning_rate": 9.095551999392083e-06, "loss": 0.475, "step": 1470 }, { "epoch": 0.8259404828747895, "grad_norm": 0.5421826243400574, "learning_rate": 9.093677029619585e-06, "loss": 0.432, "step": 1471 }, { "epoch": 0.8265019651880966, "grad_norm": 0.6402488350868225, "learning_rate": 9.091800312076657e-06, "loss": 0.4638, "step": 1472 }, { "epoch": 0.8270634475014037, "grad_norm": 0.6260296702384949, "learning_rate": 9.08992184756455e-06, "loss": 0.4531, "step": 1473 }, { "epoch": 0.8276249298147108, "grad_norm": 0.5451812744140625, "learning_rate": 9.088041636885261e-06, "loss": 0.4665, "step": 1474 }, { "epoch": 0.8281864121280179, "grad_norm": 0.5455039739608765, "learning_rate": 9.086159680841538e-06, "loss": 0.4781, "step": 1475 }, { "epoch": 0.8287478944413251, "grad_norm": 0.7128114700317383, "learning_rate": 9.084275980236868e-06, "loss": 0.4294, "step": 1476 }, { "epoch": 0.8293093767546322, "grad_norm": 0.48861777782440186, "learning_rate": 9.082390535875485e-06, "loss": 0.4328, "step": 1477 }, { "epoch": 0.8298708590679393, "grad_norm": 0.5063812136650085, "learning_rate": 9.080503348562371e-06, "loss": 0.4395, "step": 1478 }, { "epoch": 0.8304323413812464, "grad_norm": 0.5070540308952332, "learning_rate": 9.078614419103245e-06, "loss": 0.4804, "step": 1479 }, { "epoch": 0.8309938236945537, "grad_norm": 0.5226375460624695, "learning_rate": 9.076723748304578e-06, "loss": 0.4588, "step": 1480 }, { "epoch": 0.8315553060078608, "grad_norm": 0.504488468170166, "learning_rate": 9.074831336973577e-06, "loss": 0.4499, "step": 1481 }, { "epoch": 0.8321167883211679, "grad_norm": 0.48295632004737854, "learning_rate": 9.0729371859182e-06, "loss": 0.4405, "step": 1482 }, { "epoch": 0.832678270634475, "grad_norm": 0.4730626046657562, "learning_rate": 9.071041295947139e-06, "loss": 0.422, "step": 1483 }, { "epoch": 0.8332397529477822, "grad_norm": 0.48668524622917175, "learning_rate": 9.069143667869831e-06, "loss": 0.4585, "step": 1484 }, { "epoch": 0.8338012352610893, "grad_norm": 0.5848031044006348, "learning_rate": 9.067244302496462e-06, "loss": 0.4927, "step": 1485 }, { "epoch": 0.8343627175743964, "grad_norm": 0.5240735411643982, "learning_rate": 9.065343200637953e-06, "loss": 0.4848, "step": 1486 }, { "epoch": 0.8349241998877035, "grad_norm": 0.5512768030166626, "learning_rate": 9.063440363105965e-06, "loss": 0.4532, "step": 1487 }, { "epoch": 0.8354856822010107, "grad_norm": 0.5565248131752014, "learning_rate": 9.061535790712907e-06, "loss": 0.4496, "step": 1488 }, { "epoch": 0.8360471645143178, "grad_norm": 0.5250009298324585, "learning_rate": 9.05962948427192e-06, "loss": 0.4458, "step": 1489 }, { "epoch": 0.8366086468276249, "grad_norm": 0.5901839733123779, "learning_rate": 9.057721444596895e-06, "loss": 0.4684, "step": 1490 }, { "epoch": 0.837170129140932, "grad_norm": 0.4989749491214752, "learning_rate": 9.055811672502453e-06, "loss": 0.4855, "step": 1491 }, { "epoch": 0.8377316114542392, "grad_norm": 0.5766562819480896, "learning_rate": 9.053900168803963e-06, "loss": 0.4593, "step": 1492 }, { "epoch": 0.8382930937675463, "grad_norm": 0.5532837510108948, "learning_rate": 9.051986934317527e-06, "loss": 0.479, "step": 1493 }, { "epoch": 0.8388545760808535, "grad_norm": 0.5235521197319031, "learning_rate": 9.050071969859991e-06, "loss": 0.4643, "step": 1494 }, { "epoch": 0.8394160583941606, "grad_norm": 0.5751461386680603, "learning_rate": 9.048155276248938e-06, "loss": 0.4557, "step": 1495 }, { "epoch": 0.8399775407074677, "grad_norm": 0.6057397127151489, "learning_rate": 9.046236854302685e-06, "loss": 0.4263, "step": 1496 }, { "epoch": 0.8405390230207749, "grad_norm": 0.5290727615356445, "learning_rate": 9.044316704840294e-06, "loss": 0.4423, "step": 1497 }, { "epoch": 0.841100505334082, "grad_norm": 0.5421375632286072, "learning_rate": 9.042394828681562e-06, "loss": 0.4695, "step": 1498 }, { "epoch": 0.8416619876473891, "grad_norm": 0.6136300563812256, "learning_rate": 9.040471226647018e-06, "loss": 0.4356, "step": 1499 }, { "epoch": 0.8422234699606962, "grad_norm": 0.5705631971359253, "learning_rate": 9.038545899557936e-06, "loss": 0.4697, "step": 1500 }, { "epoch": 0.8427849522740034, "grad_norm": 0.5927669405937195, "learning_rate": 9.03661884823632e-06, "loss": 0.4519, "step": 1501 }, { "epoch": 0.8433464345873105, "grad_norm": 0.5225933790206909, "learning_rate": 9.034690073504915e-06, "loss": 0.4411, "step": 1502 }, { "epoch": 0.8439079169006176, "grad_norm": 0.5956157445907593, "learning_rate": 9.032759576187199e-06, "loss": 0.471, "step": 1503 }, { "epoch": 0.8444693992139247, "grad_norm": 0.5746314525604248, "learning_rate": 9.030827357107383e-06, "loss": 0.4616, "step": 1504 }, { "epoch": 0.8450308815272319, "grad_norm": 0.6368353962898254, "learning_rate": 9.028893417090421e-06, "loss": 0.438, "step": 1505 }, { "epoch": 0.845592363840539, "grad_norm": 0.533348560333252, "learning_rate": 9.026957756961994e-06, "loss": 0.4542, "step": 1506 }, { "epoch": 0.8461538461538461, "grad_norm": 0.5385999083518982, "learning_rate": 9.025020377548521e-06, "loss": 0.4405, "step": 1507 }, { "epoch": 0.8467153284671532, "grad_norm": 0.68436598777771, "learning_rate": 9.023081279677155e-06, "loss": 0.4716, "step": 1508 }, { "epoch": 0.8472768107804605, "grad_norm": 0.5565488338470459, "learning_rate": 9.02114046417578e-06, "loss": 0.4451, "step": 1509 }, { "epoch": 0.8478382930937676, "grad_norm": 0.535878598690033, "learning_rate": 9.019197931873018e-06, "loss": 0.4633, "step": 1510 }, { "epoch": 0.8483997754070747, "grad_norm": 0.5206401348114014, "learning_rate": 9.017253683598219e-06, "loss": 0.4424, "step": 1511 }, { "epoch": 0.8489612577203818, "grad_norm": 0.5218042135238647, "learning_rate": 9.015307720181468e-06, "loss": 0.4461, "step": 1512 }, { "epoch": 0.8495227400336889, "grad_norm": 0.47684064507484436, "learning_rate": 9.013360042453584e-06, "loss": 0.4796, "step": 1513 }, { "epoch": 0.8500842223469961, "grad_norm": 0.5645583271980286, "learning_rate": 9.011410651246115e-06, "loss": 0.4687, "step": 1514 }, { "epoch": 0.8506457046603032, "grad_norm": 0.5006386041641235, "learning_rate": 9.009459547391341e-06, "loss": 0.446, "step": 1515 }, { "epoch": 0.8512071869736103, "grad_norm": 0.6059245467185974, "learning_rate": 9.007506731722277e-06, "loss": 0.4633, "step": 1516 }, { "epoch": 0.8517686692869174, "grad_norm": 0.6273640394210815, "learning_rate": 9.005552205072663e-06, "loss": 0.4701, "step": 1517 }, { "epoch": 0.8523301516002246, "grad_norm": 0.4852025806903839, "learning_rate": 9.00359596827697e-06, "loss": 0.47, "step": 1518 }, { "epoch": 0.8528916339135317, "grad_norm": 0.5591435432434082, "learning_rate": 9.001638022170407e-06, "loss": 0.4843, "step": 1519 }, { "epoch": 0.8534531162268388, "grad_norm": 0.5715367794036865, "learning_rate": 8.999678367588905e-06, "loss": 0.4372, "step": 1520 }, { "epoch": 0.8540145985401459, "grad_norm": 0.4962327182292938, "learning_rate": 8.997717005369122e-06, "loss": 0.4649, "step": 1521 }, { "epoch": 0.8545760808534532, "grad_norm": 0.5018644332885742, "learning_rate": 8.995753936348458e-06, "loss": 0.4439, "step": 1522 }, { "epoch": 0.8551375631667603, "grad_norm": 0.551723301410675, "learning_rate": 8.993789161365025e-06, "loss": 0.4245, "step": 1523 }, { "epoch": 0.8556990454800674, "grad_norm": 0.519493818283081, "learning_rate": 8.991822681257676e-06, "loss": 0.4595, "step": 1524 }, { "epoch": 0.8562605277933745, "grad_norm": 0.47476792335510254, "learning_rate": 8.989854496865989e-06, "loss": 0.465, "step": 1525 }, { "epoch": 0.8568220101066817, "grad_norm": 0.585838258266449, "learning_rate": 8.987884609030264e-06, "loss": 0.4677, "step": 1526 }, { "epoch": 0.8573834924199888, "grad_norm": 0.5779849886894226, "learning_rate": 8.985913018591537e-06, "loss": 0.4967, "step": 1527 }, { "epoch": 0.8579449747332959, "grad_norm": 0.544863224029541, "learning_rate": 8.983939726391565e-06, "loss": 0.4671, "step": 1528 }, { "epoch": 0.858506457046603, "grad_norm": 0.48042669892311096, "learning_rate": 8.98196473327283e-06, "loss": 0.4616, "step": 1529 }, { "epoch": 0.8590679393599102, "grad_norm": 0.5272425413131714, "learning_rate": 8.979988040078549e-06, "loss": 0.4303, "step": 1530 }, { "epoch": 0.8596294216732173, "grad_norm": 0.6263166069984436, "learning_rate": 8.978009647652654e-06, "loss": 0.4676, "step": 1531 }, { "epoch": 0.8601909039865244, "grad_norm": 0.519582986831665, "learning_rate": 8.97602955683981e-06, "loss": 0.4781, "step": 1532 }, { "epoch": 0.8607523862998315, "grad_norm": 0.5481398701667786, "learning_rate": 8.974047768485406e-06, "loss": 0.4395, "step": 1533 }, { "epoch": 0.8613138686131386, "grad_norm": 0.5710074305534363, "learning_rate": 8.97206428343555e-06, "loss": 0.4357, "step": 1534 }, { "epoch": 0.8618753509264458, "grad_norm": 0.5662114024162292, "learning_rate": 8.970079102537082e-06, "loss": 0.4529, "step": 1535 }, { "epoch": 0.862436833239753, "grad_norm": 0.543968915939331, "learning_rate": 8.968092226637564e-06, "loss": 0.4687, "step": 1536 }, { "epoch": 0.8629983155530601, "grad_norm": 0.6427304744720459, "learning_rate": 8.966103656585277e-06, "loss": 0.4384, "step": 1537 }, { "epoch": 0.8635597978663672, "grad_norm": 0.6275960803031921, "learning_rate": 8.964113393229232e-06, "loss": 0.4642, "step": 1538 }, { "epoch": 0.8641212801796744, "grad_norm": 0.6437975764274597, "learning_rate": 8.96212143741916e-06, "loss": 0.4462, "step": 1539 }, { "epoch": 0.8646827624929815, "grad_norm": 0.5807445645332336, "learning_rate": 8.96012779000551e-06, "loss": 0.4298, "step": 1540 }, { "epoch": 0.8652442448062886, "grad_norm": 0.5348765850067139, "learning_rate": 8.958132451839464e-06, "loss": 0.4698, "step": 1541 }, { "epoch": 0.8658057271195957, "grad_norm": 0.5809811949729919, "learning_rate": 8.956135423772914e-06, "loss": 0.4674, "step": 1542 }, { "epoch": 0.8663672094329029, "grad_norm": 0.6516228914260864, "learning_rate": 8.954136706658481e-06, "loss": 0.4713, "step": 1543 }, { "epoch": 0.86692869174621, "grad_norm": 0.49846288561820984, "learning_rate": 8.952136301349505e-06, "loss": 0.4791, "step": 1544 }, { "epoch": 0.8674901740595171, "grad_norm": 0.4879606366157532, "learning_rate": 8.950134208700047e-06, "loss": 0.4221, "step": 1545 }, { "epoch": 0.8680516563728242, "grad_norm": 0.6306393146514893, "learning_rate": 8.948130429564889e-06, "loss": 0.471, "step": 1546 }, { "epoch": 0.8686131386861314, "grad_norm": 0.5403028130531311, "learning_rate": 8.94612496479953e-06, "loss": 0.4364, "step": 1547 }, { "epoch": 0.8691746209994385, "grad_norm": 0.5594610571861267, "learning_rate": 8.944117815260194e-06, "loss": 0.4747, "step": 1548 }, { "epoch": 0.8697361033127456, "grad_norm": 0.6345369815826416, "learning_rate": 8.942108981803818e-06, "loss": 0.4955, "step": 1549 }, { "epoch": 0.8702975856260527, "grad_norm": 0.5889030694961548, "learning_rate": 8.940098465288065e-06, "loss": 0.4568, "step": 1550 }, { "epoch": 0.87085906793936, "grad_norm": 0.593271791934967, "learning_rate": 8.93808626657131e-06, "loss": 0.4851, "step": 1551 }, { "epoch": 0.8714205502526671, "grad_norm": 0.5225347876548767, "learning_rate": 8.936072386512647e-06, "loss": 0.4452, "step": 1552 }, { "epoch": 0.8719820325659742, "grad_norm": 0.6119551658630371, "learning_rate": 8.934056825971896e-06, "loss": 0.4522, "step": 1553 }, { "epoch": 0.8725435148792813, "grad_norm": 0.5477710366249084, "learning_rate": 8.932039585809582e-06, "loss": 0.4335, "step": 1554 }, { "epoch": 0.8731049971925884, "grad_norm": 0.5385422706604004, "learning_rate": 8.930020666886956e-06, "loss": 0.4374, "step": 1555 }, { "epoch": 0.8736664795058956, "grad_norm": 0.6357563734054565, "learning_rate": 8.928000070065986e-06, "loss": 0.4504, "step": 1556 }, { "epoch": 0.8742279618192027, "grad_norm": 0.5183719992637634, "learning_rate": 8.925977796209347e-06, "loss": 0.4518, "step": 1557 }, { "epoch": 0.8747894441325098, "grad_norm": 0.5291733741760254, "learning_rate": 8.923953846180442e-06, "loss": 0.4322, "step": 1558 }, { "epoch": 0.8753509264458169, "grad_norm": 0.5403515696525574, "learning_rate": 8.921928220843382e-06, "loss": 0.4226, "step": 1559 }, { "epoch": 0.8759124087591241, "grad_norm": 0.5146910548210144, "learning_rate": 8.919900921062996e-06, "loss": 0.4396, "step": 1560 }, { "epoch": 0.8764738910724312, "grad_norm": 0.5308709144592285, "learning_rate": 8.917871947704828e-06, "loss": 0.4276, "step": 1561 }, { "epoch": 0.8770353733857383, "grad_norm": 0.5391326546669006, "learning_rate": 8.915841301635132e-06, "loss": 0.4772, "step": 1562 }, { "epoch": 0.8775968556990454, "grad_norm": 0.499345064163208, "learning_rate": 8.913808983720885e-06, "loss": 0.426, "step": 1563 }, { "epoch": 0.8781583380123527, "grad_norm": 0.49767017364501953, "learning_rate": 8.911774994829767e-06, "loss": 0.4496, "step": 1564 }, { "epoch": 0.8787198203256598, "grad_norm": 0.5295262932777405, "learning_rate": 8.909739335830183e-06, "loss": 0.4488, "step": 1565 }, { "epoch": 0.8792813026389669, "grad_norm": 0.560410737991333, "learning_rate": 8.907702007591243e-06, "loss": 0.4649, "step": 1566 }, { "epoch": 0.879842784952274, "grad_norm": 0.4895114302635193, "learning_rate": 8.90566301098277e-06, "loss": 0.4106, "step": 1567 }, { "epoch": 0.8804042672655812, "grad_norm": 0.5414503812789917, "learning_rate": 8.903622346875305e-06, "loss": 0.4822, "step": 1568 }, { "epoch": 0.8809657495788883, "grad_norm": 0.5560813546180725, "learning_rate": 8.901580016140092e-06, "loss": 0.4562, "step": 1569 }, { "epoch": 0.8815272318921954, "grad_norm": 0.552005410194397, "learning_rate": 8.899536019649096e-06, "loss": 0.441, "step": 1570 }, { "epoch": 0.8820887142055025, "grad_norm": 0.6236041188240051, "learning_rate": 8.897490358274987e-06, "loss": 0.4289, "step": 1571 }, { "epoch": 0.8826501965188096, "grad_norm": 0.7148640155792236, "learning_rate": 8.895443032891149e-06, "loss": 0.4554, "step": 1572 }, { "epoch": 0.8832116788321168, "grad_norm": 0.6140838265419006, "learning_rate": 8.893394044371672e-06, "loss": 0.4725, "step": 1573 }, { "epoch": 0.8837731611454239, "grad_norm": 0.5139850378036499, "learning_rate": 8.891343393591362e-06, "loss": 0.4343, "step": 1574 }, { "epoch": 0.884334643458731, "grad_norm": 0.612297534942627, "learning_rate": 8.889291081425733e-06, "loss": 0.4583, "step": 1575 }, { "epoch": 0.8848961257720381, "grad_norm": 0.5828324556350708, "learning_rate": 8.887237108751004e-06, "loss": 0.455, "step": 1576 }, { "epoch": 0.8854576080853453, "grad_norm": 0.638813316822052, "learning_rate": 8.885181476444106e-06, "loss": 0.4508, "step": 1577 }, { "epoch": 0.8860190903986525, "grad_norm": 0.5022094249725342, "learning_rate": 8.883124185382682e-06, "loss": 0.434, "step": 1578 }, { "epoch": 0.8865805727119596, "grad_norm": 0.5258540511131287, "learning_rate": 8.881065236445074e-06, "loss": 0.4311, "step": 1579 }, { "epoch": 0.8871420550252667, "grad_norm": 0.5354477763175964, "learning_rate": 8.879004630510345e-06, "loss": 0.4775, "step": 1580 }, { "epoch": 0.8877035373385739, "grad_norm": 0.5808379650115967, "learning_rate": 8.876942368458254e-06, "loss": 0.5055, "step": 1581 }, { "epoch": 0.888265019651881, "grad_norm": 0.6544892191886902, "learning_rate": 8.87487845116927e-06, "loss": 0.4769, "step": 1582 }, { "epoch": 0.8888265019651881, "grad_norm": 0.49900054931640625, "learning_rate": 8.872812879524574e-06, "loss": 0.4455, "step": 1583 }, { "epoch": 0.8893879842784952, "grad_norm": 0.7507695555686951, "learning_rate": 8.870745654406045e-06, "loss": 0.4949, "step": 1584 }, { "epoch": 0.8899494665918024, "grad_norm": 0.5257821679115295, "learning_rate": 8.868676776696276e-06, "loss": 0.4186, "step": 1585 }, { "epoch": 0.8905109489051095, "grad_norm": 0.6141937375068665, "learning_rate": 8.866606247278559e-06, "loss": 0.4403, "step": 1586 }, { "epoch": 0.8910724312184166, "grad_norm": 0.5718101263046265, "learning_rate": 8.864534067036894e-06, "loss": 0.4384, "step": 1587 }, { "epoch": 0.8916339135317237, "grad_norm": 0.5891246199607849, "learning_rate": 8.862460236855988e-06, "loss": 0.4465, "step": 1588 }, { "epoch": 0.8921953958450309, "grad_norm": 0.5219178199768066, "learning_rate": 8.860384757621247e-06, "loss": 0.4695, "step": 1589 }, { "epoch": 0.892756878158338, "grad_norm": 0.5391274690628052, "learning_rate": 8.858307630218786e-06, "loss": 0.4727, "step": 1590 }, { "epoch": 0.8933183604716451, "grad_norm": 0.5333481431007385, "learning_rate": 8.856228855535423e-06, "loss": 0.461, "step": 1591 }, { "epoch": 0.8938798427849522, "grad_norm": 0.49042749404907227, "learning_rate": 8.854148434458676e-06, "loss": 0.4459, "step": 1592 }, { "epoch": 0.8944413250982594, "grad_norm": 0.5676397085189819, "learning_rate": 8.852066367876768e-06, "loss": 0.4775, "step": 1593 }, { "epoch": 0.8950028074115666, "grad_norm": 0.5553759336471558, "learning_rate": 8.849982656678626e-06, "loss": 0.431, "step": 1594 }, { "epoch": 0.8955642897248737, "grad_norm": 0.5148587822914124, "learning_rate": 8.847897301753879e-06, "loss": 0.4489, "step": 1595 }, { "epoch": 0.8961257720381808, "grad_norm": 0.6019876599311829, "learning_rate": 8.845810303992852e-06, "loss": 0.4314, "step": 1596 }, { "epoch": 0.8966872543514879, "grad_norm": 0.6373450756072998, "learning_rate": 8.84372166428658e-06, "loss": 0.4555, "step": 1597 }, { "epoch": 0.8972487366647951, "grad_norm": 0.5373201370239258, "learning_rate": 8.841631383526795e-06, "loss": 0.4681, "step": 1598 }, { "epoch": 0.8978102189781022, "grad_norm": 0.6211251020431519, "learning_rate": 8.839539462605927e-06, "loss": 0.4273, "step": 1599 }, { "epoch": 0.8983717012914093, "grad_norm": 0.5041552186012268, "learning_rate": 8.83744590241711e-06, "loss": 0.4217, "step": 1600 }, { "epoch": 0.8989331836047164, "grad_norm": 0.5117703080177307, "learning_rate": 8.835350703854177e-06, "loss": 0.4469, "step": 1601 }, { "epoch": 0.8994946659180236, "grad_norm": 0.5214657783508301, "learning_rate": 8.833253867811662e-06, "loss": 0.4309, "step": 1602 }, { "epoch": 0.9000561482313307, "grad_norm": 0.5231374502182007, "learning_rate": 8.831155395184793e-06, "loss": 0.4548, "step": 1603 }, { "epoch": 0.9006176305446378, "grad_norm": 0.5635905861854553, "learning_rate": 8.829055286869502e-06, "loss": 0.4699, "step": 1604 }, { "epoch": 0.9011791128579449, "grad_norm": 0.5029401779174805, "learning_rate": 8.826953543762418e-06, "loss": 0.4583, "step": 1605 }, { "epoch": 0.9017405951712522, "grad_norm": 0.4946396052837372, "learning_rate": 8.824850166760866e-06, "loss": 0.4349, "step": 1606 }, { "epoch": 0.9023020774845593, "grad_norm": 0.5754432082176208, "learning_rate": 8.82274515676287e-06, "loss": 0.4539, "step": 1607 }, { "epoch": 0.9028635597978664, "grad_norm": 0.548443615436554, "learning_rate": 8.820638514667153e-06, "loss": 0.4607, "step": 1608 }, { "epoch": 0.9034250421111735, "grad_norm": 0.5190774202346802, "learning_rate": 8.81853024137313e-06, "loss": 0.4824, "step": 1609 }, { "epoch": 0.9039865244244806, "grad_norm": 0.4984028935432434, "learning_rate": 8.816420337780917e-06, "loss": 0.4474, "step": 1610 }, { "epoch": 0.9045480067377878, "grad_norm": 0.6978410482406616, "learning_rate": 8.814308804791326e-06, "loss": 0.4863, "step": 1611 }, { "epoch": 0.9051094890510949, "grad_norm": 0.5415452718734741, "learning_rate": 8.812195643305862e-06, "loss": 0.4537, "step": 1612 }, { "epoch": 0.905670971364402, "grad_norm": 0.5150550007820129, "learning_rate": 8.810080854226724e-06, "loss": 0.4642, "step": 1613 }, { "epoch": 0.9062324536777091, "grad_norm": 0.4988594651222229, "learning_rate": 8.807964438456813e-06, "loss": 0.4507, "step": 1614 }, { "epoch": 0.9067939359910163, "grad_norm": 0.5350676774978638, "learning_rate": 8.805846396899716e-06, "loss": 0.4647, "step": 1615 }, { "epoch": 0.9073554183043234, "grad_norm": 0.4837181866168976, "learning_rate": 8.803726730459719e-06, "loss": 0.4695, "step": 1616 }, { "epoch": 0.9079169006176305, "grad_norm": 0.5787271857261658, "learning_rate": 8.801605440041801e-06, "loss": 0.4562, "step": 1617 }, { "epoch": 0.9084783829309376, "grad_norm": 0.5609124898910522, "learning_rate": 8.799482526551633e-06, "loss": 0.458, "step": 1618 }, { "epoch": 0.9090398652442448, "grad_norm": 0.5027602910995483, "learning_rate": 8.797357990895582e-06, "loss": 0.4421, "step": 1619 }, { "epoch": 0.909601347557552, "grad_norm": 0.5441383719444275, "learning_rate": 8.795231833980705e-06, "loss": 0.4614, "step": 1620 }, { "epoch": 0.9101628298708591, "grad_norm": 0.49796417355537415, "learning_rate": 8.793104056714748e-06, "loss": 0.4438, "step": 1621 }, { "epoch": 0.9107243121841662, "grad_norm": 0.4766249656677246, "learning_rate": 8.790974660006158e-06, "loss": 0.428, "step": 1622 }, { "epoch": 0.9112857944974734, "grad_norm": 0.5331078171730042, "learning_rate": 8.788843644764063e-06, "loss": 0.4248, "step": 1623 }, { "epoch": 0.9118472768107805, "grad_norm": 0.49658915400505066, "learning_rate": 8.786711011898292e-06, "loss": 0.4559, "step": 1624 }, { "epoch": 0.9124087591240876, "grad_norm": 0.5765029191970825, "learning_rate": 8.784576762319358e-06, "loss": 0.4736, "step": 1625 }, { "epoch": 0.9129702414373947, "grad_norm": 0.5564265251159668, "learning_rate": 8.782440896938462e-06, "loss": 0.4242, "step": 1626 }, { "epoch": 0.9135317237507019, "grad_norm": 0.5147901773452759, "learning_rate": 8.780303416667502e-06, "loss": 0.4666, "step": 1627 }, { "epoch": 0.914093206064009, "grad_norm": 0.46161654591560364, "learning_rate": 8.778164322419061e-06, "loss": 0.4148, "step": 1628 }, { "epoch": 0.9146546883773161, "grad_norm": 0.6060371398925781, "learning_rate": 8.776023615106415e-06, "loss": 0.464, "step": 1629 }, { "epoch": 0.9152161706906232, "grad_norm": 0.5654277205467224, "learning_rate": 8.773881295643522e-06, "loss": 0.4497, "step": 1630 }, { "epoch": 0.9157776530039303, "grad_norm": 0.5280105471611023, "learning_rate": 8.771737364945037e-06, "loss": 0.4261, "step": 1631 }, { "epoch": 0.9163391353172375, "grad_norm": 0.6600010991096497, "learning_rate": 8.769591823926292e-06, "loss": 0.4647, "step": 1632 }, { "epoch": 0.9169006176305446, "grad_norm": 0.6624104380607605, "learning_rate": 8.767444673503319e-06, "loss": 0.4618, "step": 1633 }, { "epoch": 0.9174620999438517, "grad_norm": 0.5148696899414062, "learning_rate": 8.765295914592824e-06, "loss": 0.4216, "step": 1634 }, { "epoch": 0.9180235822571589, "grad_norm": 0.47854405641555786, "learning_rate": 8.763145548112214e-06, "loss": 0.4464, "step": 1635 }, { "epoch": 0.9185850645704661, "grad_norm": 0.5234184265136719, "learning_rate": 8.760993574979567e-06, "loss": 0.4473, "step": 1636 }, { "epoch": 0.9191465468837732, "grad_norm": 0.5473552346229553, "learning_rate": 8.758839996113661e-06, "loss": 0.4483, "step": 1637 }, { "epoch": 0.9197080291970803, "grad_norm": 0.5336732864379883, "learning_rate": 8.75668481243395e-06, "loss": 0.4702, "step": 1638 }, { "epoch": 0.9202695115103874, "grad_norm": 0.5227993130683899, "learning_rate": 8.75452802486058e-06, "loss": 0.4692, "step": 1639 }, { "epoch": 0.9208309938236946, "grad_norm": 0.5239608883857727, "learning_rate": 8.752369634314375e-06, "loss": 0.4708, "step": 1640 }, { "epoch": 0.9213924761370017, "grad_norm": 0.585780918598175, "learning_rate": 8.750209641716848e-06, "loss": 0.4755, "step": 1641 }, { "epoch": 0.9219539584503088, "grad_norm": 0.5174302458763123, "learning_rate": 8.748048047990191e-06, "loss": 0.4865, "step": 1642 }, { "epoch": 0.9225154407636159, "grad_norm": 0.5513038039207458, "learning_rate": 8.74588485405729e-06, "loss": 0.44, "step": 1643 }, { "epoch": 0.9230769230769231, "grad_norm": 0.6292861700057983, "learning_rate": 8.743720060841703e-06, "loss": 0.4711, "step": 1644 }, { "epoch": 0.9236384053902302, "grad_norm": 0.6442723274230957, "learning_rate": 8.741553669267677e-06, "loss": 0.4475, "step": 1645 }, { "epoch": 0.9241998877035373, "grad_norm": 0.5038877725601196, "learning_rate": 8.739385680260138e-06, "loss": 0.4687, "step": 1646 }, { "epoch": 0.9247613700168444, "grad_norm": 0.5155378580093384, "learning_rate": 8.737216094744698e-06, "loss": 0.4351, "step": 1647 }, { "epoch": 0.9253228523301515, "grad_norm": 0.5607654452323914, "learning_rate": 8.735044913647646e-06, "loss": 0.4644, "step": 1648 }, { "epoch": 0.9258843346434588, "grad_norm": 0.5438370108604431, "learning_rate": 8.732872137895953e-06, "loss": 0.4322, "step": 1649 }, { "epoch": 0.9264458169567659, "grad_norm": 0.5665859580039978, "learning_rate": 8.730697768417278e-06, "loss": 0.4479, "step": 1650 }, { "epoch": 0.927007299270073, "grad_norm": 0.5508242249488831, "learning_rate": 8.728521806139952e-06, "loss": 0.4324, "step": 1651 }, { "epoch": 0.9275687815833801, "grad_norm": 0.5982184410095215, "learning_rate": 8.726344251992986e-06, "loss": 0.4643, "step": 1652 }, { "epoch": 0.9281302638966873, "grad_norm": 0.6305740475654602, "learning_rate": 8.724165106906078e-06, "loss": 0.475, "step": 1653 }, { "epoch": 0.9286917462099944, "grad_norm": 0.4956609904766083, "learning_rate": 8.721984371809598e-06, "loss": 0.4183, "step": 1654 }, { "epoch": 0.9292532285233015, "grad_norm": 0.5849156975746155, "learning_rate": 8.719802047634596e-06, "loss": 0.4399, "step": 1655 }, { "epoch": 0.9298147108366086, "grad_norm": 0.5464709997177124, "learning_rate": 8.717618135312807e-06, "loss": 0.4539, "step": 1656 }, { "epoch": 0.9303761931499158, "grad_norm": 0.46168237924575806, "learning_rate": 8.715432635776634e-06, "loss": 0.425, "step": 1657 }, { "epoch": 0.9309376754632229, "grad_norm": 0.5800828337669373, "learning_rate": 8.713245549959166e-06, "loss": 0.4474, "step": 1658 }, { "epoch": 0.93149915777653, "grad_norm": 0.5385411977767944, "learning_rate": 8.711056878794164e-06, "loss": 0.4646, "step": 1659 }, { "epoch": 0.9320606400898371, "grad_norm": 0.502737283706665, "learning_rate": 8.70886662321607e-06, "loss": 0.4368, "step": 1660 }, { "epoch": 0.9326221224031443, "grad_norm": 0.6090850830078125, "learning_rate": 8.706674784159997e-06, "loss": 0.4601, "step": 1661 }, { "epoch": 0.9331836047164515, "grad_norm": 0.558006763458252, "learning_rate": 8.704481362561739e-06, "loss": 0.4559, "step": 1662 }, { "epoch": 0.9337450870297586, "grad_norm": 0.5412368178367615, "learning_rate": 8.702286359357766e-06, "loss": 0.4505, "step": 1663 }, { "epoch": 0.9343065693430657, "grad_norm": 0.5355163216590881, "learning_rate": 8.70008977548522e-06, "loss": 0.4274, "step": 1664 }, { "epoch": 0.9348680516563729, "grad_norm": 0.5276992917060852, "learning_rate": 8.697891611881916e-06, "loss": 0.4796, "step": 1665 }, { "epoch": 0.93542953396968, "grad_norm": 0.6763882637023926, "learning_rate": 8.69569186948635e-06, "loss": 0.5062, "step": 1666 }, { "epoch": 0.9359910162829871, "grad_norm": 0.5065687894821167, "learning_rate": 8.693490549237686e-06, "loss": 0.4706, "step": 1667 }, { "epoch": 0.9365524985962942, "grad_norm": 0.5658678412437439, "learning_rate": 8.691287652075769e-06, "loss": 0.4325, "step": 1668 }, { "epoch": 0.9371139809096013, "grad_norm": 0.5728349685668945, "learning_rate": 8.689083178941107e-06, "loss": 0.4492, "step": 1669 }, { "epoch": 0.9376754632229085, "grad_norm": 0.557280421257019, "learning_rate": 8.686877130774889e-06, "loss": 0.4294, "step": 1670 }, { "epoch": 0.9382369455362156, "grad_norm": 0.5375300049781799, "learning_rate": 8.684669508518973e-06, "loss": 0.4413, "step": 1671 }, { "epoch": 0.9387984278495227, "grad_norm": 0.549645721912384, "learning_rate": 8.682460313115891e-06, "loss": 0.4287, "step": 1672 }, { "epoch": 0.9393599101628298, "grad_norm": 0.6498605012893677, "learning_rate": 8.680249545508845e-06, "loss": 0.4815, "step": 1673 }, { "epoch": 0.939921392476137, "grad_norm": 0.49336421489715576, "learning_rate": 8.678037206641709e-06, "loss": 0.4607, "step": 1674 }, { "epoch": 0.9404828747894441, "grad_norm": 0.6199778914451599, "learning_rate": 8.675823297459027e-06, "loss": 0.4592, "step": 1675 }, { "epoch": 0.9410443571027512, "grad_norm": 0.6004332304000854, "learning_rate": 8.673607818906014e-06, "loss": 0.4623, "step": 1676 }, { "epoch": 0.9416058394160584, "grad_norm": 0.5742245316505432, "learning_rate": 8.671390771928554e-06, "loss": 0.426, "step": 1677 }, { "epoch": 0.9421673217293656, "grad_norm": 0.6202951073646545, "learning_rate": 8.669172157473205e-06, "loss": 0.4509, "step": 1678 }, { "epoch": 0.9427288040426727, "grad_norm": 0.5261647701263428, "learning_rate": 8.666951976487188e-06, "loss": 0.4439, "step": 1679 }, { "epoch": 0.9432902863559798, "grad_norm": 0.5129465460777283, "learning_rate": 8.664730229918395e-06, "loss": 0.4712, "step": 1680 }, { "epoch": 0.9438517686692869, "grad_norm": 0.5814812183380127, "learning_rate": 8.66250691871539e-06, "loss": 0.4349, "step": 1681 }, { "epoch": 0.9444132509825941, "grad_norm": 0.6136846542358398, "learning_rate": 8.660282043827399e-06, "loss": 0.4325, "step": 1682 }, { "epoch": 0.9449747332959012, "grad_norm": 0.5020133852958679, "learning_rate": 8.658055606204322e-06, "loss": 0.4424, "step": 1683 }, { "epoch": 0.9455362156092083, "grad_norm": 0.59068363904953, "learning_rate": 8.655827606796722e-06, "loss": 0.4573, "step": 1684 }, { "epoch": 0.9460976979225154, "grad_norm": 0.5725700259208679, "learning_rate": 8.653598046555832e-06, "loss": 0.4678, "step": 1685 }, { "epoch": 0.9466591802358225, "grad_norm": 0.5570755004882812, "learning_rate": 8.651366926433541e-06, "loss": 0.4567, "step": 1686 }, { "epoch": 0.9472206625491297, "grad_norm": 0.5643100738525391, "learning_rate": 8.649134247382421e-06, "loss": 0.4491, "step": 1687 }, { "epoch": 0.9477821448624368, "grad_norm": 0.616324782371521, "learning_rate": 8.646900010355696e-06, "loss": 0.4598, "step": 1688 }, { "epoch": 0.9483436271757439, "grad_norm": 0.5976985096931458, "learning_rate": 8.644664216307263e-06, "loss": 0.4648, "step": 1689 }, { "epoch": 0.948905109489051, "grad_norm": 0.5394681692123413, "learning_rate": 8.642426866191677e-06, "loss": 0.4349, "step": 1690 }, { "epoch": 0.9494665918023583, "grad_norm": 0.5924339890480042, "learning_rate": 8.640187960964163e-06, "loss": 0.4454, "step": 1691 }, { "epoch": 0.9500280741156654, "grad_norm": 0.5375018119812012, "learning_rate": 8.637947501580608e-06, "loss": 0.4562, "step": 1692 }, { "epoch": 0.9505895564289725, "grad_norm": 0.5938693881034851, "learning_rate": 8.635705488997561e-06, "loss": 0.4701, "step": 1693 }, { "epoch": 0.9511510387422796, "grad_norm": 0.5258591771125793, "learning_rate": 8.633461924172237e-06, "loss": 0.417, "step": 1694 }, { "epoch": 0.9517125210555868, "grad_norm": 0.5276675224304199, "learning_rate": 8.631216808062513e-06, "loss": 0.43, "step": 1695 }, { "epoch": 0.9522740033688939, "grad_norm": 0.5334724187850952, "learning_rate": 8.628970141626927e-06, "loss": 0.4635, "step": 1696 }, { "epoch": 0.952835485682201, "grad_norm": 0.6838250160217285, "learning_rate": 8.62672192582468e-06, "loss": 0.4644, "step": 1697 }, { "epoch": 0.9533969679955081, "grad_norm": 0.5691825151443481, "learning_rate": 8.62447216161563e-06, "loss": 0.4551, "step": 1698 }, { "epoch": 0.9539584503088153, "grad_norm": 0.583110511302948, "learning_rate": 8.622220849960305e-06, "loss": 0.404, "step": 1699 }, { "epoch": 0.9545199326221224, "grad_norm": 0.6356167793273926, "learning_rate": 8.619967991819887e-06, "loss": 0.4326, "step": 1700 }, { "epoch": 0.9550814149354295, "grad_norm": 0.5792338252067566, "learning_rate": 8.61771358815622e-06, "loss": 0.43, "step": 1701 }, { "epoch": 0.9556428972487366, "grad_norm": 0.6051939129829407, "learning_rate": 8.61545763993181e-06, "loss": 0.4293, "step": 1702 }, { "epoch": 0.9562043795620438, "grad_norm": 0.5443390607833862, "learning_rate": 8.613200148109815e-06, "loss": 0.4774, "step": 1703 }, { "epoch": 0.956765861875351, "grad_norm": 0.5755391120910645, "learning_rate": 8.610941113654064e-06, "loss": 0.4519, "step": 1704 }, { "epoch": 0.9573273441886581, "grad_norm": 0.6586125493049622, "learning_rate": 8.608680537529032e-06, "loss": 0.4278, "step": 1705 }, { "epoch": 0.9578888265019652, "grad_norm": 0.49623075127601624, "learning_rate": 8.606418420699862e-06, "loss": 0.4657, "step": 1706 }, { "epoch": 0.9584503088152723, "grad_norm": 0.6355677247047424, "learning_rate": 8.604154764132351e-06, "loss": 0.4554, "step": 1707 }, { "epoch": 0.9590117911285795, "grad_norm": 0.5773447751998901, "learning_rate": 8.60188956879295e-06, "loss": 0.4426, "step": 1708 }, { "epoch": 0.9595732734418866, "grad_norm": 0.5046952366828918, "learning_rate": 8.599622835648774e-06, "loss": 0.4775, "step": 1709 }, { "epoch": 0.9601347557551937, "grad_norm": 0.6949230432510376, "learning_rate": 8.59735456566759e-06, "loss": 0.4859, "step": 1710 }, { "epoch": 0.9606962380685008, "grad_norm": 0.5830551981925964, "learning_rate": 8.59508475981782e-06, "loss": 0.3937, "step": 1711 }, { "epoch": 0.961257720381808, "grad_norm": 0.6057324409484863, "learning_rate": 8.592813419068547e-06, "loss": 0.4582, "step": 1712 }, { "epoch": 0.9618192026951151, "grad_norm": 0.5556498765945435, "learning_rate": 8.590540544389501e-06, "loss": 0.4272, "step": 1713 }, { "epoch": 0.9623806850084222, "grad_norm": 0.5346919894218445, "learning_rate": 8.588266136751078e-06, "loss": 0.4098, "step": 1714 }, { "epoch": 0.9629421673217293, "grad_norm": 0.5504812598228455, "learning_rate": 8.58599019712432e-06, "loss": 0.4613, "step": 1715 }, { "epoch": 0.9635036496350365, "grad_norm": 0.5354852080345154, "learning_rate": 8.583712726480924e-06, "loss": 0.4528, "step": 1716 }, { "epoch": 0.9640651319483436, "grad_norm": 0.5496824383735657, "learning_rate": 8.581433725793245e-06, "loss": 0.4341, "step": 1717 }, { "epoch": 0.9646266142616507, "grad_norm": 0.5240011215209961, "learning_rate": 8.579153196034284e-06, "loss": 0.4347, "step": 1718 }, { "epoch": 0.9651880965749579, "grad_norm": 0.5651429891586304, "learning_rate": 8.576871138177704e-06, "loss": 0.4268, "step": 1719 }, { "epoch": 0.9657495788882651, "grad_norm": 0.5496369004249573, "learning_rate": 8.574587553197814e-06, "loss": 0.4229, "step": 1720 }, { "epoch": 0.9663110612015722, "grad_norm": 0.5108523368835449, "learning_rate": 8.572302442069578e-06, "loss": 0.4492, "step": 1721 }, { "epoch": 0.9668725435148793, "grad_norm": 0.5223454833030701, "learning_rate": 8.570015805768606e-06, "loss": 0.4444, "step": 1722 }, { "epoch": 0.9674340258281864, "grad_norm": 0.5420830249786377, "learning_rate": 8.56772764527117e-06, "loss": 0.466, "step": 1723 }, { "epoch": 0.9679955081414935, "grad_norm": 0.5116471648216248, "learning_rate": 8.565437961554179e-06, "loss": 0.4459, "step": 1724 }, { "epoch": 0.9685569904548007, "grad_norm": 0.5153878927230835, "learning_rate": 8.563146755595204e-06, "loss": 0.4424, "step": 1725 }, { "epoch": 0.9691184727681078, "grad_norm": 0.4957164525985718, "learning_rate": 8.560854028372458e-06, "loss": 0.4501, "step": 1726 }, { "epoch": 0.9696799550814149, "grad_norm": 0.4773913025856018, "learning_rate": 8.558559780864813e-06, "loss": 0.4441, "step": 1727 }, { "epoch": 0.970241437394722, "grad_norm": 0.5001927018165588, "learning_rate": 8.556264014051778e-06, "loss": 0.4386, "step": 1728 }, { "epoch": 0.9708029197080292, "grad_norm": 0.5303502082824707, "learning_rate": 8.553966728913518e-06, "loss": 0.4258, "step": 1729 }, { "epoch": 0.9713644020213363, "grad_norm": 0.4586193263530731, "learning_rate": 8.551667926430847e-06, "loss": 0.4299, "step": 1730 }, { "epoch": 0.9719258843346434, "grad_norm": 0.48762816190719604, "learning_rate": 8.549367607585223e-06, "loss": 0.4356, "step": 1731 }, { "epoch": 0.9724873666479505, "grad_norm": 0.5060698390007019, "learning_rate": 8.547065773358754e-06, "loss": 0.4173, "step": 1732 }, { "epoch": 0.9730488489612578, "grad_norm": 0.524591863155365, "learning_rate": 8.544762424734195e-06, "loss": 0.4463, "step": 1733 }, { "epoch": 0.9736103312745649, "grad_norm": 0.5740014314651489, "learning_rate": 8.542457562694941e-06, "loss": 0.4719, "step": 1734 }, { "epoch": 0.974171813587872, "grad_norm": 0.5199464559555054, "learning_rate": 8.540151188225047e-06, "loss": 0.4603, "step": 1735 }, { "epoch": 0.9747332959011791, "grad_norm": 0.5600287318229675, "learning_rate": 8.537843302309199e-06, "loss": 0.4796, "step": 1736 }, { "epoch": 0.9752947782144863, "grad_norm": 0.6243382692337036, "learning_rate": 8.535533905932739e-06, "loss": 0.4439, "step": 1737 }, { "epoch": 0.9758562605277934, "grad_norm": 0.6192986965179443, "learning_rate": 8.533223000081647e-06, "loss": 0.4724, "step": 1738 }, { "epoch": 0.9764177428411005, "grad_norm": 0.5536543130874634, "learning_rate": 8.530910585742552e-06, "loss": 0.466, "step": 1739 }, { "epoch": 0.9769792251544076, "grad_norm": 0.5135395526885986, "learning_rate": 8.528596663902722e-06, "loss": 0.4553, "step": 1740 }, { "epoch": 0.9775407074677148, "grad_norm": 0.52460116147995, "learning_rate": 8.526281235550076e-06, "loss": 0.448, "step": 1741 }, { "epoch": 0.9781021897810219, "grad_norm": 0.5339475274085999, "learning_rate": 8.52396430167317e-06, "loss": 0.4341, "step": 1742 }, { "epoch": 0.978663672094329, "grad_norm": 0.6007175445556641, "learning_rate": 8.521645863261205e-06, "loss": 0.4643, "step": 1743 }, { "epoch": 0.9792251544076361, "grad_norm": 0.555406928062439, "learning_rate": 8.519325921304023e-06, "loss": 0.4638, "step": 1744 }, { "epoch": 0.9797866367209432, "grad_norm": 0.526346743106842, "learning_rate": 8.517004476792113e-06, "loss": 0.4783, "step": 1745 }, { "epoch": 0.9803481190342505, "grad_norm": 0.4996754825115204, "learning_rate": 8.514681530716595e-06, "loss": 0.4367, "step": 1746 }, { "epoch": 0.9809096013475576, "grad_norm": 0.5538592338562012, "learning_rate": 8.512357084069242e-06, "loss": 0.4339, "step": 1747 }, { "epoch": 0.9814710836608647, "grad_norm": 0.4997721016407013, "learning_rate": 8.510031137842462e-06, "loss": 0.4636, "step": 1748 }, { "epoch": 0.9820325659741718, "grad_norm": 0.4847489893436432, "learning_rate": 8.5077036930293e-06, "loss": 0.4381, "step": 1749 }, { "epoch": 0.982594048287479, "grad_norm": 0.6043828725814819, "learning_rate": 8.505374750623448e-06, "loss": 0.4508, "step": 1750 }, { "epoch": 0.9831555306007861, "grad_norm": 0.4631125032901764, "learning_rate": 8.503044311619232e-06, "loss": 0.4435, "step": 1751 }, { "epoch": 0.9837170129140932, "grad_norm": 0.5343685746192932, "learning_rate": 8.500712377011618e-06, "loss": 0.4398, "step": 1752 }, { "epoch": 0.9842784952274003, "grad_norm": 0.6258784532546997, "learning_rate": 8.498378947796215e-06, "loss": 0.4291, "step": 1753 }, { "epoch": 0.9848399775407075, "grad_norm": 0.5559399127960205, "learning_rate": 8.49604402496926e-06, "loss": 0.4482, "step": 1754 }, { "epoch": 0.9854014598540146, "grad_norm": 0.6324735879898071, "learning_rate": 8.49370760952764e-06, "loss": 0.4815, "step": 1755 }, { "epoch": 0.9859629421673217, "grad_norm": 0.6012822985649109, "learning_rate": 8.491369702468872e-06, "loss": 0.4672, "step": 1756 }, { "epoch": 0.9865244244806288, "grad_norm": 0.6459401249885559, "learning_rate": 8.489030304791108e-06, "loss": 0.4863, "step": 1757 }, { "epoch": 0.987085906793936, "grad_norm": 0.5164533257484436, "learning_rate": 8.486689417493144e-06, "loss": 0.4375, "step": 1758 }, { "epoch": 0.9876473891072431, "grad_norm": 0.5305478572845459, "learning_rate": 8.484347041574403e-06, "loss": 0.4496, "step": 1759 }, { "epoch": 0.9882088714205502, "grad_norm": 0.5302371978759766, "learning_rate": 8.482003178034953e-06, "loss": 0.4722, "step": 1760 }, { "epoch": 0.9887703537338574, "grad_norm": 0.5576367378234863, "learning_rate": 8.479657827875486e-06, "loss": 0.4707, "step": 1761 }, { "epoch": 0.9893318360471645, "grad_norm": 0.48697003722190857, "learning_rate": 8.477310992097341e-06, "loss": 0.4435, "step": 1762 }, { "epoch": 0.9898933183604717, "grad_norm": 0.4890134036540985, "learning_rate": 8.474962671702483e-06, "loss": 0.4394, "step": 1763 }, { "epoch": 0.9904548006737788, "grad_norm": 0.4730072319507599, "learning_rate": 8.472612867693513e-06, "loss": 0.4298, "step": 1764 }, { "epoch": 0.9910162829870859, "grad_norm": 0.5538580417633057, "learning_rate": 8.470261581073664e-06, "loss": 0.4509, "step": 1765 }, { "epoch": 0.991577765300393, "grad_norm": 0.46759334206581116, "learning_rate": 8.467908812846805e-06, "loss": 0.4486, "step": 1766 }, { "epoch": 0.9921392476137002, "grad_norm": 0.5807069540023804, "learning_rate": 8.465554564017437e-06, "loss": 0.4486, "step": 1767 }, { "epoch": 0.9927007299270073, "grad_norm": 0.48510828614234924, "learning_rate": 8.463198835590691e-06, "loss": 0.479, "step": 1768 }, { "epoch": 0.9932622122403144, "grad_norm": 0.48767560720443726, "learning_rate": 8.460841628572331e-06, "loss": 0.4345, "step": 1769 }, { "epoch": 0.9938236945536215, "grad_norm": 0.562622606754303, "learning_rate": 8.45848294396875e-06, "loss": 0.476, "step": 1770 }, { "epoch": 0.9943851768669287, "grad_norm": 0.5500049591064453, "learning_rate": 8.45612278278698e-06, "loss": 0.4281, "step": 1771 }, { "epoch": 0.9949466591802358, "grad_norm": 0.5098548531532288, "learning_rate": 8.453761146034672e-06, "loss": 0.4452, "step": 1772 }, { "epoch": 0.9955081414935429, "grad_norm": 0.4974106550216675, "learning_rate": 8.451398034720112e-06, "loss": 0.4173, "step": 1773 }, { "epoch": 0.99606962380685, "grad_norm": 0.6325101852416992, "learning_rate": 8.449033449852219e-06, "loss": 0.4514, "step": 1774 }, { "epoch": 0.9966311061201573, "grad_norm": 0.515326738357544, "learning_rate": 8.446667392440537e-06, "loss": 0.4715, "step": 1775 }, { "epoch": 0.9971925884334644, "grad_norm": 0.5121425986289978, "learning_rate": 8.44429986349524e-06, "loss": 0.4754, "step": 1776 }, { "epoch": 0.9977540707467715, "grad_norm": 0.5035082101821899, "learning_rate": 8.44193086402713e-06, "loss": 0.4291, "step": 1777 }, { "epoch": 0.9983155530600786, "grad_norm": 0.519241213798523, "learning_rate": 8.439560395047637e-06, "loss": 0.4434, "step": 1778 }, { "epoch": 0.9988770353733858, "grad_norm": 0.5394167304039001, "learning_rate": 8.437188457568818e-06, "loss": 0.449, "step": 1779 }, { "epoch": 0.9994385176866929, "grad_norm": 0.5054137706756592, "learning_rate": 8.434815052603355e-06, "loss": 0.4369, "step": 1780 }, { "epoch": 1.0, "grad_norm": 0.5585078597068787, "learning_rate": 8.432440181164562e-06, "loss": 0.4092, "step": 1781 }, { "epoch": 1.000561482313307, "grad_norm": 0.5802502632141113, "learning_rate": 8.430063844266376e-06, "loss": 0.4388, "step": 1782 }, { "epoch": 1.0011229646266142, "grad_norm": 0.5512112379074097, "learning_rate": 8.427686042923358e-06, "loss": 0.4338, "step": 1783 }, { "epoch": 1.0016844469399213, "grad_norm": 0.5063155293464661, "learning_rate": 8.425306778150695e-06, "loss": 0.3875, "step": 1784 }, { "epoch": 1.0022459292532284, "grad_norm": 0.5037193894386292, "learning_rate": 8.422926050964202e-06, "loss": 0.4084, "step": 1785 }, { "epoch": 1.0028074115665357, "grad_norm": 0.5268838405609131, "learning_rate": 8.420543862380313e-06, "loss": 0.4091, "step": 1786 }, { "epoch": 1.0033688938798428, "grad_norm": 0.4434437155723572, "learning_rate": 8.418160213416091e-06, "loss": 0.425, "step": 1787 }, { "epoch": 1.00393037619315, "grad_norm": 0.4974170923233032, "learning_rate": 8.415775105089221e-06, "loss": 0.4278, "step": 1788 }, { "epoch": 1.004491858506457, "grad_norm": 0.5440188050270081, "learning_rate": 8.41338853841801e-06, "loss": 0.3959, "step": 1789 }, { "epoch": 1.0050533408197642, "grad_norm": 0.5489115118980408, "learning_rate": 8.411000514421386e-06, "loss": 0.4171, "step": 1790 }, { "epoch": 1.0056148231330713, "grad_norm": 0.5737612247467041, "learning_rate": 8.408611034118902e-06, "loss": 0.4322, "step": 1791 }, { "epoch": 1.0061763054463784, "grad_norm": 0.5377710461616516, "learning_rate": 8.406220098530734e-06, "loss": 0.4216, "step": 1792 }, { "epoch": 1.0067377877596855, "grad_norm": 0.5649153590202332, "learning_rate": 8.403827708677676e-06, "loss": 0.4012, "step": 1793 }, { "epoch": 1.0072992700729928, "grad_norm": 0.5426728129386902, "learning_rate": 8.401433865581143e-06, "loss": 0.4192, "step": 1794 }, { "epoch": 1.0078607523863, "grad_norm": 0.4989478588104248, "learning_rate": 8.399038570263174e-06, "loss": 0.4073, "step": 1795 }, { "epoch": 1.008422234699607, "grad_norm": 0.583908200263977, "learning_rate": 8.396641823746425e-06, "loss": 0.4251, "step": 1796 }, { "epoch": 1.0089837170129141, "grad_norm": 0.5246431827545166, "learning_rate": 8.394243627054172e-06, "loss": 0.3773, "step": 1797 }, { "epoch": 1.0095451993262212, "grad_norm": 0.5340030789375305, "learning_rate": 8.39184398121031e-06, "loss": 0.3874, "step": 1798 }, { "epoch": 1.0101066816395283, "grad_norm": 0.5250126719474792, "learning_rate": 8.389442887239356e-06, "loss": 0.4038, "step": 1799 }, { "epoch": 1.0106681639528354, "grad_norm": 0.5591611266136169, "learning_rate": 8.387040346166439e-06, "loss": 0.4162, "step": 1800 }, { "epoch": 1.0112296462661425, "grad_norm": 0.5341269969940186, "learning_rate": 8.38463635901731e-06, "loss": 0.4463, "step": 1801 }, { "epoch": 1.0117911285794496, "grad_norm": 0.5111761093139648, "learning_rate": 8.382230926818336e-06, "loss": 0.4274, "step": 1802 }, { "epoch": 1.012352610892757, "grad_norm": 0.6053972840309143, "learning_rate": 8.379824050596503e-06, "loss": 0.414, "step": 1803 }, { "epoch": 1.012914093206064, "grad_norm": 0.5024489760398865, "learning_rate": 8.377415731379412e-06, "loss": 0.4535, "step": 1804 }, { "epoch": 1.0134755755193712, "grad_norm": 0.5494826436042786, "learning_rate": 8.37500597019528e-06, "loss": 0.4087, "step": 1805 }, { "epoch": 1.0140370578326783, "grad_norm": 0.5013167262077332, "learning_rate": 8.372594768072941e-06, "loss": 0.4064, "step": 1806 }, { "epoch": 1.0145985401459854, "grad_norm": 0.5304994583129883, "learning_rate": 8.370182126041842e-06, "loss": 0.3997, "step": 1807 }, { "epoch": 1.0151600224592925, "grad_norm": 0.4634072482585907, "learning_rate": 8.367768045132044e-06, "loss": 0.3796, "step": 1808 }, { "epoch": 1.0157215047725996, "grad_norm": 0.4934869408607483, "learning_rate": 8.365352526374225e-06, "loss": 0.4075, "step": 1809 }, { "epoch": 1.0162829870859067, "grad_norm": 0.500310480594635, "learning_rate": 8.362935570799677e-06, "loss": 0.38, "step": 1810 }, { "epoch": 1.016844469399214, "grad_norm": 0.535349428653717, "learning_rate": 8.360517179440307e-06, "loss": 0.4109, "step": 1811 }, { "epoch": 1.0174059517125211, "grad_norm": 0.4929479956626892, "learning_rate": 8.358097353328627e-06, "loss": 0.4174, "step": 1812 }, { "epoch": 1.0179674340258282, "grad_norm": 0.4728625416755676, "learning_rate": 8.355676093497768e-06, "loss": 0.3944, "step": 1813 }, { "epoch": 1.0185289163391353, "grad_norm": 0.5490107536315918, "learning_rate": 8.353253400981479e-06, "loss": 0.4009, "step": 1814 }, { "epoch": 1.0190903986524424, "grad_norm": 0.4801093637943268, "learning_rate": 8.350829276814103e-06, "loss": 0.4015, "step": 1815 }, { "epoch": 1.0196518809657495, "grad_norm": 0.507047712802887, "learning_rate": 8.348403722030613e-06, "loss": 0.439, "step": 1816 }, { "epoch": 1.0202133632790567, "grad_norm": 0.5060343146324158, "learning_rate": 8.345976737666583e-06, "loss": 0.4045, "step": 1817 }, { "epoch": 1.0207748455923638, "grad_norm": 0.5770832300186157, "learning_rate": 8.343548324758197e-06, "loss": 0.4101, "step": 1818 }, { "epoch": 1.0213363279056709, "grad_norm": 0.5726828575134277, "learning_rate": 8.341118484342253e-06, "loss": 0.4167, "step": 1819 }, { "epoch": 1.0218978102189782, "grad_norm": 0.46681153774261475, "learning_rate": 8.338687217456156e-06, "loss": 0.4139, "step": 1820 }, { "epoch": 1.0224592925322853, "grad_norm": 0.5296029448509216, "learning_rate": 8.336254525137921e-06, "loss": 0.4077, "step": 1821 }, { "epoch": 1.0230207748455924, "grad_norm": 0.5514135360717773, "learning_rate": 8.333820408426173e-06, "loss": 0.4529, "step": 1822 }, { "epoch": 1.0235822571588995, "grad_norm": 0.5004760026931763, "learning_rate": 8.331384868360137e-06, "loss": 0.3918, "step": 1823 }, { "epoch": 1.0241437394722066, "grad_norm": 0.5264907479286194, "learning_rate": 8.328947905979658e-06, "loss": 0.4214, "step": 1824 }, { "epoch": 1.0247052217855137, "grad_norm": 0.4597678780555725, "learning_rate": 8.326509522325181e-06, "loss": 0.4305, "step": 1825 }, { "epoch": 1.0252667040988208, "grad_norm": 0.52338045835495, "learning_rate": 8.324069718437758e-06, "loss": 0.411, "step": 1826 }, { "epoch": 1.025828186412128, "grad_norm": 0.5264785289764404, "learning_rate": 8.321628495359047e-06, "loss": 0.406, "step": 1827 }, { "epoch": 1.0263896687254352, "grad_norm": 0.492105096578598, "learning_rate": 8.319185854131316e-06, "loss": 0.4261, "step": 1828 }, { "epoch": 1.0269511510387423, "grad_norm": 0.5020854473114014, "learning_rate": 8.316741795797435e-06, "loss": 0.4068, "step": 1829 }, { "epoch": 1.0275126333520495, "grad_norm": 0.47505536675453186, "learning_rate": 8.31429632140088e-06, "loss": 0.4181, "step": 1830 }, { "epoch": 1.0280741156653566, "grad_norm": 0.4756008982658386, "learning_rate": 8.31184943198573e-06, "loss": 0.408, "step": 1831 }, { "epoch": 1.0286355979786637, "grad_norm": 0.4827822744846344, "learning_rate": 8.309401128596672e-06, "loss": 0.4201, "step": 1832 }, { "epoch": 1.0291970802919708, "grad_norm": 0.4935355484485626, "learning_rate": 8.306951412278992e-06, "loss": 0.4001, "step": 1833 }, { "epoch": 1.0297585626052779, "grad_norm": 0.5075011253356934, "learning_rate": 8.30450028407858e-06, "loss": 0.4347, "step": 1834 }, { "epoch": 1.030320044918585, "grad_norm": 0.4842337667942047, "learning_rate": 8.302047745041935e-06, "loss": 0.4253, "step": 1835 }, { "epoch": 1.0308815272318923, "grad_norm": 0.49825409054756165, "learning_rate": 8.29959379621615e-06, "loss": 0.4303, "step": 1836 }, { "epoch": 1.0314430095451994, "grad_norm": 0.4965313673019409, "learning_rate": 8.297138438648923e-06, "loss": 0.4305, "step": 1837 }, { "epoch": 1.0320044918585065, "grad_norm": 0.48708540201187134, "learning_rate": 8.294681673388559e-06, "loss": 0.4184, "step": 1838 }, { "epoch": 1.0325659741718136, "grad_norm": 0.4587324857711792, "learning_rate": 8.292223501483952e-06, "loss": 0.4129, "step": 1839 }, { "epoch": 1.0331274564851207, "grad_norm": 0.4917663037776947, "learning_rate": 8.289763923984607e-06, "loss": 0.4073, "step": 1840 }, { "epoch": 1.0336889387984278, "grad_norm": 0.5251502990722656, "learning_rate": 8.287302941940626e-06, "loss": 0.4317, "step": 1841 }, { "epoch": 1.034250421111735, "grad_norm": 0.5086632966995239, "learning_rate": 8.284840556402709e-06, "loss": 0.4137, "step": 1842 }, { "epoch": 1.034811903425042, "grad_norm": 0.48324599862098694, "learning_rate": 8.282376768422157e-06, "loss": 0.4348, "step": 1843 }, { "epoch": 1.0353733857383491, "grad_norm": 0.5433419942855835, "learning_rate": 8.279911579050871e-06, "loss": 0.4298, "step": 1844 }, { "epoch": 1.0359348680516565, "grad_norm": 0.4430234134197235, "learning_rate": 8.277444989341345e-06, "loss": 0.4172, "step": 1845 }, { "epoch": 1.0364963503649636, "grad_norm": 0.5131403803825378, "learning_rate": 8.274977000346677e-06, "loss": 0.4103, "step": 1846 }, { "epoch": 1.0370578326782707, "grad_norm": 0.4808197617530823, "learning_rate": 8.27250761312056e-06, "loss": 0.4046, "step": 1847 }, { "epoch": 1.0376193149915778, "grad_norm": 0.4779209792613983, "learning_rate": 8.270036828717283e-06, "loss": 0.4127, "step": 1848 }, { "epoch": 1.0381807973048849, "grad_norm": 0.48014238476753235, "learning_rate": 8.267564648191731e-06, "loss": 0.4428, "step": 1849 }, { "epoch": 1.038742279618192, "grad_norm": 0.5258110761642456, "learning_rate": 8.265091072599389e-06, "loss": 0.4331, "step": 1850 }, { "epoch": 1.039303761931499, "grad_norm": 0.5102416276931763, "learning_rate": 8.262616102996333e-06, "loss": 0.3922, "step": 1851 }, { "epoch": 1.0398652442448062, "grad_norm": 0.5470871329307556, "learning_rate": 8.260139740439237e-06, "loss": 0.4249, "step": 1852 }, { "epoch": 1.0404267265581135, "grad_norm": 0.5384639501571655, "learning_rate": 8.257661985985367e-06, "loss": 0.4231, "step": 1853 }, { "epoch": 1.0409882088714206, "grad_norm": 0.49336767196655273, "learning_rate": 8.25518284069259e-06, "loss": 0.4121, "step": 1854 }, { "epoch": 1.0415496911847277, "grad_norm": 0.5390675067901611, "learning_rate": 8.252702305619357e-06, "loss": 0.4062, "step": 1855 }, { "epoch": 1.0421111734980348, "grad_norm": 0.5427564978599548, "learning_rate": 8.25022038182472e-06, "loss": 0.401, "step": 1856 }, { "epoch": 1.042672655811342, "grad_norm": 0.6042490005493164, "learning_rate": 8.24773707036832e-06, "loss": 0.4116, "step": 1857 }, { "epoch": 1.043234138124649, "grad_norm": 0.4949966371059418, "learning_rate": 8.245252372310392e-06, "loss": 0.445, "step": 1858 }, { "epoch": 1.0437956204379562, "grad_norm": 0.5909536480903625, "learning_rate": 8.242766288711763e-06, "loss": 0.4198, "step": 1859 }, { "epoch": 1.0443571027512633, "grad_norm": 0.44258999824523926, "learning_rate": 8.240278820633852e-06, "loss": 0.4088, "step": 1860 }, { "epoch": 1.0449185850645704, "grad_norm": 0.5449712872505188, "learning_rate": 8.237789969138667e-06, "loss": 0.4326, "step": 1861 }, { "epoch": 1.0454800673778777, "grad_norm": 0.4835590124130249, "learning_rate": 8.235299735288806e-06, "loss": 0.3895, "step": 1862 }, { "epoch": 1.0460415496911848, "grad_norm": 0.5078809261322021, "learning_rate": 8.232808120147463e-06, "loss": 0.4284, "step": 1863 }, { "epoch": 1.046603032004492, "grad_norm": 0.5492277145385742, "learning_rate": 8.230315124778417e-06, "loss": 0.3848, "step": 1864 }, { "epoch": 1.047164514317799, "grad_norm": 0.5024139881134033, "learning_rate": 8.227820750246035e-06, "loss": 0.3996, "step": 1865 }, { "epoch": 1.047725996631106, "grad_norm": 0.5362954139709473, "learning_rate": 8.225324997615278e-06, "loss": 0.4304, "step": 1866 }, { "epoch": 1.0482874789444132, "grad_norm": 0.549905002117157, "learning_rate": 8.22282786795169e-06, "loss": 0.4259, "step": 1867 }, { "epoch": 1.0488489612577203, "grad_norm": 0.5265644788742065, "learning_rate": 8.220329362321407e-06, "loss": 0.4275, "step": 1868 }, { "epoch": 1.0494104435710274, "grad_norm": 0.5751123428344727, "learning_rate": 8.217829481791148e-06, "loss": 0.44, "step": 1869 }, { "epoch": 1.0499719258843347, "grad_norm": 0.4960867166519165, "learning_rate": 8.215328227428225e-06, "loss": 0.4262, "step": 1870 }, { "epoch": 1.0505334081976418, "grad_norm": 0.4770427346229553, "learning_rate": 8.21282560030053e-06, "loss": 0.4413, "step": 1871 }, { "epoch": 1.051094890510949, "grad_norm": 0.5607216954231262, "learning_rate": 8.210321601476548e-06, "loss": 0.4341, "step": 1872 }, { "epoch": 1.051656372824256, "grad_norm": 0.5410726070404053, "learning_rate": 8.207816232025345e-06, "loss": 0.3923, "step": 1873 }, { "epoch": 1.0522178551375632, "grad_norm": 0.48735636472702026, "learning_rate": 8.205309493016571e-06, "loss": 0.4044, "step": 1874 }, { "epoch": 1.0527793374508703, "grad_norm": 0.5416375398635864, "learning_rate": 8.202801385520464e-06, "loss": 0.4343, "step": 1875 }, { "epoch": 1.0533408197641774, "grad_norm": 0.5760716199874878, "learning_rate": 8.200291910607848e-06, "loss": 0.4223, "step": 1876 }, { "epoch": 1.0539023020774845, "grad_norm": 0.5495180487632751, "learning_rate": 8.197781069350124e-06, "loss": 0.4341, "step": 1877 }, { "epoch": 1.0544637843907916, "grad_norm": 0.5034540891647339, "learning_rate": 8.195268862819282e-06, "loss": 0.4038, "step": 1878 }, { "epoch": 1.055025266704099, "grad_norm": 0.500170886516571, "learning_rate": 8.192755292087893e-06, "loss": 0.4139, "step": 1879 }, { "epoch": 1.055586749017406, "grad_norm": 0.5428920388221741, "learning_rate": 8.19024035822911e-06, "loss": 0.4078, "step": 1880 }, { "epoch": 1.0561482313307131, "grad_norm": 0.47413793206214905, "learning_rate": 8.187724062316672e-06, "loss": 0.4027, "step": 1881 }, { "epoch": 1.0567097136440202, "grad_norm": 0.4432065188884735, "learning_rate": 8.18520640542489e-06, "loss": 0.4094, "step": 1882 }, { "epoch": 1.0572711959573273, "grad_norm": 0.4607677757740021, "learning_rate": 8.182687388628669e-06, "loss": 0.4444, "step": 1883 }, { "epoch": 1.0578326782706344, "grad_norm": 0.4630781412124634, "learning_rate": 8.180167013003484e-06, "loss": 0.4109, "step": 1884 }, { "epoch": 1.0583941605839415, "grad_norm": 0.5584591627120972, "learning_rate": 8.177645279625394e-06, "loss": 0.4271, "step": 1885 }, { "epoch": 1.0589556428972486, "grad_norm": 0.4838709235191345, "learning_rate": 8.175122189571041e-06, "loss": 0.4161, "step": 1886 }, { "epoch": 1.059517125210556, "grad_norm": 0.5227084755897522, "learning_rate": 8.172597743917639e-06, "loss": 0.4292, "step": 1887 }, { "epoch": 1.060078607523863, "grad_norm": 0.512566089630127, "learning_rate": 8.170071943742985e-06, "loss": 0.4199, "step": 1888 }, { "epoch": 1.0606400898371702, "grad_norm": 0.5207642912864685, "learning_rate": 8.167544790125457e-06, "loss": 0.4203, "step": 1889 }, { "epoch": 1.0612015721504773, "grad_norm": 0.48312050104141235, "learning_rate": 8.165016284144005e-06, "loss": 0.402, "step": 1890 }, { "epoch": 1.0617630544637844, "grad_norm": 0.49226248264312744, "learning_rate": 8.16248642687816e-06, "loss": 0.4174, "step": 1891 }, { "epoch": 1.0623245367770915, "grad_norm": 0.5567193031311035, "learning_rate": 8.159955219408029e-06, "loss": 0.415, "step": 1892 }, { "epoch": 1.0628860190903986, "grad_norm": 0.47890806198120117, "learning_rate": 8.157422662814295e-06, "loss": 0.3993, "step": 1893 }, { "epoch": 1.0634475014037057, "grad_norm": 0.4624917507171631, "learning_rate": 8.15488875817822e-06, "loss": 0.4155, "step": 1894 }, { "epoch": 1.064008983717013, "grad_norm": 0.5134278535842896, "learning_rate": 8.152353506581637e-06, "loss": 0.3908, "step": 1895 }, { "epoch": 1.0645704660303201, "grad_norm": 0.4823552668094635, "learning_rate": 8.149816909106957e-06, "loss": 0.4228, "step": 1896 }, { "epoch": 1.0651319483436272, "grad_norm": 0.4746827483177185, "learning_rate": 8.147278966837164e-06, "loss": 0.3954, "step": 1897 }, { "epoch": 1.0656934306569343, "grad_norm": 0.5278778672218323, "learning_rate": 8.144739680855818e-06, "loss": 0.4207, "step": 1898 }, { "epoch": 1.0662549129702414, "grad_norm": 0.5264816880226135, "learning_rate": 8.14219905224705e-06, "loss": 0.4226, "step": 1899 }, { "epoch": 1.0668163952835485, "grad_norm": 0.48531341552734375, "learning_rate": 8.139657082095568e-06, "loss": 0.415, "step": 1900 }, { "epoch": 1.0673778775968557, "grad_norm": 0.5639722943305969, "learning_rate": 8.13711377148665e-06, "loss": 0.4243, "step": 1901 }, { "epoch": 1.0679393599101628, "grad_norm": 0.47640880942344666, "learning_rate": 8.134569121506146e-06, "loss": 0.4209, "step": 1902 }, { "epoch": 1.0685008422234699, "grad_norm": 0.49432089924812317, "learning_rate": 8.13202313324048e-06, "loss": 0.4069, "step": 1903 }, { "epoch": 1.0690623245367772, "grad_norm": 0.5322180390357971, "learning_rate": 8.129475807776645e-06, "loss": 0.4219, "step": 1904 }, { "epoch": 1.0696238068500843, "grad_norm": 0.5288822054862976, "learning_rate": 8.126927146202209e-06, "loss": 0.4172, "step": 1905 }, { "epoch": 1.0701852891633914, "grad_norm": 0.4993772804737091, "learning_rate": 8.124377149605302e-06, "loss": 0.4007, "step": 1906 }, { "epoch": 1.0707467714766985, "grad_norm": 0.5714595913887024, "learning_rate": 8.121825819074633e-06, "loss": 0.4022, "step": 1907 }, { "epoch": 1.0713082537900056, "grad_norm": 0.5524416565895081, "learning_rate": 8.119273155699479e-06, "loss": 0.4073, "step": 1908 }, { "epoch": 1.0718697361033127, "grad_norm": 0.5041775703430176, "learning_rate": 8.11671916056968e-06, "loss": 0.439, "step": 1909 }, { "epoch": 1.0724312184166198, "grad_norm": 0.49918264150619507, "learning_rate": 8.11416383477565e-06, "loss": 0.4256, "step": 1910 }, { "epoch": 1.072992700729927, "grad_norm": 0.5585694313049316, "learning_rate": 8.11160717940837e-06, "loss": 0.428, "step": 1911 }, { "epoch": 1.073554183043234, "grad_norm": 0.47638818621635437, "learning_rate": 8.109049195559388e-06, "loss": 0.4235, "step": 1912 }, { "epoch": 1.0741156653565413, "grad_norm": 0.5193788409233093, "learning_rate": 8.106489884320823e-06, "loss": 0.3906, "step": 1913 }, { "epoch": 1.0746771476698485, "grad_norm": 0.5140933990478516, "learning_rate": 8.103929246785351e-06, "loss": 0.4162, "step": 1914 }, { "epoch": 1.0752386299831556, "grad_norm": 0.5628550052642822, "learning_rate": 8.101367284046227e-06, "loss": 0.4271, "step": 1915 }, { "epoch": 1.0758001122964627, "grad_norm": 0.5461277365684509, "learning_rate": 8.098803997197262e-06, "loss": 0.4265, "step": 1916 }, { "epoch": 1.0763615946097698, "grad_norm": 0.5391279458999634, "learning_rate": 8.096239387332834e-06, "loss": 0.414, "step": 1917 }, { "epoch": 1.0769230769230769, "grad_norm": 0.5068195462226868, "learning_rate": 8.09367345554789e-06, "loss": 0.422, "step": 1918 }, { "epoch": 1.077484559236384, "grad_norm": 0.5427720546722412, "learning_rate": 8.09110620293794e-06, "loss": 0.3938, "step": 1919 }, { "epoch": 1.078046041549691, "grad_norm": 0.5949094295501709, "learning_rate": 8.088537630599053e-06, "loss": 0.4347, "step": 1920 }, { "epoch": 1.0786075238629984, "grad_norm": 0.45548275113105774, "learning_rate": 8.085967739627872e-06, "loss": 0.4192, "step": 1921 }, { "epoch": 1.0791690061763055, "grad_norm": 0.5322239398956299, "learning_rate": 8.083396531121588e-06, "loss": 0.4151, "step": 1922 }, { "epoch": 1.0797304884896126, "grad_norm": 0.5532917380332947, "learning_rate": 8.080824006177968e-06, "loss": 0.4201, "step": 1923 }, { "epoch": 1.0802919708029197, "grad_norm": 0.5066618323326111, "learning_rate": 8.078250165895333e-06, "loss": 0.4209, "step": 1924 }, { "epoch": 1.0808534531162268, "grad_norm": 0.5203927159309387, "learning_rate": 8.075675011372572e-06, "loss": 0.4079, "step": 1925 }, { "epoch": 1.081414935429534, "grad_norm": 0.4953920543193817, "learning_rate": 8.073098543709128e-06, "loss": 0.4186, "step": 1926 }, { "epoch": 1.081976417742841, "grad_norm": 0.49889636039733887, "learning_rate": 8.07052076400501e-06, "loss": 0.4307, "step": 1927 }, { "epoch": 1.0825379000561481, "grad_norm": 0.47502103447914124, "learning_rate": 8.067941673360786e-06, "loss": 0.3874, "step": 1928 }, { "epoch": 1.0830993823694555, "grad_norm": 0.5377910733222961, "learning_rate": 8.06536127287758e-06, "loss": 0.3985, "step": 1929 }, { "epoch": 1.0836608646827626, "grad_norm": 0.4664503037929535, "learning_rate": 8.062779563657081e-06, "loss": 0.4249, "step": 1930 }, { "epoch": 1.0842223469960697, "grad_norm": 0.49945494532585144, "learning_rate": 8.060196546801535e-06, "loss": 0.441, "step": 1931 }, { "epoch": 1.0847838293093768, "grad_norm": 0.5170819759368896, "learning_rate": 8.05761222341374e-06, "loss": 0.419, "step": 1932 }, { "epoch": 1.0853453116226839, "grad_norm": 0.4593645930290222, "learning_rate": 8.055026594597064e-06, "loss": 0.3939, "step": 1933 }, { "epoch": 1.085906793935991, "grad_norm": 0.5811879634857178, "learning_rate": 8.05243966145542e-06, "loss": 0.4417, "step": 1934 }, { "epoch": 1.086468276249298, "grad_norm": 0.48221758008003235, "learning_rate": 8.049851425093285e-06, "loss": 0.4144, "step": 1935 }, { "epoch": 1.0870297585626052, "grad_norm": 0.5970203280448914, "learning_rate": 8.047261886615691e-06, "loss": 0.4561, "step": 1936 }, { "epoch": 1.0875912408759123, "grad_norm": 0.5133828520774841, "learning_rate": 8.044671047128227e-06, "loss": 0.4033, "step": 1937 }, { "epoch": 1.0881527231892196, "grad_norm": 0.5676122307777405, "learning_rate": 8.042078907737034e-06, "loss": 0.4097, "step": 1938 }, { "epoch": 1.0887142055025267, "grad_norm": 0.6458877921104431, "learning_rate": 8.039485469548812e-06, "loss": 0.4118, "step": 1939 }, { "epoch": 1.0892756878158338, "grad_norm": 0.4689127206802368, "learning_rate": 8.036890733670811e-06, "loss": 0.4044, "step": 1940 }, { "epoch": 1.089837170129141, "grad_norm": 0.5602192282676697, "learning_rate": 8.03429470121084e-06, "loss": 0.3937, "step": 1941 }, { "epoch": 1.090398652442448, "grad_norm": 0.5874186754226685, "learning_rate": 8.031697373277259e-06, "loss": 0.4143, "step": 1942 }, { "epoch": 1.0909601347557552, "grad_norm": 0.5409570336341858, "learning_rate": 8.02909875097898e-06, "loss": 0.3935, "step": 1943 }, { "epoch": 1.0915216170690623, "grad_norm": 0.5350224375724792, "learning_rate": 8.02649883542547e-06, "loss": 0.4187, "step": 1944 }, { "epoch": 1.0920830993823694, "grad_norm": 0.6205257773399353, "learning_rate": 8.023897627726745e-06, "loss": 0.4127, "step": 1945 }, { "epoch": 1.0926445816956767, "grad_norm": 0.5965462327003479, "learning_rate": 8.021295128993379e-06, "loss": 0.4243, "step": 1946 }, { "epoch": 1.0932060640089838, "grad_norm": 0.6641983985900879, "learning_rate": 8.018691340336488e-06, "loss": 0.4625, "step": 1947 }, { "epoch": 1.093767546322291, "grad_norm": 0.49646368622779846, "learning_rate": 8.016086262867749e-06, "loss": 0.4022, "step": 1948 }, { "epoch": 1.094329028635598, "grad_norm": 0.5253635048866272, "learning_rate": 8.013479897699378e-06, "loss": 0.4117, "step": 1949 }, { "epoch": 1.094890510948905, "grad_norm": 0.5543665289878845, "learning_rate": 8.010872245944153e-06, "loss": 0.4197, "step": 1950 }, { "epoch": 1.0954519932622122, "grad_norm": 0.5241546034812927, "learning_rate": 8.008263308715389e-06, "loss": 0.4016, "step": 1951 }, { "epoch": 1.0960134755755193, "grad_norm": 0.621350884437561, "learning_rate": 8.00565308712696e-06, "loss": 0.3968, "step": 1952 }, { "epoch": 1.0965749578888264, "grad_norm": 0.47330138087272644, "learning_rate": 8.003041582293282e-06, "loss": 0.4027, "step": 1953 }, { "epoch": 1.0971364402021337, "grad_norm": 0.5569093823432922, "learning_rate": 8.000428795329323e-06, "loss": 0.429, "step": 1954 }, { "epoch": 1.0976979225154408, "grad_norm": 0.585268497467041, "learning_rate": 7.997814727350595e-06, "loss": 0.407, "step": 1955 }, { "epoch": 1.098259404828748, "grad_norm": 0.48603856563568115, "learning_rate": 7.995199379473157e-06, "loss": 0.4127, "step": 1956 }, { "epoch": 1.098820887142055, "grad_norm": 0.5305773019790649, "learning_rate": 7.99258275281362e-06, "loss": 0.4196, "step": 1957 }, { "epoch": 1.0993823694553622, "grad_norm": 0.5504736304283142, "learning_rate": 7.989964848489134e-06, "loss": 0.3926, "step": 1958 }, { "epoch": 1.0999438517686693, "grad_norm": 0.49020475149154663, "learning_rate": 7.987345667617395e-06, "loss": 0.4229, "step": 1959 }, { "epoch": 1.1005053340819764, "grad_norm": 0.5225605964660645, "learning_rate": 7.984725211316651e-06, "loss": 0.4181, "step": 1960 }, { "epoch": 1.1010668163952835, "grad_norm": 0.48225343227386475, "learning_rate": 7.982103480705688e-06, "loss": 0.3943, "step": 1961 }, { "epoch": 1.1016282987085906, "grad_norm": 0.5457044839859009, "learning_rate": 7.979480476903836e-06, "loss": 0.4216, "step": 1962 }, { "epoch": 1.102189781021898, "grad_norm": 0.5735048055648804, "learning_rate": 7.976856201030974e-06, "loss": 0.441, "step": 1963 }, { "epoch": 1.102751263335205, "grad_norm": 0.5243262648582458, "learning_rate": 7.974230654207516e-06, "loss": 0.4533, "step": 1964 }, { "epoch": 1.1033127456485121, "grad_norm": 0.5576698780059814, "learning_rate": 7.971603837554427e-06, "loss": 0.3943, "step": 1965 }, { "epoch": 1.1038742279618192, "grad_norm": 0.557949423789978, "learning_rate": 7.96897575219321e-06, "loss": 0.4245, "step": 1966 }, { "epoch": 1.1044357102751263, "grad_norm": 0.5036898255348206, "learning_rate": 7.96634639924591e-06, "loss": 0.4112, "step": 1967 }, { "epoch": 1.1049971925884334, "grad_norm": 0.5233035683631897, "learning_rate": 7.96371577983511e-06, "loss": 0.4327, "step": 1968 }, { "epoch": 1.1055586749017405, "grad_norm": 0.5547006130218506, "learning_rate": 7.961083895083941e-06, "loss": 0.4291, "step": 1969 }, { "epoch": 1.1061201572150476, "grad_norm": 0.5753288865089417, "learning_rate": 7.958450746116066e-06, "loss": 0.4548, "step": 1970 }, { "epoch": 1.1066816395283547, "grad_norm": 0.6026153564453125, "learning_rate": 7.955816334055698e-06, "loss": 0.426, "step": 1971 }, { "epoch": 1.107243121841662, "grad_norm": 0.5029599666595459, "learning_rate": 7.953180660027577e-06, "loss": 0.4009, "step": 1972 }, { "epoch": 1.1078046041549692, "grad_norm": 0.5910788178443909, "learning_rate": 7.95054372515699e-06, "loss": 0.387, "step": 1973 }, { "epoch": 1.1083660864682763, "grad_norm": 0.5144153833389282, "learning_rate": 7.947905530569762e-06, "loss": 0.3924, "step": 1974 }, { "epoch": 1.1089275687815834, "grad_norm": 0.5477709174156189, "learning_rate": 7.945266077392254e-06, "loss": 0.3931, "step": 1975 }, { "epoch": 1.1094890510948905, "grad_norm": 0.571784496307373, "learning_rate": 7.94262536675136e-06, "loss": 0.3962, "step": 1976 }, { "epoch": 1.1100505334081976, "grad_norm": 0.5439711809158325, "learning_rate": 7.93998339977452e-06, "loss": 0.395, "step": 1977 }, { "epoch": 1.1106120157215047, "grad_norm": 0.5303637981414795, "learning_rate": 7.937340177589704e-06, "loss": 0.3914, "step": 1978 }, { "epoch": 1.1111734980348118, "grad_norm": 0.6184371709823608, "learning_rate": 7.93469570132542e-06, "loss": 0.4219, "step": 1979 }, { "epoch": 1.1117349803481191, "grad_norm": 0.5080086588859558, "learning_rate": 7.93204997211071e-06, "loss": 0.3977, "step": 1980 }, { "epoch": 1.1122964626614262, "grad_norm": 0.6062014102935791, "learning_rate": 7.929402991075155e-06, "loss": 0.4422, "step": 1981 }, { "epoch": 1.1128579449747333, "grad_norm": 0.5875247120857239, "learning_rate": 7.926754759348863e-06, "loss": 0.4276, "step": 1982 }, { "epoch": 1.1134194272880404, "grad_norm": 0.5666197538375854, "learning_rate": 7.924105278062481e-06, "loss": 0.3948, "step": 1983 }, { "epoch": 1.1139809096013475, "grad_norm": 0.5711553692817688, "learning_rate": 7.921454548347193e-06, "loss": 0.4142, "step": 1984 }, { "epoch": 1.1145423919146547, "grad_norm": 0.5309496521949768, "learning_rate": 7.918802571334709e-06, "loss": 0.443, "step": 1985 }, { "epoch": 1.1151038742279618, "grad_norm": 0.49552464485168457, "learning_rate": 7.916149348157273e-06, "loss": 0.4004, "step": 1986 }, { "epoch": 1.1156653565412689, "grad_norm": 0.5888410806655884, "learning_rate": 7.913494879947667e-06, "loss": 0.4156, "step": 1987 }, { "epoch": 1.1162268388545762, "grad_norm": 0.586715817451477, "learning_rate": 7.910839167839195e-06, "loss": 0.4433, "step": 1988 }, { "epoch": 1.1167883211678833, "grad_norm": 0.5508003830909729, "learning_rate": 7.9081822129657e-06, "loss": 0.4125, "step": 1989 }, { "epoch": 1.1173498034811904, "grad_norm": 0.5964674353599548, "learning_rate": 7.905524016461551e-06, "loss": 0.4133, "step": 1990 }, { "epoch": 1.1179112857944975, "grad_norm": 0.5447354912757874, "learning_rate": 7.90286457946165e-06, "loss": 0.4274, "step": 1991 }, { "epoch": 1.1184727681078046, "grad_norm": 0.5806409120559692, "learning_rate": 7.900203903101427e-06, "loss": 0.4224, "step": 1992 }, { "epoch": 1.1190342504211117, "grad_norm": 0.507507860660553, "learning_rate": 7.897541988516842e-06, "loss": 0.4188, "step": 1993 }, { "epoch": 1.1195957327344188, "grad_norm": 0.5014584064483643, "learning_rate": 7.894878836844382e-06, "loss": 0.3987, "step": 1994 }, { "epoch": 1.120157215047726, "grad_norm": 0.5712757706642151, "learning_rate": 7.892214449221064e-06, "loss": 0.4199, "step": 1995 }, { "epoch": 1.120718697361033, "grad_norm": 0.49095577001571655, "learning_rate": 7.889548826784434e-06, "loss": 0.4166, "step": 1996 }, { "epoch": 1.1212801796743403, "grad_norm": 0.48579245805740356, "learning_rate": 7.88688197067256e-06, "loss": 0.408, "step": 1997 }, { "epoch": 1.1218416619876475, "grad_norm": 0.5053645372390747, "learning_rate": 7.884213882024041e-06, "loss": 0.4013, "step": 1998 }, { "epoch": 1.1224031443009546, "grad_norm": 0.4645758867263794, "learning_rate": 7.881544561978001e-06, "loss": 0.4325, "step": 1999 }, { "epoch": 1.1229646266142617, "grad_norm": 0.4977056682109833, "learning_rate": 7.878874011674093e-06, "loss": 0.4342, "step": 2000 }, { "epoch": 1.1235261089275688, "grad_norm": 0.5103041529655457, "learning_rate": 7.876202232252488e-06, "loss": 0.436, "step": 2001 }, { "epoch": 1.1240875912408759, "grad_norm": 0.5053243637084961, "learning_rate": 7.873529224853886e-06, "loss": 0.3963, "step": 2002 }, { "epoch": 1.124649073554183, "grad_norm": 0.5729970335960388, "learning_rate": 7.870854990619515e-06, "loss": 0.4162, "step": 2003 }, { "epoch": 1.12521055586749, "grad_norm": 0.48785483837127686, "learning_rate": 7.86817953069112e-06, "loss": 0.4206, "step": 2004 }, { "epoch": 1.1257720381807972, "grad_norm": 0.5160909295082092, "learning_rate": 7.865502846210973e-06, "loss": 0.3987, "step": 2005 }, { "epoch": 1.1263335204941045, "grad_norm": 0.5563057065010071, "learning_rate": 7.862824938321867e-06, "loss": 0.4442, "step": 2006 }, { "epoch": 1.1268950028074116, "grad_norm": 0.5113658905029297, "learning_rate": 7.860145808167121e-06, "loss": 0.4169, "step": 2007 }, { "epoch": 1.1274564851207187, "grad_norm": 0.44491392374038696, "learning_rate": 7.857465456890571e-06, "loss": 0.4094, "step": 2008 }, { "epoch": 1.1280179674340258, "grad_norm": 0.517547607421875, "learning_rate": 7.854783885636579e-06, "loss": 0.4035, "step": 2009 }, { "epoch": 1.128579449747333, "grad_norm": 0.47145116329193115, "learning_rate": 7.852101095550023e-06, "loss": 0.4296, "step": 2010 }, { "epoch": 1.12914093206064, "grad_norm": 0.5687405467033386, "learning_rate": 7.849417087776303e-06, "loss": 0.4209, "step": 2011 }, { "epoch": 1.1297024143739471, "grad_norm": 0.5423445105552673, "learning_rate": 7.846731863461343e-06, "loss": 0.4311, "step": 2012 }, { "epoch": 1.1302638966872545, "grad_norm": 0.4790567457675934, "learning_rate": 7.844045423751582e-06, "loss": 0.3951, "step": 2013 }, { "epoch": 1.1308253790005616, "grad_norm": 0.5334677696228027, "learning_rate": 7.841357769793978e-06, "loss": 0.4233, "step": 2014 }, { "epoch": 1.1313868613138687, "grad_norm": 0.5402255654335022, "learning_rate": 7.838668902736012e-06, "loss": 0.3851, "step": 2015 }, { "epoch": 1.1319483436271758, "grad_norm": 0.49191123247146606, "learning_rate": 7.835978823725673e-06, "loss": 0.3811, "step": 2016 }, { "epoch": 1.1325098259404829, "grad_norm": 0.5528040528297424, "learning_rate": 7.833287533911478e-06, "loss": 0.4464, "step": 2017 }, { "epoch": 1.13307130825379, "grad_norm": 0.5265504121780396, "learning_rate": 7.830595034442456e-06, "loss": 0.4435, "step": 2018 }, { "epoch": 1.133632790567097, "grad_norm": 0.47636568546295166, "learning_rate": 7.827901326468154e-06, "loss": 0.4343, "step": 2019 }, { "epoch": 1.1341942728804042, "grad_norm": 0.49692270159721375, "learning_rate": 7.825206411138634e-06, "loss": 0.4306, "step": 2020 }, { "epoch": 1.1347557551937113, "grad_norm": 0.517039954662323, "learning_rate": 7.822510289604472e-06, "loss": 0.4312, "step": 2021 }, { "epoch": 1.1353172375070186, "grad_norm": 0.46873557567596436, "learning_rate": 7.819812963016764e-06, "loss": 0.4184, "step": 2022 }, { "epoch": 1.1358787198203257, "grad_norm": 0.4910273253917694, "learning_rate": 7.817114432527114e-06, "loss": 0.4305, "step": 2023 }, { "epoch": 1.1364402021336328, "grad_norm": 0.6583677530288696, "learning_rate": 7.814414699287642e-06, "loss": 0.4348, "step": 2024 }, { "epoch": 1.13700168444694, "grad_norm": 0.4975959062576294, "learning_rate": 7.811713764450988e-06, "loss": 0.4477, "step": 2025 }, { "epoch": 1.137563166760247, "grad_norm": 0.5325098633766174, "learning_rate": 7.809011629170294e-06, "loss": 0.402, "step": 2026 }, { "epoch": 1.1381246490735542, "grad_norm": 0.5150450468063354, "learning_rate": 7.806308294599223e-06, "loss": 0.3976, "step": 2027 }, { "epoch": 1.1386861313868613, "grad_norm": 0.5084819793701172, "learning_rate": 7.803603761891947e-06, "loss": 0.4297, "step": 2028 }, { "epoch": 1.1392476137001684, "grad_norm": 0.5029119849205017, "learning_rate": 7.80089803220315e-06, "loss": 0.4413, "step": 2029 }, { "epoch": 1.1398090960134755, "grad_norm": 0.5253236293792725, "learning_rate": 7.798191106688026e-06, "loss": 0.4095, "step": 2030 }, { "epoch": 1.1403705783267828, "grad_norm": 0.5016337037086487, "learning_rate": 7.795482986502281e-06, "loss": 0.4211, "step": 2031 }, { "epoch": 1.14093206064009, "grad_norm": 0.5694748759269714, "learning_rate": 7.79277367280213e-06, "loss": 0.4404, "step": 2032 }, { "epoch": 1.141493542953397, "grad_norm": 0.5009506344795227, "learning_rate": 7.790063166744298e-06, "loss": 0.3911, "step": 2033 }, { "epoch": 1.142055025266704, "grad_norm": 0.5947183966636658, "learning_rate": 7.787351469486018e-06, "loss": 0.3943, "step": 2034 }, { "epoch": 1.1426165075800112, "grad_norm": 0.5407971143722534, "learning_rate": 7.784638582185034e-06, "loss": 0.3802, "step": 2035 }, { "epoch": 1.1431779898933183, "grad_norm": 0.541781485080719, "learning_rate": 7.781924505999597e-06, "loss": 0.4328, "step": 2036 }, { "epoch": 1.1437394722066254, "grad_norm": 0.576844334602356, "learning_rate": 7.779209242088464e-06, "loss": 0.419, "step": 2037 }, { "epoch": 1.1443009545199327, "grad_norm": 0.45913127064704895, "learning_rate": 7.776492791610901e-06, "loss": 0.4156, "step": 2038 }, { "epoch": 1.1448624368332398, "grad_norm": 0.49453234672546387, "learning_rate": 7.773775155726679e-06, "loss": 0.3887, "step": 2039 }, { "epoch": 1.145423919146547, "grad_norm": 0.6302667856216431, "learning_rate": 7.771056335596079e-06, "loss": 0.4293, "step": 2040 }, { "epoch": 1.145985401459854, "grad_norm": 0.5287615656852722, "learning_rate": 7.76833633237988e-06, "loss": 0.4059, "step": 2041 }, { "epoch": 1.1465468837731612, "grad_norm": 0.5220057964324951, "learning_rate": 7.765615147239375e-06, "loss": 0.4467, "step": 2042 }, { "epoch": 1.1471083660864683, "grad_norm": 0.5223116874694824, "learning_rate": 7.762892781336355e-06, "loss": 0.4366, "step": 2043 }, { "epoch": 1.1476698483997754, "grad_norm": 0.533878743648529, "learning_rate": 7.76016923583312e-06, "loss": 0.4226, "step": 2044 }, { "epoch": 1.1482313307130825, "grad_norm": 0.4907231032848358, "learning_rate": 7.757444511892468e-06, "loss": 0.3844, "step": 2045 }, { "epoch": 1.1487928130263896, "grad_norm": 0.4951094686985016, "learning_rate": 7.754718610677705e-06, "loss": 0.3901, "step": 2046 }, { "epoch": 1.149354295339697, "grad_norm": 0.4536187946796417, "learning_rate": 7.751991533352637e-06, "loss": 0.4108, "step": 2047 }, { "epoch": 1.149915777653004, "grad_norm": 0.5641949772834778, "learning_rate": 7.749263281081574e-06, "loss": 0.414, "step": 2048 }, { "epoch": 1.1504772599663111, "grad_norm": 0.5944583415985107, "learning_rate": 7.746533855029326e-06, "loss": 0.3915, "step": 2049 }, { "epoch": 1.1510387422796182, "grad_norm": 0.4928952753543854, "learning_rate": 7.743803256361205e-06, "loss": 0.4336, "step": 2050 }, { "epoch": 1.1516002245929253, "grad_norm": 0.5940431952476501, "learning_rate": 7.741071486243024e-06, "loss": 0.4396, "step": 2051 }, { "epoch": 1.1521617069062324, "grad_norm": 0.578167200088501, "learning_rate": 7.738338545841095e-06, "loss": 0.4414, "step": 2052 }, { "epoch": 1.1527231892195395, "grad_norm": 0.5161982178688049, "learning_rate": 7.73560443632223e-06, "loss": 0.3958, "step": 2053 }, { "epoch": 1.1532846715328466, "grad_norm": 0.5393643379211426, "learning_rate": 7.732869158853742e-06, "loss": 0.4047, "step": 2054 }, { "epoch": 1.1538461538461537, "grad_norm": 0.5238351225852966, "learning_rate": 7.730132714603437e-06, "loss": 0.4221, "step": 2055 }, { "epoch": 1.154407636159461, "grad_norm": 0.5141897201538086, "learning_rate": 7.72739510473963e-06, "loss": 0.4061, "step": 2056 }, { "epoch": 1.1549691184727682, "grad_norm": 0.5260945558547974, "learning_rate": 7.724656330431119e-06, "loss": 0.4254, "step": 2057 }, { "epoch": 1.1555306007860753, "grad_norm": 0.5580913424491882, "learning_rate": 7.721916392847213e-06, "loss": 0.4304, "step": 2058 }, { "epoch": 1.1560920830993824, "grad_norm": 0.5133140683174133, "learning_rate": 7.719175293157707e-06, "loss": 0.4171, "step": 2059 }, { "epoch": 1.1566535654126895, "grad_norm": 0.6189917922019958, "learning_rate": 7.716433032532902e-06, "loss": 0.4189, "step": 2060 }, { "epoch": 1.1572150477259966, "grad_norm": 0.5834705829620361, "learning_rate": 7.713689612143584e-06, "loss": 0.3953, "step": 2061 }, { "epoch": 1.1577765300393037, "grad_norm": 0.5560048222541809, "learning_rate": 7.710945033161044e-06, "loss": 0.4133, "step": 2062 }, { "epoch": 1.1583380123526108, "grad_norm": 0.5528165102005005, "learning_rate": 7.70819929675706e-06, "loss": 0.3849, "step": 2063 }, { "epoch": 1.158899494665918, "grad_norm": 0.6154552102088928, "learning_rate": 7.70545240410391e-06, "loss": 0.4565, "step": 2064 }, { "epoch": 1.1594609769792252, "grad_norm": 0.5722680687904358, "learning_rate": 7.702704356374362e-06, "loss": 0.425, "step": 2065 }, { "epoch": 1.1600224592925323, "grad_norm": 0.4815792143344879, "learning_rate": 7.699955154741679e-06, "loss": 0.408, "step": 2066 }, { "epoch": 1.1605839416058394, "grad_norm": 0.5901631116867065, "learning_rate": 7.697204800379612e-06, "loss": 0.439, "step": 2067 }, { "epoch": 1.1611454239191465, "grad_norm": 0.5379826426506042, "learning_rate": 7.694453294462414e-06, "loss": 0.4339, "step": 2068 }, { "epoch": 1.1617069062324537, "grad_norm": 0.5296193957328796, "learning_rate": 7.691700638164819e-06, "loss": 0.4446, "step": 2069 }, { "epoch": 1.1622683885457608, "grad_norm": 0.5163171887397766, "learning_rate": 7.688946832662057e-06, "loss": 0.3754, "step": 2070 }, { "epoch": 1.1628298708590679, "grad_norm": 0.5095730423927307, "learning_rate": 7.686191879129853e-06, "loss": 0.4491, "step": 2071 }, { "epoch": 1.1633913531723752, "grad_norm": 0.5541089177131653, "learning_rate": 7.683435778744413e-06, "loss": 0.4308, "step": 2072 }, { "epoch": 1.1639528354856823, "grad_norm": 0.5838501453399658, "learning_rate": 7.68067853268244e-06, "loss": 0.4309, "step": 2073 }, { "epoch": 1.1645143177989894, "grad_norm": 0.4986182153224945, "learning_rate": 7.67792014212112e-06, "loss": 0.4061, "step": 2074 }, { "epoch": 1.1650758001122965, "grad_norm": 0.49857833981513977, "learning_rate": 7.675160608238135e-06, "loss": 0.4204, "step": 2075 }, { "epoch": 1.1656372824256036, "grad_norm": 0.5711730122566223, "learning_rate": 7.672399932211648e-06, "loss": 0.4232, "step": 2076 }, { "epoch": 1.1661987647389107, "grad_norm": 0.4910397231578827, "learning_rate": 7.669638115220315e-06, "loss": 0.3927, "step": 2077 }, { "epoch": 1.1667602470522178, "grad_norm": 0.5007229447364807, "learning_rate": 7.666875158443277e-06, "loss": 0.4106, "step": 2078 }, { "epoch": 1.167321729365525, "grad_norm": 0.5504438281059265, "learning_rate": 7.664111063060158e-06, "loss": 0.4283, "step": 2079 }, { "epoch": 1.167883211678832, "grad_norm": 0.4420378506183624, "learning_rate": 7.661345830251075e-06, "loss": 0.4153, "step": 2080 }, { "epoch": 1.1684446939921393, "grad_norm": 0.4269157350063324, "learning_rate": 7.658579461196627e-06, "loss": 0.3762, "step": 2081 }, { "epoch": 1.1690061763054465, "grad_norm": 0.47395795583724976, "learning_rate": 7.6558119570779e-06, "loss": 0.4117, "step": 2082 }, { "epoch": 1.1695676586187536, "grad_norm": 0.5551823973655701, "learning_rate": 7.653043319076458e-06, "loss": 0.389, "step": 2083 }, { "epoch": 1.1701291409320607, "grad_norm": 0.46689698100090027, "learning_rate": 7.650273548374358e-06, "loss": 0.375, "step": 2084 }, { "epoch": 1.1706906232453678, "grad_norm": 0.4669168293476105, "learning_rate": 7.647502646154135e-06, "loss": 0.3975, "step": 2085 }, { "epoch": 1.1712521055586749, "grad_norm": 0.50274658203125, "learning_rate": 7.644730613598809e-06, "loss": 0.402, "step": 2086 }, { "epoch": 1.171813587871982, "grad_norm": 0.5429737567901611, "learning_rate": 7.641957451891886e-06, "loss": 0.4482, "step": 2087 }, { "epoch": 1.172375070185289, "grad_norm": 0.4829143285751343, "learning_rate": 7.639183162217345e-06, "loss": 0.3972, "step": 2088 }, { "epoch": 1.1729365524985962, "grad_norm": 0.5160886645317078, "learning_rate": 7.636407745759655e-06, "loss": 0.4106, "step": 2089 }, { "epoch": 1.1734980348119035, "grad_norm": 0.4962663948535919, "learning_rate": 7.633631203703762e-06, "loss": 0.3995, "step": 2090 }, { "epoch": 1.1740595171252106, "grad_norm": 0.522302508354187, "learning_rate": 7.630853537235095e-06, "loss": 0.4246, "step": 2091 }, { "epoch": 1.1746209994385177, "grad_norm": 0.5145089030265808, "learning_rate": 7.62807474753956e-06, "loss": 0.4127, "step": 2092 }, { "epoch": 1.1751824817518248, "grad_norm": 0.6500824689865112, "learning_rate": 7.625294835803548e-06, "loss": 0.4609, "step": 2093 }, { "epoch": 1.175743964065132, "grad_norm": 0.548811137676239, "learning_rate": 7.622513803213918e-06, "loss": 0.4288, "step": 2094 }, { "epoch": 1.176305446378439, "grad_norm": 0.5396094918251038, "learning_rate": 7.6197316509580225e-06, "loss": 0.4162, "step": 2095 }, { "epoch": 1.1768669286917461, "grad_norm": 0.5253773927688599, "learning_rate": 7.616948380223681e-06, "loss": 0.4071, "step": 2096 }, { "epoch": 1.1774284110050532, "grad_norm": 0.5094903707504272, "learning_rate": 7.614163992199195e-06, "loss": 0.4527, "step": 2097 }, { "epoch": 1.1779898933183603, "grad_norm": 0.4888249933719635, "learning_rate": 7.6113784880733424e-06, "loss": 0.4262, "step": 2098 }, { "epoch": 1.1785513756316677, "grad_norm": 0.5420947074890137, "learning_rate": 7.608591869035373e-06, "loss": 0.4097, "step": 2099 }, { "epoch": 1.1791128579449748, "grad_norm": 0.5180853605270386, "learning_rate": 7.605804136275021e-06, "loss": 0.4179, "step": 2100 }, { "epoch": 1.1796743402582819, "grad_norm": 0.56271892786026, "learning_rate": 7.603015290982493e-06, "loss": 0.4517, "step": 2101 }, { "epoch": 1.180235822571589, "grad_norm": 0.4795737564563751, "learning_rate": 7.600225334348465e-06, "loss": 0.3851, "step": 2102 }, { "epoch": 1.180797304884896, "grad_norm": 0.5167601108551025, "learning_rate": 7.597434267564097e-06, "loss": 0.3941, "step": 2103 }, { "epoch": 1.1813587871982032, "grad_norm": 0.6041990518569946, "learning_rate": 7.594642091821014e-06, "loss": 0.4222, "step": 2104 }, { "epoch": 1.1819202695115103, "grad_norm": 0.4996815621852875, "learning_rate": 7.5918488083113176e-06, "loss": 0.4358, "step": 2105 }, { "epoch": 1.1824817518248176, "grad_norm": 0.49069076776504517, "learning_rate": 7.589054418227588e-06, "loss": 0.4345, "step": 2106 }, { "epoch": 1.1830432341381247, "grad_norm": 0.5001227855682373, "learning_rate": 7.586258922762869e-06, "loss": 0.4215, "step": 2107 }, { "epoch": 1.1836047164514318, "grad_norm": 0.4886295199394226, "learning_rate": 7.583462323110682e-06, "loss": 0.43, "step": 2108 }, { "epoch": 1.184166198764739, "grad_norm": 0.47369587421417236, "learning_rate": 7.580664620465018e-06, "loss": 0.4392, "step": 2109 }, { "epoch": 1.184727681078046, "grad_norm": 0.45745131373405457, "learning_rate": 7.577865816020338e-06, "loss": 0.4173, "step": 2110 }, { "epoch": 1.1852891633913532, "grad_norm": 0.46906885504722595, "learning_rate": 7.5750659109715755e-06, "loss": 0.3959, "step": 2111 }, { "epoch": 1.1858506457046603, "grad_norm": 0.49499890208244324, "learning_rate": 7.572264906514133e-06, "loss": 0.3951, "step": 2112 }, { "epoch": 1.1864121280179674, "grad_norm": 0.4188763499259949, "learning_rate": 7.569462803843881e-06, "loss": 0.4158, "step": 2113 }, { "epoch": 1.1869736103312745, "grad_norm": 0.5147024393081665, "learning_rate": 7.566659604157163e-06, "loss": 0.4435, "step": 2114 }, { "epoch": 1.1875350926445818, "grad_norm": 0.47234949469566345, "learning_rate": 7.563855308650783e-06, "loss": 0.4252, "step": 2115 }, { "epoch": 1.188096574957889, "grad_norm": 0.45643264055252075, "learning_rate": 7.561049918522023e-06, "loss": 0.4047, "step": 2116 }, { "epoch": 1.188658057271196, "grad_norm": 0.4591674506664276, "learning_rate": 7.558243434968626e-06, "loss": 0.419, "step": 2117 }, { "epoch": 1.189219539584503, "grad_norm": 0.49865037202835083, "learning_rate": 7.5554358591888e-06, "loss": 0.4126, "step": 2118 }, { "epoch": 1.1897810218978102, "grad_norm": 0.5086956024169922, "learning_rate": 7.5526271923812265e-06, "loss": 0.4061, "step": 2119 }, { "epoch": 1.1903425042111173, "grad_norm": 0.5123899579048157, "learning_rate": 7.549817435745045e-06, "loss": 0.4422, "step": 2120 }, { "epoch": 1.1909039865244244, "grad_norm": 0.5126532316207886, "learning_rate": 7.5470065904798665e-06, "loss": 0.4407, "step": 2121 }, { "epoch": 1.1914654688377315, "grad_norm": 0.5463150143623352, "learning_rate": 7.544194657785764e-06, "loss": 0.4209, "step": 2122 }, { "epoch": 1.1920269511510386, "grad_norm": 0.5575338006019592, "learning_rate": 7.541381638863273e-06, "loss": 0.399, "step": 2123 }, { "epoch": 1.192588433464346, "grad_norm": 0.4747096300125122, "learning_rate": 7.538567534913397e-06, "loss": 0.4075, "step": 2124 }, { "epoch": 1.193149915777653, "grad_norm": 0.6180269122123718, "learning_rate": 7.5357523471376e-06, "loss": 0.4524, "step": 2125 }, { "epoch": 1.1937113980909602, "grad_norm": 0.4883875846862793, "learning_rate": 7.532936076737807e-06, "loss": 0.3997, "step": 2126 }, { "epoch": 1.1942728804042673, "grad_norm": 0.5369228720664978, "learning_rate": 7.530118724916411e-06, "loss": 0.3907, "step": 2127 }, { "epoch": 1.1948343627175744, "grad_norm": 0.5602598190307617, "learning_rate": 7.52730029287626e-06, "loss": 0.4175, "step": 2128 }, { "epoch": 1.1953958450308815, "grad_norm": 0.5378056168556213, "learning_rate": 7.52448078182067e-06, "loss": 0.4327, "step": 2129 }, { "epoch": 1.1959573273441886, "grad_norm": 0.5111677050590515, "learning_rate": 7.521660192953411e-06, "loss": 0.4256, "step": 2130 }, { "epoch": 1.196518809657496, "grad_norm": 0.5200886130332947, "learning_rate": 7.518838527478717e-06, "loss": 0.403, "step": 2131 }, { "epoch": 1.197080291970803, "grad_norm": 0.5186847448348999, "learning_rate": 7.516015786601281e-06, "loss": 0.4282, "step": 2132 }, { "epoch": 1.1976417742841101, "grad_norm": 0.5155947804450989, "learning_rate": 7.513191971526255e-06, "loss": 0.4254, "step": 2133 }, { "epoch": 1.1982032565974172, "grad_norm": 0.5824740529060364, "learning_rate": 7.5103670834592505e-06, "loss": 0.4068, "step": 2134 }, { "epoch": 1.1987647389107243, "grad_norm": 0.4757175147533417, "learning_rate": 7.507541123606333e-06, "loss": 0.3915, "step": 2135 }, { "epoch": 1.1993262212240314, "grad_norm": 0.5659790635108948, "learning_rate": 7.504714093174033e-06, "loss": 0.4142, "step": 2136 }, { "epoch": 1.1998877035373385, "grad_norm": 0.5317938923835754, "learning_rate": 7.501885993369332e-06, "loss": 0.4249, "step": 2137 }, { "epoch": 1.2004491858506456, "grad_norm": 0.5971941947937012, "learning_rate": 7.499056825399669e-06, "loss": 0.4005, "step": 2138 }, { "epoch": 1.2010106681639527, "grad_norm": 0.5792484879493713, "learning_rate": 7.496226590472944e-06, "loss": 0.4196, "step": 2139 }, { "epoch": 1.20157215047726, "grad_norm": 0.5066475868225098, "learning_rate": 7.493395289797501e-06, "loss": 0.427, "step": 2140 }, { "epoch": 1.2021336327905672, "grad_norm": 0.4950944781303406, "learning_rate": 7.490562924582154e-06, "loss": 0.4455, "step": 2141 }, { "epoch": 1.2026951151038743, "grad_norm": 0.5343753099441528, "learning_rate": 7.487729496036161e-06, "loss": 0.418, "step": 2142 }, { "epoch": 1.2032565974171814, "grad_norm": 0.48987632989883423, "learning_rate": 7.4848950053692374e-06, "loss": 0.3915, "step": 2143 }, { "epoch": 1.2038180797304885, "grad_norm": 0.47648268938064575, "learning_rate": 7.482059453791551e-06, "loss": 0.3935, "step": 2144 }, { "epoch": 1.2043795620437956, "grad_norm": 0.4961659014225006, "learning_rate": 7.479222842513723e-06, "loss": 0.3914, "step": 2145 }, { "epoch": 1.2049410443571027, "grad_norm": 0.5157271027565002, "learning_rate": 7.476385172746828e-06, "loss": 0.4241, "step": 2146 }, { "epoch": 1.2055025266704098, "grad_norm": 0.43103930354118347, "learning_rate": 7.473546445702393e-06, "loss": 0.4257, "step": 2147 }, { "epoch": 1.206064008983717, "grad_norm": 0.4358638525009155, "learning_rate": 7.470706662592394e-06, "loss": 0.4054, "step": 2148 }, { "epoch": 1.2066254912970242, "grad_norm": 0.4578574299812317, "learning_rate": 7.467865824629259e-06, "loss": 0.4423, "step": 2149 }, { "epoch": 1.2071869736103313, "grad_norm": 0.45918840169906616, "learning_rate": 7.465023933025866e-06, "loss": 0.4085, "step": 2150 }, { "epoch": 1.2077484559236384, "grad_norm": 0.4927962124347687, "learning_rate": 7.462180988995542e-06, "loss": 0.4134, "step": 2151 }, { "epoch": 1.2083099382369455, "grad_norm": 0.4466961622238159, "learning_rate": 7.459336993752068e-06, "loss": 0.4158, "step": 2152 }, { "epoch": 1.2088714205502527, "grad_norm": 0.4951990246772766, "learning_rate": 7.45649194850967e-06, "loss": 0.4169, "step": 2153 }, { "epoch": 1.2094329028635598, "grad_norm": 0.4992975890636444, "learning_rate": 7.45364585448302e-06, "loss": 0.403, "step": 2154 }, { "epoch": 1.2099943851768669, "grad_norm": 0.47195833921432495, "learning_rate": 7.450798712887244e-06, "loss": 0.4405, "step": 2155 }, { "epoch": 1.210555867490174, "grad_norm": 0.5563965439796448, "learning_rate": 7.447950524937907e-06, "loss": 0.4087, "step": 2156 }, { "epoch": 1.211117349803481, "grad_norm": 0.49630215764045715, "learning_rate": 7.445101291851029e-06, "loss": 0.4003, "step": 2157 }, { "epoch": 1.2116788321167884, "grad_norm": 0.5352805256843567, "learning_rate": 7.4422510148430725e-06, "loss": 0.4221, "step": 2158 }, { "epoch": 1.2122403144300955, "grad_norm": 0.5240042805671692, "learning_rate": 7.439399695130946e-06, "loss": 0.4107, "step": 2159 }, { "epoch": 1.2128017967434026, "grad_norm": 0.4872782826423645, "learning_rate": 7.436547333932002e-06, "loss": 0.4146, "step": 2160 }, { "epoch": 1.2133632790567097, "grad_norm": 0.5136293768882751, "learning_rate": 7.433693932464037e-06, "loss": 0.4219, "step": 2161 }, { "epoch": 1.2139247613700168, "grad_norm": 0.5083343982696533, "learning_rate": 7.430839491945297e-06, "loss": 0.4084, "step": 2162 }, { "epoch": 1.214486243683324, "grad_norm": 0.5461674332618713, "learning_rate": 7.4279840135944666e-06, "loss": 0.3995, "step": 2163 }, { "epoch": 1.215047725996631, "grad_norm": 0.49180030822753906, "learning_rate": 7.425127498630674e-06, "loss": 0.4013, "step": 2164 }, { "epoch": 1.2156092083099383, "grad_norm": 0.5343416929244995, "learning_rate": 7.422269948273492e-06, "loss": 0.4161, "step": 2165 }, { "epoch": 1.2161706906232455, "grad_norm": 0.5039750933647156, "learning_rate": 7.419411363742932e-06, "loss": 0.4188, "step": 2166 }, { "epoch": 1.2167321729365526, "grad_norm": 0.5260903239250183, "learning_rate": 7.4165517462594525e-06, "loss": 0.4339, "step": 2167 }, { "epoch": 1.2172936552498597, "grad_norm": 0.45139387249946594, "learning_rate": 7.413691097043949e-06, "loss": 0.4151, "step": 2168 }, { "epoch": 1.2178551375631668, "grad_norm": 0.5131839513778687, "learning_rate": 7.4108294173177565e-06, "loss": 0.4213, "step": 2169 }, { "epoch": 1.2184166198764739, "grad_norm": 0.527319073677063, "learning_rate": 7.407966708302653e-06, "loss": 0.4084, "step": 2170 }, { "epoch": 1.218978102189781, "grad_norm": 0.48122331500053406, "learning_rate": 7.405102971220852e-06, "loss": 0.388, "step": 2171 }, { "epoch": 1.219539584503088, "grad_norm": 0.5171840786933899, "learning_rate": 7.402238207295014e-06, "loss": 0.4146, "step": 2172 }, { "epoch": 1.2201010668163952, "grad_norm": 0.550398588180542, "learning_rate": 7.3993724177482274e-06, "loss": 0.3903, "step": 2173 }, { "epoch": 1.2206625491297025, "grad_norm": 0.6553874015808105, "learning_rate": 7.396505603804027e-06, "loss": 0.4439, "step": 2174 }, { "epoch": 1.2212240314430096, "grad_norm": 0.5159474611282349, "learning_rate": 7.393637766686379e-06, "loss": 0.4212, "step": 2175 }, { "epoch": 1.2217855137563167, "grad_norm": 0.5311527252197266, "learning_rate": 7.39076890761969e-06, "loss": 0.3987, "step": 2176 }, { "epoch": 1.2223469960696238, "grad_norm": 0.6776444911956787, "learning_rate": 7.387899027828802e-06, "loss": 0.4313, "step": 2177 }, { "epoch": 1.222908478382931, "grad_norm": 0.4612668752670288, "learning_rate": 7.3850281285389925e-06, "loss": 0.4152, "step": 2178 }, { "epoch": 1.223469960696238, "grad_norm": 0.5412721633911133, "learning_rate": 7.382156210975975e-06, "loss": 0.3999, "step": 2179 }, { "epoch": 1.2240314430095451, "grad_norm": 0.5807458758354187, "learning_rate": 7.379283276365896e-06, "loss": 0.4114, "step": 2180 }, { "epoch": 1.2245929253228522, "grad_norm": 0.5088834166526794, "learning_rate": 7.376409325935338e-06, "loss": 0.4151, "step": 2181 }, { "epoch": 1.2251544076361593, "grad_norm": 0.5163597464561462, "learning_rate": 7.373534360911317e-06, "loss": 0.401, "step": 2182 }, { "epoch": 1.2257158899494667, "grad_norm": 0.5706794857978821, "learning_rate": 7.370658382521281e-06, "loss": 0.4083, "step": 2183 }, { "epoch": 1.2262773722627738, "grad_norm": 0.555849015712738, "learning_rate": 7.367781391993112e-06, "loss": 0.3924, "step": 2184 }, { "epoch": 1.2268388545760809, "grad_norm": 0.5844675898551941, "learning_rate": 7.364903390555124e-06, "loss": 0.4374, "step": 2185 }, { "epoch": 1.227400336889388, "grad_norm": 0.563957691192627, "learning_rate": 7.36202437943606e-06, "loss": 0.4193, "step": 2186 }, { "epoch": 1.227961819202695, "grad_norm": 0.48953425884246826, "learning_rate": 7.359144359865099e-06, "loss": 0.4272, "step": 2187 }, { "epoch": 1.2285233015160022, "grad_norm": 0.499451220035553, "learning_rate": 7.356263333071845e-06, "loss": 0.4098, "step": 2188 }, { "epoch": 1.2290847838293093, "grad_norm": 0.5633230805397034, "learning_rate": 7.353381300286339e-06, "loss": 0.434, "step": 2189 }, { "epoch": 1.2296462661426166, "grad_norm": 0.5084733963012695, "learning_rate": 7.350498262739043e-06, "loss": 0.3887, "step": 2190 }, { "epoch": 1.2302077484559237, "grad_norm": 0.5786779522895813, "learning_rate": 7.347614221660855e-06, "loss": 0.4481, "step": 2191 }, { "epoch": 1.2307692307692308, "grad_norm": 0.5812886357307434, "learning_rate": 7.344729178283098e-06, "loss": 0.4377, "step": 2192 }, { "epoch": 1.231330713082538, "grad_norm": 0.5053824186325073, "learning_rate": 7.341843133837522e-06, "loss": 0.4446, "step": 2193 }, { "epoch": 1.231892195395845, "grad_norm": 0.4857684373855591, "learning_rate": 7.338956089556308e-06, "loss": 0.379, "step": 2194 }, { "epoch": 1.2324536777091522, "grad_norm": 0.6545397043228149, "learning_rate": 7.336068046672062e-06, "loss": 0.4298, "step": 2195 }, { "epoch": 1.2330151600224593, "grad_norm": 0.5481657385826111, "learning_rate": 7.333179006417816e-06, "loss": 0.4143, "step": 2196 }, { "epoch": 1.2335766423357664, "grad_norm": 0.5298793315887451, "learning_rate": 7.330288970027027e-06, "loss": 0.4032, "step": 2197 }, { "epoch": 1.2341381246490735, "grad_norm": 0.5697639584541321, "learning_rate": 7.327397938733578e-06, "loss": 0.4121, "step": 2198 }, { "epoch": 1.2346996069623808, "grad_norm": 0.561829686164856, "learning_rate": 7.32450591377178e-06, "loss": 0.4182, "step": 2199 }, { "epoch": 1.235261089275688, "grad_norm": 0.5126230120658875, "learning_rate": 7.321612896376364e-06, "loss": 0.4358, "step": 2200 }, { "epoch": 1.235822571588995, "grad_norm": 0.514999270439148, "learning_rate": 7.318718887782485e-06, "loss": 0.4368, "step": 2201 }, { "epoch": 1.236384053902302, "grad_norm": 0.5988884568214417, "learning_rate": 7.315823889225724e-06, "loss": 0.4282, "step": 2202 }, { "epoch": 1.2369455362156092, "grad_norm": 0.44250452518463135, "learning_rate": 7.312927901942082e-06, "loss": 0.4105, "step": 2203 }, { "epoch": 1.2375070185289163, "grad_norm": 0.49952033162117004, "learning_rate": 7.310030927167984e-06, "loss": 0.4318, "step": 2204 }, { "epoch": 1.2380685008422234, "grad_norm": 0.5482261180877686, "learning_rate": 7.307132966140273e-06, "loss": 0.4371, "step": 2205 }, { "epoch": 1.2386299831555305, "grad_norm": 0.5049880743026733, "learning_rate": 7.304234020096219e-06, "loss": 0.3957, "step": 2206 }, { "epoch": 1.2391914654688376, "grad_norm": 0.535776674747467, "learning_rate": 7.30133409027351e-06, "loss": 0.4104, "step": 2207 }, { "epoch": 1.239752947782145, "grad_norm": 0.4794069528579712, "learning_rate": 7.298433177910248e-06, "loss": 0.4044, "step": 2208 }, { "epoch": 1.240314430095452, "grad_norm": 0.5652307271957397, "learning_rate": 7.295531284244966e-06, "loss": 0.4149, "step": 2209 }, { "epoch": 1.2408759124087592, "grad_norm": 0.47027143836021423, "learning_rate": 7.292628410516606e-06, "loss": 0.4084, "step": 2210 }, { "epoch": 1.2414373947220663, "grad_norm": 0.4699520468711853, "learning_rate": 7.289724557964534e-06, "loss": 0.4255, "step": 2211 }, { "epoch": 1.2419988770353734, "grad_norm": 0.47844117879867554, "learning_rate": 7.2868197278285316e-06, "loss": 0.4004, "step": 2212 }, { "epoch": 1.2425603593486805, "grad_norm": 0.47982779145240784, "learning_rate": 7.283913921348796e-06, "loss": 0.4006, "step": 2213 }, { "epoch": 1.2431218416619876, "grad_norm": 0.4938255846500397, "learning_rate": 7.281007139765947e-06, "loss": 0.3908, "step": 2214 }, { "epoch": 1.2436833239752947, "grad_norm": 0.5376873016357422, "learning_rate": 7.278099384321017e-06, "loss": 0.444, "step": 2215 }, { "epoch": 1.2442448062886018, "grad_norm": 0.47243788838386536, "learning_rate": 7.275190656255453e-06, "loss": 0.4054, "step": 2216 }, { "epoch": 1.2448062886019091, "grad_norm": 0.4534139633178711, "learning_rate": 7.272280956811119e-06, "loss": 0.3783, "step": 2217 }, { "epoch": 1.2453677709152162, "grad_norm": 0.5173442959785461, "learning_rate": 7.269370287230293e-06, "loss": 0.4258, "step": 2218 }, { "epoch": 1.2459292532285233, "grad_norm": 0.5071815252304077, "learning_rate": 7.266458648755669e-06, "loss": 0.4266, "step": 2219 }, { "epoch": 1.2464907355418304, "grad_norm": 0.5061998963356018, "learning_rate": 7.263546042630352e-06, "loss": 0.4258, "step": 2220 }, { "epoch": 1.2470522178551375, "grad_norm": 0.47375917434692383, "learning_rate": 7.260632470097863e-06, "loss": 0.4375, "step": 2221 }, { "epoch": 1.2476137001684446, "grad_norm": 0.5053783059120178, "learning_rate": 7.257717932402131e-06, "loss": 0.3951, "step": 2222 }, { "epoch": 1.2481751824817517, "grad_norm": 0.4946967363357544, "learning_rate": 7.254802430787502e-06, "loss": 0.423, "step": 2223 }, { "epoch": 1.248736664795059, "grad_norm": 0.49850380420684814, "learning_rate": 7.251885966498733e-06, "loss": 0.427, "step": 2224 }, { "epoch": 1.2492981471083662, "grad_norm": 0.5020861625671387, "learning_rate": 7.248968540780989e-06, "loss": 0.3801, "step": 2225 }, { "epoch": 1.2498596294216733, "grad_norm": 0.5606759786605835, "learning_rate": 7.246050154879846e-06, "loss": 0.4273, "step": 2226 }, { "epoch": 1.2504211117349804, "grad_norm": 0.45905980467796326, "learning_rate": 7.2431308100412945e-06, "loss": 0.4156, "step": 2227 }, { "epoch": 1.2509825940482875, "grad_norm": 0.49171188473701477, "learning_rate": 7.240210507511728e-06, "loss": 0.4326, "step": 2228 }, { "epoch": 1.2515440763615946, "grad_norm": 0.5476842522621155, "learning_rate": 7.2372892485379524e-06, "loss": 0.4308, "step": 2229 }, { "epoch": 1.2521055586749017, "grad_norm": 0.49568989872932434, "learning_rate": 7.234367034367182e-06, "loss": 0.392, "step": 2230 }, { "epoch": 1.2526670409882088, "grad_norm": 0.4881848394870758, "learning_rate": 7.2314438662470384e-06, "loss": 0.4218, "step": 2231 }, { "epoch": 1.253228523301516, "grad_norm": 0.5499840378761292, "learning_rate": 7.22851974542555e-06, "loss": 0.3928, "step": 2232 }, { "epoch": 1.2537900056148232, "grad_norm": 0.5031362175941467, "learning_rate": 7.225594673151152e-06, "loss": 0.4292, "step": 2233 }, { "epoch": 1.2543514879281303, "grad_norm": 0.5085735321044922, "learning_rate": 7.222668650672686e-06, "loss": 0.4266, "step": 2234 }, { "epoch": 1.2549129702414374, "grad_norm": 0.5640439987182617, "learning_rate": 7.2197416792394e-06, "loss": 0.3991, "step": 2235 }, { "epoch": 1.2554744525547445, "grad_norm": 0.5348339080810547, "learning_rate": 7.216813760100948e-06, "loss": 0.4192, "step": 2236 }, { "epoch": 1.2560359348680517, "grad_norm": 0.4770994186401367, "learning_rate": 7.213884894507385e-06, "loss": 0.4066, "step": 2237 }, { "epoch": 1.2565974171813588, "grad_norm": 0.545295774936676, "learning_rate": 7.210955083709173e-06, "loss": 0.4023, "step": 2238 }, { "epoch": 1.2571588994946659, "grad_norm": 0.46659526228904724, "learning_rate": 7.2080243289571765e-06, "loss": 0.4103, "step": 2239 }, { "epoch": 1.2577203818079732, "grad_norm": 0.5339896082878113, "learning_rate": 7.205092631502664e-06, "loss": 0.4273, "step": 2240 }, { "epoch": 1.25828186412128, "grad_norm": 0.5512104034423828, "learning_rate": 7.2021599925973064e-06, "loss": 0.4383, "step": 2241 }, { "epoch": 1.2588433464345874, "grad_norm": 0.4883364140987396, "learning_rate": 7.199226413493174e-06, "loss": 0.396, "step": 2242 }, { "epoch": 1.2594048287478945, "grad_norm": 0.5859366655349731, "learning_rate": 7.196291895442744e-06, "loss": 0.4199, "step": 2243 }, { "epoch": 1.2599663110612016, "grad_norm": 0.5207738876342773, "learning_rate": 7.1933564396988865e-06, "loss": 0.3799, "step": 2244 }, { "epoch": 1.2605277933745087, "grad_norm": 0.5000956058502197, "learning_rate": 7.1904200475148785e-06, "loss": 0.413, "step": 2245 }, { "epoch": 1.2610892756878158, "grad_norm": 0.5112067461013794, "learning_rate": 7.1874827201443965e-06, "loss": 0.4171, "step": 2246 }, { "epoch": 1.261650758001123, "grad_norm": 0.5438712239265442, "learning_rate": 7.1845444588415125e-06, "loss": 0.4275, "step": 2247 }, { "epoch": 1.26221224031443, "grad_norm": 0.4975585341453552, "learning_rate": 7.181605264860699e-06, "loss": 0.3914, "step": 2248 }, { "epoch": 1.2627737226277373, "grad_norm": 0.546332836151123, "learning_rate": 7.178665139456826e-06, "loss": 0.4355, "step": 2249 }, { "epoch": 1.2633352049410442, "grad_norm": 0.45223015546798706, "learning_rate": 7.175724083885166e-06, "loss": 0.4185, "step": 2250 }, { "epoch": 1.2638966872543516, "grad_norm": 0.5042740702629089, "learning_rate": 7.1727820994013815e-06, "loss": 0.4044, "step": 2251 }, { "epoch": 1.2644581695676587, "grad_norm": 0.5668632388114929, "learning_rate": 7.169839187261537e-06, "loss": 0.4046, "step": 2252 }, { "epoch": 1.2650196518809658, "grad_norm": 0.4577108323574066, "learning_rate": 7.166895348722088e-06, "loss": 0.4226, "step": 2253 }, { "epoch": 1.2655811341942729, "grad_norm": 0.44466274976730347, "learning_rate": 7.1639505850398895e-06, "loss": 0.4371, "step": 2254 }, { "epoch": 1.26614261650758, "grad_norm": 0.5353525876998901, "learning_rate": 7.161004897472192e-06, "loss": 0.4312, "step": 2255 }, { "epoch": 1.266704098820887, "grad_norm": 0.4743967652320862, "learning_rate": 7.158058287276638e-06, "loss": 0.3828, "step": 2256 }, { "epoch": 1.2672655811341942, "grad_norm": 0.5462449789047241, "learning_rate": 7.155110755711264e-06, "loss": 0.4381, "step": 2257 }, { "epoch": 1.2678270634475015, "grad_norm": 0.4949532747268677, "learning_rate": 7.1521623040345026e-06, "loss": 0.4003, "step": 2258 }, { "epoch": 1.2683885457608086, "grad_norm": 0.5131034851074219, "learning_rate": 7.149212933505172e-06, "loss": 0.411, "step": 2259 }, { "epoch": 1.2689500280741157, "grad_norm": 0.5547118186950684, "learning_rate": 7.146262645382494e-06, "loss": 0.4129, "step": 2260 }, { "epoch": 1.2695115103874228, "grad_norm": 0.5451356768608093, "learning_rate": 7.143311440926075e-06, "loss": 0.4027, "step": 2261 }, { "epoch": 1.27007299270073, "grad_norm": 0.48254749178886414, "learning_rate": 7.140359321395911e-06, "loss": 0.4171, "step": 2262 }, { "epoch": 1.270634475014037, "grad_norm": 0.5528948307037354, "learning_rate": 7.137406288052393e-06, "loss": 0.4365, "step": 2263 }, { "epoch": 1.2711959573273441, "grad_norm": 0.4674314260482788, "learning_rate": 7.1344523421563e-06, "loss": 0.4072, "step": 2264 }, { "epoch": 1.2717574396406512, "grad_norm": 0.4806506335735321, "learning_rate": 7.131497484968802e-06, "loss": 0.4282, "step": 2265 }, { "epoch": 1.2723189219539583, "grad_norm": 0.4717147946357727, "learning_rate": 7.128541717751458e-06, "loss": 0.4011, "step": 2266 }, { "epoch": 1.2728804042672657, "grad_norm": 0.4287640154361725, "learning_rate": 7.125585041766213e-06, "loss": 0.4303, "step": 2267 }, { "epoch": 1.2734418865805728, "grad_norm": 0.5432797074317932, "learning_rate": 7.122627458275403e-06, "loss": 0.4088, "step": 2268 }, { "epoch": 1.2740033688938799, "grad_norm": 0.48291534185409546, "learning_rate": 7.119668968541749e-06, "loss": 0.4367, "step": 2269 }, { "epoch": 1.274564851207187, "grad_norm": 0.4776771664619446, "learning_rate": 7.116709573828361e-06, "loss": 0.4332, "step": 2270 }, { "epoch": 1.275126333520494, "grad_norm": 0.6455519795417786, "learning_rate": 7.113749275398735e-06, "loss": 0.4199, "step": 2271 }, { "epoch": 1.2756878158338012, "grad_norm": 0.49318987131118774, "learning_rate": 7.110788074516751e-06, "loss": 0.4143, "step": 2272 }, { "epoch": 1.2762492981471083, "grad_norm": 0.5682706832885742, "learning_rate": 7.107825972446678e-06, "loss": 0.4061, "step": 2273 }, { "epoch": 1.2768107804604156, "grad_norm": 0.5450190305709839, "learning_rate": 7.104862970453163e-06, "loss": 0.4149, "step": 2274 }, { "epoch": 1.2773722627737225, "grad_norm": 0.48313021659851074, "learning_rate": 7.101899069801245e-06, "loss": 0.3997, "step": 2275 }, { "epoch": 1.2779337450870298, "grad_norm": 0.548506498336792, "learning_rate": 7.098934271756346e-06, "loss": 0.4129, "step": 2276 }, { "epoch": 1.278495227400337, "grad_norm": 0.5613232254981995, "learning_rate": 7.095968577584262e-06, "loss": 0.3983, "step": 2277 }, { "epoch": 1.279056709713644, "grad_norm": 0.49210283160209656, "learning_rate": 7.093001988551183e-06, "loss": 0.398, "step": 2278 }, { "epoch": 1.2796181920269512, "grad_norm": 0.5286835432052612, "learning_rate": 7.090034505923673e-06, "loss": 0.4295, "step": 2279 }, { "epoch": 1.2801796743402583, "grad_norm": 0.521195650100708, "learning_rate": 7.087066130968681e-06, "loss": 0.4191, "step": 2280 }, { "epoch": 1.2807411566535654, "grad_norm": 0.5399542450904846, "learning_rate": 7.08409686495354e-06, "loss": 0.42, "step": 2281 }, { "epoch": 1.2813026389668725, "grad_norm": 0.43719175457954407, "learning_rate": 7.081126709145956e-06, "loss": 0.4084, "step": 2282 }, { "epoch": 1.2818641212801798, "grad_norm": 0.6423871517181396, "learning_rate": 7.078155664814022e-06, "loss": 0.4489, "step": 2283 }, { "epoch": 1.2824256035934867, "grad_norm": 0.5535472631454468, "learning_rate": 7.075183733226203e-06, "loss": 0.4266, "step": 2284 }, { "epoch": 1.282987085906794, "grad_norm": 0.4398556649684906, "learning_rate": 7.072210915651351e-06, "loss": 0.405, "step": 2285 }, { "epoch": 1.283548568220101, "grad_norm": 0.4561077058315277, "learning_rate": 7.069237213358691e-06, "loss": 0.4162, "step": 2286 }, { "epoch": 1.2841100505334082, "grad_norm": 0.5086630582809448, "learning_rate": 7.066262627617827e-06, "loss": 0.4156, "step": 2287 }, { "epoch": 1.2846715328467153, "grad_norm": 0.5686905384063721, "learning_rate": 7.063287159698741e-06, "loss": 0.4222, "step": 2288 }, { "epoch": 1.2852330151600224, "grad_norm": 0.5070647597312927, "learning_rate": 7.060310810871789e-06, "loss": 0.4242, "step": 2289 }, { "epoch": 1.2857944974733295, "grad_norm": 0.49537941813468933, "learning_rate": 7.057333582407706e-06, "loss": 0.4137, "step": 2290 }, { "epoch": 1.2863559797866366, "grad_norm": 0.5869025588035583, "learning_rate": 7.054355475577602e-06, "loss": 0.4211, "step": 2291 }, { "epoch": 1.286917462099944, "grad_norm": 0.49570149183273315, "learning_rate": 7.051376491652963e-06, "loss": 0.3938, "step": 2292 }, { "epoch": 1.287478944413251, "grad_norm": 0.4581998586654663, "learning_rate": 7.048396631905644e-06, "loss": 0.4495, "step": 2293 }, { "epoch": 1.2880404267265582, "grad_norm": 0.518488347530365, "learning_rate": 7.0454158976078815e-06, "loss": 0.393, "step": 2294 }, { "epoch": 1.2886019090398653, "grad_norm": 0.5205323100090027, "learning_rate": 7.04243429003228e-06, "loss": 0.4139, "step": 2295 }, { "epoch": 1.2891633913531724, "grad_norm": 0.5017574429512024, "learning_rate": 7.039451810451818e-06, "loss": 0.4154, "step": 2296 }, { "epoch": 1.2897248736664795, "grad_norm": 0.44734612107276917, "learning_rate": 7.03646846013985e-06, "loss": 0.4339, "step": 2297 }, { "epoch": 1.2902863559797866, "grad_norm": 0.5417091846466064, "learning_rate": 7.033484240370097e-06, "loss": 0.4287, "step": 2298 }, { "epoch": 1.2908478382930937, "grad_norm": 0.5139449238777161, "learning_rate": 7.030499152416653e-06, "loss": 0.4306, "step": 2299 }, { "epoch": 1.2914093206064008, "grad_norm": 0.4821740388870239, "learning_rate": 7.027513197553984e-06, "loss": 0.4067, "step": 2300 }, { "epoch": 1.2919708029197081, "grad_norm": 0.4908732771873474, "learning_rate": 7.024526377056925e-06, "loss": 0.4079, "step": 2301 }, { "epoch": 1.2925322852330152, "grad_norm": 0.4590698182582855, "learning_rate": 7.021538692200681e-06, "loss": 0.4173, "step": 2302 }, { "epoch": 1.2930937675463223, "grad_norm": 0.5274698734283447, "learning_rate": 7.018550144260827e-06, "loss": 0.4346, "step": 2303 }, { "epoch": 1.2936552498596294, "grad_norm": 0.49613624811172485, "learning_rate": 7.015560734513302e-06, "loss": 0.4063, "step": 2304 }, { "epoch": 1.2942167321729365, "grad_norm": 0.46040135622024536, "learning_rate": 7.012570464234418e-06, "loss": 0.3987, "step": 2305 }, { "epoch": 1.2947782144862436, "grad_norm": 0.502091646194458, "learning_rate": 7.009579334700854e-06, "loss": 0.4108, "step": 2306 }, { "epoch": 1.2953396967995507, "grad_norm": 0.5264946222305298, "learning_rate": 7.006587347189655e-06, "loss": 0.4282, "step": 2307 }, { "epoch": 1.295901179112858, "grad_norm": 0.4731038510799408, "learning_rate": 7.003594502978228e-06, "loss": 0.3832, "step": 2308 }, { "epoch": 1.296462661426165, "grad_norm": 0.5192720890045166, "learning_rate": 7.000600803344353e-06, "loss": 0.4172, "step": 2309 }, { "epoch": 1.2970241437394723, "grad_norm": 0.468107134103775, "learning_rate": 6.997606249566169e-06, "loss": 0.3874, "step": 2310 }, { "epoch": 1.2975856260527794, "grad_norm": 0.4744458794593811, "learning_rate": 6.994610842922185e-06, "loss": 0.4204, "step": 2311 }, { "epoch": 1.2981471083660865, "grad_norm": 0.5958382487297058, "learning_rate": 6.991614584691271e-06, "loss": 0.4121, "step": 2312 }, { "epoch": 1.2987085906793936, "grad_norm": 0.48883625864982605, "learning_rate": 6.9886174761526605e-06, "loss": 0.4236, "step": 2313 }, { "epoch": 1.2992700729927007, "grad_norm": 0.4534575939178467, "learning_rate": 6.985619518585951e-06, "loss": 0.3856, "step": 2314 }, { "epoch": 1.2998315553060078, "grad_norm": 0.5653669834136963, "learning_rate": 6.982620713271101e-06, "loss": 0.441, "step": 2315 }, { "epoch": 1.300393037619315, "grad_norm": 0.5111190676689148, "learning_rate": 6.979621061488435e-06, "loss": 0.41, "step": 2316 }, { "epoch": 1.3009545199326222, "grad_norm": 0.4485512375831604, "learning_rate": 6.976620564518634e-06, "loss": 0.4016, "step": 2317 }, { "epoch": 1.3015160022459291, "grad_norm": 0.5551222562789917, "learning_rate": 6.9736192236427426e-06, "loss": 0.3999, "step": 2318 }, { "epoch": 1.3020774845592364, "grad_norm": 0.4460165202617645, "learning_rate": 6.970617040142163e-06, "loss": 0.4067, "step": 2319 }, { "epoch": 1.3026389668725435, "grad_norm": 0.494144082069397, "learning_rate": 6.967614015298663e-06, "loss": 0.3976, "step": 2320 }, { "epoch": 1.3032004491858507, "grad_norm": 0.4851943254470825, "learning_rate": 6.9646101503943616e-06, "loss": 0.4474, "step": 2321 }, { "epoch": 1.3037619314991578, "grad_norm": 0.5012516379356384, "learning_rate": 6.961605446711743e-06, "loss": 0.4298, "step": 2322 }, { "epoch": 1.3043234138124649, "grad_norm": 0.496948778629303, "learning_rate": 6.958599905533647e-06, "loss": 0.4005, "step": 2323 }, { "epoch": 1.304884896125772, "grad_norm": 0.5322234034538269, "learning_rate": 6.955593528143271e-06, "loss": 0.4373, "step": 2324 }, { "epoch": 1.305446378439079, "grad_norm": 0.4528132677078247, "learning_rate": 6.952586315824168e-06, "loss": 0.407, "step": 2325 }, { "epoch": 1.3060078607523864, "grad_norm": 0.4934799373149872, "learning_rate": 6.949578269860251e-06, "loss": 0.4394, "step": 2326 }, { "epoch": 1.3065693430656935, "grad_norm": 0.5181715488433838, "learning_rate": 6.946569391535785e-06, "loss": 0.4175, "step": 2327 }, { "epoch": 1.3071308253790006, "grad_norm": 0.44103237986564636, "learning_rate": 6.943559682135395e-06, "loss": 0.4048, "step": 2328 }, { "epoch": 1.3076923076923077, "grad_norm": 0.42129823565483093, "learning_rate": 6.940549142944054e-06, "loss": 0.4107, "step": 2329 }, { "epoch": 1.3082537900056148, "grad_norm": 0.4197305142879486, "learning_rate": 6.937537775247098e-06, "loss": 0.3912, "step": 2330 }, { "epoch": 1.308815272318922, "grad_norm": 0.4710829555988312, "learning_rate": 6.934525580330207e-06, "loss": 0.4126, "step": 2331 }, { "epoch": 1.309376754632229, "grad_norm": 0.4407337009906769, "learning_rate": 6.931512559479423e-06, "loss": 0.4105, "step": 2332 }, { "epoch": 1.3099382369455363, "grad_norm": 0.48404064774513245, "learning_rate": 6.928498713981137e-06, "loss": 0.4401, "step": 2333 }, { "epoch": 1.3104997192588432, "grad_norm": 0.4693131148815155, "learning_rate": 6.925484045122091e-06, "loss": 0.3893, "step": 2334 }, { "epoch": 1.3110612015721506, "grad_norm": 0.4565127491950989, "learning_rate": 6.922468554189377e-06, "loss": 0.4447, "step": 2335 }, { "epoch": 1.3116226838854577, "grad_norm": 0.47112134099006653, "learning_rate": 6.9194522424704435e-06, "loss": 0.3799, "step": 2336 }, { "epoch": 1.3121841661987648, "grad_norm": 0.48831236362457275, "learning_rate": 6.916435111253086e-06, "loss": 0.4004, "step": 2337 }, { "epoch": 1.3127456485120719, "grad_norm": 0.49125897884368896, "learning_rate": 6.913417161825449e-06, "loss": 0.4049, "step": 2338 }, { "epoch": 1.313307130825379, "grad_norm": 0.47520408034324646, "learning_rate": 6.91039839547603e-06, "loss": 0.4104, "step": 2339 }, { "epoch": 1.313868613138686, "grad_norm": 0.4507128894329071, "learning_rate": 6.9073788134936705e-06, "loss": 0.4233, "step": 2340 }, { "epoch": 1.3144300954519932, "grad_norm": 0.4753039479255676, "learning_rate": 6.904358417167562e-06, "loss": 0.4052, "step": 2341 }, { "epoch": 1.3149915777653005, "grad_norm": 0.4944312572479248, "learning_rate": 6.901337207787248e-06, "loss": 0.3987, "step": 2342 }, { "epoch": 1.3155530600786074, "grad_norm": 0.4504777193069458, "learning_rate": 6.898315186642612e-06, "loss": 0.4357, "step": 2343 }, { "epoch": 1.3161145423919147, "grad_norm": 0.4876493811607361, "learning_rate": 6.895292355023888e-06, "loss": 0.4118, "step": 2344 }, { "epoch": 1.3166760247052218, "grad_norm": 0.4948068857192993, "learning_rate": 6.892268714221657e-06, "loss": 0.3909, "step": 2345 }, { "epoch": 1.317237507018529, "grad_norm": 0.5089834332466125, "learning_rate": 6.889244265526842e-06, "loss": 0.3974, "step": 2346 }, { "epoch": 1.317798989331836, "grad_norm": 0.4604131877422333, "learning_rate": 6.886219010230716e-06, "loss": 0.41, "step": 2347 }, { "epoch": 1.3183604716451431, "grad_norm": 0.5662000179290771, "learning_rate": 6.88319294962489e-06, "loss": 0.4314, "step": 2348 }, { "epoch": 1.3189219539584502, "grad_norm": 0.49495163559913635, "learning_rate": 6.880166085001324e-06, "loss": 0.413, "step": 2349 }, { "epoch": 1.3194834362717573, "grad_norm": 0.5426608324050903, "learning_rate": 6.877138417652318e-06, "loss": 0.4055, "step": 2350 }, { "epoch": 1.3200449185850647, "grad_norm": 0.46896204352378845, "learning_rate": 6.874109948870518e-06, "loss": 0.376, "step": 2351 }, { "epoch": 1.3206064008983718, "grad_norm": 0.48468056321144104, "learning_rate": 6.871080679948908e-06, "loss": 0.4126, "step": 2352 }, { "epoch": 1.3211678832116789, "grad_norm": 0.5372284650802612, "learning_rate": 6.868050612180817e-06, "loss": 0.4105, "step": 2353 }, { "epoch": 1.321729365524986, "grad_norm": 0.5698116421699524, "learning_rate": 6.865019746859915e-06, "loss": 0.4435, "step": 2354 }, { "epoch": 1.322290847838293, "grad_norm": 0.4691462516784668, "learning_rate": 6.861988085280212e-06, "loss": 0.4587, "step": 2355 }, { "epoch": 1.3228523301516002, "grad_norm": 0.5432417392730713, "learning_rate": 6.8589556287360524e-06, "loss": 0.3919, "step": 2356 }, { "epoch": 1.3234138124649073, "grad_norm": 0.46617037057876587, "learning_rate": 6.855922378522131e-06, "loss": 0.3877, "step": 2357 }, { "epoch": 1.3239752947782144, "grad_norm": 0.5166692137718201, "learning_rate": 6.852888335933472e-06, "loss": 0.4321, "step": 2358 }, { "epoch": 1.3245367770915215, "grad_norm": 0.52180016040802, "learning_rate": 6.849853502265442e-06, "loss": 0.4482, "step": 2359 }, { "epoch": 1.3250982594048288, "grad_norm": 0.4724474549293518, "learning_rate": 6.8468178788137455e-06, "loss": 0.4082, "step": 2360 }, { "epoch": 1.325659741718136, "grad_norm": 0.5313926339149475, "learning_rate": 6.843781466874424e-06, "loss": 0.4616, "step": 2361 }, { "epoch": 1.326221224031443, "grad_norm": 0.4649287760257721, "learning_rate": 6.840744267743852e-06, "loss": 0.3943, "step": 2362 }, { "epoch": 1.3267827063447502, "grad_norm": 0.49747326970100403, "learning_rate": 6.837706282718746e-06, "loss": 0.4335, "step": 2363 }, { "epoch": 1.3273441886580573, "grad_norm": 0.4860000014305115, "learning_rate": 6.8346675130961545e-06, "loss": 0.4178, "step": 2364 }, { "epoch": 1.3279056709713644, "grad_norm": 0.47700801491737366, "learning_rate": 6.831627960173461e-06, "loss": 0.412, "step": 2365 }, { "epoch": 1.3284671532846715, "grad_norm": 0.45901769399642944, "learning_rate": 6.828587625248386e-06, "loss": 0.4072, "step": 2366 }, { "epoch": 1.3290286355979788, "grad_norm": 0.4908572733402252, "learning_rate": 6.825546509618979e-06, "loss": 0.3926, "step": 2367 }, { "epoch": 1.3295901179112857, "grad_norm": 0.49247944355010986, "learning_rate": 6.822504614583628e-06, "loss": 0.432, "step": 2368 }, { "epoch": 1.330151600224593, "grad_norm": 0.5272449851036072, "learning_rate": 6.819461941441051e-06, "loss": 0.428, "step": 2369 }, { "epoch": 1.3307130825379, "grad_norm": 0.4953324496746063, "learning_rate": 6.816418491490299e-06, "loss": 0.4047, "step": 2370 }, { "epoch": 1.3312745648512072, "grad_norm": 0.4999220073223114, "learning_rate": 6.8133742660307535e-06, "loss": 0.4009, "step": 2371 }, { "epoch": 1.3318360471645143, "grad_norm": 0.5575302243232727, "learning_rate": 6.810329266362127e-06, "loss": 0.4129, "step": 2372 }, { "epoch": 1.3323975294778214, "grad_norm": 0.5387337803840637, "learning_rate": 6.807283493784467e-06, "loss": 0.4046, "step": 2373 }, { "epoch": 1.3329590117911285, "grad_norm": 0.525605320930481, "learning_rate": 6.804236949598144e-06, "loss": 0.4213, "step": 2374 }, { "epoch": 1.3335204941044356, "grad_norm": 0.5409771203994751, "learning_rate": 6.801189635103864e-06, "loss": 0.4434, "step": 2375 }, { "epoch": 1.334081976417743, "grad_norm": 0.4825245141983032, "learning_rate": 6.798141551602659e-06, "loss": 0.438, "step": 2376 }, { "epoch": 1.3346434587310498, "grad_norm": 0.5054187178611755, "learning_rate": 6.795092700395885e-06, "loss": 0.3924, "step": 2377 }, { "epoch": 1.3352049410443572, "grad_norm": 0.46611300110816956, "learning_rate": 6.792043082785238e-06, "loss": 0.3889, "step": 2378 }, { "epoch": 1.3357664233576643, "grad_norm": 0.46360138058662415, "learning_rate": 6.7889927000727296e-06, "loss": 0.4189, "step": 2379 }, { "epoch": 1.3363279056709714, "grad_norm": 0.43949562311172485, "learning_rate": 6.785941553560702e-06, "loss": 0.3991, "step": 2380 }, { "epoch": 1.3368893879842785, "grad_norm": 0.4679587185382843, "learning_rate": 6.782889644551824e-06, "loss": 0.414, "step": 2381 }, { "epoch": 1.3374508702975856, "grad_norm": 0.5008673667907715, "learning_rate": 6.779836974349089e-06, "loss": 0.4172, "step": 2382 }, { "epoch": 1.3380123526108927, "grad_norm": 0.528759777545929, "learning_rate": 6.776783544255817e-06, "loss": 0.4269, "step": 2383 }, { "epoch": 1.3385738349241998, "grad_norm": 0.48454025387763977, "learning_rate": 6.773729355575651e-06, "loss": 0.4106, "step": 2384 }, { "epoch": 1.3391353172375071, "grad_norm": 0.4879207909107208, "learning_rate": 6.770674409612559e-06, "loss": 0.4125, "step": 2385 }, { "epoch": 1.3396967995508142, "grad_norm": 0.5467033386230469, "learning_rate": 6.7676187076708285e-06, "loss": 0.3855, "step": 2386 }, { "epoch": 1.3402582818641213, "grad_norm": 0.521888256072998, "learning_rate": 6.764562251055076e-06, "loss": 0.4058, "step": 2387 }, { "epoch": 1.3408197641774284, "grad_norm": 0.5185530781745911, "learning_rate": 6.7615050410702355e-06, "loss": 0.4055, "step": 2388 }, { "epoch": 1.3413812464907355, "grad_norm": 0.5451387166976929, "learning_rate": 6.758447079021565e-06, "loss": 0.4101, "step": 2389 }, { "epoch": 1.3419427288040426, "grad_norm": 0.5141934156417847, "learning_rate": 6.7553883662146425e-06, "loss": 0.4123, "step": 2390 }, { "epoch": 1.3425042111173497, "grad_norm": 0.5906288027763367, "learning_rate": 6.752328903955368e-06, "loss": 0.4044, "step": 2391 }, { "epoch": 1.343065693430657, "grad_norm": 0.5337412357330322, "learning_rate": 6.749268693549955e-06, "loss": 0.4253, "step": 2392 }, { "epoch": 1.343627175743964, "grad_norm": 0.4686107635498047, "learning_rate": 6.746207736304948e-06, "loss": 0.4245, "step": 2393 }, { "epoch": 1.3441886580572713, "grad_norm": 0.5017719864845276, "learning_rate": 6.743146033527201e-06, "loss": 0.4338, "step": 2394 }, { "epoch": 1.3447501403705784, "grad_norm": 0.45971277356147766, "learning_rate": 6.740083586523891e-06, "loss": 0.3885, "step": 2395 }, { "epoch": 1.3453116226838855, "grad_norm": 0.48577770590782166, "learning_rate": 6.73702039660251e-06, "loss": 0.4327, "step": 2396 }, { "epoch": 1.3458731049971926, "grad_norm": 0.4243365526199341, "learning_rate": 6.7339564650708655e-06, "loss": 0.4059, "step": 2397 }, { "epoch": 1.3464345873104997, "grad_norm": 0.4439323842525482, "learning_rate": 6.730891793237088e-06, "loss": 0.3828, "step": 2398 }, { "epoch": 1.3469960696238068, "grad_norm": 0.5373735427856445, "learning_rate": 6.7278263824096204e-06, "loss": 0.4153, "step": 2399 }, { "epoch": 1.347557551937114, "grad_norm": 0.423995703458786, "learning_rate": 6.724760233897221e-06, "loss": 0.4049, "step": 2400 }, { "epoch": 1.3481190342504212, "grad_norm": 0.4947342276573181, "learning_rate": 6.721693349008961e-06, "loss": 0.4076, "step": 2401 }, { "epoch": 1.3486805165637281, "grad_norm": 0.5393979549407959, "learning_rate": 6.718625729054229e-06, "loss": 0.4334, "step": 2402 }, { "epoch": 1.3492419988770354, "grad_norm": 0.5007150173187256, "learning_rate": 6.715557375342728e-06, "loss": 0.4101, "step": 2403 }, { "epoch": 1.3498034811903425, "grad_norm": 0.4898264408111572, "learning_rate": 6.712488289184473e-06, "loss": 0.4158, "step": 2404 }, { "epoch": 1.3503649635036497, "grad_norm": 0.5217766165733337, "learning_rate": 6.709418471889791e-06, "loss": 0.4128, "step": 2405 }, { "epoch": 1.3509264458169568, "grad_norm": 0.5358155369758606, "learning_rate": 6.70634792476932e-06, "loss": 0.4396, "step": 2406 }, { "epoch": 1.3514879281302639, "grad_norm": 0.5844485759735107, "learning_rate": 6.703276649134013e-06, "loss": 0.4334, "step": 2407 }, { "epoch": 1.352049410443571, "grad_norm": 0.4522865116596222, "learning_rate": 6.700204646295132e-06, "loss": 0.443, "step": 2408 }, { "epoch": 1.352610892756878, "grad_norm": 0.5129997730255127, "learning_rate": 6.6971319175642504e-06, "loss": 0.4156, "step": 2409 }, { "epoch": 1.3531723750701854, "grad_norm": 0.4676356613636017, "learning_rate": 6.694058464253251e-06, "loss": 0.3779, "step": 2410 }, { "epoch": 1.3537338573834925, "grad_norm": 0.45738160610198975, "learning_rate": 6.6909842876743266e-06, "loss": 0.4003, "step": 2411 }, { "epoch": 1.3542953396967996, "grad_norm": 0.5066654086112976, "learning_rate": 6.687909389139977e-06, "loss": 0.4058, "step": 2412 }, { "epoch": 1.3548568220101067, "grad_norm": 0.47321197390556335, "learning_rate": 6.6848337699630095e-06, "loss": 0.4004, "step": 2413 }, { "epoch": 1.3554183043234138, "grad_norm": 0.4661482870578766, "learning_rate": 6.681757431456544e-06, "loss": 0.4096, "step": 2414 }, { "epoch": 1.355979786636721, "grad_norm": 0.48615676164627075, "learning_rate": 6.678680374934005e-06, "loss": 0.3966, "step": 2415 }, { "epoch": 1.356541268950028, "grad_norm": 0.5596820712089539, "learning_rate": 6.67560260170912e-06, "loss": 0.4309, "step": 2416 }, { "epoch": 1.3571027512633351, "grad_norm": 0.5101399421691895, "learning_rate": 6.672524113095927e-06, "loss": 0.4234, "step": 2417 }, { "epoch": 1.3576642335766422, "grad_norm": 0.4518723785877228, "learning_rate": 6.669444910408768e-06, "loss": 0.3923, "step": 2418 }, { "epoch": 1.3582257158899496, "grad_norm": 0.5397061109542847, "learning_rate": 6.66636499496229e-06, "loss": 0.4307, "step": 2419 }, { "epoch": 1.3587871982032567, "grad_norm": 0.5172988772392273, "learning_rate": 6.663284368071444e-06, "loss": 0.3939, "step": 2420 }, { "epoch": 1.3593486805165638, "grad_norm": 0.5412387251853943, "learning_rate": 6.660203031051484e-06, "loss": 0.3873, "step": 2421 }, { "epoch": 1.3599101628298709, "grad_norm": 0.44654330611228943, "learning_rate": 6.65712098521797e-06, "loss": 0.3968, "step": 2422 }, { "epoch": 1.360471645143178, "grad_norm": 0.5227207541465759, "learning_rate": 6.654038231886759e-06, "loss": 0.3937, "step": 2423 }, { "epoch": 1.361033127456485, "grad_norm": 0.6204281449317932, "learning_rate": 6.650954772374019e-06, "loss": 0.4002, "step": 2424 }, { "epoch": 1.3615946097697922, "grad_norm": 0.5136566758155823, "learning_rate": 6.64787060799621e-06, "loss": 0.4302, "step": 2425 }, { "epoch": 1.3621560920830995, "grad_norm": 0.4691891372203827, "learning_rate": 6.644785740070098e-06, "loss": 0.4096, "step": 2426 }, { "epoch": 1.3627175743964064, "grad_norm": 0.6360399127006531, "learning_rate": 6.6417001699127495e-06, "loss": 0.439, "step": 2427 }, { "epoch": 1.3632790567097137, "grad_norm": 0.5790189504623413, "learning_rate": 6.638613898841529e-06, "loss": 0.439, "step": 2428 }, { "epoch": 1.3638405390230208, "grad_norm": 0.5123319625854492, "learning_rate": 6.6355269281741005e-06, "loss": 0.4186, "step": 2429 }, { "epoch": 1.364402021336328, "grad_norm": 0.47878482937812805, "learning_rate": 6.63243925922843e-06, "loss": 0.4356, "step": 2430 }, { "epoch": 1.364963503649635, "grad_norm": 0.4484623074531555, "learning_rate": 6.629350893322778e-06, "loss": 0.377, "step": 2431 }, { "epoch": 1.3655249859629421, "grad_norm": 0.49843910336494446, "learning_rate": 6.6262618317757015e-06, "loss": 0.4086, "step": 2432 }, { "epoch": 1.3660864682762492, "grad_norm": 0.5211622714996338, "learning_rate": 6.623172075906057e-06, "loss": 0.4208, "step": 2433 }, { "epoch": 1.3666479505895563, "grad_norm": 0.44202306866645813, "learning_rate": 6.620081627032998e-06, "loss": 0.3992, "step": 2434 }, { "epoch": 1.3672094329028637, "grad_norm": 0.43675631284713745, "learning_rate": 6.616990486475972e-06, "loss": 0.3885, "step": 2435 }, { "epoch": 1.3677709152161706, "grad_norm": 0.5505504012107849, "learning_rate": 6.613898655554725e-06, "loss": 0.4017, "step": 2436 }, { "epoch": 1.3683323975294779, "grad_norm": 0.5169909000396729, "learning_rate": 6.610806135589292e-06, "loss": 0.3756, "step": 2437 }, { "epoch": 1.368893879842785, "grad_norm": 0.5119092464447021, "learning_rate": 6.607712927900005e-06, "loss": 0.4278, "step": 2438 }, { "epoch": 1.369455362156092, "grad_norm": 0.44342586398124695, "learning_rate": 6.6046190338074945e-06, "loss": 0.4085, "step": 2439 }, { "epoch": 1.3700168444693992, "grad_norm": 0.5567947030067444, "learning_rate": 6.601524454632677e-06, "loss": 0.3912, "step": 2440 }, { "epoch": 1.3705783267827063, "grad_norm": 0.5975965261459351, "learning_rate": 6.598429191696763e-06, "loss": 0.3967, "step": 2441 }, { "epoch": 1.3711398090960134, "grad_norm": 0.486075758934021, "learning_rate": 6.595333246321259e-06, "loss": 0.4116, "step": 2442 }, { "epoch": 1.3717012914093205, "grad_norm": 0.5400604009628296, "learning_rate": 6.592236619827958e-06, "loss": 0.4384, "step": 2443 }, { "epoch": 1.3722627737226278, "grad_norm": 0.46090590953826904, "learning_rate": 6.589139313538946e-06, "loss": 0.4075, "step": 2444 }, { "epoch": 1.372824256035935, "grad_norm": 0.5076034665107727, "learning_rate": 6.5860413287766e-06, "loss": 0.3926, "step": 2445 }, { "epoch": 1.373385738349242, "grad_norm": 0.5295721292495728, "learning_rate": 6.582942666863585e-06, "loss": 0.4018, "step": 2446 }, { "epoch": 1.3739472206625492, "grad_norm": 0.44913050532341003, "learning_rate": 6.579843329122856e-06, "loss": 0.3975, "step": 2447 }, { "epoch": 1.3745087029758563, "grad_norm": 0.47400012612342834, "learning_rate": 6.576743316877656e-06, "loss": 0.4221, "step": 2448 }, { "epoch": 1.3750701852891634, "grad_norm": 0.4545968174934387, "learning_rate": 6.573642631451515e-06, "loss": 0.3935, "step": 2449 }, { "epoch": 1.3756316676024705, "grad_norm": 0.5204000473022461, "learning_rate": 6.570541274168255e-06, "loss": 0.4338, "step": 2450 }, { "epoch": 1.3761931499157778, "grad_norm": 0.4803692102432251, "learning_rate": 6.567439246351979e-06, "loss": 0.4104, "step": 2451 }, { "epoch": 1.3767546322290847, "grad_norm": 0.4477470815181732, "learning_rate": 6.56433654932708e-06, "loss": 0.3813, "step": 2452 }, { "epoch": 1.377316114542392, "grad_norm": 0.5240747928619385, "learning_rate": 6.561233184418235e-06, "loss": 0.3999, "step": 2453 }, { "epoch": 1.377877596855699, "grad_norm": 0.44326770305633545, "learning_rate": 6.558129152950406e-06, "loss": 0.4159, "step": 2454 }, { "epoch": 1.3784390791690062, "grad_norm": 0.4282318353652954, "learning_rate": 6.555024456248843e-06, "loss": 0.4158, "step": 2455 }, { "epoch": 1.3790005614823133, "grad_norm": 0.4659128785133362, "learning_rate": 6.551919095639075e-06, "loss": 0.414, "step": 2456 }, { "epoch": 1.3795620437956204, "grad_norm": 0.4493613839149475, "learning_rate": 6.5488130724469165e-06, "loss": 0.3874, "step": 2457 }, { "epoch": 1.3801235261089275, "grad_norm": 0.48977822065353394, "learning_rate": 6.5457063879984675e-06, "loss": 0.4077, "step": 2458 }, { "epoch": 1.3806850084222346, "grad_norm": 0.504636824131012, "learning_rate": 6.542599043620103e-06, "loss": 0.3988, "step": 2459 }, { "epoch": 1.381246490735542, "grad_norm": 0.45984435081481934, "learning_rate": 6.539491040638493e-06, "loss": 0.4253, "step": 2460 }, { "epoch": 1.3818079730488488, "grad_norm": 0.4424978196620941, "learning_rate": 6.536382380380573e-06, "loss": 0.4351, "step": 2461 }, { "epoch": 1.3823694553621562, "grad_norm": 0.44156181812286377, "learning_rate": 6.533273064173568e-06, "loss": 0.4089, "step": 2462 }, { "epoch": 1.3829309376754633, "grad_norm": 0.4563598036766052, "learning_rate": 6.530163093344986e-06, "loss": 0.4161, "step": 2463 }, { "epoch": 1.3834924199887704, "grad_norm": 0.40550291538238525, "learning_rate": 6.527052469222602e-06, "loss": 0.3988, "step": 2464 }, { "epoch": 1.3840539023020775, "grad_norm": 0.4330405294895172, "learning_rate": 6.523941193134486e-06, "loss": 0.3666, "step": 2465 }, { "epoch": 1.3846153846153846, "grad_norm": 0.43351271748542786, "learning_rate": 6.520829266408974e-06, "loss": 0.4288, "step": 2466 }, { "epoch": 1.3851768669286917, "grad_norm": 0.48039746284484863, "learning_rate": 6.517716690374687e-06, "loss": 0.4216, "step": 2467 }, { "epoch": 1.3857383492419988, "grad_norm": 0.4818114638328552, "learning_rate": 6.5146034663605175e-06, "loss": 0.4418, "step": 2468 }, { "epoch": 1.3862998315553061, "grad_norm": 0.4515441060066223, "learning_rate": 6.511489595695637e-06, "loss": 0.4227, "step": 2469 }, { "epoch": 1.3868613138686132, "grad_norm": 0.5270153284072876, "learning_rate": 6.508375079709497e-06, "loss": 0.3896, "step": 2470 }, { "epoch": 1.3874227961819203, "grad_norm": 0.4738202393054962, "learning_rate": 6.505259919731819e-06, "loss": 0.4148, "step": 2471 }, { "epoch": 1.3879842784952274, "grad_norm": 0.46457260847091675, "learning_rate": 6.502144117092602e-06, "loss": 0.4197, "step": 2472 }, { "epoch": 1.3885457608085345, "grad_norm": 0.5096597671508789, "learning_rate": 6.499027673122118e-06, "loss": 0.4154, "step": 2473 }, { "epoch": 1.3891072431218416, "grad_norm": 0.3999055027961731, "learning_rate": 6.4959105891509134e-06, "loss": 0.3912, "step": 2474 }, { "epoch": 1.3896687254351487, "grad_norm": 0.4494934678077698, "learning_rate": 6.4927928665098106e-06, "loss": 0.3901, "step": 2475 }, { "epoch": 1.3902302077484558, "grad_norm": 0.4591536819934845, "learning_rate": 6.489674506529902e-06, "loss": 0.3748, "step": 2476 }, { "epoch": 1.390791690061763, "grad_norm": 0.5042945146560669, "learning_rate": 6.48655551054255e-06, "loss": 0.4296, "step": 2477 }, { "epoch": 1.3913531723750703, "grad_norm": 0.49176403880119324, "learning_rate": 6.483435879879392e-06, "loss": 0.4, "step": 2478 }, { "epoch": 1.3919146546883774, "grad_norm": 0.523596465587616, "learning_rate": 6.480315615872335e-06, "loss": 0.4086, "step": 2479 }, { "epoch": 1.3924761370016845, "grad_norm": 0.45499861240386963, "learning_rate": 6.477194719853556e-06, "loss": 0.3976, "step": 2480 }, { "epoch": 1.3930376193149916, "grad_norm": 0.4863879382610321, "learning_rate": 6.474073193155507e-06, "loss": 0.4003, "step": 2481 }, { "epoch": 1.3935991016282987, "grad_norm": 0.4563393294811249, "learning_rate": 6.4709510371109e-06, "loss": 0.3881, "step": 2482 }, { "epoch": 1.3941605839416058, "grad_norm": 0.4936552941799164, "learning_rate": 6.467828253052721e-06, "loss": 0.4405, "step": 2483 }, { "epoch": 1.394722066254913, "grad_norm": 0.43933409452438354, "learning_rate": 6.464704842314225e-06, "loss": 0.3895, "step": 2484 }, { "epoch": 1.3952835485682202, "grad_norm": 0.49271029233932495, "learning_rate": 6.461580806228933e-06, "loss": 0.4219, "step": 2485 }, { "epoch": 1.3958450308815271, "grad_norm": 0.4861069321632385, "learning_rate": 6.458456146130634e-06, "loss": 0.4372, "step": 2486 }, { "epoch": 1.3964065131948344, "grad_norm": 0.4685143828392029, "learning_rate": 6.455330863353379e-06, "loss": 0.362, "step": 2487 }, { "epoch": 1.3969679955081415, "grad_norm": 0.4401159882545471, "learning_rate": 6.452204959231493e-06, "loss": 0.4053, "step": 2488 }, { "epoch": 1.3975294778214487, "grad_norm": 0.46158865094184875, "learning_rate": 6.449078435099558e-06, "loss": 0.4087, "step": 2489 }, { "epoch": 1.3980909601347558, "grad_norm": 0.48446211218833923, "learning_rate": 6.445951292292426e-06, "loss": 0.3966, "step": 2490 }, { "epoch": 1.3986524424480629, "grad_norm": 0.44426655769348145, "learning_rate": 6.442823532145212e-06, "loss": 0.4332, "step": 2491 }, { "epoch": 1.39921392476137, "grad_norm": 0.45490148663520813, "learning_rate": 6.439695155993293e-06, "loss": 0.3866, "step": 2492 }, { "epoch": 1.399775407074677, "grad_norm": 0.4434986412525177, "learning_rate": 6.436566165172311e-06, "loss": 0.405, "step": 2493 }, { "epoch": 1.4003368893879844, "grad_norm": 0.4817710220813751, "learning_rate": 6.433436561018169e-06, "loss": 0.4174, "step": 2494 }, { "epoch": 1.4008983717012913, "grad_norm": 0.5321172475814819, "learning_rate": 6.430306344867033e-06, "loss": 0.41, "step": 2495 }, { "epoch": 1.4014598540145986, "grad_norm": 0.4156673848628998, "learning_rate": 6.427175518055329e-06, "loss": 0.3962, "step": 2496 }, { "epoch": 1.4020213363279057, "grad_norm": 0.4827881157398224, "learning_rate": 6.424044081919745e-06, "loss": 0.4255, "step": 2497 }, { "epoch": 1.4025828186412128, "grad_norm": 0.4610586166381836, "learning_rate": 6.4209120377972276e-06, "loss": 0.3971, "step": 2498 }, { "epoch": 1.40314430095452, "grad_norm": 0.5154926180839539, "learning_rate": 6.417779387024985e-06, "loss": 0.4324, "step": 2499 }, { "epoch": 1.403705783267827, "grad_norm": 0.46711093187332153, "learning_rate": 6.4146461309404825e-06, "loss": 0.4121, "step": 2500 }, { "epoch": 1.4042672655811341, "grad_norm": 0.4468764662742615, "learning_rate": 6.411512270881445e-06, "loss": 0.3966, "step": 2501 }, { "epoch": 1.4048287478944412, "grad_norm": 0.5048316121101379, "learning_rate": 6.408377808185856e-06, "loss": 0.4054, "step": 2502 }, { "epoch": 1.4053902302077486, "grad_norm": 0.5671415328979492, "learning_rate": 6.405242744191954e-06, "loss": 0.394, "step": 2503 }, { "epoch": 1.4059517125210557, "grad_norm": 0.5056939125061035, "learning_rate": 6.402107080238236e-06, "loss": 0.4096, "step": 2504 }, { "epoch": 1.4065131948343628, "grad_norm": 0.48976603150367737, "learning_rate": 6.398970817663453e-06, "loss": 0.4244, "step": 2505 }, { "epoch": 1.4070746771476699, "grad_norm": 0.5154252052307129, "learning_rate": 6.395833957806614e-06, "loss": 0.3723, "step": 2506 }, { "epoch": 1.407636159460977, "grad_norm": 0.5458301305770874, "learning_rate": 6.392696502006984e-06, "loss": 0.4324, "step": 2507 }, { "epoch": 1.408197641774284, "grad_norm": 0.5327669382095337, "learning_rate": 6.38955845160408e-06, "loss": 0.4238, "step": 2508 }, { "epoch": 1.4087591240875912, "grad_norm": 0.5153898596763611, "learning_rate": 6.386419807937671e-06, "loss": 0.4263, "step": 2509 }, { "epoch": 1.4093206064008983, "grad_norm": 0.5682222247123718, "learning_rate": 6.3832805723477856e-06, "loss": 0.4418, "step": 2510 }, { "epoch": 1.4098820887142054, "grad_norm": 0.5253010392189026, "learning_rate": 6.380140746174697e-06, "loss": 0.4035, "step": 2511 }, { "epoch": 1.4104435710275127, "grad_norm": 0.501616358757019, "learning_rate": 6.3770003307589375e-06, "loss": 0.4161, "step": 2512 }, { "epoch": 1.4110050533408198, "grad_norm": 0.5079493522644043, "learning_rate": 6.373859327441288e-06, "loss": 0.4246, "step": 2513 }, { "epoch": 1.411566535654127, "grad_norm": 0.6233018040657043, "learning_rate": 6.370717737562781e-06, "loss": 0.4185, "step": 2514 }, { "epoch": 1.412128017967434, "grad_norm": 0.48761850595474243, "learning_rate": 6.367575562464698e-06, "loss": 0.4233, "step": 2515 }, { "epoch": 1.4126895002807411, "grad_norm": 0.5050312280654907, "learning_rate": 6.364432803488571e-06, "loss": 0.3995, "step": 2516 }, { "epoch": 1.4132509825940482, "grad_norm": 0.5511537194252014, "learning_rate": 6.361289461976185e-06, "loss": 0.429, "step": 2517 }, { "epoch": 1.4138124649073553, "grad_norm": 0.6259415149688721, "learning_rate": 6.358145539269568e-06, "loss": 0.4132, "step": 2518 }, { "epoch": 1.4143739472206627, "grad_norm": 0.49633052945137024, "learning_rate": 6.355001036710998e-06, "loss": 0.391, "step": 2519 }, { "epoch": 1.4149354295339696, "grad_norm": 0.5090000629425049, "learning_rate": 6.351855955643004e-06, "loss": 0.4071, "step": 2520 }, { "epoch": 1.4154969118472769, "grad_norm": 0.5932656526565552, "learning_rate": 6.348710297408355e-06, "loss": 0.3997, "step": 2521 }, { "epoch": 1.416058394160584, "grad_norm": 0.5211352705955505, "learning_rate": 6.345564063350074e-06, "loss": 0.4124, "step": 2522 }, { "epoch": 1.416619876473891, "grad_norm": 0.4861297905445099, "learning_rate": 6.342417254811425e-06, "loss": 0.4039, "step": 2523 }, { "epoch": 1.4171813587871982, "grad_norm": 0.5383680462837219, "learning_rate": 6.339269873135921e-06, "loss": 0.3921, "step": 2524 }, { "epoch": 1.4177428411005053, "grad_norm": 0.518095850944519, "learning_rate": 6.336121919667313e-06, "loss": 0.3814, "step": 2525 }, { "epoch": 1.4183043234138124, "grad_norm": 0.5070397257804871, "learning_rate": 6.332973395749603e-06, "loss": 0.4285, "step": 2526 }, { "epoch": 1.4188658057271195, "grad_norm": 0.5894259810447693, "learning_rate": 6.329824302727036e-06, "loss": 0.4081, "step": 2527 }, { "epoch": 1.4194272880404268, "grad_norm": 0.52408367395401, "learning_rate": 6.326674641944095e-06, "loss": 0.4255, "step": 2528 }, { "epoch": 1.4199887703537337, "grad_norm": 0.4775222837924957, "learning_rate": 6.32352441474551e-06, "loss": 0.3821, "step": 2529 }, { "epoch": 1.420550252667041, "grad_norm": 0.5389823317527771, "learning_rate": 6.32037362247625e-06, "loss": 0.3876, "step": 2530 }, { "epoch": 1.4211117349803482, "grad_norm": 0.6135439872741699, "learning_rate": 6.317222266481528e-06, "loss": 0.4111, "step": 2531 }, { "epoch": 1.4216732172936553, "grad_norm": 0.6134797930717468, "learning_rate": 6.3140703481067954e-06, "loss": 0.4445, "step": 2532 }, { "epoch": 1.4222346996069624, "grad_norm": 0.5451945662498474, "learning_rate": 6.310917868697746e-06, "loss": 0.4093, "step": 2533 }, { "epoch": 1.4227961819202695, "grad_norm": 0.5537818074226379, "learning_rate": 6.307764829600309e-06, "loss": 0.4164, "step": 2534 }, { "epoch": 1.4233576642335766, "grad_norm": 0.6562954783439636, "learning_rate": 6.304611232160658e-06, "loss": 0.4128, "step": 2535 }, { "epoch": 1.4239191465468837, "grad_norm": 0.5243217349052429, "learning_rate": 6.3014570777252e-06, "loss": 0.4121, "step": 2536 }, { "epoch": 1.424480628860191, "grad_norm": 0.5213106274604797, "learning_rate": 6.298302367640581e-06, "loss": 0.4041, "step": 2537 }, { "epoch": 1.425042111173498, "grad_norm": 0.6501502990722656, "learning_rate": 6.295147103253689e-06, "loss": 0.4353, "step": 2538 }, { "epoch": 1.4256035934868052, "grad_norm": 0.5742585062980652, "learning_rate": 6.291991285911643e-06, "loss": 0.4423, "step": 2539 }, { "epoch": 1.4261650758001123, "grad_norm": 0.49370378255844116, "learning_rate": 6.2888349169618e-06, "loss": 0.425, "step": 2540 }, { "epoch": 1.4267265581134194, "grad_norm": 0.47029197216033936, "learning_rate": 6.285677997751751e-06, "loss": 0.3878, "step": 2541 }, { "epoch": 1.4272880404267265, "grad_norm": 0.6205877661705017, "learning_rate": 6.282520529629326e-06, "loss": 0.4278, "step": 2542 }, { "epoch": 1.4278495227400336, "grad_norm": 0.6817514300346375, "learning_rate": 6.279362513942584e-06, "loss": 0.4013, "step": 2543 }, { "epoch": 1.428411005053341, "grad_norm": 0.4882749319076538, "learning_rate": 6.276203952039823e-06, "loss": 0.4185, "step": 2544 }, { "epoch": 1.4289724873666478, "grad_norm": 0.5179117918014526, "learning_rate": 6.2730448452695715e-06, "loss": 0.439, "step": 2545 }, { "epoch": 1.4295339696799552, "grad_norm": 0.5694510340690613, "learning_rate": 6.269885194980589e-06, "loss": 0.401, "step": 2546 }, { "epoch": 1.4300954519932623, "grad_norm": 0.5455119013786316, "learning_rate": 6.266725002521869e-06, "loss": 0.425, "step": 2547 }, { "epoch": 1.4306569343065694, "grad_norm": 0.5096943974494934, "learning_rate": 6.2635642692426406e-06, "loss": 0.381, "step": 2548 }, { "epoch": 1.4312184166198765, "grad_norm": 0.5067031383514404, "learning_rate": 6.260402996492354e-06, "loss": 0.3785, "step": 2549 }, { "epoch": 1.4317798989331836, "grad_norm": 0.5150525569915771, "learning_rate": 6.257241185620698e-06, "loss": 0.4047, "step": 2550 }, { "epoch": 1.4323413812464907, "grad_norm": 0.5062898397445679, "learning_rate": 6.254078837977589e-06, "loss": 0.4303, "step": 2551 }, { "epoch": 1.4329028635597978, "grad_norm": 0.5258253216743469, "learning_rate": 6.250915954913169e-06, "loss": 0.4241, "step": 2552 }, { "epoch": 1.4334643458731051, "grad_norm": 0.5098314881324768, "learning_rate": 6.247752537777816e-06, "loss": 0.4444, "step": 2553 }, { "epoch": 1.434025828186412, "grad_norm": 0.43601736426353455, "learning_rate": 6.244588587922129e-06, "loss": 0.4165, "step": 2554 }, { "epoch": 1.4345873104997193, "grad_norm": 0.5474600791931152, "learning_rate": 6.241424106696938e-06, "loss": 0.4576, "step": 2555 }, { "epoch": 1.4351487928130264, "grad_norm": 0.4979509115219116, "learning_rate": 6.238259095453299e-06, "loss": 0.4348, "step": 2556 }, { "epoch": 1.4357102751263335, "grad_norm": 0.458364874124527, "learning_rate": 6.235093555542491e-06, "loss": 0.4076, "step": 2557 }, { "epoch": 1.4362717574396406, "grad_norm": 0.5219268798828125, "learning_rate": 6.2319274883160265e-06, "loss": 0.4292, "step": 2558 }, { "epoch": 1.4368332397529477, "grad_norm": 0.4617058038711548, "learning_rate": 6.228760895125636e-06, "loss": 0.4048, "step": 2559 }, { "epoch": 1.4373947220662548, "grad_norm": 0.49331289529800415, "learning_rate": 6.22559377732328e-06, "loss": 0.4013, "step": 2560 }, { "epoch": 1.437956204379562, "grad_norm": 0.49030694365501404, "learning_rate": 6.222426136261136e-06, "loss": 0.4182, "step": 2561 }, { "epoch": 1.4385176866928693, "grad_norm": 0.47536104917526245, "learning_rate": 6.219257973291611e-06, "loss": 0.3806, "step": 2562 }, { "epoch": 1.4390791690061764, "grad_norm": 0.4411625862121582, "learning_rate": 6.216089289767334e-06, "loss": 0.4266, "step": 2563 }, { "epoch": 1.4396406513194835, "grad_norm": 0.4477655589580536, "learning_rate": 6.212920087041155e-06, "loss": 0.3885, "step": 2564 }, { "epoch": 1.4402021336327906, "grad_norm": 0.47718074917793274, "learning_rate": 6.2097503664661425e-06, "loss": 0.4243, "step": 2565 }, { "epoch": 1.4407636159460977, "grad_norm": 0.41429123282432556, "learning_rate": 6.2065801293955926e-06, "loss": 0.4057, "step": 2566 }, { "epoch": 1.4413250982594048, "grad_norm": 0.46331480145454407, "learning_rate": 6.203409377183017e-06, "loss": 0.4055, "step": 2567 }, { "epoch": 1.441886580572712, "grad_norm": 0.4730745553970337, "learning_rate": 6.200238111182152e-06, "loss": 0.4068, "step": 2568 }, { "epoch": 1.442448062886019, "grad_norm": 0.45934417843818665, "learning_rate": 6.197066332746946e-06, "loss": 0.4229, "step": 2569 }, { "epoch": 1.4430095451993261, "grad_norm": 0.4481833279132843, "learning_rate": 6.193894043231574e-06, "loss": 0.4107, "step": 2570 }, { "epoch": 1.4435710275126334, "grad_norm": 0.44085678458213806, "learning_rate": 6.190721243990423e-06, "loss": 0.41, "step": 2571 }, { "epoch": 1.4441325098259405, "grad_norm": 0.47856926918029785, "learning_rate": 6.187547936378101e-06, "loss": 0.4382, "step": 2572 }, { "epoch": 1.4446939921392477, "grad_norm": 0.5142543315887451, "learning_rate": 6.184374121749434e-06, "loss": 0.3914, "step": 2573 }, { "epoch": 1.4452554744525548, "grad_norm": 0.4981826841831207, "learning_rate": 6.181199801459461e-06, "loss": 0.4338, "step": 2574 }, { "epoch": 1.4458169567658619, "grad_norm": 0.4764862358570099, "learning_rate": 6.178024976863439e-06, "loss": 0.4189, "step": 2575 }, { "epoch": 1.446378439079169, "grad_norm": 0.5844174027442932, "learning_rate": 6.174849649316841e-06, "loss": 0.3964, "step": 2576 }, { "epoch": 1.446939921392476, "grad_norm": 0.5116150975227356, "learning_rate": 6.171673820175352e-06, "loss": 0.4114, "step": 2577 }, { "epoch": 1.4475014037057834, "grad_norm": 0.46716657280921936, "learning_rate": 6.168497490794874e-06, "loss": 0.4061, "step": 2578 }, { "epoch": 1.4480628860190903, "grad_norm": 0.4792223274707794, "learning_rate": 6.165320662531521e-06, "loss": 0.4435, "step": 2579 }, { "epoch": 1.4486243683323976, "grad_norm": 0.5233532786369324, "learning_rate": 6.162143336741621e-06, "loss": 0.4221, "step": 2580 }, { "epoch": 1.4491858506457047, "grad_norm": 0.5693509578704834, "learning_rate": 6.158965514781714e-06, "loss": 0.4469, "step": 2581 }, { "epoch": 1.4497473329590118, "grad_norm": 0.5504201650619507, "learning_rate": 6.155787198008551e-06, "loss": 0.4412, "step": 2582 }, { "epoch": 1.450308815272319, "grad_norm": 0.4965769648551941, "learning_rate": 6.1526083877790935e-06, "loss": 0.4115, "step": 2583 }, { "epoch": 1.450870297585626, "grad_norm": 0.44934168457984924, "learning_rate": 6.149429085450519e-06, "loss": 0.4479, "step": 2584 }, { "epoch": 1.4514317798989331, "grad_norm": 0.4832610487937927, "learning_rate": 6.146249292380209e-06, "loss": 0.3897, "step": 2585 }, { "epoch": 1.4519932622122402, "grad_norm": 0.48569589853286743, "learning_rate": 6.143069009925756e-06, "loss": 0.4072, "step": 2586 }, { "epoch": 1.4525547445255476, "grad_norm": 0.4683602750301361, "learning_rate": 6.139888239444963e-06, "loss": 0.3971, "step": 2587 }, { "epoch": 1.4531162268388544, "grad_norm": 0.49349409341812134, "learning_rate": 6.13670698229584e-06, "loss": 0.4436, "step": 2588 }, { "epoch": 1.4536777091521618, "grad_norm": 0.47699305415153503, "learning_rate": 6.133525239836607e-06, "loss": 0.4103, "step": 2589 }, { "epoch": 1.4542391914654689, "grad_norm": 0.4753575325012207, "learning_rate": 6.130343013425688e-06, "loss": 0.4247, "step": 2590 }, { "epoch": 1.454800673778776, "grad_norm": 0.45627859234809875, "learning_rate": 6.127160304421717e-06, "loss": 0.422, "step": 2591 }, { "epoch": 1.455362156092083, "grad_norm": 0.49792903661727905, "learning_rate": 6.123977114183529e-06, "loss": 0.4028, "step": 2592 }, { "epoch": 1.4559236384053902, "grad_norm": 0.5456176400184631, "learning_rate": 6.12079344407017e-06, "loss": 0.4133, "step": 2593 }, { "epoch": 1.4564851207186973, "grad_norm": 0.48308849334716797, "learning_rate": 6.1176092954408875e-06, "loss": 0.421, "step": 2594 }, { "epoch": 1.4570466030320044, "grad_norm": 0.46064579486846924, "learning_rate": 6.114424669655136e-06, "loss": 0.4027, "step": 2595 }, { "epoch": 1.4576080853453117, "grad_norm": 0.5029228329658508, "learning_rate": 6.11123956807257e-06, "loss": 0.4044, "step": 2596 }, { "epoch": 1.4581695676586188, "grad_norm": 0.5297603607177734, "learning_rate": 6.1080539920530505e-06, "loss": 0.4006, "step": 2597 }, { "epoch": 1.458731049971926, "grad_norm": 0.4890080392360687, "learning_rate": 6.104867942956638e-06, "loss": 0.4297, "step": 2598 }, { "epoch": 1.459292532285233, "grad_norm": 0.46451836824417114, "learning_rate": 6.1016814221435985e-06, "loss": 0.3997, "step": 2599 }, { "epoch": 1.4598540145985401, "grad_norm": 0.49664559960365295, "learning_rate": 6.0984944309743975e-06, "loss": 0.4127, "step": 2600 }, { "epoch": 1.4604154969118472, "grad_norm": 0.49918389320373535, "learning_rate": 6.095306970809701e-06, "loss": 0.3899, "step": 2601 }, { "epoch": 1.4609769792251543, "grad_norm": 0.5096874833106995, "learning_rate": 6.092119043010377e-06, "loss": 0.4301, "step": 2602 }, { "epoch": 1.4615384615384617, "grad_norm": 0.4758189916610718, "learning_rate": 6.088930648937488e-06, "loss": 0.4229, "step": 2603 }, { "epoch": 1.4620999438517686, "grad_norm": 0.4680514335632324, "learning_rate": 6.085741789952304e-06, "loss": 0.3941, "step": 2604 }, { "epoch": 1.4626614261650759, "grad_norm": 0.4895431101322174, "learning_rate": 6.082552467416286e-06, "loss": 0.4116, "step": 2605 }, { "epoch": 1.463222908478383, "grad_norm": 0.5171424150466919, "learning_rate": 6.0793626826910964e-06, "loss": 0.4545, "step": 2606 }, { "epoch": 1.46378439079169, "grad_norm": 0.44223588705062866, "learning_rate": 6.076172437138595e-06, "loss": 0.4231, "step": 2607 }, { "epoch": 1.4643458731049972, "grad_norm": 0.43383660912513733, "learning_rate": 6.072981732120837e-06, "loss": 0.3959, "step": 2608 }, { "epoch": 1.4649073554183043, "grad_norm": 0.47773054242134094, "learning_rate": 6.069790569000074e-06, "loss": 0.422, "step": 2609 }, { "epoch": 1.4654688377316114, "grad_norm": 0.47213923931121826, "learning_rate": 6.066598949138757e-06, "loss": 0.4269, "step": 2610 }, { "epoch": 1.4660303200449185, "grad_norm": 0.49825653433799744, "learning_rate": 6.063406873899524e-06, "loss": 0.4164, "step": 2611 }, { "epoch": 1.4665918023582258, "grad_norm": 0.42086169123649597, "learning_rate": 6.060214344645214e-06, "loss": 0.3857, "step": 2612 }, { "epoch": 1.4671532846715327, "grad_norm": 0.4929948151111603, "learning_rate": 6.0570213627388575e-06, "loss": 0.4139, "step": 2613 }, { "epoch": 1.46771476698484, "grad_norm": 0.5134188532829285, "learning_rate": 6.053827929543679e-06, "loss": 0.4343, "step": 2614 }, { "epoch": 1.4682762492981472, "grad_norm": 0.4437432289123535, "learning_rate": 6.0506340464230965e-06, "loss": 0.3859, "step": 2615 }, { "epoch": 1.4688377316114543, "grad_norm": 0.4882189631462097, "learning_rate": 6.047439714740717e-06, "loss": 0.406, "step": 2616 }, { "epoch": 1.4693992139247614, "grad_norm": 0.472851037979126, "learning_rate": 6.044244935860341e-06, "loss": 0.3866, "step": 2617 }, { "epoch": 1.4699606962380685, "grad_norm": 0.5842772126197815, "learning_rate": 6.041049711145962e-06, "loss": 0.422, "step": 2618 }, { "epoch": 1.4705221785513756, "grad_norm": 0.4845850467681885, "learning_rate": 6.037854041961759e-06, "loss": 0.4089, "step": 2619 }, { "epoch": 1.4710836608646827, "grad_norm": 0.5116607546806335, "learning_rate": 6.0346579296721056e-06, "loss": 0.4134, "step": 2620 }, { "epoch": 1.47164514317799, "grad_norm": 0.43995535373687744, "learning_rate": 6.0314613756415605e-06, "loss": 0.4178, "step": 2621 }, { "epoch": 1.472206625491297, "grad_norm": 0.4697747528553009, "learning_rate": 6.028264381234875e-06, "loss": 0.4151, "step": 2622 }, { "epoch": 1.4727681078046042, "grad_norm": 0.4559278190135956, "learning_rate": 6.025066947816985e-06, "loss": 0.4368, "step": 2623 }, { "epoch": 1.4733295901179113, "grad_norm": 0.48013144731521606, "learning_rate": 6.021869076753014e-06, "loss": 0.4517, "step": 2624 }, { "epoch": 1.4738910724312184, "grad_norm": 0.5096118450164795, "learning_rate": 6.0186707694082765e-06, "loss": 0.4203, "step": 2625 }, { "epoch": 1.4744525547445255, "grad_norm": 0.45945602655410767, "learning_rate": 6.015472027148269e-06, "loss": 0.4191, "step": 2626 }, { "epoch": 1.4750140370578326, "grad_norm": 0.5178208947181702, "learning_rate": 6.012272851338675e-06, "loss": 0.441, "step": 2627 }, { "epoch": 1.4755755193711397, "grad_norm": 0.4781286418437958, "learning_rate": 6.0090732433453635e-06, "loss": 0.4192, "step": 2628 }, { "epoch": 1.4761370016844468, "grad_norm": 0.5056989789009094, "learning_rate": 6.005873204534385e-06, "loss": 0.4046, "step": 2629 }, { "epoch": 1.4766984839977542, "grad_norm": 0.5295119881629944, "learning_rate": 6.002672736271979e-06, "loss": 0.3988, "step": 2630 }, { "epoch": 1.4772599663110613, "grad_norm": 0.49670809507369995, "learning_rate": 5.999471839924568e-06, "loss": 0.3866, "step": 2631 }, { "epoch": 1.4778214486243684, "grad_norm": 0.4495619833469391, "learning_rate": 5.996270516858751e-06, "loss": 0.43, "step": 2632 }, { "epoch": 1.4783829309376755, "grad_norm": 0.49083709716796875, "learning_rate": 5.993068768441315e-06, "loss": 0.4183, "step": 2633 }, { "epoch": 1.4789444132509826, "grad_norm": 0.4391896426677704, "learning_rate": 5.989866596039226e-06, "loss": 0.4034, "step": 2634 }, { "epoch": 1.4795058955642897, "grad_norm": 0.47032156586647034, "learning_rate": 5.986664001019633e-06, "loss": 0.3901, "step": 2635 }, { "epoch": 1.4800673778775968, "grad_norm": 0.479828417301178, "learning_rate": 5.983460984749866e-06, "loss": 0.4313, "step": 2636 }, { "epoch": 1.4806288601909041, "grad_norm": 0.49140918254852295, "learning_rate": 5.980257548597431e-06, "loss": 0.4169, "step": 2637 }, { "epoch": 1.481190342504211, "grad_norm": 0.47926145792007446, "learning_rate": 5.977053693930016e-06, "loss": 0.3847, "step": 2638 }, { "epoch": 1.4817518248175183, "grad_norm": 0.526287853717804, "learning_rate": 5.973849422115486e-06, "loss": 0.394, "step": 2639 }, { "epoch": 1.4823133071308254, "grad_norm": 0.4800287187099457, "learning_rate": 5.970644734521889e-06, "loss": 0.406, "step": 2640 }, { "epoch": 1.4828747894441325, "grad_norm": 0.5072662830352783, "learning_rate": 5.9674396325174435e-06, "loss": 0.4392, "step": 2641 }, { "epoch": 1.4834362717574396, "grad_norm": 0.44998058676719666, "learning_rate": 5.964234117470548e-06, "loss": 0.4342, "step": 2642 }, { "epoch": 1.4839977540707467, "grad_norm": 0.5272728204727173, "learning_rate": 5.961028190749781e-06, "loss": 0.4216, "step": 2643 }, { "epoch": 1.4845592363840538, "grad_norm": 0.5088030695915222, "learning_rate": 5.95782185372389e-06, "loss": 0.4137, "step": 2644 }, { "epoch": 1.485120718697361, "grad_norm": 0.4407382607460022, "learning_rate": 5.954615107761802e-06, "loss": 0.427, "step": 2645 }, { "epoch": 1.4856822010106683, "grad_norm": 0.47395938634872437, "learning_rate": 5.9514079542326174e-06, "loss": 0.3959, "step": 2646 }, { "epoch": 1.4862436833239752, "grad_norm": 0.5348685383796692, "learning_rate": 5.948200394505612e-06, "loss": 0.4189, "step": 2647 }, { "epoch": 1.4868051656372825, "grad_norm": 0.5027997493743896, "learning_rate": 5.944992429950234e-06, "loss": 0.3915, "step": 2648 }, { "epoch": 1.4873666479505896, "grad_norm": 0.477328360080719, "learning_rate": 5.941784061936101e-06, "loss": 0.4292, "step": 2649 }, { "epoch": 1.4879281302638967, "grad_norm": 0.4825415015220642, "learning_rate": 5.93857529183301e-06, "loss": 0.4071, "step": 2650 }, { "epoch": 1.4884896125772038, "grad_norm": 0.6222770810127258, "learning_rate": 5.935366121010924e-06, "loss": 0.4056, "step": 2651 }, { "epoch": 1.489051094890511, "grad_norm": 0.5186034440994263, "learning_rate": 5.93215655083998e-06, "loss": 0.4211, "step": 2652 }, { "epoch": 1.489612577203818, "grad_norm": 0.5049124360084534, "learning_rate": 5.928946582690483e-06, "loss": 0.3942, "step": 2653 }, { "epoch": 1.4901740595171251, "grad_norm": 0.47175630927085876, "learning_rate": 5.925736217932909e-06, "loss": 0.4007, "step": 2654 }, { "epoch": 1.4907355418304324, "grad_norm": 0.5241733193397522, "learning_rate": 5.922525457937907e-06, "loss": 0.4192, "step": 2655 }, { "epoch": 1.4912970241437395, "grad_norm": 0.5339150428771973, "learning_rate": 5.919314304076288e-06, "loss": 0.3853, "step": 2656 }, { "epoch": 1.4918585064570467, "grad_norm": 0.5289257764816284, "learning_rate": 5.916102757719036e-06, "loss": 0.4368, "step": 2657 }, { "epoch": 1.4924199887703538, "grad_norm": 0.5263919234275818, "learning_rate": 5.9128908202372996e-06, "loss": 0.3739, "step": 2658 }, { "epoch": 1.4929814710836609, "grad_norm": 0.5130220055580139, "learning_rate": 5.909678493002396e-06, "loss": 0.43, "step": 2659 }, { "epoch": 1.493542953396968, "grad_norm": 0.5227622389793396, "learning_rate": 5.906465777385812e-06, "loss": 0.4145, "step": 2660 }, { "epoch": 1.494104435710275, "grad_norm": 0.43976661562919617, "learning_rate": 5.903252674759193e-06, "loss": 0.4095, "step": 2661 }, { "epoch": 1.4946659180235822, "grad_norm": 0.5314767360687256, "learning_rate": 5.900039186494356e-06, "loss": 0.3977, "step": 2662 }, { "epoch": 1.4952274003368893, "grad_norm": 0.5195698738098145, "learning_rate": 5.8968253139632804e-06, "loss": 0.4065, "step": 2663 }, { "epoch": 1.4957888826501966, "grad_norm": 0.463034987449646, "learning_rate": 5.893611058538108e-06, "loss": 0.4175, "step": 2664 }, { "epoch": 1.4963503649635037, "grad_norm": 0.473894864320755, "learning_rate": 5.890396421591144e-06, "loss": 0.4141, "step": 2665 }, { "epoch": 1.4969118472768108, "grad_norm": 0.5561984777450562, "learning_rate": 5.887181404494861e-06, "loss": 0.4257, "step": 2666 }, { "epoch": 1.497473329590118, "grad_norm": 0.4867593050003052, "learning_rate": 5.883966008621892e-06, "loss": 0.4101, "step": 2667 }, { "epoch": 1.498034811903425, "grad_norm": 0.4899985194206238, "learning_rate": 5.8807502353450275e-06, "loss": 0.4503, "step": 2668 }, { "epoch": 1.4985962942167321, "grad_norm": 0.5057079195976257, "learning_rate": 5.877534086037223e-06, "loss": 0.3938, "step": 2669 }, { "epoch": 1.4991577765300392, "grad_norm": 0.5048499703407288, "learning_rate": 5.874317562071593e-06, "loss": 0.4288, "step": 2670 }, { "epoch": 1.4997192588433466, "grad_norm": 0.5287804007530212, "learning_rate": 5.871100664821416e-06, "loss": 0.4232, "step": 2671 }, { "epoch": 1.5002807411566534, "grad_norm": 0.4869365394115448, "learning_rate": 5.867883395660124e-06, "loss": 0.4303, "step": 2672 }, { "epoch": 1.5008422234699608, "grad_norm": 0.49428948760032654, "learning_rate": 5.864665755961312e-06, "loss": 0.4055, "step": 2673 }, { "epoch": 1.5014037057832679, "grad_norm": 0.5351567268371582, "learning_rate": 5.861447747098731e-06, "loss": 0.4182, "step": 2674 }, { "epoch": 1.501965188096575, "grad_norm": 0.5468021035194397, "learning_rate": 5.858229370446288e-06, "loss": 0.4331, "step": 2675 }, { "epoch": 1.502526670409882, "grad_norm": 0.4860740005970001, "learning_rate": 5.855010627378052e-06, "loss": 0.4047, "step": 2676 }, { "epoch": 1.5030881527231892, "grad_norm": 0.4298393428325653, "learning_rate": 5.851791519268246e-06, "loss": 0.4028, "step": 2677 }, { "epoch": 1.5036496350364965, "grad_norm": 0.5536957383155823, "learning_rate": 5.848572047491246e-06, "loss": 0.4124, "step": 2678 }, { "epoch": 1.5042111173498034, "grad_norm": 0.49050024151802063, "learning_rate": 5.8453522134215905e-06, "loss": 0.4007, "step": 2679 }, { "epoch": 1.5047725996631107, "grad_norm": 0.44671642780303955, "learning_rate": 5.8421320184339615e-06, "loss": 0.4203, "step": 2680 }, { "epoch": 1.5053340819764176, "grad_norm": 0.49920564889907837, "learning_rate": 5.838911463903206e-06, "loss": 0.3984, "step": 2681 }, { "epoch": 1.505895564289725, "grad_norm": 0.5130435228347778, "learning_rate": 5.8356905512043205e-06, "loss": 0.4424, "step": 2682 }, { "epoch": 1.506457046603032, "grad_norm": 0.521113395690918, "learning_rate": 5.832469281712452e-06, "loss": 0.4458, "step": 2683 }, { "epoch": 1.5070185289163391, "grad_norm": 0.5454095005989075, "learning_rate": 5.829247656802902e-06, "loss": 0.4118, "step": 2684 }, { "epoch": 1.5075800112296462, "grad_norm": 0.45253902673721313, "learning_rate": 5.826025677851124e-06, "loss": 0.4228, "step": 2685 }, { "epoch": 1.5081414935429533, "grad_norm": 0.43319347500801086, "learning_rate": 5.822803346232723e-06, "loss": 0.4019, "step": 2686 }, { "epoch": 1.5087029758562607, "grad_norm": 0.5300816297531128, "learning_rate": 5.819580663323452e-06, "loss": 0.4018, "step": 2687 }, { "epoch": 1.5092644581695676, "grad_norm": 0.48219263553619385, "learning_rate": 5.816357630499219e-06, "loss": 0.4322, "step": 2688 }, { "epoch": 1.5098259404828749, "grad_norm": 0.4556834101676941, "learning_rate": 5.813134249136075e-06, "loss": 0.4327, "step": 2689 }, { "epoch": 1.5103874227961818, "grad_norm": 0.43715348839759827, "learning_rate": 5.809910520610224e-06, "loss": 0.3999, "step": 2690 }, { "epoch": 1.510948905109489, "grad_norm": 0.48745688796043396, "learning_rate": 5.8066864462980175e-06, "loss": 0.4133, "step": 2691 }, { "epoch": 1.5115103874227962, "grad_norm": 0.4538288712501526, "learning_rate": 5.8034620275759545e-06, "loss": 0.4043, "step": 2692 }, { "epoch": 1.5120718697361033, "grad_norm": 0.4486751854419708, "learning_rate": 5.800237265820682e-06, "loss": 0.3733, "step": 2693 }, { "epoch": 1.5126333520494104, "grad_norm": 0.4580402076244354, "learning_rate": 5.79701216240899e-06, "loss": 0.3895, "step": 2694 }, { "epoch": 1.5131948343627175, "grad_norm": 0.47053155303001404, "learning_rate": 5.793786718717816e-06, "loss": 0.4079, "step": 2695 }, { "epoch": 1.5137563166760248, "grad_norm": 0.4270652234554291, "learning_rate": 5.790560936124248e-06, "loss": 0.3955, "step": 2696 }, { "epoch": 1.5143177989893317, "grad_norm": 0.5126726031303406, "learning_rate": 5.787334816005511e-06, "loss": 0.4531, "step": 2697 }, { "epoch": 1.514879281302639, "grad_norm": 0.47525957226753235, "learning_rate": 5.784108359738977e-06, "loss": 0.4199, "step": 2698 }, { "epoch": 1.5154407636159462, "grad_norm": 0.4771634042263031, "learning_rate": 5.780881568702163e-06, "loss": 0.4077, "step": 2699 }, { "epoch": 1.5160022459292533, "grad_norm": 0.4769246280193329, "learning_rate": 5.777654444272727e-06, "loss": 0.3858, "step": 2700 }, { "epoch": 1.5165637282425604, "grad_norm": 0.5304968953132629, "learning_rate": 5.774426987828471e-06, "loss": 0.3943, "step": 2701 }, { "epoch": 1.5171252105558675, "grad_norm": 0.42300844192504883, "learning_rate": 5.7711992007473376e-06, "loss": 0.4029, "step": 2702 }, { "epoch": 1.5176866928691746, "grad_norm": 0.4159699082374573, "learning_rate": 5.7679710844074095e-06, "loss": 0.3817, "step": 2703 }, { "epoch": 1.5182481751824817, "grad_norm": 0.45403096079826355, "learning_rate": 5.764742640186914e-06, "loss": 0.4077, "step": 2704 }, { "epoch": 1.518809657495789, "grad_norm": 0.4810454845428467, "learning_rate": 5.761513869464211e-06, "loss": 0.4393, "step": 2705 }, { "epoch": 1.5193711398090959, "grad_norm": 0.43134114146232605, "learning_rate": 5.758284773617809e-06, "loss": 0.4087, "step": 2706 }, { "epoch": 1.5199326221224032, "grad_norm": 0.4534789025783539, "learning_rate": 5.7550553540263484e-06, "loss": 0.4096, "step": 2707 }, { "epoch": 1.5204941044357103, "grad_norm": 0.4480615258216858, "learning_rate": 5.751825612068613e-06, "loss": 0.4066, "step": 2708 }, { "epoch": 1.5210555867490174, "grad_norm": 0.4971943199634552, "learning_rate": 5.748595549123518e-06, "loss": 0.427, "step": 2709 }, { "epoch": 1.5216170690623245, "grad_norm": 0.48154327273368835, "learning_rate": 5.74536516657012e-06, "loss": 0.4124, "step": 2710 }, { "epoch": 1.5221785513756316, "grad_norm": 0.4475409686565399, "learning_rate": 5.742134465787612e-06, "loss": 0.3922, "step": 2711 }, { "epoch": 1.522740033688939, "grad_norm": 0.4766024053096771, "learning_rate": 5.738903448155322e-06, "loss": 0.4296, "step": 2712 }, { "epoch": 1.5233015160022458, "grad_norm": 0.5287604928016663, "learning_rate": 5.735672115052713e-06, "loss": 0.4207, "step": 2713 }, { "epoch": 1.5238629983155532, "grad_norm": 0.45321783423423767, "learning_rate": 5.732440467859383e-06, "loss": 0.3933, "step": 2714 }, { "epoch": 1.52442448062886, "grad_norm": 0.42413485050201416, "learning_rate": 5.729208507955063e-06, "loss": 0.4075, "step": 2715 }, { "epoch": 1.5249859629421674, "grad_norm": 0.45245495438575745, "learning_rate": 5.7259762367196185e-06, "loss": 0.427, "step": 2716 }, { "epoch": 1.5255474452554745, "grad_norm": 0.46632418036460876, "learning_rate": 5.72274365553305e-06, "loss": 0.4251, "step": 2717 }, { "epoch": 1.5261089275687816, "grad_norm": 0.48158490657806396, "learning_rate": 5.719510765775487e-06, "loss": 0.4135, "step": 2718 }, { "epoch": 1.5266704098820887, "grad_norm": 0.46908995509147644, "learning_rate": 5.7162775688271935e-06, "loss": 0.4159, "step": 2719 }, { "epoch": 1.5272318921953958, "grad_norm": 0.5107643604278564, "learning_rate": 5.71304406606856e-06, "loss": 0.4152, "step": 2720 }, { "epoch": 1.5277933745087031, "grad_norm": 0.44720458984375, "learning_rate": 5.709810258880113e-06, "loss": 0.4032, "step": 2721 }, { "epoch": 1.52835485682201, "grad_norm": 0.5090492367744446, "learning_rate": 5.7065761486425075e-06, "loss": 0.4188, "step": 2722 }, { "epoch": 1.5289163391353173, "grad_norm": 0.48685452342033386, "learning_rate": 5.7033417367365255e-06, "loss": 0.4258, "step": 2723 }, { "epoch": 1.5294778214486242, "grad_norm": 0.4590862989425659, "learning_rate": 5.70010702454308e-06, "loss": 0.4118, "step": 2724 }, { "epoch": 1.5300393037619315, "grad_norm": 0.47051262855529785, "learning_rate": 5.696872013443213e-06, "loss": 0.4163, "step": 2725 }, { "epoch": 1.5306007860752386, "grad_norm": 0.5416069030761719, "learning_rate": 5.693636704818088e-06, "loss": 0.4392, "step": 2726 }, { "epoch": 1.5311622683885457, "grad_norm": 0.5014942288398743, "learning_rate": 5.6904011000490076e-06, "loss": 0.4255, "step": 2727 }, { "epoch": 1.5317237507018528, "grad_norm": 0.4545884132385254, "learning_rate": 5.687165200517388e-06, "loss": 0.4226, "step": 2728 }, { "epoch": 1.53228523301516, "grad_norm": 0.47400912642478943, "learning_rate": 5.683929007604778e-06, "loss": 0.44, "step": 2729 }, { "epoch": 1.5328467153284673, "grad_norm": 0.5009153485298157, "learning_rate": 5.680692522692852e-06, "loss": 0.4245, "step": 2730 }, { "epoch": 1.5334081976417742, "grad_norm": 0.4527222812175751, "learning_rate": 5.677455747163405e-06, "loss": 0.395, "step": 2731 }, { "epoch": 1.5339696799550815, "grad_norm": 0.5582462549209595, "learning_rate": 5.67421868239836e-06, "loss": 0.4154, "step": 2732 }, { "epoch": 1.5345311622683886, "grad_norm": 0.5130447745323181, "learning_rate": 5.670981329779763e-06, "loss": 0.4323, "step": 2733 }, { "epoch": 1.5350926445816957, "grad_norm": 0.454712837934494, "learning_rate": 5.667743690689781e-06, "loss": 0.3988, "step": 2734 }, { "epoch": 1.5356541268950028, "grad_norm": 0.5275452733039856, "learning_rate": 5.664505766510704e-06, "loss": 0.4346, "step": 2735 }, { "epoch": 1.53621560920831, "grad_norm": 0.47667261958122253, "learning_rate": 5.661267558624942e-06, "loss": 0.3952, "step": 2736 }, { "epoch": 1.5367770915216172, "grad_norm": 0.5080882906913757, "learning_rate": 5.658029068415032e-06, "loss": 0.397, "step": 2737 }, { "epoch": 1.5373385738349241, "grad_norm": 0.47965583205223083, "learning_rate": 5.654790297263626e-06, "loss": 0.4003, "step": 2738 }, { "epoch": 1.5379000561482314, "grad_norm": 0.5060041546821594, "learning_rate": 5.651551246553497e-06, "loss": 0.4204, "step": 2739 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5170806646347046, "learning_rate": 5.648311917667538e-06, "loss": 0.3968, "step": 2740 }, { "epoch": 1.5390230207748457, "grad_norm": 0.49806421995162964, "learning_rate": 5.6450723119887615e-06, "loss": 0.4068, "step": 2741 }, { "epoch": 1.5395845030881528, "grad_norm": 0.4538702070713043, "learning_rate": 5.641832430900297e-06, "loss": 0.4263, "step": 2742 }, { "epoch": 1.5401459854014599, "grad_norm": 0.48034462332725525, "learning_rate": 5.6385922757853915e-06, "loss": 0.4113, "step": 2743 }, { "epoch": 1.540707467714767, "grad_norm": 0.5430519580841064, "learning_rate": 5.635351848027411e-06, "loss": 0.4107, "step": 2744 }, { "epoch": 1.541268950028074, "grad_norm": 0.5084047913551331, "learning_rate": 5.632111149009834e-06, "loss": 0.4124, "step": 2745 }, { "epoch": 1.5418304323413814, "grad_norm": 0.4749096632003784, "learning_rate": 5.628870180116258e-06, "loss": 0.4268, "step": 2746 }, { "epoch": 1.5423919146546883, "grad_norm": 0.5261681079864502, "learning_rate": 5.625628942730397e-06, "loss": 0.4579, "step": 2747 }, { "epoch": 1.5429533969679956, "grad_norm": 0.480488121509552, "learning_rate": 5.622387438236076e-06, "loss": 0.3764, "step": 2748 }, { "epoch": 1.5435148792813025, "grad_norm": 0.5112077593803406, "learning_rate": 5.619145668017234e-06, "loss": 0.4084, "step": 2749 }, { "epoch": 1.5440763615946098, "grad_norm": 0.4784593880176544, "learning_rate": 5.615903633457926e-06, "loss": 0.4202, "step": 2750 }, { "epoch": 1.544637843907917, "grad_norm": 0.5422596335411072, "learning_rate": 5.612661335942321e-06, "loss": 0.4242, "step": 2751 }, { "epoch": 1.545199326221224, "grad_norm": 0.49345022439956665, "learning_rate": 5.609418776854695e-06, "loss": 0.4061, "step": 2752 }, { "epoch": 1.5457608085345311, "grad_norm": 0.5214236974716187, "learning_rate": 5.6061759575794406e-06, "loss": 0.4097, "step": 2753 }, { "epoch": 1.5463222908478382, "grad_norm": 0.4291330873966217, "learning_rate": 5.6029328795010595e-06, "loss": 0.4168, "step": 2754 }, { "epoch": 1.5468837731611456, "grad_norm": 0.5131253600120544, "learning_rate": 5.599689544004164e-06, "loss": 0.4295, "step": 2755 }, { "epoch": 1.5474452554744524, "grad_norm": 0.5008382797241211, "learning_rate": 5.596445952473477e-06, "loss": 0.4471, "step": 2756 }, { "epoch": 1.5480067377877598, "grad_norm": 0.4298574924468994, "learning_rate": 5.593202106293828e-06, "loss": 0.4117, "step": 2757 }, { "epoch": 1.5485682201010667, "grad_norm": 0.5366486310958862, "learning_rate": 5.58995800685016e-06, "loss": 0.428, "step": 2758 }, { "epoch": 1.549129702414374, "grad_norm": 0.4657226502895355, "learning_rate": 5.586713655527521e-06, "loss": 0.4175, "step": 2759 }, { "epoch": 1.549691184727681, "grad_norm": 0.5086181163787842, "learning_rate": 5.583469053711065e-06, "loss": 0.4289, "step": 2760 }, { "epoch": 1.5502526670409882, "grad_norm": 0.4286278784275055, "learning_rate": 5.580224202786057e-06, "loss": 0.3996, "step": 2761 }, { "epoch": 1.5508141493542953, "grad_norm": 0.48896804451942444, "learning_rate": 5.576979104137863e-06, "loss": 0.422, "step": 2762 }, { "epoch": 1.5513756316676024, "grad_norm": 0.4609198272228241, "learning_rate": 5.5737337591519635e-06, "loss": 0.4019, "step": 2763 }, { "epoch": 1.5519371139809097, "grad_norm": 0.4457937479019165, "learning_rate": 5.570488169213934e-06, "loss": 0.4058, "step": 2764 }, { "epoch": 1.5524985962942166, "grad_norm": 0.4274764060974121, "learning_rate": 5.567242335709463e-06, "loss": 0.4201, "step": 2765 }, { "epoch": 1.553060078607524, "grad_norm": 0.41251134872436523, "learning_rate": 5.563996260024337e-06, "loss": 0.416, "step": 2766 }, { "epoch": 1.553621560920831, "grad_norm": 0.469808429479599, "learning_rate": 5.560749943544448e-06, "loss": 0.4253, "step": 2767 }, { "epoch": 1.5541830432341381, "grad_norm": 0.4612409472465515, "learning_rate": 5.557503387655792e-06, "loss": 0.3755, "step": 2768 }, { "epoch": 1.5547445255474452, "grad_norm": 0.43067264556884766, "learning_rate": 5.554256593744466e-06, "loss": 0.3869, "step": 2769 }, { "epoch": 1.5553060078607523, "grad_norm": 0.451578289270401, "learning_rate": 5.551009563196669e-06, "loss": 0.4154, "step": 2770 }, { "epoch": 1.5558674901740597, "grad_norm": 0.44021522998809814, "learning_rate": 5.5477622973987e-06, "loss": 0.4149, "step": 2771 }, { "epoch": 1.5564289724873666, "grad_norm": 0.4548647403717041, "learning_rate": 5.5445147977369596e-06, "loss": 0.4236, "step": 2772 }, { "epoch": 1.5569904548006739, "grad_norm": 0.4977158010005951, "learning_rate": 5.541267065597949e-06, "loss": 0.424, "step": 2773 }, { "epoch": 1.5575519371139808, "grad_norm": 0.4442746341228485, "learning_rate": 5.538019102368268e-06, "loss": 0.3932, "step": 2774 }, { "epoch": 1.558113419427288, "grad_norm": 0.46836021542549133, "learning_rate": 5.534770909434614e-06, "loss": 0.4306, "step": 2775 }, { "epoch": 1.5586749017405952, "grad_norm": 0.4348883628845215, "learning_rate": 5.531522488183781e-06, "loss": 0.3986, "step": 2776 }, { "epoch": 1.5592363840539023, "grad_norm": 0.48984792828559875, "learning_rate": 5.528273840002665e-06, "loss": 0.3956, "step": 2777 }, { "epoch": 1.5597978663672094, "grad_norm": 0.4655041992664337, "learning_rate": 5.525024966278256e-06, "loss": 0.4004, "step": 2778 }, { "epoch": 1.5603593486805165, "grad_norm": 0.5412930250167847, "learning_rate": 5.521775868397641e-06, "loss": 0.43, "step": 2779 }, { "epoch": 1.5609208309938238, "grad_norm": 0.4587423801422119, "learning_rate": 5.518526547748002e-06, "loss": 0.3893, "step": 2780 }, { "epoch": 1.5614823133071307, "grad_norm": 0.45847997069358826, "learning_rate": 5.5152770057166185e-06, "loss": 0.4121, "step": 2781 }, { "epoch": 1.562043795620438, "grad_norm": 0.45227140188217163, "learning_rate": 5.5120272436908584e-06, "loss": 0.3805, "step": 2782 }, { "epoch": 1.562605277933745, "grad_norm": 0.49777576327323914, "learning_rate": 5.508777263058192e-06, "loss": 0.4123, "step": 2783 }, { "epoch": 1.5631667602470523, "grad_norm": 0.45478928089141846, "learning_rate": 5.5055270652061774e-06, "loss": 0.4055, "step": 2784 }, { "epoch": 1.5637282425603594, "grad_norm": 0.4476781487464905, "learning_rate": 5.502276651522466e-06, "loss": 0.369, "step": 2785 }, { "epoch": 1.5642897248736665, "grad_norm": 0.5102748870849609, "learning_rate": 5.499026023394803e-06, "loss": 0.4445, "step": 2786 }, { "epoch": 1.5648512071869736, "grad_norm": 0.4438459873199463, "learning_rate": 5.495775182211023e-06, "loss": 0.3878, "step": 2787 }, { "epoch": 1.5654126895002807, "grad_norm": 0.45419952273368835, "learning_rate": 5.492524129359054e-06, "loss": 0.4291, "step": 2788 }, { "epoch": 1.565974171813588, "grad_norm": 0.48405060172080994, "learning_rate": 5.489272866226912e-06, "loss": 0.4017, "step": 2789 }, { "epoch": 1.5665356541268949, "grad_norm": 0.49647337198257446, "learning_rate": 5.486021394202706e-06, "loss": 0.4138, "step": 2790 }, { "epoch": 1.5670971364402022, "grad_norm": 0.4996411204338074, "learning_rate": 5.4827697146746315e-06, "loss": 0.4192, "step": 2791 }, { "epoch": 1.5676586187535093, "grad_norm": 0.4659585654735565, "learning_rate": 5.479517829030972e-06, "loss": 0.3946, "step": 2792 }, { "epoch": 1.5682201010668164, "grad_norm": 0.4266340434551239, "learning_rate": 5.476265738660102e-06, "loss": 0.4121, "step": 2793 }, { "epoch": 1.5687815833801235, "grad_norm": 0.5081726312637329, "learning_rate": 5.473013444950478e-06, "loss": 0.3961, "step": 2794 }, { "epoch": 1.5693430656934306, "grad_norm": 0.4832826554775238, "learning_rate": 5.46976094929065e-06, "loss": 0.3968, "step": 2795 }, { "epoch": 1.5699045480067377, "grad_norm": 0.4697575271129608, "learning_rate": 5.466508253069253e-06, "loss": 0.4317, "step": 2796 }, { "epoch": 1.5704660303200448, "grad_norm": 0.49464836716651917, "learning_rate": 5.463255357675001e-06, "loss": 0.4327, "step": 2797 }, { "epoch": 1.5710275126333522, "grad_norm": 0.47959232330322266, "learning_rate": 5.460002264496702e-06, "loss": 0.3934, "step": 2798 }, { "epoch": 1.571588994946659, "grad_norm": 0.4571303427219391, "learning_rate": 5.456748974923242e-06, "loss": 0.3766, "step": 2799 }, { "epoch": 1.5721504772599664, "grad_norm": 0.4796956479549408, "learning_rate": 5.453495490343593e-06, "loss": 0.4004, "step": 2800 }, { "epoch": 1.5727119595732735, "grad_norm": 0.5166484117507935, "learning_rate": 5.450241812146812e-06, "loss": 0.463, "step": 2801 }, { "epoch": 1.5732734418865806, "grad_norm": 0.4627852737903595, "learning_rate": 5.446987941722037e-06, "loss": 0.4432, "step": 2802 }, { "epoch": 1.5738349241998877, "grad_norm": 0.47753027081489563, "learning_rate": 5.443733880458487e-06, "loss": 0.4133, "step": 2803 }, { "epoch": 1.5743964065131948, "grad_norm": 0.5281411409378052, "learning_rate": 5.440479629745463e-06, "loss": 0.4094, "step": 2804 }, { "epoch": 1.5749578888265021, "grad_norm": 0.48680081963539124, "learning_rate": 5.437225190972348e-06, "loss": 0.3918, "step": 2805 }, { "epoch": 1.575519371139809, "grad_norm": 0.43399813771247864, "learning_rate": 5.433970565528607e-06, "loss": 0.421, "step": 2806 }, { "epoch": 1.5760808534531163, "grad_norm": 0.4555703401565552, "learning_rate": 5.430715754803781e-06, "loss": 0.4167, "step": 2807 }, { "epoch": 1.5766423357664232, "grad_norm": 0.3938028812408447, "learning_rate": 5.42746076018749e-06, "loss": 0.3787, "step": 2808 }, { "epoch": 1.5772038180797305, "grad_norm": 0.4728699326515198, "learning_rate": 5.424205583069434e-06, "loss": 0.4402, "step": 2809 }, { "epoch": 1.5777653003930376, "grad_norm": 0.47372913360595703, "learning_rate": 5.4209502248393945e-06, "loss": 0.4116, "step": 2810 }, { "epoch": 1.5783267827063447, "grad_norm": 0.4547557830810547, "learning_rate": 5.417694686887224e-06, "loss": 0.4001, "step": 2811 }, { "epoch": 1.5788882650196518, "grad_norm": 0.4399188756942749, "learning_rate": 5.4144389706028544e-06, "loss": 0.4249, "step": 2812 }, { "epoch": 1.579449747332959, "grad_norm": 0.45898914337158203, "learning_rate": 5.411183077376296e-06, "loss": 0.4234, "step": 2813 }, { "epoch": 1.5800112296462663, "grad_norm": 0.5544984340667725, "learning_rate": 5.40792700859763e-06, "loss": 0.3899, "step": 2814 }, { "epoch": 1.5805727119595732, "grad_norm": 0.41261276602745056, "learning_rate": 5.404670765657017e-06, "loss": 0.3978, "step": 2815 }, { "epoch": 1.5811341942728805, "grad_norm": 0.4841651916503906, "learning_rate": 5.4014143499446915e-06, "loss": 0.4261, "step": 2816 }, { "epoch": 1.5816956765861874, "grad_norm": 0.4829533100128174, "learning_rate": 5.3981577628509565e-06, "loss": 0.3946, "step": 2817 }, { "epoch": 1.5822571588994947, "grad_norm": 0.46713101863861084, "learning_rate": 5.394901005766193e-06, "loss": 0.4345, "step": 2818 }, { "epoch": 1.5828186412128018, "grad_norm": 0.48720359802246094, "learning_rate": 5.391644080080855e-06, "loss": 0.4234, "step": 2819 }, { "epoch": 1.583380123526109, "grad_norm": 0.515562891960144, "learning_rate": 5.388386987185467e-06, "loss": 0.3848, "step": 2820 }, { "epoch": 1.583941605839416, "grad_norm": 0.46186310052871704, "learning_rate": 5.385129728470624e-06, "loss": 0.4013, "step": 2821 }, { "epoch": 1.5845030881527231, "grad_norm": 0.4690632224082947, "learning_rate": 5.3818723053269936e-06, "loss": 0.4107, "step": 2822 }, { "epoch": 1.5850645704660304, "grad_norm": 0.5415593981742859, "learning_rate": 5.378614719145312e-06, "loss": 0.3987, "step": 2823 }, { "epoch": 1.5856260527793373, "grad_norm": 0.4939579963684082, "learning_rate": 5.375356971316385e-06, "loss": 0.3655, "step": 2824 }, { "epoch": 1.5861875350926447, "grad_norm": 0.49558764696121216, "learning_rate": 5.3720990632310895e-06, "loss": 0.3759, "step": 2825 }, { "epoch": 1.5867490174059518, "grad_norm": 0.4456428587436676, "learning_rate": 5.36884099628037e-06, "loss": 0.415, "step": 2826 }, { "epoch": 1.5873104997192589, "grad_norm": 0.47432810068130493, "learning_rate": 5.3655827718552375e-06, "loss": 0.4261, "step": 2827 }, { "epoch": 1.587871982032566, "grad_norm": 0.45385265350341797, "learning_rate": 5.362324391346771e-06, "loss": 0.4243, "step": 2828 }, { "epoch": 1.588433464345873, "grad_norm": 0.47518986463546753, "learning_rate": 5.359065856146114e-06, "loss": 0.4069, "step": 2829 }, { "epoch": 1.5889949466591804, "grad_norm": 0.46783047914505005, "learning_rate": 5.355807167644483e-06, "loss": 0.4238, "step": 2830 }, { "epoch": 1.5895564289724873, "grad_norm": 0.45356878638267517, "learning_rate": 5.3525483272331506e-06, "loss": 0.4156, "step": 2831 }, { "epoch": 1.5901179112857946, "grad_norm": 0.47621074318885803, "learning_rate": 5.3492893363034614e-06, "loss": 0.415, "step": 2832 }, { "epoch": 1.5906793935991015, "grad_norm": 0.5432513356208801, "learning_rate": 5.346030196246822e-06, "loss": 0.4062, "step": 2833 }, { "epoch": 1.5912408759124088, "grad_norm": 0.6042305827140808, "learning_rate": 5.342770908454701e-06, "loss": 0.4238, "step": 2834 }, { "epoch": 1.591802358225716, "grad_norm": 0.4659605026245117, "learning_rate": 5.33951147431863e-06, "loss": 0.4211, "step": 2835 }, { "epoch": 1.592363840539023, "grad_norm": 0.5042184591293335, "learning_rate": 5.336251895230208e-06, "loss": 0.4069, "step": 2836 }, { "epoch": 1.5929253228523301, "grad_norm": 0.5664698481559753, "learning_rate": 5.3329921725810904e-06, "loss": 0.4042, "step": 2837 }, { "epoch": 1.5934868051656372, "grad_norm": 0.4395785927772522, "learning_rate": 5.329732307762997e-06, "loss": 0.42, "step": 2838 }, { "epoch": 1.5940482874789446, "grad_norm": 0.5392475128173828, "learning_rate": 5.326472302167707e-06, "loss": 0.4188, "step": 2839 }, { "epoch": 1.5946097697922514, "grad_norm": 0.4984365403652191, "learning_rate": 5.323212157187058e-06, "loss": 0.4379, "step": 2840 }, { "epoch": 1.5951712521055588, "grad_norm": 0.48665568232536316, "learning_rate": 5.3199518742129504e-06, "loss": 0.3553, "step": 2841 }, { "epoch": 1.5957327344188657, "grad_norm": 0.49841761589050293, "learning_rate": 5.316691454637342e-06, "loss": 0.3874, "step": 2842 }, { "epoch": 1.596294216732173, "grad_norm": 0.5760215520858765, "learning_rate": 5.313430899852251e-06, "loss": 0.4223, "step": 2843 }, { "epoch": 1.59685569904548, "grad_norm": 0.593207597732544, "learning_rate": 5.310170211249748e-06, "loss": 0.4178, "step": 2844 }, { "epoch": 1.5974171813587872, "grad_norm": 0.4684610366821289, "learning_rate": 5.3069093902219645e-06, "loss": 0.3925, "step": 2845 }, { "epoch": 1.5979786636720943, "grad_norm": 0.5076104402542114, "learning_rate": 5.3036484381610905e-06, "loss": 0.3693, "step": 2846 }, { "epoch": 1.5985401459854014, "grad_norm": 0.6500234007835388, "learning_rate": 5.30038735645937e-06, "loss": 0.4026, "step": 2847 }, { "epoch": 1.5991016282987087, "grad_norm": 0.49410244822502136, "learning_rate": 5.297126146509098e-06, "loss": 0.4132, "step": 2848 }, { "epoch": 1.5996631106120156, "grad_norm": 0.45980745553970337, "learning_rate": 5.2938648097026324e-06, "loss": 0.393, "step": 2849 }, { "epoch": 1.600224592925323, "grad_norm": 0.47610440850257874, "learning_rate": 5.290603347432377e-06, "loss": 0.4241, "step": 2850 }, { "epoch": 1.60078607523863, "grad_norm": 0.5461520552635193, "learning_rate": 5.287341761090796e-06, "loss": 0.3996, "step": 2851 }, { "epoch": 1.6013475575519371, "grad_norm": 0.5530164837837219, "learning_rate": 5.284080052070404e-06, "loss": 0.4077, "step": 2852 }, { "epoch": 1.6019090398652442, "grad_norm": 0.4826066792011261, "learning_rate": 5.280818221763767e-06, "loss": 0.3949, "step": 2853 }, { "epoch": 1.6024705221785513, "grad_norm": 0.5117914080619812, "learning_rate": 5.2775562715635035e-06, "loss": 0.4055, "step": 2854 }, { "epoch": 1.6030320044918585, "grad_norm": 0.48357197642326355, "learning_rate": 5.274294202862282e-06, "loss": 0.4161, "step": 2855 }, { "epoch": 1.6035934868051656, "grad_norm": 0.4496392011642456, "learning_rate": 5.271032017052825e-06, "loss": 0.3723, "step": 2856 }, { "epoch": 1.6041549691184729, "grad_norm": 0.4907981753349304, "learning_rate": 5.2677697155279016e-06, "loss": 0.4398, "step": 2857 }, { "epoch": 1.6047164514317798, "grad_norm": 0.5115631222724915, "learning_rate": 5.264507299680331e-06, "loss": 0.4378, "step": 2858 }, { "epoch": 1.605277933745087, "grad_norm": 0.49068135023117065, "learning_rate": 5.261244770902985e-06, "loss": 0.4226, "step": 2859 }, { "epoch": 1.6058394160583942, "grad_norm": 0.43436363339424133, "learning_rate": 5.257982130588775e-06, "loss": 0.3821, "step": 2860 }, { "epoch": 1.6064008983717013, "grad_norm": 0.41269081830978394, "learning_rate": 5.2547193801306695e-06, "loss": 0.405, "step": 2861 }, { "epoch": 1.6069623806850084, "grad_norm": 0.5010202527046204, "learning_rate": 5.25145652092168e-06, "loss": 0.4176, "step": 2862 }, { "epoch": 1.6075238629983155, "grad_norm": 0.4928915202617645, "learning_rate": 5.248193554354861e-06, "loss": 0.428, "step": 2863 }, { "epoch": 1.6080853453116228, "grad_norm": 0.42168277502059937, "learning_rate": 5.24493048182332e-06, "loss": 0.4214, "step": 2864 }, { "epoch": 1.6086468276249297, "grad_norm": 0.5237216949462891, "learning_rate": 5.241667304720202e-06, "loss": 0.4258, "step": 2865 }, { "epoch": 1.609208309938237, "grad_norm": 0.5652816891670227, "learning_rate": 5.238404024438704e-06, "loss": 0.3972, "step": 2866 }, { "epoch": 1.609769792251544, "grad_norm": 0.5239914655685425, "learning_rate": 5.235140642372062e-06, "loss": 0.4109, "step": 2867 }, { "epoch": 1.6103312745648513, "grad_norm": 0.5099974870681763, "learning_rate": 5.231877159913558e-06, "loss": 0.42, "step": 2868 }, { "epoch": 1.6108927568781584, "grad_norm": 0.515823483467102, "learning_rate": 5.228613578456514e-06, "loss": 0.4182, "step": 2869 }, { "epoch": 1.6114542391914655, "grad_norm": 0.5376139283180237, "learning_rate": 5.225349899394296e-06, "loss": 0.4242, "step": 2870 }, { "epoch": 1.6120157215047726, "grad_norm": 0.47177654504776, "learning_rate": 5.222086124120317e-06, "loss": 0.3985, "step": 2871 }, { "epoch": 1.6125772038180797, "grad_norm": 0.48990753293037415, "learning_rate": 5.218822254028019e-06, "loss": 0.4091, "step": 2872 }, { "epoch": 1.613138686131387, "grad_norm": 0.5065432786941528, "learning_rate": 5.215558290510897e-06, "loss": 0.4295, "step": 2873 }, { "epoch": 1.6137001684446939, "grad_norm": 0.48345208168029785, "learning_rate": 5.2122942349624786e-06, "loss": 0.3992, "step": 2874 }, { "epoch": 1.6142616507580012, "grad_norm": 0.47070255875587463, "learning_rate": 5.20903008877633e-06, "loss": 0.4158, "step": 2875 }, { "epoch": 1.614823133071308, "grad_norm": 0.4584297239780426, "learning_rate": 5.2057658533460594e-06, "loss": 0.3842, "step": 2876 }, { "epoch": 1.6153846153846154, "grad_norm": 0.5120691061019897, "learning_rate": 5.202501530065315e-06, "loss": 0.4339, "step": 2877 }, { "epoch": 1.6159460976979225, "grad_norm": 0.49277687072753906, "learning_rate": 5.199237120327777e-06, "loss": 0.4002, "step": 2878 }, { "epoch": 1.6165075800112296, "grad_norm": 0.43218758702278137, "learning_rate": 5.195972625527166e-06, "loss": 0.3848, "step": 2879 }, { "epoch": 1.6170690623245367, "grad_norm": 0.4564712643623352, "learning_rate": 5.192708047057239e-06, "loss": 0.4143, "step": 2880 }, { "epoch": 1.6176305446378438, "grad_norm": 0.4326903223991394, "learning_rate": 5.1894433863117855e-06, "loss": 0.4146, "step": 2881 }, { "epoch": 1.6181920269511512, "grad_norm": 0.45420464873313904, "learning_rate": 5.186178644684634e-06, "loss": 0.4476, "step": 2882 }, { "epoch": 1.618753509264458, "grad_norm": 0.4658478796482086, "learning_rate": 5.182913823569647e-06, "loss": 0.4116, "step": 2883 }, { "epoch": 1.6193149915777654, "grad_norm": 0.4520869553089142, "learning_rate": 5.1796489243607195e-06, "loss": 0.4077, "step": 2884 }, { "epoch": 1.6198764738910725, "grad_norm": 0.41459476947784424, "learning_rate": 5.176383948451779e-06, "loss": 0.4121, "step": 2885 }, { "epoch": 1.6204379562043796, "grad_norm": 0.4194946587085724, "learning_rate": 5.173118897236787e-06, "loss": 0.3931, "step": 2886 }, { "epoch": 1.6209994385176867, "grad_norm": 0.41628143191337585, "learning_rate": 5.16985377210974e-06, "loss": 0.4035, "step": 2887 }, { "epoch": 1.6215609208309938, "grad_norm": 0.4547686278820038, "learning_rate": 5.1665885744646595e-06, "loss": 0.399, "step": 2888 }, { "epoch": 1.6221224031443011, "grad_norm": 0.42845579981803894, "learning_rate": 5.163323305695604e-06, "loss": 0.4363, "step": 2889 }, { "epoch": 1.622683885457608, "grad_norm": 0.42708948254585266, "learning_rate": 5.16005796719666e-06, "loss": 0.4077, "step": 2890 }, { "epoch": 1.6232453677709153, "grad_norm": 0.4233963191509247, "learning_rate": 5.156792560361943e-06, "loss": 0.3839, "step": 2891 }, { "epoch": 1.6238068500842222, "grad_norm": 0.45739468932151794, "learning_rate": 5.153527086585599e-06, "loss": 0.3973, "step": 2892 }, { "epoch": 1.6243683323975295, "grad_norm": 0.5048028826713562, "learning_rate": 5.150261547261803e-06, "loss": 0.3915, "step": 2893 }, { "epoch": 1.6249298147108366, "grad_norm": 0.4352138042449951, "learning_rate": 5.146995943784755e-06, "loss": 0.4196, "step": 2894 }, { "epoch": 1.6254912970241437, "grad_norm": 0.4384513199329376, "learning_rate": 5.143730277548685e-06, "loss": 0.4239, "step": 2895 }, { "epoch": 1.6260527793374508, "grad_norm": 0.4540598690509796, "learning_rate": 5.14046454994785e-06, "loss": 0.3999, "step": 2896 }, { "epoch": 1.626614261650758, "grad_norm": 0.4578467011451721, "learning_rate": 5.137198762376534e-06, "loss": 0.3951, "step": 2897 }, { "epoch": 1.6271757439640653, "grad_norm": 0.44841906428337097, "learning_rate": 5.133932916229042e-06, "loss": 0.4155, "step": 2898 }, { "epoch": 1.6277372262773722, "grad_norm": 0.45158928632736206, "learning_rate": 5.130667012899709e-06, "loss": 0.4247, "step": 2899 }, { "epoch": 1.6282987085906795, "grad_norm": 0.44502583146095276, "learning_rate": 5.127401053782894e-06, "loss": 0.4265, "step": 2900 }, { "epoch": 1.6288601909039864, "grad_norm": 0.46806061267852783, "learning_rate": 5.124135040272973e-06, "loss": 0.3892, "step": 2901 }, { "epoch": 1.6294216732172937, "grad_norm": 0.5299411416053772, "learning_rate": 5.1208689737643544e-06, "loss": 0.4265, "step": 2902 }, { "epoch": 1.6299831555306008, "grad_norm": 0.4489661157131195, "learning_rate": 5.117602855651466e-06, "loss": 0.3834, "step": 2903 }, { "epoch": 1.630544637843908, "grad_norm": 0.4487694203853607, "learning_rate": 5.114336687328757e-06, "loss": 0.3956, "step": 2904 }, { "epoch": 1.631106120157215, "grad_norm": 0.4438643753528595, "learning_rate": 5.111070470190696e-06, "loss": 0.3998, "step": 2905 }, { "epoch": 1.6316676024705221, "grad_norm": 0.47123056650161743, "learning_rate": 5.1078042056317734e-06, "loss": 0.3749, "step": 2906 }, { "epoch": 1.6322290847838294, "grad_norm": 0.446480393409729, "learning_rate": 5.1045378950465054e-06, "loss": 0.4102, "step": 2907 }, { "epoch": 1.6327905670971363, "grad_norm": 0.4276769459247589, "learning_rate": 5.10127153982942e-06, "loss": 0.4044, "step": 2908 }, { "epoch": 1.6333520494104437, "grad_norm": 0.4605846405029297, "learning_rate": 5.098005141375069e-06, "loss": 0.4204, "step": 2909 }, { "epoch": 1.6339135317237508, "grad_norm": 0.4815738797187805, "learning_rate": 5.09473870107802e-06, "loss": 0.403, "step": 2910 }, { "epoch": 1.6344750140370579, "grad_norm": 0.47587549686431885, "learning_rate": 5.091472220332859e-06, "loss": 0.4261, "step": 2911 }, { "epoch": 1.635036496350365, "grad_norm": 0.5117761492729187, "learning_rate": 5.088205700534195e-06, "loss": 0.4032, "step": 2912 }, { "epoch": 1.635597978663672, "grad_norm": 0.5064557790756226, "learning_rate": 5.084939143076643e-06, "loss": 0.4135, "step": 2913 }, { "epoch": 1.6361594609769792, "grad_norm": 0.4580407440662384, "learning_rate": 5.0816725493548435e-06, "loss": 0.4319, "step": 2914 }, { "epoch": 1.6367209432902863, "grad_norm": 0.45609143376350403, "learning_rate": 5.0784059207634476e-06, "loss": 0.3885, "step": 2915 }, { "epoch": 1.6372824256035936, "grad_norm": 0.49566811323165894, "learning_rate": 5.075139258697119e-06, "loss": 0.4122, "step": 2916 }, { "epoch": 1.6378439079169005, "grad_norm": 0.47673654556274414, "learning_rate": 5.071872564550545e-06, "loss": 0.4107, "step": 2917 }, { "epoch": 1.6384053902302078, "grad_norm": 0.44110265374183655, "learning_rate": 5.0686058397184175e-06, "loss": 0.4262, "step": 2918 }, { "epoch": 1.638966872543515, "grad_norm": 0.4307021200656891, "learning_rate": 5.065339085595446e-06, "loss": 0.417, "step": 2919 }, { "epoch": 1.639528354856822, "grad_norm": 0.47207725048065186, "learning_rate": 5.062072303576351e-06, "loss": 0.3982, "step": 2920 }, { "epoch": 1.6400898371701291, "grad_norm": 0.5180945992469788, "learning_rate": 5.058805495055864e-06, "loss": 0.3912, "step": 2921 }, { "epoch": 1.6406513194834362, "grad_norm": 0.43232983350753784, "learning_rate": 5.055538661428728e-06, "loss": 0.3481, "step": 2922 }, { "epoch": 1.6412128017967436, "grad_norm": 0.4230382442474365, "learning_rate": 5.052271804089701e-06, "loss": 0.4228, "step": 2923 }, { "epoch": 1.6417742841100504, "grad_norm": 0.44703248143196106, "learning_rate": 5.049004924433546e-06, "loss": 0.413, "step": 2924 }, { "epoch": 1.6423357664233578, "grad_norm": 0.49935221672058105, "learning_rate": 5.045738023855037e-06, "loss": 0.4274, "step": 2925 }, { "epoch": 1.6428972487366647, "grad_norm": 0.5068000555038452, "learning_rate": 5.042471103748958e-06, "loss": 0.3954, "step": 2926 }, { "epoch": 1.643458731049972, "grad_norm": 0.48123276233673096, "learning_rate": 5.0392041655101e-06, "loss": 0.422, "step": 2927 }, { "epoch": 1.644020213363279, "grad_norm": 0.4509470760822296, "learning_rate": 5.035937210533264e-06, "loss": 0.4157, "step": 2928 }, { "epoch": 1.6445816956765862, "grad_norm": 0.5060197710990906, "learning_rate": 5.032670240213255e-06, "loss": 0.3863, "step": 2929 }, { "epoch": 1.6451431779898933, "grad_norm": 0.5071704983711243, "learning_rate": 5.029403255944885e-06, "loss": 0.4209, "step": 2930 }, { "epoch": 1.6457046603032004, "grad_norm": 0.42698150873184204, "learning_rate": 5.026136259122974e-06, "loss": 0.3801, "step": 2931 }, { "epoch": 1.6462661426165077, "grad_norm": 0.4631955623626709, "learning_rate": 5.0228692511423474e-06, "loss": 0.3744, "step": 2932 }, { "epoch": 1.6468276249298146, "grad_norm": 0.4901924729347229, "learning_rate": 5.0196022333978346e-06, "loss": 0.3681, "step": 2933 }, { "epoch": 1.647389107243122, "grad_norm": 0.45722779631614685, "learning_rate": 5.016335207284266e-06, "loss": 0.42, "step": 2934 }, { "epoch": 1.6479505895564288, "grad_norm": 0.441645085811615, "learning_rate": 5.0130681741964814e-06, "loss": 0.38, "step": 2935 }, { "epoch": 1.6485120718697361, "grad_norm": 0.4614783227443695, "learning_rate": 5.00980113552932e-06, "loss": 0.4191, "step": 2936 }, { "epoch": 1.6490735541830432, "grad_norm": 0.4185820519924164, "learning_rate": 5.006534092677622e-06, "loss": 0.4166, "step": 2937 }, { "epoch": 1.6496350364963503, "grad_norm": 0.4963092505931854, "learning_rate": 5.003267047036234e-06, "loss": 0.4473, "step": 2938 }, { "epoch": 1.6501965188096575, "grad_norm": 0.4471431076526642, "learning_rate": 5e-06, "loss": 0.4307, "step": 2939 }, { "epoch": 1.6507580011229646, "grad_norm": 0.50930255651474, "learning_rate": 4.996732952963767e-06, "loss": 0.42, "step": 2940 }, { "epoch": 1.6513194834362719, "grad_norm": 0.47226378321647644, "learning_rate": 4.993465907322379e-06, "loss": 0.4257, "step": 2941 }, { "epoch": 1.6518809657495788, "grad_norm": 0.40871843695640564, "learning_rate": 4.990198864470681e-06, "loss": 0.393, "step": 2942 }, { "epoch": 1.652442448062886, "grad_norm": 0.5282956957817078, "learning_rate": 4.986931825803519e-06, "loss": 0.4087, "step": 2943 }, { "epoch": 1.6530039303761932, "grad_norm": 0.42189231514930725, "learning_rate": 4.983664792715735e-06, "loss": 0.4301, "step": 2944 }, { "epoch": 1.6535654126895003, "grad_norm": 0.4329470694065094, "learning_rate": 4.980397766602167e-06, "loss": 0.3681, "step": 2945 }, { "epoch": 1.6541268950028074, "grad_norm": 0.487389475107193, "learning_rate": 4.977130748857653e-06, "loss": 0.3923, "step": 2946 }, { "epoch": 1.6546883773161145, "grad_norm": 0.46916714310646057, "learning_rate": 4.9738637408770265e-06, "loss": 0.3875, "step": 2947 }, { "epoch": 1.6552498596294218, "grad_norm": 0.4261088967323303, "learning_rate": 4.970596744055117e-06, "loss": 0.4364, "step": 2948 }, { "epoch": 1.6558113419427287, "grad_norm": 0.4375821352005005, "learning_rate": 4.967329759786748e-06, "loss": 0.4141, "step": 2949 }, { "epoch": 1.656372824256036, "grad_norm": 0.4839567542076111, "learning_rate": 4.964062789466738e-06, "loss": 0.4215, "step": 2950 }, { "epoch": 1.656934306569343, "grad_norm": 0.5030179619789124, "learning_rate": 4.960795834489902e-06, "loss": 0.4175, "step": 2951 }, { "epoch": 1.6574957888826503, "grad_norm": 0.45250752568244934, "learning_rate": 4.957528896251043e-06, "loss": 0.3784, "step": 2952 }, { "epoch": 1.6580572711959574, "grad_norm": 0.44792675971984863, "learning_rate": 4.954261976144964e-06, "loss": 0.4312, "step": 2953 }, { "epoch": 1.6586187535092645, "grad_norm": 0.44798770546913147, "learning_rate": 4.9509950755664555e-06, "loss": 0.4162, "step": 2954 }, { "epoch": 1.6591802358225716, "grad_norm": 0.4642179608345032, "learning_rate": 4.9477281959103e-06, "loss": 0.4134, "step": 2955 }, { "epoch": 1.6597417181358787, "grad_norm": 0.4623255431652069, "learning_rate": 4.944461338571274e-06, "loss": 0.4115, "step": 2956 }, { "epoch": 1.660303200449186, "grad_norm": 0.39184698462486267, "learning_rate": 4.941194504944138e-06, "loss": 0.3829, "step": 2957 }, { "epoch": 1.6608646827624929, "grad_norm": 0.46130380034446716, "learning_rate": 4.937927696423651e-06, "loss": 0.4122, "step": 2958 }, { "epoch": 1.6614261650758002, "grad_norm": 0.4919210374355316, "learning_rate": 4.934660914404555e-06, "loss": 0.4186, "step": 2959 }, { "epoch": 1.661987647389107, "grad_norm": 0.42103704810142517, "learning_rate": 4.931394160281583e-06, "loss": 0.4081, "step": 2960 }, { "epoch": 1.6625491297024144, "grad_norm": 0.40581560134887695, "learning_rate": 4.928127435449456e-06, "loss": 0.3865, "step": 2961 }, { "epoch": 1.6631106120157215, "grad_norm": 0.4729568660259247, "learning_rate": 4.924860741302882e-06, "loss": 0.3987, "step": 2962 }, { "epoch": 1.6636720943290286, "grad_norm": 0.4986543357372284, "learning_rate": 4.921594079236555e-06, "loss": 0.3868, "step": 2963 }, { "epoch": 1.6642335766423357, "grad_norm": 0.44405749440193176, "learning_rate": 4.918327450645158e-06, "loss": 0.4035, "step": 2964 }, { "epoch": 1.6647950589556428, "grad_norm": 0.504423201084137, "learning_rate": 4.915060856923358e-06, "loss": 0.4453, "step": 2965 }, { "epoch": 1.6653565412689502, "grad_norm": 0.4692177176475525, "learning_rate": 4.911794299465807e-06, "loss": 0.406, "step": 2966 }, { "epoch": 1.665918023582257, "grad_norm": 0.4810151755809784, "learning_rate": 4.9085277796671415e-06, "loss": 0.4048, "step": 2967 }, { "epoch": 1.6664795058955644, "grad_norm": 0.5002745389938354, "learning_rate": 4.905261298921981e-06, "loss": 0.3735, "step": 2968 }, { "epoch": 1.6670409882088713, "grad_norm": 0.5013208985328674, "learning_rate": 4.901994858624933e-06, "loss": 0.3986, "step": 2969 }, { "epoch": 1.6676024705221786, "grad_norm": 0.5266544222831726, "learning_rate": 4.898728460170582e-06, "loss": 0.4313, "step": 2970 }, { "epoch": 1.6681639528354857, "grad_norm": 0.560392439365387, "learning_rate": 4.895462104953497e-06, "loss": 0.4263, "step": 2971 }, { "epoch": 1.6687254351487928, "grad_norm": 0.463030070066452, "learning_rate": 4.892195794368228e-06, "loss": 0.416, "step": 2972 }, { "epoch": 1.6692869174621, "grad_norm": 0.46573176980018616, "learning_rate": 4.888929529809306e-06, "loss": 0.4103, "step": 2973 }, { "epoch": 1.669848399775407, "grad_norm": 0.49644675850868225, "learning_rate": 4.885663312671245e-06, "loss": 0.3706, "step": 2974 }, { "epoch": 1.6704098820887143, "grad_norm": 0.41501185297966003, "learning_rate": 4.8823971443485344e-06, "loss": 0.402, "step": 2975 }, { "epoch": 1.6709713644020212, "grad_norm": 0.4661504626274109, "learning_rate": 4.879131026235646e-06, "loss": 0.4349, "step": 2976 }, { "epoch": 1.6715328467153285, "grad_norm": 0.46247637271881104, "learning_rate": 4.875864959727029e-06, "loss": 0.389, "step": 2977 }, { "epoch": 1.6720943290286356, "grad_norm": 0.4385911226272583, "learning_rate": 4.872598946217109e-06, "loss": 0.4194, "step": 2978 }, { "epoch": 1.6726558113419427, "grad_norm": 0.46083372831344604, "learning_rate": 4.869332987100292e-06, "loss": 0.4343, "step": 2979 }, { "epoch": 1.6732172936552498, "grad_norm": 0.4415045976638794, "learning_rate": 4.866067083770959e-06, "loss": 0.4249, "step": 2980 }, { "epoch": 1.673778775968557, "grad_norm": 0.4653138518333435, "learning_rate": 4.8628012376234675e-06, "loss": 0.4286, "step": 2981 }, { "epoch": 1.6743402582818643, "grad_norm": 0.4997231662273407, "learning_rate": 4.859535450052151e-06, "loss": 0.4166, "step": 2982 }, { "epoch": 1.6749017405951712, "grad_norm": 0.49927666783332825, "learning_rate": 4.856269722451316e-06, "loss": 0.4008, "step": 2983 }, { "epoch": 1.6754632229084785, "grad_norm": 0.492965966463089, "learning_rate": 4.853004056215247e-06, "loss": 0.4531, "step": 2984 }, { "epoch": 1.6760247052217854, "grad_norm": 0.42217421531677246, "learning_rate": 4.849738452738199e-06, "loss": 0.3982, "step": 2985 }, { "epoch": 1.6765861875350927, "grad_norm": 0.5471200346946716, "learning_rate": 4.846472913414403e-06, "loss": 0.4137, "step": 2986 }, { "epoch": 1.6771476698483998, "grad_norm": 0.507124125957489, "learning_rate": 4.843207439638059e-06, "loss": 0.3989, "step": 2987 }, { "epoch": 1.677709152161707, "grad_norm": 0.47814443707466125, "learning_rate": 4.8399420328033405e-06, "loss": 0.3788, "step": 2988 }, { "epoch": 1.678270634475014, "grad_norm": 0.4348815381526947, "learning_rate": 4.836676694304397e-06, "loss": 0.3908, "step": 2989 }, { "epoch": 1.6788321167883211, "grad_norm": 0.5392429232597351, "learning_rate": 4.833411425535341e-06, "loss": 0.3966, "step": 2990 }, { "epoch": 1.6793935991016284, "grad_norm": 0.4999266564846039, "learning_rate": 4.830146227890262e-06, "loss": 0.4074, "step": 2991 }, { "epoch": 1.6799550814149353, "grad_norm": 0.40511733293533325, "learning_rate": 4.826881102763214e-06, "loss": 0.3897, "step": 2992 }, { "epoch": 1.6805165637282427, "grad_norm": 0.45610982179641724, "learning_rate": 4.823616051548222e-06, "loss": 0.4068, "step": 2993 }, { "epoch": 1.6810780460415495, "grad_norm": 0.4712795913219452, "learning_rate": 4.820351075639282e-06, "loss": 0.4212, "step": 2994 }, { "epoch": 1.6816395283548569, "grad_norm": 0.4800291955471039, "learning_rate": 4.8170861764303536e-06, "loss": 0.4092, "step": 2995 }, { "epoch": 1.682201010668164, "grad_norm": 0.42504072189331055, "learning_rate": 4.8138213553153665e-06, "loss": 0.4008, "step": 2996 }, { "epoch": 1.682762492981471, "grad_norm": 0.5098683834075928, "learning_rate": 4.810556613688216e-06, "loss": 0.4084, "step": 2997 }, { "epoch": 1.6833239752947782, "grad_norm": 0.43878474831581116, "learning_rate": 4.807291952942762e-06, "loss": 0.4028, "step": 2998 }, { "epoch": 1.6838854576080853, "grad_norm": 0.4292124807834625, "learning_rate": 4.804027374472835e-06, "loss": 0.3911, "step": 2999 }, { "epoch": 1.6844469399213926, "grad_norm": 0.45616427063941956, "learning_rate": 4.800762879672224e-06, "loss": 0.4183, "step": 3000 }, { "epoch": 1.6850084222346995, "grad_norm": 0.4917950928211212, "learning_rate": 4.797498469934687e-06, "loss": 0.4108, "step": 3001 }, { "epoch": 1.6855699045480068, "grad_norm": 0.4437319040298462, "learning_rate": 4.794234146653942e-06, "loss": 0.3979, "step": 3002 }, { "epoch": 1.686131386861314, "grad_norm": 0.47453102469444275, "learning_rate": 4.790969911223672e-06, "loss": 0.4226, "step": 3003 }, { "epoch": 1.686692869174621, "grad_norm": 0.41817912459373474, "learning_rate": 4.787705765037524e-06, "loss": 0.4129, "step": 3004 }, { "epoch": 1.6872543514879281, "grad_norm": 0.4933673143386841, "learning_rate": 4.784441709489104e-06, "loss": 0.4028, "step": 3005 }, { "epoch": 1.6878158338012352, "grad_norm": 0.4635257124900818, "learning_rate": 4.7811777459719814e-06, "loss": 0.4257, "step": 3006 }, { "epoch": 1.6883773161145423, "grad_norm": 0.4490947425365448, "learning_rate": 4.777913875879685e-06, "loss": 0.4428, "step": 3007 }, { "epoch": 1.6889387984278494, "grad_norm": 0.43560564517974854, "learning_rate": 4.774650100605705e-06, "loss": 0.3811, "step": 3008 }, { "epoch": 1.6895002807411568, "grad_norm": 0.4636662006378174, "learning_rate": 4.771386421543487e-06, "loss": 0.407, "step": 3009 }, { "epoch": 1.6900617630544637, "grad_norm": 0.44347715377807617, "learning_rate": 4.768122840086444e-06, "loss": 0.4127, "step": 3010 }, { "epoch": 1.690623245367771, "grad_norm": 0.4254954159259796, "learning_rate": 4.76485935762794e-06, "loss": 0.3753, "step": 3011 }, { "epoch": 1.691184727681078, "grad_norm": 0.4495353400707245, "learning_rate": 4.761595975561298e-06, "loss": 0.4015, "step": 3012 }, { "epoch": 1.6917462099943852, "grad_norm": 0.42858609557151794, "learning_rate": 4.7583326952798e-06, "loss": 0.3926, "step": 3013 }, { "epoch": 1.6923076923076923, "grad_norm": 0.46917739510536194, "learning_rate": 4.755069518176682e-06, "loss": 0.41, "step": 3014 }, { "epoch": 1.6928691746209994, "grad_norm": 0.4934830069541931, "learning_rate": 4.7518064456451405e-06, "loss": 0.4187, "step": 3015 }, { "epoch": 1.6934306569343067, "grad_norm": 0.49045249819755554, "learning_rate": 4.748543479078323e-06, "loss": 0.3976, "step": 3016 }, { "epoch": 1.6939921392476136, "grad_norm": 0.45181405544281006, "learning_rate": 4.745280619869332e-06, "loss": 0.4061, "step": 3017 }, { "epoch": 1.694553621560921, "grad_norm": 0.49484965205192566, "learning_rate": 4.742017869411227e-06, "loss": 0.4191, "step": 3018 }, { "epoch": 1.6951151038742278, "grad_norm": 0.49434173107147217, "learning_rate": 4.738755229097017e-06, "loss": 0.3951, "step": 3019 }, { "epoch": 1.6956765861875351, "grad_norm": 0.5108085870742798, "learning_rate": 4.735492700319669e-06, "loss": 0.4244, "step": 3020 }, { "epoch": 1.6962380685008422, "grad_norm": 0.453757643699646, "learning_rate": 4.7322302844721e-06, "loss": 0.3946, "step": 3021 }, { "epoch": 1.6967995508141493, "grad_norm": 0.5382628440856934, "learning_rate": 4.728967982947177e-06, "loss": 0.4069, "step": 3022 }, { "epoch": 1.6973610331274565, "grad_norm": 0.5079814195632935, "learning_rate": 4.725705797137719e-06, "loss": 0.4119, "step": 3023 }, { "epoch": 1.6979225154407636, "grad_norm": 0.46715033054351807, "learning_rate": 4.722443728436498e-06, "loss": 0.3978, "step": 3024 }, { "epoch": 1.6984839977540709, "grad_norm": 0.45861250162124634, "learning_rate": 4.719181778236234e-06, "loss": 0.4191, "step": 3025 }, { "epoch": 1.6990454800673778, "grad_norm": 0.5090296268463135, "learning_rate": 4.715919947929597e-06, "loss": 0.4177, "step": 3026 }, { "epoch": 1.699606962380685, "grad_norm": 0.4529528021812439, "learning_rate": 4.712658238909205e-06, "loss": 0.4, "step": 3027 }, { "epoch": 1.700168444693992, "grad_norm": 0.41510480642318726, "learning_rate": 4.709396652567624e-06, "loss": 0.4222, "step": 3028 }, { "epoch": 1.7007299270072993, "grad_norm": 0.49254822731018066, "learning_rate": 4.706135190297369e-06, "loss": 0.3963, "step": 3029 }, { "epoch": 1.7012914093206064, "grad_norm": 0.48484885692596436, "learning_rate": 4.702873853490903e-06, "loss": 0.4163, "step": 3030 }, { "epoch": 1.7018528916339135, "grad_norm": 0.4573042690753937, "learning_rate": 4.6996126435406316e-06, "loss": 0.4225, "step": 3031 }, { "epoch": 1.7024143739472206, "grad_norm": 0.5008995532989502, "learning_rate": 4.69635156183891e-06, "loss": 0.3883, "step": 3032 }, { "epoch": 1.7029758562605277, "grad_norm": 0.4240192472934723, "learning_rate": 4.693090609778036e-06, "loss": 0.356, "step": 3033 }, { "epoch": 1.703537338573835, "grad_norm": 0.4717586040496826, "learning_rate": 4.689829788750254e-06, "loss": 0.4106, "step": 3034 }, { "epoch": 1.704098820887142, "grad_norm": 0.5114845633506775, "learning_rate": 4.686569100147751e-06, "loss": 0.4387, "step": 3035 }, { "epoch": 1.7046603032004493, "grad_norm": 0.44235721230506897, "learning_rate": 4.6833085453626585e-06, "loss": 0.4189, "step": 3036 }, { "epoch": 1.7052217855137564, "grad_norm": 0.44206908345222473, "learning_rate": 4.680048125787051e-06, "loss": 0.4036, "step": 3037 }, { "epoch": 1.7057832678270635, "grad_norm": 0.44659727811813354, "learning_rate": 4.6767878428129445e-06, "loss": 0.4151, "step": 3038 }, { "epoch": 1.7063447501403706, "grad_norm": 0.4323214590549469, "learning_rate": 4.673527697832295e-06, "loss": 0.3936, "step": 3039 }, { "epoch": 1.7069062324536777, "grad_norm": 0.44958266615867615, "learning_rate": 4.670267692237004e-06, "loss": 0.419, "step": 3040 }, { "epoch": 1.707467714766985, "grad_norm": 0.4668327569961548, "learning_rate": 4.66700782741891e-06, "loss": 0.3808, "step": 3041 }, { "epoch": 1.7080291970802919, "grad_norm": 0.47179803252220154, "learning_rate": 4.663748104769793e-06, "loss": 0.3962, "step": 3042 }, { "epoch": 1.7085906793935992, "grad_norm": 0.4487510323524475, "learning_rate": 4.660488525681371e-06, "loss": 0.3829, "step": 3043 }, { "epoch": 1.709152161706906, "grad_norm": 0.46822589635849, "learning_rate": 4.657229091545301e-06, "loss": 0.3933, "step": 3044 }, { "epoch": 1.7097136440202134, "grad_norm": 0.48829707503318787, "learning_rate": 4.65396980375318e-06, "loss": 0.4266, "step": 3045 }, { "epoch": 1.7102751263335205, "grad_norm": 0.4533199369907379, "learning_rate": 4.65071066369654e-06, "loss": 0.4371, "step": 3046 }, { "epoch": 1.7108366086468276, "grad_norm": 0.4405719041824341, "learning_rate": 4.64745167276685e-06, "loss": 0.3899, "step": 3047 }, { "epoch": 1.7113980909601347, "grad_norm": 0.40198132395744324, "learning_rate": 4.64419283235552e-06, "loss": 0.4244, "step": 3048 }, { "epoch": 1.7119595732734418, "grad_norm": 0.422249436378479, "learning_rate": 4.640934143853888e-06, "loss": 0.3898, "step": 3049 }, { "epoch": 1.7125210555867492, "grad_norm": 0.5015997886657715, "learning_rate": 4.637675608653231e-06, "loss": 0.4173, "step": 3050 }, { "epoch": 1.713082537900056, "grad_norm": 0.45107266306877136, "learning_rate": 4.634417228144764e-06, "loss": 0.4193, "step": 3051 }, { "epoch": 1.7136440202133634, "grad_norm": 0.48217517137527466, "learning_rate": 4.631159003719631e-06, "loss": 0.4255, "step": 3052 }, { "epoch": 1.7142055025266703, "grad_norm": 0.5021740198135376, "learning_rate": 4.627900936768911e-06, "loss": 0.4136, "step": 3053 }, { "epoch": 1.7147669848399776, "grad_norm": 0.4072937071323395, "learning_rate": 4.624643028683616e-06, "loss": 0.4057, "step": 3054 }, { "epoch": 1.7153284671532847, "grad_norm": 0.4634387493133545, "learning_rate": 4.62138528085469e-06, "loss": 0.3978, "step": 3055 }, { "epoch": 1.7158899494665918, "grad_norm": 0.4801906943321228, "learning_rate": 4.618127694673008e-06, "loss": 0.3898, "step": 3056 }, { "epoch": 1.716451431779899, "grad_norm": 0.5117997527122498, "learning_rate": 4.614870271529377e-06, "loss": 0.4421, "step": 3057 }, { "epoch": 1.717012914093206, "grad_norm": 0.44854360818862915, "learning_rate": 4.611613012814534e-06, "loss": 0.4223, "step": 3058 }, { "epoch": 1.7175743964065133, "grad_norm": 0.47580769658088684, "learning_rate": 4.6083559199191464e-06, "loss": 0.4088, "step": 3059 }, { "epoch": 1.7181358787198202, "grad_norm": 0.46741098165512085, "learning_rate": 4.605098994233808e-06, "loss": 0.4096, "step": 3060 }, { "epoch": 1.7186973610331275, "grad_norm": 0.48473885655403137, "learning_rate": 4.601842237149045e-06, "loss": 0.4324, "step": 3061 }, { "epoch": 1.7192588433464346, "grad_norm": 0.43490636348724365, "learning_rate": 4.59858565005531e-06, "loss": 0.46, "step": 3062 }, { "epoch": 1.7198203256597417, "grad_norm": 0.4507758319377899, "learning_rate": 4.595329234342984e-06, "loss": 0.4242, "step": 3063 }, { "epoch": 1.7203818079730488, "grad_norm": 0.4314849376678467, "learning_rate": 4.592072991402371e-06, "loss": 0.4454, "step": 3064 }, { "epoch": 1.720943290286356, "grad_norm": 0.45819351077079773, "learning_rate": 4.588816922623705e-06, "loss": 0.3834, "step": 3065 }, { "epoch": 1.721504772599663, "grad_norm": 0.4208221435546875, "learning_rate": 4.585561029397146e-06, "loss": 0.4263, "step": 3066 }, { "epoch": 1.7220662549129702, "grad_norm": 0.4710971415042877, "learning_rate": 4.5823053131127785e-06, "loss": 0.4051, "step": 3067 }, { "epoch": 1.7226277372262775, "grad_norm": 0.46845743060112, "learning_rate": 4.579049775160608e-06, "loss": 0.3977, "step": 3068 }, { "epoch": 1.7231892195395844, "grad_norm": 0.4319361448287964, "learning_rate": 4.575794416930568e-06, "loss": 0.4058, "step": 3069 }, { "epoch": 1.7237507018528917, "grad_norm": 0.43493372201919556, "learning_rate": 4.5725392398125125e-06, "loss": 0.438, "step": 3070 }, { "epoch": 1.7243121841661988, "grad_norm": 0.4844881296157837, "learning_rate": 4.569284245196222e-06, "loss": 0.3991, "step": 3071 }, { "epoch": 1.724873666479506, "grad_norm": 0.42476630210876465, "learning_rate": 4.566029434471395e-06, "loss": 0.3829, "step": 3072 }, { "epoch": 1.725435148792813, "grad_norm": 0.4343009889125824, "learning_rate": 4.562774809027652e-06, "loss": 0.4158, "step": 3073 }, { "epoch": 1.7259966311061201, "grad_norm": 0.4561391770839691, "learning_rate": 4.559520370254539e-06, "loss": 0.3992, "step": 3074 }, { "epoch": 1.7265581134194274, "grad_norm": 0.40941286087036133, "learning_rate": 4.5562661195415144e-06, "loss": 0.3959, "step": 3075 }, { "epoch": 1.7271195957327343, "grad_norm": 0.49267786741256714, "learning_rate": 4.553012058277964e-06, "loss": 0.4241, "step": 3076 }, { "epoch": 1.7276810780460417, "grad_norm": 0.41129302978515625, "learning_rate": 4.5497581878531884e-06, "loss": 0.3812, "step": 3077 }, { "epoch": 1.7282425603593485, "grad_norm": 0.44510412216186523, "learning_rate": 4.546504509656408e-06, "loss": 0.3882, "step": 3078 }, { "epoch": 1.7288040426726559, "grad_norm": 0.5204919576644897, "learning_rate": 4.54325102507676e-06, "loss": 0.4042, "step": 3079 }, { "epoch": 1.729365524985963, "grad_norm": 0.4444963037967682, "learning_rate": 4.539997735503299e-06, "loss": 0.4123, "step": 3080 }, { "epoch": 1.72992700729927, "grad_norm": 0.41502824425697327, "learning_rate": 4.536744642325e-06, "loss": 0.4283, "step": 3081 }, { "epoch": 1.7304884896125772, "grad_norm": 0.5113120675086975, "learning_rate": 4.533491746930749e-06, "loss": 0.4216, "step": 3082 }, { "epoch": 1.7310499719258843, "grad_norm": 0.46891263127326965, "learning_rate": 4.5302390507093505e-06, "loss": 0.4211, "step": 3083 }, { "epoch": 1.7316114542391916, "grad_norm": 0.4254365861415863, "learning_rate": 4.526986555049524e-06, "loss": 0.4086, "step": 3084 }, { "epoch": 1.7321729365524985, "grad_norm": 0.45282188057899475, "learning_rate": 4.523734261339901e-06, "loss": 0.417, "step": 3085 }, { "epoch": 1.7327344188658058, "grad_norm": 0.5152555704116821, "learning_rate": 4.520482170969029e-06, "loss": 0.438, "step": 3086 }, { "epoch": 1.7332959011791127, "grad_norm": 0.4169726073741913, "learning_rate": 4.51723028532537e-06, "loss": 0.4001, "step": 3087 }, { "epoch": 1.73385738349242, "grad_norm": 0.4316422939300537, "learning_rate": 4.513978605797295e-06, "loss": 0.4193, "step": 3088 }, { "epoch": 1.7344188658057271, "grad_norm": 0.44613805413246155, "learning_rate": 4.5107271337730894e-06, "loss": 0.4368, "step": 3089 }, { "epoch": 1.7349803481190342, "grad_norm": 0.45939046144485474, "learning_rate": 4.507475870640947e-06, "loss": 0.4174, "step": 3090 }, { "epoch": 1.7355418304323413, "grad_norm": 0.4755656123161316, "learning_rate": 4.504224817788979e-06, "loss": 0.4437, "step": 3091 }, { "epoch": 1.7361033127456484, "grad_norm": 0.43809059262275696, "learning_rate": 4.500973976605199e-06, "loss": 0.417, "step": 3092 }, { "epoch": 1.7366647950589558, "grad_norm": 0.42930686473846436, "learning_rate": 4.4977233484775366e-06, "loss": 0.4024, "step": 3093 }, { "epoch": 1.7372262773722627, "grad_norm": 0.477896124124527, "learning_rate": 4.494472934793825e-06, "loss": 0.4, "step": 3094 }, { "epoch": 1.73778775968557, "grad_norm": 0.4524000883102417, "learning_rate": 4.491222736941811e-06, "loss": 0.3824, "step": 3095 }, { "epoch": 1.738349241998877, "grad_norm": 0.4591808319091797, "learning_rate": 4.487972756309143e-06, "loss": 0.4024, "step": 3096 }, { "epoch": 1.7389107243121842, "grad_norm": 0.453867644071579, "learning_rate": 4.484722994283385e-06, "loss": 0.4146, "step": 3097 }, { "epoch": 1.7394722066254913, "grad_norm": 0.5368629097938538, "learning_rate": 4.481473452252e-06, "loss": 0.4183, "step": 3098 }, { "epoch": 1.7400336889387984, "grad_norm": 0.4630209803581238, "learning_rate": 4.478224131602362e-06, "loss": 0.413, "step": 3099 }, { "epoch": 1.7405951712521057, "grad_norm": 0.4406249523162842, "learning_rate": 4.4749750337217465e-06, "loss": 0.3925, "step": 3100 }, { "epoch": 1.7411566535654126, "grad_norm": 0.5253549218177795, "learning_rate": 4.471726159997337e-06, "loss": 0.4226, "step": 3101 }, { "epoch": 1.74171813587872, "grad_norm": 0.4806996285915375, "learning_rate": 4.468477511816221e-06, "loss": 0.4189, "step": 3102 }, { "epoch": 1.7422796181920268, "grad_norm": 0.4519580900669098, "learning_rate": 4.46522909056539e-06, "loss": 0.4357, "step": 3103 }, { "epoch": 1.7428411005053341, "grad_norm": 0.416126012802124, "learning_rate": 4.461980897631734e-06, "loss": 0.3929, "step": 3104 }, { "epoch": 1.7434025828186412, "grad_norm": 0.4838379919528961, "learning_rate": 4.458732934402052e-06, "loss": 0.4156, "step": 3105 }, { "epoch": 1.7439640651319483, "grad_norm": 0.4748612642288208, "learning_rate": 4.455485202263041e-06, "loss": 0.3765, "step": 3106 }, { "epoch": 1.7445255474452555, "grad_norm": 0.4700273871421814, "learning_rate": 4.452237702601302e-06, "loss": 0.4081, "step": 3107 }, { "epoch": 1.7450870297585626, "grad_norm": 0.40249818563461304, "learning_rate": 4.448990436803333e-06, "loss": 0.4107, "step": 3108 }, { "epoch": 1.7456485120718699, "grad_norm": 0.46347829699516296, "learning_rate": 4.445743406255537e-06, "loss": 0.4427, "step": 3109 }, { "epoch": 1.7462099943851768, "grad_norm": 0.478247731924057, "learning_rate": 4.442496612344211e-06, "loss": 0.4098, "step": 3110 }, { "epoch": 1.746771476698484, "grad_norm": 0.4486679136753082, "learning_rate": 4.439250056455554e-06, "loss": 0.4346, "step": 3111 }, { "epoch": 1.747332959011791, "grad_norm": 0.49756643176078796, "learning_rate": 4.4360037399756656e-06, "loss": 0.4564, "step": 3112 }, { "epoch": 1.7478944413250983, "grad_norm": 0.44801390171051025, "learning_rate": 4.43275766429054e-06, "loss": 0.3972, "step": 3113 }, { "epoch": 1.7484559236384054, "grad_norm": 0.40255245566368103, "learning_rate": 4.429511830786068e-06, "loss": 0.3983, "step": 3114 }, { "epoch": 1.7490174059517125, "grad_norm": 0.4589289724826813, "learning_rate": 4.42626624084804e-06, "loss": 0.4163, "step": 3115 }, { "epoch": 1.7495788882650196, "grad_norm": 0.42552053928375244, "learning_rate": 4.423020895862138e-06, "loss": 0.3839, "step": 3116 }, { "epoch": 1.7501403705783267, "grad_norm": 0.44868117570877075, "learning_rate": 4.419775797213947e-06, "loss": 0.3937, "step": 3117 }, { "epoch": 1.750701852891634, "grad_norm": 0.4597080647945404, "learning_rate": 4.416530946288938e-06, "loss": 0.4164, "step": 3118 }, { "epoch": 1.751263335204941, "grad_norm": 0.4604181945323944, "learning_rate": 4.413286344472483e-06, "loss": 0.3938, "step": 3119 }, { "epoch": 1.7518248175182483, "grad_norm": 0.43568065762519836, "learning_rate": 4.410041993149843e-06, "loss": 0.3784, "step": 3120 }, { "epoch": 1.7523862998315554, "grad_norm": 0.474798321723938, "learning_rate": 4.406797893706174e-06, "loss": 0.4146, "step": 3121 }, { "epoch": 1.7529477821448625, "grad_norm": 0.5003825426101685, "learning_rate": 4.403554047526525e-06, "loss": 0.4569, "step": 3122 }, { "epoch": 1.7535092644581696, "grad_norm": 0.43845710158348083, "learning_rate": 4.400310455995838e-06, "loss": 0.3857, "step": 3123 }, { "epoch": 1.7540707467714767, "grad_norm": 0.43301236629486084, "learning_rate": 4.397067120498943e-06, "loss": 0.412, "step": 3124 }, { "epoch": 1.7546322290847838, "grad_norm": 0.4615459442138672, "learning_rate": 4.393824042420562e-06, "loss": 0.4224, "step": 3125 }, { "epoch": 1.7551937113980909, "grad_norm": 0.4668946862220764, "learning_rate": 4.390581223145307e-06, "loss": 0.4209, "step": 3126 }, { "epoch": 1.7557551937113982, "grad_norm": 0.44276493787765503, "learning_rate": 4.387338664057681e-06, "loss": 0.4305, "step": 3127 }, { "epoch": 1.756316676024705, "grad_norm": 0.47142264246940613, "learning_rate": 4.384096366542076e-06, "loss": 0.4215, "step": 3128 }, { "epoch": 1.7568781583380124, "grad_norm": 0.45279237627983093, "learning_rate": 4.380854331982769e-06, "loss": 0.4292, "step": 3129 }, { "epoch": 1.7574396406513195, "grad_norm": 0.49920654296875, "learning_rate": 4.3776125617639275e-06, "loss": 0.4122, "step": 3130 }, { "epoch": 1.7580011229646266, "grad_norm": 0.443268746137619, "learning_rate": 4.374371057269605e-06, "loss": 0.4006, "step": 3131 }, { "epoch": 1.7585626052779337, "grad_norm": 0.46659231185913086, "learning_rate": 4.371129819883743e-06, "loss": 0.4168, "step": 3132 }, { "epoch": 1.7591240875912408, "grad_norm": 0.4104505777359009, "learning_rate": 4.367888850990168e-06, "loss": 0.362, "step": 3133 }, { "epoch": 1.7596855699045482, "grad_norm": 0.4615892767906189, "learning_rate": 4.3646481519725916e-06, "loss": 0.4003, "step": 3134 }, { "epoch": 1.760247052217855, "grad_norm": 0.46948516368865967, "learning_rate": 4.36140772421461e-06, "loss": 0.3742, "step": 3135 }, { "epoch": 1.7608085345311624, "grad_norm": 0.46246346831321716, "learning_rate": 4.3581675690997056e-06, "loss": 0.4151, "step": 3136 }, { "epoch": 1.7613700168444693, "grad_norm": 0.4796067476272583, "learning_rate": 4.35492768801124e-06, "loss": 0.4437, "step": 3137 }, { "epoch": 1.7619314991577766, "grad_norm": 0.5360813140869141, "learning_rate": 4.351688082332464e-06, "loss": 0.4316, "step": 3138 }, { "epoch": 1.7624929814710837, "grad_norm": 0.47699713706970215, "learning_rate": 4.348448753446506e-06, "loss": 0.4091, "step": 3139 }, { "epoch": 1.7630544637843908, "grad_norm": 0.40471088886260986, "learning_rate": 4.345209702736377e-06, "loss": 0.3668, "step": 3140 }, { "epoch": 1.763615946097698, "grad_norm": 0.42313146591186523, "learning_rate": 4.34197093158497e-06, "loss": 0.4029, "step": 3141 }, { "epoch": 1.764177428411005, "grad_norm": 0.45309317111968994, "learning_rate": 4.338732441375059e-06, "loss": 0.4192, "step": 3142 }, { "epoch": 1.7647389107243123, "grad_norm": 0.4673651158809662, "learning_rate": 4.335494233489299e-06, "loss": 0.4284, "step": 3143 }, { "epoch": 1.7653003930376192, "grad_norm": 0.45665159821510315, "learning_rate": 4.332256309310221e-06, "loss": 0.4018, "step": 3144 }, { "epoch": 1.7658618753509265, "grad_norm": 0.4327390789985657, "learning_rate": 4.329018670220239e-06, "loss": 0.4078, "step": 3145 }, { "epoch": 1.7664233576642334, "grad_norm": 0.4783262610435486, "learning_rate": 4.325781317601642e-06, "loss": 0.4105, "step": 3146 }, { "epoch": 1.7669848399775407, "grad_norm": 0.396791011095047, "learning_rate": 4.322544252836596e-06, "loss": 0.4154, "step": 3147 }, { "epoch": 1.7675463222908478, "grad_norm": 0.46817252039909363, "learning_rate": 4.3193074773071505e-06, "loss": 0.3879, "step": 3148 }, { "epoch": 1.768107804604155, "grad_norm": 0.436717689037323, "learning_rate": 4.316070992395224e-06, "loss": 0.397, "step": 3149 }, { "epoch": 1.768669286917462, "grad_norm": 0.42866429686546326, "learning_rate": 4.312834799482615e-06, "loss": 0.3791, "step": 3150 }, { "epoch": 1.7692307692307692, "grad_norm": 0.4415270984172821, "learning_rate": 4.309598899950995e-06, "loss": 0.3868, "step": 3151 }, { "epoch": 1.7697922515440765, "grad_norm": 0.487941175699234, "learning_rate": 4.3063632951819125e-06, "loss": 0.4004, "step": 3152 }, { "epoch": 1.7703537338573834, "grad_norm": 0.5247289538383484, "learning_rate": 4.3031279865567895e-06, "loss": 0.414, "step": 3153 }, { "epoch": 1.7709152161706907, "grad_norm": 0.4936218559741974, "learning_rate": 4.2998929754569215e-06, "loss": 0.3881, "step": 3154 }, { "epoch": 1.7714766984839978, "grad_norm": 0.5052503347396851, "learning_rate": 4.296658263263477e-06, "loss": 0.3831, "step": 3155 }, { "epoch": 1.772038180797305, "grad_norm": 0.4372645616531372, "learning_rate": 4.293423851357496e-06, "loss": 0.3915, "step": 3156 }, { "epoch": 1.772599663110612, "grad_norm": 0.4577261507511139, "learning_rate": 4.290189741119888e-06, "loss": 0.3789, "step": 3157 }, { "epoch": 1.7731611454239191, "grad_norm": 0.4217069447040558, "learning_rate": 4.286955933931442e-06, "loss": 0.4003, "step": 3158 }, { "epoch": 1.7737226277372264, "grad_norm": 0.47230425477027893, "learning_rate": 4.28372243117281e-06, "loss": 0.4083, "step": 3159 }, { "epoch": 1.7742841100505333, "grad_norm": 0.4832988977432251, "learning_rate": 4.280489234224515e-06, "loss": 0.4003, "step": 3160 }, { "epoch": 1.7748455923638407, "grad_norm": 0.41674891114234924, "learning_rate": 4.277256344466952e-06, "loss": 0.3806, "step": 3161 }, { "epoch": 1.7754070746771475, "grad_norm": 0.47651407122612, "learning_rate": 4.274023763280383e-06, "loss": 0.4172, "step": 3162 }, { "epoch": 1.7759685569904549, "grad_norm": 0.4613437056541443, "learning_rate": 4.27079149204494e-06, "loss": 0.3857, "step": 3163 }, { "epoch": 1.776530039303762, "grad_norm": 0.455201119184494, "learning_rate": 4.267559532140621e-06, "loss": 0.418, "step": 3164 }, { "epoch": 1.777091521617069, "grad_norm": 0.46311062574386597, "learning_rate": 4.264327884947289e-06, "loss": 0.4146, "step": 3165 }, { "epoch": 1.7776530039303762, "grad_norm": 0.4356507956981659, "learning_rate": 4.26109655184468e-06, "loss": 0.4114, "step": 3166 }, { "epoch": 1.7782144862436833, "grad_norm": 0.48267170786857605, "learning_rate": 4.257865534212389e-06, "loss": 0.4127, "step": 3167 }, { "epoch": 1.7787759685569906, "grad_norm": 0.47439685463905334, "learning_rate": 4.2546348334298814e-06, "loss": 0.3883, "step": 3168 }, { "epoch": 1.7793374508702975, "grad_norm": 0.47730353474617004, "learning_rate": 4.251404450876485e-06, "loss": 0.3751, "step": 3169 }, { "epoch": 1.7798989331836048, "grad_norm": 0.42182135581970215, "learning_rate": 4.24817438793139e-06, "loss": 0.4203, "step": 3170 }, { "epoch": 1.7804604154969117, "grad_norm": 0.421324759721756, "learning_rate": 4.244944645973653e-06, "loss": 0.4067, "step": 3171 }, { "epoch": 1.781021897810219, "grad_norm": 0.502926766872406, "learning_rate": 4.241715226382193e-06, "loss": 0.3963, "step": 3172 }, { "epoch": 1.7815833801235261, "grad_norm": 0.4574959874153137, "learning_rate": 4.23848613053579e-06, "loss": 0.3839, "step": 3173 }, { "epoch": 1.7821448624368332, "grad_norm": 0.43314820528030396, "learning_rate": 4.23525735981309e-06, "loss": 0.4183, "step": 3174 }, { "epoch": 1.7827063447501403, "grad_norm": 0.5041125416755676, "learning_rate": 4.232028915592593e-06, "loss": 0.4298, "step": 3175 }, { "epoch": 1.7832678270634474, "grad_norm": 0.533092737197876, "learning_rate": 4.228800799252666e-06, "loss": 0.3903, "step": 3176 }, { "epoch": 1.7838293093767548, "grad_norm": 0.46816185116767883, "learning_rate": 4.225573012171531e-06, "loss": 0.3876, "step": 3177 }, { "epoch": 1.7843907916900617, "grad_norm": 0.45259976387023926, "learning_rate": 4.222345555727275e-06, "loss": 0.4098, "step": 3178 }, { "epoch": 1.784952274003369, "grad_norm": 0.45155125856399536, "learning_rate": 4.219118431297839e-06, "loss": 0.4059, "step": 3179 }, { "epoch": 1.7855137563166759, "grad_norm": 0.482977032661438, "learning_rate": 4.215891640261025e-06, "loss": 0.4087, "step": 3180 }, { "epoch": 1.7860752386299832, "grad_norm": 0.4945214092731476, "learning_rate": 4.212665183994492e-06, "loss": 0.3944, "step": 3181 }, { "epoch": 1.7866367209432903, "grad_norm": 0.4609074294567108, "learning_rate": 4.209439063875755e-06, "loss": 0.4083, "step": 3182 }, { "epoch": 1.7871982032565974, "grad_norm": 0.45935699343681335, "learning_rate": 4.206213281282185e-06, "loss": 0.4181, "step": 3183 }, { "epoch": 1.7877596855699045, "grad_norm": 0.4012492299079895, "learning_rate": 4.202987837591013e-06, "loss": 0.4126, "step": 3184 }, { "epoch": 1.7883211678832116, "grad_norm": 0.4582816958427429, "learning_rate": 4.199762734179321e-06, "loss": 0.3825, "step": 3185 }, { "epoch": 1.788882650196519, "grad_norm": 0.5402362942695618, "learning_rate": 4.196537972424047e-06, "loss": 0.4138, "step": 3186 }, { "epoch": 1.7894441325098258, "grad_norm": 0.4460982084274292, "learning_rate": 4.193313553701985e-06, "loss": 0.4361, "step": 3187 }, { "epoch": 1.7900056148231331, "grad_norm": 0.45970046520233154, "learning_rate": 4.190089479389778e-06, "loss": 0.4269, "step": 3188 }, { "epoch": 1.7905670971364402, "grad_norm": 0.4213634431362152, "learning_rate": 4.186865750863928e-06, "loss": 0.4152, "step": 3189 }, { "epoch": 1.7911285794497473, "grad_norm": 0.4623009264469147, "learning_rate": 4.183642369500784e-06, "loss": 0.3844, "step": 3190 }, { "epoch": 1.7916900617630545, "grad_norm": 0.5559455752372742, "learning_rate": 4.18041933667655e-06, "loss": 0.3907, "step": 3191 }, { "epoch": 1.7922515440763616, "grad_norm": 0.4833182394504547, "learning_rate": 4.177196653767281e-06, "loss": 0.4055, "step": 3192 }, { "epoch": 1.7928130263896689, "grad_norm": 0.4588322341442108, "learning_rate": 4.173974322148878e-06, "loss": 0.3898, "step": 3193 }, { "epoch": 1.7933745087029758, "grad_norm": 0.4977942407131195, "learning_rate": 4.170752343197101e-06, "loss": 0.4191, "step": 3194 }, { "epoch": 1.793935991016283, "grad_norm": 0.6174769401550293, "learning_rate": 4.167530718287551e-06, "loss": 0.424, "step": 3195 }, { "epoch": 1.79449747332959, "grad_norm": 0.5019794702529907, "learning_rate": 4.164309448795683e-06, "loss": 0.4091, "step": 3196 }, { "epoch": 1.7950589556428973, "grad_norm": 0.44798582792282104, "learning_rate": 4.161088536096796e-06, "loss": 0.4097, "step": 3197 }, { "epoch": 1.7956204379562044, "grad_norm": 0.5154345631599426, "learning_rate": 4.15786798156604e-06, "loss": 0.417, "step": 3198 }, { "epoch": 1.7961819202695115, "grad_norm": 0.45236992835998535, "learning_rate": 4.154647786578413e-06, "loss": 0.4031, "step": 3199 }, { "epoch": 1.7967434025828186, "grad_norm": 0.4516700804233551, "learning_rate": 4.1514279525087555e-06, "loss": 0.3926, "step": 3200 }, { "epoch": 1.7973048848961257, "grad_norm": 0.48955288529396057, "learning_rate": 4.148208480731757e-06, "loss": 0.4128, "step": 3201 }, { "epoch": 1.797866367209433, "grad_norm": 0.5700005292892456, "learning_rate": 4.14498937262195e-06, "loss": 0.4354, "step": 3202 }, { "epoch": 1.79842784952274, "grad_norm": 0.4302150011062622, "learning_rate": 4.141770629553714e-06, "loss": 0.3959, "step": 3203 }, { "epoch": 1.7989893318360473, "grad_norm": 0.4429859519004822, "learning_rate": 4.138552252901272e-06, "loss": 0.4219, "step": 3204 }, { "epoch": 1.7995508141493541, "grad_norm": 0.4751366674900055, "learning_rate": 4.13533424403869e-06, "loss": 0.418, "step": 3205 }, { "epoch": 1.8001122964626615, "grad_norm": 0.4729556739330292, "learning_rate": 4.132116604339878e-06, "loss": 0.4083, "step": 3206 }, { "epoch": 1.8006737787759686, "grad_norm": 0.4250647723674774, "learning_rate": 4.128899335178587e-06, "loss": 0.4108, "step": 3207 }, { "epoch": 1.8012352610892757, "grad_norm": 0.44673243165016174, "learning_rate": 4.125682437928408e-06, "loss": 0.4195, "step": 3208 }, { "epoch": 1.8017967434025828, "grad_norm": 0.4706653952598572, "learning_rate": 4.12246591396278e-06, "loss": 0.3862, "step": 3209 }, { "epoch": 1.8023582257158899, "grad_norm": 0.4358057677745819, "learning_rate": 4.119249764654976e-06, "loss": 0.4037, "step": 3210 }, { "epoch": 1.8029197080291972, "grad_norm": 0.42242351174354553, "learning_rate": 4.1160339913781115e-06, "loss": 0.4089, "step": 3211 }, { "epoch": 1.803481190342504, "grad_norm": 0.44335317611694336, "learning_rate": 4.11281859550514e-06, "loss": 0.4255, "step": 3212 }, { "epoch": 1.8040426726558114, "grad_norm": 0.46481767296791077, "learning_rate": 4.109603578408857e-06, "loss": 0.4299, "step": 3213 }, { "epoch": 1.8046041549691185, "grad_norm": 0.48185592889785767, "learning_rate": 4.106388941461895e-06, "loss": 0.3965, "step": 3214 }, { "epoch": 1.8051656372824256, "grad_norm": 0.3914335370063782, "learning_rate": 4.103174686036722e-06, "loss": 0.3794, "step": 3215 }, { "epoch": 1.8057271195957327, "grad_norm": 0.47966229915618896, "learning_rate": 4.099960813505645e-06, "loss": 0.4253, "step": 3216 }, { "epoch": 1.8062886019090398, "grad_norm": 0.48302021622657776, "learning_rate": 4.0967473252408095e-06, "loss": 0.4129, "step": 3217 }, { "epoch": 1.806850084222347, "grad_norm": 0.46595898270606995, "learning_rate": 4.09353422261419e-06, "loss": 0.4053, "step": 3218 }, { "epoch": 1.807411566535654, "grad_norm": 0.41154369711875916, "learning_rate": 4.090321506997605e-06, "loss": 0.3757, "step": 3219 }, { "epoch": 1.8079730488489614, "grad_norm": 0.44679829478263855, "learning_rate": 4.087109179762703e-06, "loss": 0.3988, "step": 3220 }, { "epoch": 1.8085345311622683, "grad_norm": 0.48355433344841003, "learning_rate": 4.0838972422809675e-06, "loss": 0.441, "step": 3221 }, { "epoch": 1.8090960134755756, "grad_norm": 0.505709171295166, "learning_rate": 4.080685695923715e-06, "loss": 0.4218, "step": 3222 }, { "epoch": 1.8096574957888827, "grad_norm": 0.4177892208099365, "learning_rate": 4.077474542062096e-06, "loss": 0.3972, "step": 3223 }, { "epoch": 1.8102189781021898, "grad_norm": 0.39786195755004883, "learning_rate": 4.074263782067092e-06, "loss": 0.3765, "step": 3224 }, { "epoch": 1.810780460415497, "grad_norm": 0.4498734772205353, "learning_rate": 4.071053417309519e-06, "loss": 0.4017, "step": 3225 }, { "epoch": 1.811341942728804, "grad_norm": 0.4460574984550476, "learning_rate": 4.067843449160022e-06, "loss": 0.4368, "step": 3226 }, { "epoch": 1.8119034250421113, "grad_norm": 0.46412011981010437, "learning_rate": 4.064633878989078e-06, "loss": 0.4352, "step": 3227 }, { "epoch": 1.8124649073554182, "grad_norm": 0.4966476559638977, "learning_rate": 4.061424708166993e-06, "loss": 0.4383, "step": 3228 }, { "epoch": 1.8130263896687255, "grad_norm": 0.47449102997779846, "learning_rate": 4.0582159380639006e-06, "loss": 0.3995, "step": 3229 }, { "epoch": 1.8135878719820324, "grad_norm": 0.4878791570663452, "learning_rate": 4.0550075700497686e-06, "loss": 0.4477, "step": 3230 }, { "epoch": 1.8141493542953397, "grad_norm": 0.444061279296875, "learning_rate": 4.0517996054943895e-06, "loss": 0.4088, "step": 3231 }, { "epoch": 1.8147108366086468, "grad_norm": 0.4748918414115906, "learning_rate": 4.048592045767384e-06, "loss": 0.3957, "step": 3232 }, { "epoch": 1.815272318921954, "grad_norm": 0.5038130879402161, "learning_rate": 4.0453848922382e-06, "loss": 0.4116, "step": 3233 }, { "epoch": 1.815833801235261, "grad_norm": 0.4895729422569275, "learning_rate": 4.0421781462761126e-06, "loss": 0.4044, "step": 3234 }, { "epoch": 1.8163952835485682, "grad_norm": 0.4673062264919281, "learning_rate": 4.0389718092502215e-06, "loss": 0.409, "step": 3235 }, { "epoch": 1.8169567658618755, "grad_norm": 0.4479764401912689, "learning_rate": 4.035765882529453e-06, "loss": 0.4073, "step": 3236 }, { "epoch": 1.8175182481751824, "grad_norm": 0.4525449872016907, "learning_rate": 4.032560367482559e-06, "loss": 0.3893, "step": 3237 }, { "epoch": 1.8180797304884897, "grad_norm": 0.4615972340106964, "learning_rate": 4.029355265478113e-06, "loss": 0.4085, "step": 3238 }, { "epoch": 1.8186412128017966, "grad_norm": 0.49775221943855286, "learning_rate": 4.0261505778845144e-06, "loss": 0.4211, "step": 3239 }, { "epoch": 1.819202695115104, "grad_norm": 0.4585128426551819, "learning_rate": 4.022946306069985e-06, "loss": 0.4398, "step": 3240 }, { "epoch": 1.819764177428411, "grad_norm": 0.4750231206417084, "learning_rate": 4.019742451402569e-06, "loss": 0.3864, "step": 3241 }, { "epoch": 1.8203256597417181, "grad_norm": 0.4663258194923401, "learning_rate": 4.016539015250134e-06, "loss": 0.3909, "step": 3242 }, { "epoch": 1.8208871420550252, "grad_norm": 0.5108314752578735, "learning_rate": 4.013335998980367e-06, "loss": 0.3975, "step": 3243 }, { "epoch": 1.8214486243683323, "grad_norm": 0.4918060302734375, "learning_rate": 4.010133403960775e-06, "loss": 0.3922, "step": 3244 }, { "epoch": 1.8220101066816397, "grad_norm": 0.438429594039917, "learning_rate": 4.006931231558687e-06, "loss": 0.4178, "step": 3245 }, { "epoch": 1.8225715889949465, "grad_norm": 0.4373646378517151, "learning_rate": 4.00372948314125e-06, "loss": 0.3935, "step": 3246 }, { "epoch": 1.8231330713082539, "grad_norm": 0.4690655469894409, "learning_rate": 4.000528160075433e-06, "loss": 0.4346, "step": 3247 }, { "epoch": 1.823694553621561, "grad_norm": 0.47370412945747375, "learning_rate": 3.99732726372802e-06, "loss": 0.3739, "step": 3248 }, { "epoch": 1.824256035934868, "grad_norm": 0.4803910553455353, "learning_rate": 3.994126795465615e-06, "loss": 0.4097, "step": 3249 }, { "epoch": 1.8248175182481752, "grad_norm": 0.4559785723686218, "learning_rate": 3.990926756654638e-06, "loss": 0.378, "step": 3250 }, { "epoch": 1.8253790005614823, "grad_norm": 0.45041465759277344, "learning_rate": 3.987727148661325e-06, "loss": 0.3971, "step": 3251 }, { "epoch": 1.8259404828747896, "grad_norm": 0.5000032186508179, "learning_rate": 3.9845279728517306e-06, "loss": 0.3989, "step": 3252 }, { "epoch": 1.8265019651880965, "grad_norm": 0.44151759147644043, "learning_rate": 3.9813292305917235e-06, "loss": 0.4181, "step": 3253 }, { "epoch": 1.8270634475014038, "grad_norm": 0.4438800513744354, "learning_rate": 3.978130923246985e-06, "loss": 0.4181, "step": 3254 }, { "epoch": 1.8276249298147107, "grad_norm": 0.45329853892326355, "learning_rate": 3.974933052183017e-06, "loss": 0.4244, "step": 3255 }, { "epoch": 1.828186412128018, "grad_norm": 0.4764217138290405, "learning_rate": 3.9717356187651256e-06, "loss": 0.3996, "step": 3256 }, { "epoch": 1.8287478944413251, "grad_norm": 0.42355576157569885, "learning_rate": 3.9685386243584395e-06, "loss": 0.3806, "step": 3257 }, { "epoch": 1.8293093767546322, "grad_norm": 0.3907976448535919, "learning_rate": 3.965342070327895e-06, "loss": 0.4072, "step": 3258 }, { "epoch": 1.8298708590679393, "grad_norm": 0.45588111877441406, "learning_rate": 3.962145958038242e-06, "loss": 0.42, "step": 3259 }, { "epoch": 1.8304323413812464, "grad_norm": 0.4288538694381714, "learning_rate": 3.958950288854039e-06, "loss": 0.4074, "step": 3260 }, { "epoch": 1.8309938236945538, "grad_norm": 0.4635668694972992, "learning_rate": 3.9557550641396596e-06, "loss": 0.3816, "step": 3261 }, { "epoch": 1.8315553060078607, "grad_norm": 0.5168713331222534, "learning_rate": 3.9525602852592835e-06, "loss": 0.3713, "step": 3262 }, { "epoch": 1.832116788321168, "grad_norm": 0.4192051887512207, "learning_rate": 3.949365953576904e-06, "loss": 0.4031, "step": 3263 }, { "epoch": 1.8326782706344749, "grad_norm": 0.44029688835144043, "learning_rate": 3.946172070456321e-06, "loss": 0.3884, "step": 3264 }, { "epoch": 1.8332397529477822, "grad_norm": 0.46374085545539856, "learning_rate": 3.942978637261143e-06, "loss": 0.4126, "step": 3265 }, { "epoch": 1.8338012352610893, "grad_norm": 0.46005722880363464, "learning_rate": 3.9397856553547875e-06, "loss": 0.3995, "step": 3266 }, { "epoch": 1.8343627175743964, "grad_norm": 0.4132698178291321, "learning_rate": 3.936593126100476e-06, "loss": 0.407, "step": 3267 }, { "epoch": 1.8349241998877035, "grad_norm": 0.41869956254959106, "learning_rate": 3.933401050861244e-06, "loss": 0.3975, "step": 3268 }, { "epoch": 1.8354856822010106, "grad_norm": 0.4406319260597229, "learning_rate": 3.930209430999925e-06, "loss": 0.3846, "step": 3269 }, { "epoch": 1.836047164514318, "grad_norm": 0.489885151386261, "learning_rate": 3.927018267879164e-06, "loss": 0.425, "step": 3270 }, { "epoch": 1.8366086468276248, "grad_norm": 0.4501068890094757, "learning_rate": 3.923827562861406e-06, "loss": 0.4105, "step": 3271 }, { "epoch": 1.8371701291409321, "grad_norm": 0.4570774734020233, "learning_rate": 3.9206373173089035e-06, "loss": 0.4068, "step": 3272 }, { "epoch": 1.8377316114542392, "grad_norm": 0.446127325296402, "learning_rate": 3.917447532583715e-06, "loss": 0.4049, "step": 3273 }, { "epoch": 1.8382930937675463, "grad_norm": 0.48020175099372864, "learning_rate": 3.914258210047697e-06, "loss": 0.4119, "step": 3274 }, { "epoch": 1.8388545760808535, "grad_norm": 0.45109161734580994, "learning_rate": 3.911069351062512e-06, "loss": 0.3823, "step": 3275 }, { "epoch": 1.8394160583941606, "grad_norm": 0.4497314393520355, "learning_rate": 3.9078809569896244e-06, "loss": 0.4258, "step": 3276 }, { "epoch": 1.8399775407074677, "grad_norm": 0.46976929903030396, "learning_rate": 3.904693029190299e-06, "loss": 0.4167, "step": 3277 }, { "epoch": 1.8405390230207748, "grad_norm": 0.45362916588783264, "learning_rate": 3.9015055690256024e-06, "loss": 0.4111, "step": 3278 }, { "epoch": 1.841100505334082, "grad_norm": 0.4989689886569977, "learning_rate": 3.8983185778564015e-06, "loss": 0.4449, "step": 3279 }, { "epoch": 1.841661987647389, "grad_norm": 0.47340458631515503, "learning_rate": 3.895132057043363e-06, "loss": 0.4053, "step": 3280 }, { "epoch": 1.8422234699606963, "grad_norm": 0.4555872082710266, "learning_rate": 3.891946007946951e-06, "loss": 0.4251, "step": 3281 }, { "epoch": 1.8427849522740034, "grad_norm": 0.5352565050125122, "learning_rate": 3.888760431927431e-06, "loss": 0.4081, "step": 3282 }, { "epoch": 1.8433464345873105, "grad_norm": 0.4573174715042114, "learning_rate": 3.885575330344865e-06, "loss": 0.3963, "step": 3283 }, { "epoch": 1.8439079169006176, "grad_norm": 0.4419270157814026, "learning_rate": 3.8823907045591125e-06, "loss": 0.4096, "step": 3284 }, { "epoch": 1.8444693992139247, "grad_norm": 0.505664587020874, "learning_rate": 3.879206555929831e-06, "loss": 0.4148, "step": 3285 }, { "epoch": 1.845030881527232, "grad_norm": 0.49636024236679077, "learning_rate": 3.8760228858164726e-06, "loss": 0.389, "step": 3286 }, { "epoch": 1.845592363840539, "grad_norm": 0.5145238041877747, "learning_rate": 3.872839695578284e-06, "loss": 0.407, "step": 3287 }, { "epoch": 1.8461538461538463, "grad_norm": 0.4645131826400757, "learning_rate": 3.869656986574312e-06, "loss": 0.4285, "step": 3288 }, { "epoch": 1.8467153284671531, "grad_norm": 0.5226326584815979, "learning_rate": 3.866474760163394e-06, "loss": 0.413, "step": 3289 }, { "epoch": 1.8472768107804605, "grad_norm": 0.44680166244506836, "learning_rate": 3.86329301770416e-06, "loss": 0.3666, "step": 3290 }, { "epoch": 1.8478382930937676, "grad_norm": 0.4948612451553345, "learning_rate": 3.860111760555038e-06, "loss": 0.4005, "step": 3291 }, { "epoch": 1.8483997754070747, "grad_norm": 0.4981977939605713, "learning_rate": 3.856930990074244e-06, "loss": 0.4138, "step": 3292 }, { "epoch": 1.8489612577203818, "grad_norm": 0.45464253425598145, "learning_rate": 3.853750707619792e-06, "loss": 0.394, "step": 3293 }, { "epoch": 1.8495227400336889, "grad_norm": 0.4448580741882324, "learning_rate": 3.8505709145494815e-06, "loss": 0.4103, "step": 3294 }, { "epoch": 1.8500842223469962, "grad_norm": 0.45789945125579834, "learning_rate": 3.8473916122209065e-06, "loss": 0.4083, "step": 3295 }, { "epoch": 1.850645704660303, "grad_norm": 0.508236289024353, "learning_rate": 3.844212801991451e-06, "loss": 0.3959, "step": 3296 }, { "epoch": 1.8512071869736104, "grad_norm": 0.5449656248092651, "learning_rate": 3.841034485218287e-06, "loss": 0.4091, "step": 3297 }, { "epoch": 1.8517686692869173, "grad_norm": 0.4968237280845642, "learning_rate": 3.83785666325838e-06, "loss": 0.3817, "step": 3298 }, { "epoch": 1.8523301516002246, "grad_norm": 0.42765840888023376, "learning_rate": 3.83467933746848e-06, "loss": 0.3863, "step": 3299 }, { "epoch": 1.8528916339135317, "grad_norm": 0.4602724015712738, "learning_rate": 3.831502509205127e-06, "loss": 0.3992, "step": 3300 }, { "epoch": 1.8534531162268388, "grad_norm": 0.4774307608604431, "learning_rate": 3.82832617982465e-06, "loss": 0.3859, "step": 3301 }, { "epoch": 1.854014598540146, "grad_norm": 0.45027807354927063, "learning_rate": 3.825150350683161e-06, "loss": 0.4236, "step": 3302 }, { "epoch": 1.854576080853453, "grad_norm": 0.4949328899383545, "learning_rate": 3.821975023136562e-06, "loss": 0.4168, "step": 3303 }, { "epoch": 1.8551375631667604, "grad_norm": 0.46345022320747375, "learning_rate": 3.81880019854054e-06, "loss": 0.4207, "step": 3304 }, { "epoch": 1.8556990454800673, "grad_norm": 0.4172219932079315, "learning_rate": 3.815625878250566e-06, "loss": 0.3789, "step": 3305 }, { "epoch": 1.8562605277933746, "grad_norm": 0.44457384943962097, "learning_rate": 3.8124520636219e-06, "loss": 0.4047, "step": 3306 }, { "epoch": 1.8568220101066817, "grad_norm": 0.4456699788570404, "learning_rate": 3.8092787560095784e-06, "loss": 0.4201, "step": 3307 }, { "epoch": 1.8573834924199888, "grad_norm": 0.4354194700717926, "learning_rate": 3.8061059567684265e-06, "loss": 0.3842, "step": 3308 }, { "epoch": 1.857944974733296, "grad_norm": 0.45248791575431824, "learning_rate": 3.8029336672530535e-06, "loss": 0.4154, "step": 3309 }, { "epoch": 1.858506457046603, "grad_norm": 0.4646375775337219, "learning_rate": 3.799761888817849e-06, "loss": 0.3983, "step": 3310 }, { "epoch": 1.8590679393599103, "grad_norm": 0.4446170926094055, "learning_rate": 3.7965906228169826e-06, "loss": 0.404, "step": 3311 }, { "epoch": 1.8596294216732172, "grad_norm": 0.47060325741767883, "learning_rate": 3.793419870604408e-06, "loss": 0.3914, "step": 3312 }, { "epoch": 1.8601909039865245, "grad_norm": 0.4562906324863434, "learning_rate": 3.7902496335338574e-06, "loss": 0.3836, "step": 3313 }, { "epoch": 1.8607523862998314, "grad_norm": 0.4801759421825409, "learning_rate": 3.7870799129588465e-06, "loss": 0.4414, "step": 3314 }, { "epoch": 1.8613138686131387, "grad_norm": 0.4323444962501526, "learning_rate": 3.783910710232666e-06, "loss": 0.41, "step": 3315 }, { "epoch": 1.8618753509264458, "grad_norm": 0.441994309425354, "learning_rate": 3.7807420267083887e-06, "loss": 0.4127, "step": 3316 }, { "epoch": 1.862436833239753, "grad_norm": 0.4802887439727783, "learning_rate": 3.777573863738865e-06, "loss": 0.4075, "step": 3317 }, { "epoch": 1.86299831555306, "grad_norm": 0.5222012996673584, "learning_rate": 3.774406222676721e-06, "loss": 0.4044, "step": 3318 }, { "epoch": 1.8635597978663672, "grad_norm": 0.4604558050632477, "learning_rate": 3.7712391048743637e-06, "loss": 0.4101, "step": 3319 }, { "epoch": 1.8641212801796745, "grad_norm": 0.41561147570610046, "learning_rate": 3.768072511683974e-06, "loss": 0.3867, "step": 3320 }, { "epoch": 1.8646827624929814, "grad_norm": 0.4186350703239441, "learning_rate": 3.7649064444575094e-06, "loss": 0.3859, "step": 3321 }, { "epoch": 1.8652442448062887, "grad_norm": 0.513102650642395, "learning_rate": 3.7617409045467035e-06, "loss": 0.4137, "step": 3322 }, { "epoch": 1.8658057271195956, "grad_norm": 0.4718990921974182, "learning_rate": 3.7585758933030625e-06, "loss": 0.4372, "step": 3323 }, { "epoch": 1.866367209432903, "grad_norm": 0.4563388228416443, "learning_rate": 3.755411412077871e-06, "loss": 0.419, "step": 3324 }, { "epoch": 1.86692869174621, "grad_norm": 0.4465227723121643, "learning_rate": 3.752247462222184e-06, "loss": 0.4009, "step": 3325 }, { "epoch": 1.8674901740595171, "grad_norm": 0.41009122133255005, "learning_rate": 3.7490840450868305e-06, "loss": 0.3932, "step": 3326 }, { "epoch": 1.8680516563728242, "grad_norm": 0.4651363492012024, "learning_rate": 3.7459211620224124e-06, "loss": 0.4539, "step": 3327 }, { "epoch": 1.8686131386861313, "grad_norm": 0.446151465177536, "learning_rate": 3.742758814379302e-06, "loss": 0.3869, "step": 3328 }, { "epoch": 1.8691746209994387, "grad_norm": 0.47435081005096436, "learning_rate": 3.7395970035076467e-06, "loss": 0.4026, "step": 3329 }, { "epoch": 1.8697361033127455, "grad_norm": 0.45936113595962524, "learning_rate": 3.7364357307573607e-06, "loss": 0.407, "step": 3330 }, { "epoch": 1.8702975856260529, "grad_norm": 0.47518736124038696, "learning_rate": 3.7332749974781303e-06, "loss": 0.4275, "step": 3331 }, { "epoch": 1.87085906793936, "grad_norm": 0.4374437630176544, "learning_rate": 3.7301148050194118e-06, "loss": 0.3911, "step": 3332 }, { "epoch": 1.871420550252667, "grad_norm": 0.43809640407562256, "learning_rate": 3.726955154730429e-06, "loss": 0.4483, "step": 3333 }, { "epoch": 1.8719820325659742, "grad_norm": 0.4627927243709564, "learning_rate": 3.7237960479601766e-06, "loss": 0.4095, "step": 3334 }, { "epoch": 1.8725435148792813, "grad_norm": 0.434384286403656, "learning_rate": 3.7206374860574155e-06, "loss": 0.4082, "step": 3335 }, { "epoch": 1.8731049971925884, "grad_norm": 0.4554755985736847, "learning_rate": 3.717479470370675e-06, "loss": 0.3962, "step": 3336 }, { "epoch": 1.8736664795058955, "grad_norm": 0.49124136567115784, "learning_rate": 3.7143220022482494e-06, "loss": 0.4192, "step": 3337 }, { "epoch": 1.8742279618192028, "grad_norm": 0.5097683072090149, "learning_rate": 3.7111650830382006e-06, "loss": 0.3872, "step": 3338 }, { "epoch": 1.8747894441325097, "grad_norm": 0.5293956398963928, "learning_rate": 3.708008714088357e-06, "loss": 0.418, "step": 3339 }, { "epoch": 1.875350926445817, "grad_norm": 0.4650017321109772, "learning_rate": 3.704852896746311e-06, "loss": 0.4108, "step": 3340 }, { "epoch": 1.8759124087591241, "grad_norm": 0.41514432430267334, "learning_rate": 3.701697632359419e-06, "loss": 0.3716, "step": 3341 }, { "epoch": 1.8764738910724312, "grad_norm": 0.4573768377304077, "learning_rate": 3.6985429222748015e-06, "loss": 0.4372, "step": 3342 }, { "epoch": 1.8770353733857383, "grad_norm": 0.5187644362449646, "learning_rate": 3.695388767839343e-06, "loss": 0.4134, "step": 3343 }, { "epoch": 1.8775968556990454, "grad_norm": 0.43547576665878296, "learning_rate": 3.692235170399691e-06, "loss": 0.3922, "step": 3344 }, { "epoch": 1.8781583380123528, "grad_norm": 0.4460247755050659, "learning_rate": 3.6890821313022553e-06, "loss": 0.3849, "step": 3345 }, { "epoch": 1.8787198203256597, "grad_norm": 0.45466282963752747, "learning_rate": 3.6859296518932045e-06, "loss": 0.4033, "step": 3346 }, { "epoch": 1.879281302638967, "grad_norm": 0.43328896164894104, "learning_rate": 3.6827777335184724e-06, "loss": 0.4036, "step": 3347 }, { "epoch": 1.8798427849522739, "grad_norm": 0.4629657566547394, "learning_rate": 3.6796263775237507e-06, "loss": 0.4277, "step": 3348 }, { "epoch": 1.8804042672655812, "grad_norm": 0.43310070037841797, "learning_rate": 3.6764755852544908e-06, "loss": 0.3788, "step": 3349 }, { "epoch": 1.8809657495788883, "grad_norm": 0.44028612971305847, "learning_rate": 3.673325358055906e-06, "loss": 0.4254, "step": 3350 }, { "epoch": 1.8815272318921954, "grad_norm": 0.5272777080535889, "learning_rate": 3.670175697272965e-06, "loss": 0.4162, "step": 3351 }, { "epoch": 1.8820887142055025, "grad_norm": 0.41266122460365295, "learning_rate": 3.667026604250397e-06, "loss": 0.3989, "step": 3352 }, { "epoch": 1.8826501965188096, "grad_norm": 0.45295557379722595, "learning_rate": 3.6638780803326886e-06, "loss": 0.4178, "step": 3353 }, { "epoch": 1.883211678832117, "grad_norm": 0.4874502420425415, "learning_rate": 3.6607301268640805e-06, "loss": 0.3982, "step": 3354 }, { "epoch": 1.8837731611454238, "grad_norm": 0.48068922758102417, "learning_rate": 3.657582745188575e-06, "loss": 0.3753, "step": 3355 }, { "epoch": 1.8843346434587311, "grad_norm": 0.4514552652835846, "learning_rate": 3.6544359366499267e-06, "loss": 0.3912, "step": 3356 }, { "epoch": 1.884896125772038, "grad_norm": 0.4600638151168823, "learning_rate": 3.651289702591646e-06, "loss": 0.407, "step": 3357 }, { "epoch": 1.8854576080853453, "grad_norm": 0.4348922371864319, "learning_rate": 3.648144044356998e-06, "loss": 0.3936, "step": 3358 }, { "epoch": 1.8860190903986525, "grad_norm": 0.4613727629184723, "learning_rate": 3.644998963289002e-06, "loss": 0.4198, "step": 3359 }, { "epoch": 1.8865805727119596, "grad_norm": 0.4390535354614258, "learning_rate": 3.6418544607304327e-06, "loss": 0.4228, "step": 3360 }, { "epoch": 1.8871420550252667, "grad_norm": 0.4592054486274719, "learning_rate": 3.638710538023815e-06, "loss": 0.4273, "step": 3361 }, { "epoch": 1.8877035373385738, "grad_norm": 0.41523754596710205, "learning_rate": 3.635567196511429e-06, "loss": 0.4064, "step": 3362 }, { "epoch": 1.888265019651881, "grad_norm": 0.4001336693763733, "learning_rate": 3.6324244375353034e-06, "loss": 0.385, "step": 3363 }, { "epoch": 1.888826501965188, "grad_norm": 0.4436281621456146, "learning_rate": 3.6292822624372193e-06, "loss": 0.4037, "step": 3364 }, { "epoch": 1.8893879842784953, "grad_norm": 0.49422118067741394, "learning_rate": 3.6261406725587123e-06, "loss": 0.399, "step": 3365 }, { "epoch": 1.8899494665918024, "grad_norm": 0.4853861331939697, "learning_rate": 3.622999669241063e-06, "loss": 0.4132, "step": 3366 }, { "epoch": 1.8905109489051095, "grad_norm": 0.41963139176368713, "learning_rate": 3.619859253825304e-06, "loss": 0.4073, "step": 3367 }, { "epoch": 1.8910724312184166, "grad_norm": 0.4264313280582428, "learning_rate": 3.6167194276522165e-06, "loss": 0.395, "step": 3368 }, { "epoch": 1.8916339135317237, "grad_norm": 0.4713347852230072, "learning_rate": 3.6135801920623287e-06, "loss": 0.3854, "step": 3369 }, { "epoch": 1.892195395845031, "grad_norm": 0.4478082060813904, "learning_rate": 3.610441548395921e-06, "loss": 0.4214, "step": 3370 }, { "epoch": 1.892756878158338, "grad_norm": 0.4502524435520172, "learning_rate": 3.6073034979930153e-06, "loss": 0.3974, "step": 3371 }, { "epoch": 1.8933183604716453, "grad_norm": 0.41214293241500854, "learning_rate": 3.6041660421933857e-06, "loss": 0.3949, "step": 3372 }, { "epoch": 1.8938798427849521, "grad_norm": 0.45568618178367615, "learning_rate": 3.6010291823365483e-06, "loss": 0.4152, "step": 3373 }, { "epoch": 1.8944413250982595, "grad_norm": 0.501645028591156, "learning_rate": 3.5978929197617653e-06, "loss": 0.3913, "step": 3374 }, { "epoch": 1.8950028074115666, "grad_norm": 0.42381852865219116, "learning_rate": 3.594757255808047e-06, "loss": 0.4177, "step": 3375 }, { "epoch": 1.8955642897248737, "grad_norm": 0.4182449281215668, "learning_rate": 3.5916221918141445e-06, "loss": 0.4139, "step": 3376 }, { "epoch": 1.8961257720381808, "grad_norm": 0.4278284013271332, "learning_rate": 3.588487729118555e-06, "loss": 0.4147, "step": 3377 }, { "epoch": 1.8966872543514879, "grad_norm": 0.4499664008617401, "learning_rate": 3.585353869059518e-06, "loss": 0.4089, "step": 3378 }, { "epoch": 1.8972487366647952, "grad_norm": 0.44738370180130005, "learning_rate": 3.5822206129750154e-06, "loss": 0.4394, "step": 3379 }, { "epoch": 1.897810218978102, "grad_norm": 0.38453784584999084, "learning_rate": 3.579087962202773e-06, "loss": 0.3857, "step": 3380 }, { "epoch": 1.8983717012914094, "grad_norm": 0.424225777387619, "learning_rate": 3.575955918080256e-06, "loss": 0.4058, "step": 3381 }, { "epoch": 1.8989331836047163, "grad_norm": 0.4277803301811218, "learning_rate": 3.5728244819446722e-06, "loss": 0.4289, "step": 3382 }, { "epoch": 1.8994946659180236, "grad_norm": 0.39621487259864807, "learning_rate": 3.5696936551329688e-06, "loss": 0.3735, "step": 3383 }, { "epoch": 1.9000561482313307, "grad_norm": 0.43138110637664795, "learning_rate": 3.5665634389818315e-06, "loss": 0.4208, "step": 3384 }, { "epoch": 1.9006176305446378, "grad_norm": 0.45332154631614685, "learning_rate": 3.56343383482769e-06, "loss": 0.3966, "step": 3385 }, { "epoch": 1.901179112857945, "grad_norm": 0.4544050991535187, "learning_rate": 3.560304844006708e-06, "loss": 0.3926, "step": 3386 }, { "epoch": 1.901740595171252, "grad_norm": 0.4387453496456146, "learning_rate": 3.5571764678547895e-06, "loss": 0.4109, "step": 3387 }, { "epoch": 1.9023020774845594, "grad_norm": 0.424968421459198, "learning_rate": 3.5540487077075753e-06, "loss": 0.3917, "step": 3388 }, { "epoch": 1.9028635597978663, "grad_norm": 0.3939818739891052, "learning_rate": 3.5509215649004435e-06, "loss": 0.3954, "step": 3389 }, { "epoch": 1.9034250421111736, "grad_norm": 0.45015743374824524, "learning_rate": 3.547795040768508e-06, "loss": 0.4173, "step": 3390 }, { "epoch": 1.9039865244244805, "grad_norm": 0.41337430477142334, "learning_rate": 3.544669136646621e-06, "loss": 0.4108, "step": 3391 }, { "epoch": 1.9045480067377878, "grad_norm": 0.4378931224346161, "learning_rate": 3.541543853869368e-06, "loss": 0.3949, "step": 3392 }, { "epoch": 1.905109489051095, "grad_norm": 0.42433491349220276, "learning_rate": 3.5384191937710676e-06, "loss": 0.4218, "step": 3393 }, { "epoch": 1.905670971364402, "grad_norm": 0.38945522904396057, "learning_rate": 3.5352951576857756e-06, "loss": 0.4071, "step": 3394 }, { "epoch": 1.906232453677709, "grad_norm": 0.5070671439170837, "learning_rate": 3.5321717469472793e-06, "loss": 0.3885, "step": 3395 }, { "epoch": 1.9067939359910162, "grad_norm": 0.4251120090484619, "learning_rate": 3.5290489628891012e-06, "loss": 0.3718, "step": 3396 }, { "epoch": 1.9073554183043235, "grad_norm": 0.4669714868068695, "learning_rate": 3.525926806844494e-06, "loss": 0.3979, "step": 3397 }, { "epoch": 1.9079169006176304, "grad_norm": 0.4193938076496124, "learning_rate": 3.522805280146444e-06, "loss": 0.4355, "step": 3398 }, { "epoch": 1.9084783829309377, "grad_norm": 0.48121097683906555, "learning_rate": 3.5196843841276674e-06, "loss": 0.4224, "step": 3399 }, { "epoch": 1.9090398652442448, "grad_norm": 0.43267908692359924, "learning_rate": 3.51656412012061e-06, "loss": 0.3864, "step": 3400 }, { "epoch": 1.909601347557552, "grad_norm": 0.4157465100288391, "learning_rate": 3.5134444894574525e-06, "loss": 0.3975, "step": 3401 }, { "epoch": 1.910162829870859, "grad_norm": 0.41156458854675293, "learning_rate": 3.5103254934701005e-06, "loss": 0.4063, "step": 3402 }, { "epoch": 1.9107243121841662, "grad_norm": 0.46612483263015747, "learning_rate": 3.5072071334901907e-06, "loss": 0.3891, "step": 3403 }, { "epoch": 1.9112857944974735, "grad_norm": 0.4462655186653137, "learning_rate": 3.5040894108490882e-06, "loss": 0.4254, "step": 3404 }, { "epoch": 1.9118472768107804, "grad_norm": 0.401134729385376, "learning_rate": 3.500972326877884e-06, "loss": 0.3877, "step": 3405 }, { "epoch": 1.9124087591240877, "grad_norm": 0.47348111867904663, "learning_rate": 3.4978558829074004e-06, "loss": 0.4351, "step": 3406 }, { "epoch": 1.9129702414373946, "grad_norm": 0.4208754301071167, "learning_rate": 3.4947400802681832e-06, "loss": 0.4012, "step": 3407 }, { "epoch": 1.913531723750702, "grad_norm": 0.4147956967353821, "learning_rate": 3.491624920290505e-06, "loss": 0.3789, "step": 3408 }, { "epoch": 1.914093206064009, "grad_norm": 0.4590725004673004, "learning_rate": 3.4885104043043645e-06, "loss": 0.388, "step": 3409 }, { "epoch": 1.9146546883773161, "grad_norm": 0.4345109462738037, "learning_rate": 3.4853965336394846e-06, "loss": 0.4164, "step": 3410 }, { "epoch": 1.9152161706906232, "grad_norm": 0.49112293124198914, "learning_rate": 3.482283309625315e-06, "loss": 0.3976, "step": 3411 }, { "epoch": 1.9157776530039303, "grad_norm": 0.4720171093940735, "learning_rate": 3.4791707335910262e-06, "loss": 0.4301, "step": 3412 }, { "epoch": 1.9163391353172377, "grad_norm": 0.4516614079475403, "learning_rate": 3.4760588068655155e-06, "loss": 0.403, "step": 3413 }, { "epoch": 1.9169006176305445, "grad_norm": 0.4301910400390625, "learning_rate": 3.4729475307773984e-06, "loss": 0.4059, "step": 3414 }, { "epoch": 1.9174620999438519, "grad_norm": 0.44174790382385254, "learning_rate": 3.4698369066550166e-06, "loss": 0.4161, "step": 3415 }, { "epoch": 1.9180235822571587, "grad_norm": 0.5546747446060181, "learning_rate": 3.466726935826433e-06, "loss": 0.4055, "step": 3416 }, { "epoch": 1.918585064570466, "grad_norm": 0.46147143840789795, "learning_rate": 3.4636176196194295e-06, "loss": 0.3928, "step": 3417 }, { "epoch": 1.9191465468837732, "grad_norm": 0.4180717170238495, "learning_rate": 3.460508959361509e-06, "loss": 0.4114, "step": 3418 }, { "epoch": 1.9197080291970803, "grad_norm": 0.46447524428367615, "learning_rate": 3.457400956379897e-06, "loss": 0.4228, "step": 3419 }, { "epoch": 1.9202695115103874, "grad_norm": 0.47442105412483215, "learning_rate": 3.454293612001534e-06, "loss": 0.4147, "step": 3420 }, { "epoch": 1.9208309938236945, "grad_norm": 0.4006524085998535, "learning_rate": 3.451186927553084e-06, "loss": 0.393, "step": 3421 }, { "epoch": 1.9213924761370018, "grad_norm": 0.45002472400665283, "learning_rate": 3.4480809043609265e-06, "loss": 0.3866, "step": 3422 }, { "epoch": 1.9219539584503087, "grad_norm": 0.462990403175354, "learning_rate": 3.4449755437511585e-06, "loss": 0.398, "step": 3423 }, { "epoch": 1.922515440763616, "grad_norm": 0.4813178777694702, "learning_rate": 3.441870847049595e-06, "loss": 0.4316, "step": 3424 }, { "epoch": 1.9230769230769231, "grad_norm": 0.4915928244590759, "learning_rate": 3.438766815581766e-06, "loss": 0.403, "step": 3425 }, { "epoch": 1.9236384053902302, "grad_norm": 0.4657153785228729, "learning_rate": 3.4356634506729213e-06, "loss": 0.4431, "step": 3426 }, { "epoch": 1.9241998877035373, "grad_norm": 0.4072481095790863, "learning_rate": 3.432560753648022e-06, "loss": 0.3925, "step": 3427 }, { "epoch": 1.9247613700168444, "grad_norm": 0.4609399139881134, "learning_rate": 3.429458725831747e-06, "loss": 0.3878, "step": 3428 }, { "epoch": 1.9253228523301515, "grad_norm": 0.5185809135437012, "learning_rate": 3.4263573685484863e-06, "loss": 0.4118, "step": 3429 }, { "epoch": 1.9258843346434587, "grad_norm": 0.45728835463523865, "learning_rate": 3.423256683122346e-06, "loss": 0.3941, "step": 3430 }, { "epoch": 1.926445816956766, "grad_norm": 0.48807430267333984, "learning_rate": 3.4201566708771454e-06, "loss": 0.4129, "step": 3431 }, { "epoch": 1.9270072992700729, "grad_norm": 0.4389239251613617, "learning_rate": 3.4170573331364163e-06, "loss": 0.4139, "step": 3432 }, { "epoch": 1.9275687815833802, "grad_norm": 0.47779029607772827, "learning_rate": 3.4139586712234015e-06, "loss": 0.4263, "step": 3433 }, { "epoch": 1.9281302638966873, "grad_norm": 0.4830620586872101, "learning_rate": 3.410860686461055e-06, "loss": 0.432, "step": 3434 }, { "epoch": 1.9286917462099944, "grad_norm": 0.43392956256866455, "learning_rate": 3.4077633801720433e-06, "loss": 0.3869, "step": 3435 }, { "epoch": 1.9292532285233015, "grad_norm": 0.4326777458190918, "learning_rate": 3.4046667536787412e-06, "loss": 0.4076, "step": 3436 }, { "epoch": 1.9298147108366086, "grad_norm": 0.48164668679237366, "learning_rate": 3.401570808303237e-06, "loss": 0.3864, "step": 3437 }, { "epoch": 1.930376193149916, "grad_norm": 0.4659072458744049, "learning_rate": 3.3984755453673246e-06, "loss": 0.4053, "step": 3438 }, { "epoch": 1.9309376754632228, "grad_norm": 0.46061208844184875, "learning_rate": 3.3953809661925068e-06, "loss": 0.4186, "step": 3439 }, { "epoch": 1.9314991577765301, "grad_norm": 0.48598089814186096, "learning_rate": 3.3922870720999954e-06, "loss": 0.3849, "step": 3440 }, { "epoch": 1.932060640089837, "grad_norm": 0.45155802369117737, "learning_rate": 3.3891938644107096e-06, "loss": 0.3988, "step": 3441 }, { "epoch": 1.9326221224031443, "grad_norm": 0.5129411816596985, "learning_rate": 3.3861013444452766e-06, "loss": 0.4191, "step": 3442 }, { "epoch": 1.9331836047164515, "grad_norm": 0.45781949162483215, "learning_rate": 3.383009513524028e-06, "loss": 0.3986, "step": 3443 }, { "epoch": 1.9337450870297586, "grad_norm": 0.4306277930736542, "learning_rate": 3.379918372967004e-06, "loss": 0.4183, "step": 3444 }, { "epoch": 1.9343065693430657, "grad_norm": 0.4260140359401703, "learning_rate": 3.376827924093945e-06, "loss": 0.4011, "step": 3445 }, { "epoch": 1.9348680516563728, "grad_norm": 0.4632387161254883, "learning_rate": 3.3737381682242998e-06, "loss": 0.4162, "step": 3446 }, { "epoch": 1.93542953396968, "grad_norm": 0.5040937662124634, "learning_rate": 3.370649106677224e-06, "loss": 0.434, "step": 3447 }, { "epoch": 1.935991016282987, "grad_norm": 0.4607296884059906, "learning_rate": 3.3675607407715704e-06, "loss": 0.4036, "step": 3448 }, { "epoch": 1.9365524985962943, "grad_norm": 0.44842663407325745, "learning_rate": 3.3644730718259e-06, "loss": 0.4255, "step": 3449 }, { "epoch": 1.9371139809096012, "grad_norm": 0.442037433385849, "learning_rate": 3.361386101158473e-06, "loss": 0.3861, "step": 3450 }, { "epoch": 1.9376754632229085, "grad_norm": 0.4813191294670105, "learning_rate": 3.3582998300872517e-06, "loss": 0.4025, "step": 3451 }, { "epoch": 1.9382369455362156, "grad_norm": 0.45244264602661133, "learning_rate": 3.3552142599299035e-06, "loss": 0.4258, "step": 3452 }, { "epoch": 1.9387984278495227, "grad_norm": 0.4275437295436859, "learning_rate": 3.352129392003792e-06, "loss": 0.3951, "step": 3453 }, { "epoch": 1.9393599101628298, "grad_norm": 0.4653974771499634, "learning_rate": 3.3490452276259835e-06, "loss": 0.4088, "step": 3454 }, { "epoch": 1.939921392476137, "grad_norm": 0.4508798122406006, "learning_rate": 3.3459617681132416e-06, "loss": 0.4259, "step": 3455 }, { "epoch": 1.9404828747894443, "grad_norm": 0.47929948568344116, "learning_rate": 3.3428790147820317e-06, "loss": 0.4026, "step": 3456 }, { "epoch": 1.9410443571027511, "grad_norm": 0.4541015326976776, "learning_rate": 3.3397969689485166e-06, "loss": 0.4386, "step": 3457 }, { "epoch": 1.9416058394160585, "grad_norm": 0.4769798219203949, "learning_rate": 3.3367156319285576e-06, "loss": 0.3877, "step": 3458 }, { "epoch": 1.9421673217293656, "grad_norm": 0.5018932819366455, "learning_rate": 3.3336350050377116e-06, "loss": 0.415, "step": 3459 }, { "epoch": 1.9427288040426727, "grad_norm": 0.43357565999031067, "learning_rate": 3.330555089591233e-06, "loss": 0.4096, "step": 3460 }, { "epoch": 1.9432902863559798, "grad_norm": 0.4659222960472107, "learning_rate": 3.3274758869040737e-06, "loss": 0.4219, "step": 3461 }, { "epoch": 1.9438517686692869, "grad_norm": 0.43707722425460815, "learning_rate": 3.3243973982908813e-06, "loss": 0.3875, "step": 3462 }, { "epoch": 1.9444132509825942, "grad_norm": 0.3979853093624115, "learning_rate": 3.321319625065997e-06, "loss": 0.3804, "step": 3463 }, { "epoch": 1.944974733295901, "grad_norm": 0.42946717143058777, "learning_rate": 3.3182425685434573e-06, "loss": 0.3743, "step": 3464 }, { "epoch": 1.9455362156092084, "grad_norm": 0.4378260672092438, "learning_rate": 3.3151662300369918e-06, "loss": 0.3812, "step": 3465 }, { "epoch": 1.9460976979225153, "grad_norm": 0.4276694357395172, "learning_rate": 3.3120906108600248e-06, "loss": 0.3946, "step": 3466 }, { "epoch": 1.9466591802358226, "grad_norm": 0.4812508225440979, "learning_rate": 3.309015712325675e-06, "loss": 0.4023, "step": 3467 }, { "epoch": 1.9472206625491297, "grad_norm": 0.40264222025871277, "learning_rate": 3.3059415357467493e-06, "loss": 0.4142, "step": 3468 }, { "epoch": 1.9477821448624368, "grad_norm": 0.45280301570892334, "learning_rate": 3.302868082435751e-06, "loss": 0.4075, "step": 3469 }, { "epoch": 1.948343627175744, "grad_norm": 0.4064820110797882, "learning_rate": 3.29979535370487e-06, "loss": 0.4152, "step": 3470 }, { "epoch": 1.948905109489051, "grad_norm": 0.45120275020599365, "learning_rate": 3.2967233508659886e-06, "loss": 0.4219, "step": 3471 }, { "epoch": 1.9494665918023584, "grad_norm": 0.4189216196537018, "learning_rate": 3.293652075230682e-06, "loss": 0.4327, "step": 3472 }, { "epoch": 1.9500280741156653, "grad_norm": 0.47780728340148926, "learning_rate": 3.2905815281102115e-06, "loss": 0.3761, "step": 3473 }, { "epoch": 1.9505895564289726, "grad_norm": 0.4785729646682739, "learning_rate": 3.2875117108155286e-06, "loss": 0.4149, "step": 3474 }, { "epoch": 1.9511510387422795, "grad_norm": 0.4167942404747009, "learning_rate": 3.2844426246572737e-06, "loss": 0.3873, "step": 3475 }, { "epoch": 1.9517125210555868, "grad_norm": 0.3997957110404968, "learning_rate": 3.281374270945773e-06, "loss": 0.4138, "step": 3476 }, { "epoch": 1.952274003368894, "grad_norm": 0.44869738817214966, "learning_rate": 3.278306650991041e-06, "loss": 0.4048, "step": 3477 }, { "epoch": 1.952835485682201, "grad_norm": 0.45535755157470703, "learning_rate": 3.275239766102781e-06, "loss": 0.4037, "step": 3478 }, { "epoch": 1.953396967995508, "grad_norm": 0.5209730863571167, "learning_rate": 3.272173617590381e-06, "loss": 0.4126, "step": 3479 }, { "epoch": 1.9539584503088152, "grad_norm": 0.4217243790626526, "learning_rate": 3.2691082067629133e-06, "loss": 0.4004, "step": 3480 }, { "epoch": 1.9545199326221225, "grad_norm": 0.449520081281662, "learning_rate": 3.2660435349291366e-06, "loss": 0.4275, "step": 3481 }, { "epoch": 1.9550814149354294, "grad_norm": 0.46309441328048706, "learning_rate": 3.2629796033974924e-06, "loss": 0.4023, "step": 3482 }, { "epoch": 1.9556428972487367, "grad_norm": 0.45420315861701965, "learning_rate": 3.259916413476111e-06, "loss": 0.4019, "step": 3483 }, { "epoch": 1.9562043795620438, "grad_norm": 0.44110238552093506, "learning_rate": 3.2568539664728e-06, "loss": 0.3989, "step": 3484 }, { "epoch": 1.956765861875351, "grad_norm": 0.4763919711112976, "learning_rate": 3.2537922636950537e-06, "loss": 0.4157, "step": 3485 }, { "epoch": 1.957327344188658, "grad_norm": 0.4287123680114746, "learning_rate": 3.2507313064500463e-06, "loss": 0.3915, "step": 3486 }, { "epoch": 1.9578888265019652, "grad_norm": 0.4794691503047943, "learning_rate": 3.2476710960446344e-06, "loss": 0.4287, "step": 3487 }, { "epoch": 1.9584503088152723, "grad_norm": 0.44588735699653625, "learning_rate": 3.2446116337853583e-06, "loss": 0.4262, "step": 3488 }, { "epoch": 1.9590117911285794, "grad_norm": 0.424045205116272, "learning_rate": 3.2415529209784356e-06, "loss": 0.3877, "step": 3489 }, { "epoch": 1.9595732734418867, "grad_norm": 0.42892369627952576, "learning_rate": 3.2384949589297657e-06, "loss": 0.4156, "step": 3490 }, { "epoch": 1.9601347557551936, "grad_norm": 0.48805662989616394, "learning_rate": 3.2354377489449263e-06, "loss": 0.4171, "step": 3491 }, { "epoch": 1.960696238068501, "grad_norm": 0.4383067488670349, "learning_rate": 3.2323812923291723e-06, "loss": 0.3991, "step": 3492 }, { "epoch": 1.961257720381808, "grad_norm": 0.4349946975708008, "learning_rate": 3.229325590387443e-06, "loss": 0.3966, "step": 3493 }, { "epoch": 1.9618192026951151, "grad_norm": 0.4386933743953705, "learning_rate": 3.226270644424351e-06, "loss": 0.3892, "step": 3494 }, { "epoch": 1.9623806850084222, "grad_norm": 0.45755618810653687, "learning_rate": 3.223216455744185e-06, "loss": 0.4086, "step": 3495 }, { "epoch": 1.9629421673217293, "grad_norm": 0.4451434910297394, "learning_rate": 3.220163025650913e-06, "loss": 0.4008, "step": 3496 }, { "epoch": 1.9635036496350367, "grad_norm": 0.3896694779396057, "learning_rate": 3.2171103554481775e-06, "loss": 0.4023, "step": 3497 }, { "epoch": 1.9640651319483435, "grad_norm": 0.4128664433956146, "learning_rate": 3.2140584464392998e-06, "loss": 0.4351, "step": 3498 }, { "epoch": 1.9646266142616509, "grad_norm": 0.39718759059906006, "learning_rate": 3.2110072999272725e-06, "loss": 0.3828, "step": 3499 }, { "epoch": 1.9651880965749577, "grad_norm": 0.4240165054798126, "learning_rate": 3.2079569172147634e-06, "loss": 0.4054, "step": 3500 }, { "epoch": 1.965749578888265, "grad_norm": 0.4660540521144867, "learning_rate": 3.2049072996041152e-06, "loss": 0.4223, "step": 3501 }, { "epoch": 1.9663110612015722, "grad_norm": 0.4093714654445648, "learning_rate": 3.2018584483973434e-06, "loss": 0.3993, "step": 3502 }, { "epoch": 1.9668725435148793, "grad_norm": 0.48055681586265564, "learning_rate": 3.1988103648961377e-06, "loss": 0.4248, "step": 3503 }, { "epoch": 1.9674340258281864, "grad_norm": 0.43435636162757874, "learning_rate": 3.1957630504018574e-06, "loss": 0.4295, "step": 3504 }, { "epoch": 1.9679955081414935, "grad_norm": 0.4465259313583374, "learning_rate": 3.192716506215535e-06, "loss": 0.3774, "step": 3505 }, { "epoch": 1.9685569904548008, "grad_norm": 0.48154416680336, "learning_rate": 3.189670733637874e-06, "loss": 0.423, "step": 3506 }, { "epoch": 1.9691184727681077, "grad_norm": 0.45654889941215515, "learning_rate": 3.1866257339692486e-06, "loss": 0.3985, "step": 3507 }, { "epoch": 1.969679955081415, "grad_norm": 0.42355912923812866, "learning_rate": 3.1835815085097027e-06, "loss": 0.3949, "step": 3508 }, { "epoch": 1.970241437394722, "grad_norm": 0.44731977581977844, "learning_rate": 3.1805380585589503e-06, "loss": 0.3943, "step": 3509 }, { "epoch": 1.9708029197080292, "grad_norm": 0.45125100016593933, "learning_rate": 3.177495385416373e-06, "loss": 0.3881, "step": 3510 }, { "epoch": 1.9713644020213363, "grad_norm": 0.4441142678260803, "learning_rate": 3.174453490381022e-06, "loss": 0.4078, "step": 3511 }, { "epoch": 1.9719258843346434, "grad_norm": 0.409376323223114, "learning_rate": 3.171412374751615e-06, "loss": 0.3647, "step": 3512 }, { "epoch": 1.9724873666479505, "grad_norm": 0.4942393898963928, "learning_rate": 3.1683720398265393e-06, "loss": 0.4224, "step": 3513 }, { "epoch": 1.9730488489612577, "grad_norm": 0.41223666071891785, "learning_rate": 3.165332486903847e-06, "loss": 0.3952, "step": 3514 }, { "epoch": 1.973610331274565, "grad_norm": 0.44383135437965393, "learning_rate": 3.162293717281255e-06, "loss": 0.3883, "step": 3515 }, { "epoch": 1.9741718135878719, "grad_norm": 0.454215943813324, "learning_rate": 3.1592557322561503e-06, "loss": 0.4142, "step": 3516 }, { "epoch": 1.9747332959011792, "grad_norm": 0.4596787393093109, "learning_rate": 3.156218533125578e-06, "loss": 0.3779, "step": 3517 }, { "epoch": 1.9752947782144863, "grad_norm": 0.4807995855808258, "learning_rate": 3.1531821211862558e-06, "loss": 0.4001, "step": 3518 }, { "epoch": 1.9758562605277934, "grad_norm": 0.39075830578804016, "learning_rate": 3.15014649773456e-06, "loss": 0.4126, "step": 3519 }, { "epoch": 1.9764177428411005, "grad_norm": 0.49202901124954224, "learning_rate": 3.14711166406653e-06, "loss": 0.397, "step": 3520 }, { "epoch": 1.9769792251544076, "grad_norm": 0.4398996829986572, "learning_rate": 3.144077621477871e-06, "loss": 0.4195, "step": 3521 }, { "epoch": 1.977540707467715, "grad_norm": 0.45503050088882446, "learning_rate": 3.1410443712639492e-06, "loss": 0.401, "step": 3522 }, { "epoch": 1.9781021897810218, "grad_norm": 0.48512589931488037, "learning_rate": 3.138011914719791e-06, "loss": 0.4531, "step": 3523 }, { "epoch": 1.9786636720943291, "grad_norm": 0.47008344531059265, "learning_rate": 3.134980253140086e-06, "loss": 0.4024, "step": 3524 }, { "epoch": 1.979225154407636, "grad_norm": 0.4593307077884674, "learning_rate": 3.131949387819183e-06, "loss": 0.3899, "step": 3525 }, { "epoch": 1.9797866367209433, "grad_norm": 0.452985554933548, "learning_rate": 3.1289193200510935e-06, "loss": 0.4043, "step": 3526 }, { "epoch": 1.9803481190342505, "grad_norm": 0.49140623211860657, "learning_rate": 3.1258900511294843e-06, "loss": 0.415, "step": 3527 }, { "epoch": 1.9809096013475576, "grad_norm": 0.46090349555015564, "learning_rate": 3.122861582347683e-06, "loss": 0.3755, "step": 3528 }, { "epoch": 1.9814710836608647, "grad_norm": 0.4062277376651764, "learning_rate": 3.1198339149986774e-06, "loss": 0.4212, "step": 3529 }, { "epoch": 1.9820325659741718, "grad_norm": 0.4679860770702362, "learning_rate": 3.1168070503751114e-06, "loss": 0.3894, "step": 3530 }, { "epoch": 1.982594048287479, "grad_norm": 0.4557066559791565, "learning_rate": 3.1137809897692856e-06, "loss": 0.4094, "step": 3531 }, { "epoch": 1.983155530600786, "grad_norm": 0.45125120878219604, "learning_rate": 3.1107557344731586e-06, "loss": 0.4244, "step": 3532 }, { "epoch": 1.9837170129140933, "grad_norm": 0.45938393473625183, "learning_rate": 3.107731285778344e-06, "loss": 0.3978, "step": 3533 }, { "epoch": 1.9842784952274002, "grad_norm": 0.4022979140281677, "learning_rate": 3.1047076449761126e-06, "loss": 0.4031, "step": 3534 }, { "epoch": 1.9848399775407075, "grad_norm": 0.4931841790676117, "learning_rate": 3.1016848133573896e-06, "loss": 0.3942, "step": 3535 }, { "epoch": 1.9854014598540146, "grad_norm": 0.4105769991874695, "learning_rate": 3.0986627922127537e-06, "loss": 0.4092, "step": 3536 }, { "epoch": 1.9859629421673217, "grad_norm": 0.45678046345710754, "learning_rate": 3.0956415828324393e-06, "loss": 0.3817, "step": 3537 }, { "epoch": 1.9865244244806288, "grad_norm": 0.4341873228549957, "learning_rate": 3.0926211865063303e-06, "loss": 0.431, "step": 3538 }, { "epoch": 1.987085906793936, "grad_norm": 0.3988542854785919, "learning_rate": 3.089601604523972e-06, "loss": 0.4008, "step": 3539 }, { "epoch": 1.9876473891072433, "grad_norm": 0.43084481358528137, "learning_rate": 3.0865828381745515e-06, "loss": 0.3961, "step": 3540 }, { "epoch": 1.9882088714205501, "grad_norm": 0.42090922594070435, "learning_rate": 3.0835648887469155e-06, "loss": 0.3907, "step": 3541 }, { "epoch": 1.9887703537338575, "grad_norm": 0.42818307876586914, "learning_rate": 3.0805477575295585e-06, "loss": 0.406, "step": 3542 }, { "epoch": 1.9893318360471643, "grad_norm": 0.4114232063293457, "learning_rate": 3.0775314458106243e-06, "loss": 0.3657, "step": 3543 }, { "epoch": 1.9898933183604717, "grad_norm": 0.39732006192207336, "learning_rate": 3.074515954877911e-06, "loss": 0.4339, "step": 3544 }, { "epoch": 1.9904548006737788, "grad_norm": 0.4386228024959564, "learning_rate": 3.0715012860188642e-06, "loss": 0.4323, "step": 3545 }, { "epoch": 1.9910162829870859, "grad_norm": 0.492316871881485, "learning_rate": 3.0684874405205777e-06, "loss": 0.4171, "step": 3546 }, { "epoch": 1.991577765300393, "grad_norm": 0.4798378050327301, "learning_rate": 3.0654744196697945e-06, "loss": 0.4065, "step": 3547 }, { "epoch": 1.9921392476137, "grad_norm": 0.4610265791416168, "learning_rate": 3.062462224752904e-06, "loss": 0.4083, "step": 3548 }, { "epoch": 1.9927007299270074, "grad_norm": 0.4484364688396454, "learning_rate": 3.0594508570559466e-06, "loss": 0.4018, "step": 3549 }, { "epoch": 1.9932622122403143, "grad_norm": 0.4298401176929474, "learning_rate": 3.0564403178646073e-06, "loss": 0.3922, "step": 3550 }, { "epoch": 1.9938236945536216, "grad_norm": 0.43555721640586853, "learning_rate": 3.0534306084642163e-06, "loss": 0.419, "step": 3551 }, { "epoch": 1.9943851768669287, "grad_norm": 0.46710869669914246, "learning_rate": 3.050421730139751e-06, "loss": 0.4065, "step": 3552 }, { "epoch": 1.9949466591802358, "grad_norm": 0.4217379689216614, "learning_rate": 3.047413684175833e-06, "loss": 0.4127, "step": 3553 }, { "epoch": 1.995508141493543, "grad_norm": 0.4880024790763855, "learning_rate": 3.0444064718567305e-06, "loss": 0.3971, "step": 3554 }, { "epoch": 1.99606962380685, "grad_norm": 0.47345927357673645, "learning_rate": 3.0414000944663547e-06, "loss": 0.4276, "step": 3555 }, { "epoch": 1.9966311061201574, "grad_norm": 0.44212794303894043, "learning_rate": 3.0383945532882585e-06, "loss": 0.4309, "step": 3556 }, { "epoch": 1.9971925884334643, "grad_norm": 0.4258018136024475, "learning_rate": 3.03538984960564e-06, "loss": 0.4089, "step": 3557 }, { "epoch": 1.9977540707467716, "grad_norm": 0.37450239062309265, "learning_rate": 3.0323859847013386e-06, "loss": 0.3759, "step": 3558 }, { "epoch": 1.9983155530600785, "grad_norm": 0.4758813679218292, "learning_rate": 3.029382959857837e-06, "loss": 0.4135, "step": 3559 }, { "epoch": 1.9988770353733858, "grad_norm": 0.3796449601650238, "learning_rate": 3.0263807763572595e-06, "loss": 0.3888, "step": 3560 }, { "epoch": 1.999438517686693, "grad_norm": 0.43269771337509155, "learning_rate": 3.0233794354813673e-06, "loss": 0.4283, "step": 3561 }, { "epoch": 2.0, "grad_norm": 0.46504849195480347, "learning_rate": 3.0203789385115668e-06, "loss": 0.4027, "step": 3562 }, { "epoch": 2.0005614823133073, "grad_norm": 0.44103336334228516, "learning_rate": 3.0173792867289004e-06, "loss": 0.3494, "step": 3563 }, { "epoch": 2.001122964626614, "grad_norm": 0.43591395020484924, "learning_rate": 3.014380481414051e-06, "loss": 0.3597, "step": 3564 }, { "epoch": 2.0016844469399215, "grad_norm": 0.4068385362625122, "learning_rate": 3.0113825238473416e-06, "loss": 0.3777, "step": 3565 }, { "epoch": 2.0022459292532284, "grad_norm": 0.40799638628959656, "learning_rate": 3.0083854153087312e-06, "loss": 0.3788, "step": 3566 }, { "epoch": 2.0028074115665357, "grad_norm": 0.4722590148448944, "learning_rate": 3.0053891570778165e-06, "loss": 0.3636, "step": 3567 }, { "epoch": 2.0033688938798426, "grad_norm": 0.4120185077190399, "learning_rate": 3.002393750433833e-06, "loss": 0.3891, "step": 3568 }, { "epoch": 2.00393037619315, "grad_norm": 0.43989086151123047, "learning_rate": 2.999399196655649e-06, "loss": 0.3874, "step": 3569 }, { "epoch": 2.004491858506457, "grad_norm": 0.4194948673248291, "learning_rate": 2.996405497021773e-06, "loss": 0.3681, "step": 3570 }, { "epoch": 2.005053340819764, "grad_norm": 0.4035292863845825, "learning_rate": 2.993412652810347e-06, "loss": 0.3496, "step": 3571 }, { "epoch": 2.0056148231330715, "grad_norm": 0.39177680015563965, "learning_rate": 2.9904206652991463e-06, "loss": 0.3995, "step": 3572 }, { "epoch": 2.0061763054463784, "grad_norm": 0.41422897577285767, "learning_rate": 2.9874295357655823e-06, "loss": 0.3764, "step": 3573 }, { "epoch": 2.0067377877596857, "grad_norm": 0.4162711203098297, "learning_rate": 2.984439265486699e-06, "loss": 0.3737, "step": 3574 }, { "epoch": 2.0072992700729926, "grad_norm": 0.4232403337955475, "learning_rate": 2.981449855739175e-06, "loss": 0.3691, "step": 3575 }, { "epoch": 2.0078607523863, "grad_norm": 0.4872440695762634, "learning_rate": 2.9784613077993195e-06, "loss": 0.3937, "step": 3576 }, { "epoch": 2.008422234699607, "grad_norm": 0.42339158058166504, "learning_rate": 2.975473622943076e-06, "loss": 0.3904, "step": 3577 }, { "epoch": 2.008983717012914, "grad_norm": 0.4425334930419922, "learning_rate": 2.9724868024460176e-06, "loss": 0.3852, "step": 3578 }, { "epoch": 2.0095451993262214, "grad_norm": 0.4375705122947693, "learning_rate": 2.969500847583348e-06, "loss": 0.3684, "step": 3579 }, { "epoch": 2.0101066816395283, "grad_norm": 0.4157335162162781, "learning_rate": 2.9665157596299045e-06, "loss": 0.3681, "step": 3580 }, { "epoch": 2.0106681639528357, "grad_norm": 0.45096564292907715, "learning_rate": 2.9635315398601516e-06, "loss": 0.3692, "step": 3581 }, { "epoch": 2.0112296462661425, "grad_norm": 0.4121969938278198, "learning_rate": 2.960548189548183e-06, "loss": 0.3908, "step": 3582 }, { "epoch": 2.01179112857945, "grad_norm": 0.474394291639328, "learning_rate": 2.9575657099677223e-06, "loss": 0.4009, "step": 3583 }, { "epoch": 2.0123526108927567, "grad_norm": 0.4425446391105652, "learning_rate": 2.9545841023921197e-06, "loss": 0.3489, "step": 3584 }, { "epoch": 2.012914093206064, "grad_norm": 0.4185887277126312, "learning_rate": 2.9516033680943567e-06, "loss": 0.3555, "step": 3585 }, { "epoch": 2.013475575519371, "grad_norm": 0.418953001499176, "learning_rate": 2.9486235083470393e-06, "loss": 0.3591, "step": 3586 }, { "epoch": 2.0140370578326783, "grad_norm": 0.45097872614860535, "learning_rate": 2.9456445244223988e-06, "loss": 0.3808, "step": 3587 }, { "epoch": 2.0145985401459856, "grad_norm": 0.40071573853492737, "learning_rate": 2.9426664175922953e-06, "loss": 0.3768, "step": 3588 }, { "epoch": 2.0151600224592925, "grad_norm": 0.41394349932670593, "learning_rate": 2.939689189128212e-06, "loss": 0.3577, "step": 3589 }, { "epoch": 2.0157215047726, "grad_norm": 0.4050072133541107, "learning_rate": 2.9367128403012614e-06, "loss": 0.3657, "step": 3590 }, { "epoch": 2.0162829870859067, "grad_norm": 0.4459613263607025, "learning_rate": 2.933737372382175e-06, "loss": 0.3712, "step": 3591 }, { "epoch": 2.016844469399214, "grad_norm": 0.41667911410331726, "learning_rate": 2.930762786641311e-06, "loss": 0.3888, "step": 3592 }, { "epoch": 2.017405951712521, "grad_norm": 0.39950335025787354, "learning_rate": 2.927789084348651e-06, "loss": 0.4046, "step": 3593 }, { "epoch": 2.0179674340258282, "grad_norm": 0.41156840324401855, "learning_rate": 2.924816266773798e-06, "loss": 0.3702, "step": 3594 }, { "epoch": 2.018528916339135, "grad_norm": 0.44587525725364685, "learning_rate": 2.9218443351859806e-06, "loss": 0.3421, "step": 3595 }, { "epoch": 2.0190903986524424, "grad_norm": 0.4177926778793335, "learning_rate": 2.9188732908540464e-06, "loss": 0.3668, "step": 3596 }, { "epoch": 2.0196518809657498, "grad_norm": 0.4064767360687256, "learning_rate": 2.915903135046462e-06, "loss": 0.3668, "step": 3597 }, { "epoch": 2.0202133632790567, "grad_norm": 0.42503485083580017, "learning_rate": 2.9129338690313187e-06, "loss": 0.3992, "step": 3598 }, { "epoch": 2.020774845592364, "grad_norm": 0.4131571650505066, "learning_rate": 2.9099654940763287e-06, "loss": 0.3889, "step": 3599 }, { "epoch": 2.021336327905671, "grad_norm": 0.42438486218452454, "learning_rate": 2.906998011448818e-06, "loss": 0.3874, "step": 3600 }, { "epoch": 2.021897810218978, "grad_norm": 0.398837149143219, "learning_rate": 2.90403142241574e-06, "loss": 0.3612, "step": 3601 }, { "epoch": 2.022459292532285, "grad_norm": 0.40505510568618774, "learning_rate": 2.901065728243656e-06, "loss": 0.3601, "step": 3602 }, { "epoch": 2.0230207748455924, "grad_norm": 0.43105289340019226, "learning_rate": 2.8981009301987538e-06, "loss": 0.3635, "step": 3603 }, { "epoch": 2.0235822571588993, "grad_norm": 0.4310489892959595, "learning_rate": 2.8951370295468385e-06, "loss": 0.3652, "step": 3604 }, { "epoch": 2.0241437394722066, "grad_norm": 0.4167082607746124, "learning_rate": 2.8921740275533238e-06, "loss": 0.3618, "step": 3605 }, { "epoch": 2.024705221785514, "grad_norm": 0.4256590008735657, "learning_rate": 2.8892119254832507e-06, "loss": 0.3452, "step": 3606 }, { "epoch": 2.025266704098821, "grad_norm": 0.35410818457603455, "learning_rate": 2.886250724601266e-06, "loss": 0.3289, "step": 3607 }, { "epoch": 2.025828186412128, "grad_norm": 0.49214109778404236, "learning_rate": 2.8832904261716387e-06, "loss": 0.3946, "step": 3608 }, { "epoch": 2.026389668725435, "grad_norm": 0.44466787576675415, "learning_rate": 2.8803310314582522e-06, "loss": 0.3309, "step": 3609 }, { "epoch": 2.0269511510387423, "grad_norm": 0.4295720160007477, "learning_rate": 2.8773725417245977e-06, "loss": 0.3871, "step": 3610 }, { "epoch": 2.0275126333520492, "grad_norm": 0.39730390906333923, "learning_rate": 2.874414958233789e-06, "loss": 0.4068, "step": 3611 }, { "epoch": 2.0280741156653566, "grad_norm": 0.37221595644950867, "learning_rate": 2.871458282248543e-06, "loss": 0.3948, "step": 3612 }, { "epoch": 2.028635597978664, "grad_norm": 0.4334031343460083, "learning_rate": 2.8685025150311976e-06, "loss": 0.3936, "step": 3613 }, { "epoch": 2.0291970802919708, "grad_norm": 0.41479188203811646, "learning_rate": 2.8655476578437015e-06, "loss": 0.3806, "step": 3614 }, { "epoch": 2.029758562605278, "grad_norm": 0.43695223331451416, "learning_rate": 2.8625937119476084e-06, "loss": 0.3804, "step": 3615 }, { "epoch": 2.030320044918585, "grad_norm": 0.4719362258911133, "learning_rate": 2.8596406786040913e-06, "loss": 0.3869, "step": 3616 }, { "epoch": 2.0308815272318923, "grad_norm": 0.41593649983406067, "learning_rate": 2.8566885590739267e-06, "loss": 0.368, "step": 3617 }, { "epoch": 2.031443009545199, "grad_norm": 0.4208119213581085, "learning_rate": 2.8537373546175057e-06, "loss": 0.3531, "step": 3618 }, { "epoch": 2.0320044918585065, "grad_norm": 0.4499245882034302, "learning_rate": 2.8507870664948288e-06, "loss": 0.4134, "step": 3619 }, { "epoch": 2.0325659741718134, "grad_norm": 0.3951369524002075, "learning_rate": 2.8478376959654995e-06, "loss": 0.4164, "step": 3620 }, { "epoch": 2.0331274564851207, "grad_norm": 0.4206237196922302, "learning_rate": 2.844889244288738e-06, "loss": 0.3853, "step": 3621 }, { "epoch": 2.033688938798428, "grad_norm": 0.42975106835365295, "learning_rate": 2.8419417127233627e-06, "loss": 0.3654, "step": 3622 }, { "epoch": 2.034250421111735, "grad_norm": 0.45321446657180786, "learning_rate": 2.838995102527808e-06, "loss": 0.3757, "step": 3623 }, { "epoch": 2.0348119034250423, "grad_norm": 0.4220580756664276, "learning_rate": 2.8360494149601113e-06, "loss": 0.3835, "step": 3624 }, { "epoch": 2.035373385738349, "grad_norm": 0.4149090051651001, "learning_rate": 2.8331046512779124e-06, "loss": 0.3876, "step": 3625 }, { "epoch": 2.0359348680516565, "grad_norm": 0.4370776116847992, "learning_rate": 2.8301608127384655e-06, "loss": 0.3487, "step": 3626 }, { "epoch": 2.0364963503649633, "grad_norm": 0.43256497383117676, "learning_rate": 2.827217900598619e-06, "loss": 0.3927, "step": 3627 }, { "epoch": 2.0370578326782707, "grad_norm": 0.46356141567230225, "learning_rate": 2.8242759161148336e-06, "loss": 0.3582, "step": 3628 }, { "epoch": 2.0376193149915776, "grad_norm": 0.3963897228240967, "learning_rate": 2.8213348605431745e-06, "loss": 0.3783, "step": 3629 }, { "epoch": 2.038180797304885, "grad_norm": 0.41204026341438293, "learning_rate": 2.8183947351393027e-06, "loss": 0.3573, "step": 3630 }, { "epoch": 2.038742279618192, "grad_norm": 0.4155179262161255, "learning_rate": 2.8154555411584904e-06, "loss": 0.3777, "step": 3631 }, { "epoch": 2.039303761931499, "grad_norm": 0.411766916513443, "learning_rate": 2.8125172798556048e-06, "loss": 0.3863, "step": 3632 }, { "epoch": 2.0398652442448064, "grad_norm": 0.38523682951927185, "learning_rate": 2.8095799524851215e-06, "loss": 0.3928, "step": 3633 }, { "epoch": 2.0404267265581133, "grad_norm": 0.3836558759212494, "learning_rate": 2.8066435603011155e-06, "loss": 0.3892, "step": 3634 }, { "epoch": 2.0409882088714206, "grad_norm": 0.4104357063770294, "learning_rate": 2.803708104557258e-06, "loss": 0.355, "step": 3635 }, { "epoch": 2.0415496911847275, "grad_norm": 0.4100287854671478, "learning_rate": 2.8007735865068277e-06, "loss": 0.363, "step": 3636 }, { "epoch": 2.042111173498035, "grad_norm": 0.4197886884212494, "learning_rate": 2.7978400074026944e-06, "loss": 0.3544, "step": 3637 }, { "epoch": 2.0426726558113417, "grad_norm": 0.3539346158504486, "learning_rate": 2.794907368497336e-06, "loss": 0.3584, "step": 3638 }, { "epoch": 2.043234138124649, "grad_norm": 0.41422179341316223, "learning_rate": 2.7919756710428248e-06, "loss": 0.3546, "step": 3639 }, { "epoch": 2.0437956204379564, "grad_norm": 0.40322476625442505, "learning_rate": 2.7890449162908285e-06, "loss": 0.3837, "step": 3640 }, { "epoch": 2.0443571027512633, "grad_norm": 0.442950576543808, "learning_rate": 2.7861151054926174e-06, "loss": 0.3911, "step": 3641 }, { "epoch": 2.0449185850645706, "grad_norm": 0.400951623916626, "learning_rate": 2.783186239899054e-06, "loss": 0.345, "step": 3642 }, { "epoch": 2.0454800673778775, "grad_norm": 0.39443856477737427, "learning_rate": 2.7802583207606e-06, "loss": 0.3899, "step": 3643 }, { "epoch": 2.046041549691185, "grad_norm": 0.40669161081314087, "learning_rate": 2.7773313493273157e-06, "loss": 0.3861, "step": 3644 }, { "epoch": 2.0466030320044917, "grad_norm": 0.3721672296524048, "learning_rate": 2.774405326848849e-06, "loss": 0.3825, "step": 3645 }, { "epoch": 2.047164514317799, "grad_norm": 0.4091375768184662, "learning_rate": 2.7714802545744525e-06, "loss": 0.3866, "step": 3646 }, { "epoch": 2.0477259966311063, "grad_norm": 0.43060705065727234, "learning_rate": 2.7685561337529632e-06, "loss": 0.3834, "step": 3647 }, { "epoch": 2.048287478944413, "grad_norm": 0.41025668382644653, "learning_rate": 2.7656329656328186e-06, "loss": 0.393, "step": 3648 }, { "epoch": 2.0488489612577205, "grad_norm": 0.3865824043750763, "learning_rate": 2.762710751462049e-06, "loss": 0.3272, "step": 3649 }, { "epoch": 2.0494104435710274, "grad_norm": 0.43237966299057007, "learning_rate": 2.759789492488273e-06, "loss": 0.3895, "step": 3650 }, { "epoch": 2.0499719258843347, "grad_norm": 0.41964367032051086, "learning_rate": 2.7568691899587076e-06, "loss": 0.3774, "step": 3651 }, { "epoch": 2.0505334081976416, "grad_norm": 0.4091278314590454, "learning_rate": 2.753949845120154e-06, "loss": 0.372, "step": 3652 }, { "epoch": 2.051094890510949, "grad_norm": 0.40040433406829834, "learning_rate": 2.7510314592190113e-06, "loss": 0.3919, "step": 3653 }, { "epoch": 2.051656372824256, "grad_norm": 0.4211637079715729, "learning_rate": 2.7481140335012677e-06, "loss": 0.3707, "step": 3654 }, { "epoch": 2.052217855137563, "grad_norm": 0.3700118362903595, "learning_rate": 2.7451975692124977e-06, "loss": 0.3771, "step": 3655 }, { "epoch": 2.0527793374508705, "grad_norm": 0.38852420449256897, "learning_rate": 2.7422820675978713e-06, "loss": 0.3826, "step": 3656 }, { "epoch": 2.0533408197641774, "grad_norm": 0.36223623156547546, "learning_rate": 2.7393675299021395e-06, "loss": 0.3549, "step": 3657 }, { "epoch": 2.0539023020774847, "grad_norm": 0.410321444272995, "learning_rate": 2.736453957369648e-06, "loss": 0.3815, "step": 3658 }, { "epoch": 2.0544637843907916, "grad_norm": 0.391575425863266, "learning_rate": 2.7335413512443325e-06, "loss": 0.3805, "step": 3659 }, { "epoch": 2.055025266704099, "grad_norm": 0.41137444972991943, "learning_rate": 2.730629712769708e-06, "loss": 0.3645, "step": 3660 }, { "epoch": 2.055586749017406, "grad_norm": 0.4215141832828522, "learning_rate": 2.7277190431888836e-06, "loss": 0.3797, "step": 3661 }, { "epoch": 2.056148231330713, "grad_norm": 0.4063601791858673, "learning_rate": 2.724809343744548e-06, "loss": 0.3899, "step": 3662 }, { "epoch": 2.05670971364402, "grad_norm": 0.3956460654735565, "learning_rate": 2.721900615678983e-06, "loss": 0.3827, "step": 3663 }, { "epoch": 2.0572711959573273, "grad_norm": 0.39367470145225525, "learning_rate": 2.718992860234054e-06, "loss": 0.3755, "step": 3664 }, { "epoch": 2.0578326782706347, "grad_norm": 0.443496435880661, "learning_rate": 2.7160860786512043e-06, "loss": 0.3806, "step": 3665 }, { "epoch": 2.0583941605839415, "grad_norm": 0.4136575162410736, "learning_rate": 2.713180272171472e-06, "loss": 0.3858, "step": 3666 }, { "epoch": 2.058955642897249, "grad_norm": 0.38204851746559143, "learning_rate": 2.7102754420354676e-06, "loss": 0.3499, "step": 3667 }, { "epoch": 2.0595171252105557, "grad_norm": 0.40547874569892883, "learning_rate": 2.707371589483394e-06, "loss": 0.3437, "step": 3668 }, { "epoch": 2.060078607523863, "grad_norm": 0.43316957354545593, "learning_rate": 2.704468715755036e-06, "loss": 0.3837, "step": 3669 }, { "epoch": 2.06064008983717, "grad_norm": 0.3966485261917114, "learning_rate": 2.701566822089752e-06, "loss": 0.3446, "step": 3670 }, { "epoch": 2.0612015721504773, "grad_norm": 0.3940685987472534, "learning_rate": 2.6986659097264934e-06, "loss": 0.375, "step": 3671 }, { "epoch": 2.0617630544637846, "grad_norm": 0.46701982617378235, "learning_rate": 2.6957659799037816e-06, "loss": 0.4109, "step": 3672 }, { "epoch": 2.0623245367770915, "grad_norm": 0.4162931442260742, "learning_rate": 2.6928670338597285e-06, "loss": 0.3823, "step": 3673 }, { "epoch": 2.062886019090399, "grad_norm": 0.44016769528388977, "learning_rate": 2.689969072832018e-06, "loss": 0.393, "step": 3674 }, { "epoch": 2.0634475014037057, "grad_norm": 0.43184611201286316, "learning_rate": 2.687072098057919e-06, "loss": 0.3883, "step": 3675 }, { "epoch": 2.064008983717013, "grad_norm": 0.45252731442451477, "learning_rate": 2.684176110774278e-06, "loss": 0.3653, "step": 3676 }, { "epoch": 2.06457046603032, "grad_norm": 0.4017550051212311, "learning_rate": 2.6812811122175157e-06, "loss": 0.3862, "step": 3677 }, { "epoch": 2.0651319483436272, "grad_norm": 0.42190730571746826, "learning_rate": 2.6783871036236386e-06, "loss": 0.3606, "step": 3678 }, { "epoch": 2.065693430656934, "grad_norm": 0.43626898527145386, "learning_rate": 2.6754940862282216e-06, "loss": 0.3427, "step": 3679 }, { "epoch": 2.0662549129702414, "grad_norm": 0.4392480254173279, "learning_rate": 2.6726020612664226e-06, "loss": 0.3712, "step": 3680 }, { "epoch": 2.0668163952835488, "grad_norm": 0.4098392128944397, "learning_rate": 2.6697110299729754e-06, "loss": 0.3608, "step": 3681 }, { "epoch": 2.0673778775968557, "grad_norm": 0.44043686985969543, "learning_rate": 2.6668209935821863e-06, "loss": 0.384, "step": 3682 }, { "epoch": 2.067939359910163, "grad_norm": 0.4281127452850342, "learning_rate": 2.6639319533279404e-06, "loss": 0.3631, "step": 3683 }, { "epoch": 2.06850084222347, "grad_norm": 0.41182994842529297, "learning_rate": 2.6610439104436935e-06, "loss": 0.3527, "step": 3684 }, { "epoch": 2.069062324536777, "grad_norm": 0.4331268072128296, "learning_rate": 2.658156866162479e-06, "loss": 0.3808, "step": 3685 }, { "epoch": 2.069623806850084, "grad_norm": 0.3852330446243286, "learning_rate": 2.6552708217169043e-06, "loss": 0.3595, "step": 3686 }, { "epoch": 2.0701852891633914, "grad_norm": 0.3783239424228668, "learning_rate": 2.652385778339146e-06, "loss": 0.3735, "step": 3687 }, { "epoch": 2.0707467714766983, "grad_norm": 0.445514440536499, "learning_rate": 2.6495017372609586e-06, "loss": 0.338, "step": 3688 }, { "epoch": 2.0713082537900056, "grad_norm": 0.4322548508644104, "learning_rate": 2.646618699713662e-06, "loss": 0.3792, "step": 3689 }, { "epoch": 2.071869736103313, "grad_norm": 0.3616850972175598, "learning_rate": 2.643736666928155e-06, "loss": 0.3802, "step": 3690 }, { "epoch": 2.07243121841662, "grad_norm": 0.39817163348197937, "learning_rate": 2.6408556401349036e-06, "loss": 0.3922, "step": 3691 }, { "epoch": 2.072992700729927, "grad_norm": 0.3996894061565399, "learning_rate": 2.637975620563942e-06, "loss": 0.3477, "step": 3692 }, { "epoch": 2.073554183043234, "grad_norm": 0.42300358414649963, "learning_rate": 2.6350966094448793e-06, "loss": 0.3693, "step": 3693 }, { "epoch": 2.0741156653565413, "grad_norm": 0.40261828899383545, "learning_rate": 2.63221860800689e-06, "loss": 0.3941, "step": 3694 }, { "epoch": 2.0746771476698482, "grad_norm": 0.39712682366371155, "learning_rate": 2.6293416174787206e-06, "loss": 0.3763, "step": 3695 }, { "epoch": 2.0752386299831556, "grad_norm": 0.42990943789482117, "learning_rate": 2.626465639088686e-06, "loss": 0.3773, "step": 3696 }, { "epoch": 2.075800112296463, "grad_norm": 0.40972140431404114, "learning_rate": 2.623590674064664e-06, "loss": 0.3703, "step": 3697 }, { "epoch": 2.0763615946097698, "grad_norm": 0.4952843189239502, "learning_rate": 2.6207167236341074e-06, "loss": 0.377, "step": 3698 }, { "epoch": 2.076923076923077, "grad_norm": 0.4087299108505249, "learning_rate": 2.617843789024028e-06, "loss": 0.3215, "step": 3699 }, { "epoch": 2.077484559236384, "grad_norm": 0.4619285762310028, "learning_rate": 2.6149718714610088e-06, "loss": 0.3468, "step": 3700 }, { "epoch": 2.0780460415496913, "grad_norm": 0.46278154850006104, "learning_rate": 2.612100972171201e-06, "loss": 0.3839, "step": 3701 }, { "epoch": 2.078607523862998, "grad_norm": 0.41581353545188904, "learning_rate": 2.609231092380312e-06, "loss": 0.3683, "step": 3702 }, { "epoch": 2.0791690061763055, "grad_norm": 0.38009214401245117, "learning_rate": 2.606362233313624e-06, "loss": 0.3651, "step": 3703 }, { "epoch": 2.0797304884896124, "grad_norm": 0.3621695935726166, "learning_rate": 2.6034943961959756e-06, "loss": 0.3732, "step": 3704 }, { "epoch": 2.0802919708029197, "grad_norm": 0.478930801153183, "learning_rate": 2.6006275822517734e-06, "loss": 0.3856, "step": 3705 }, { "epoch": 2.080853453116227, "grad_norm": 0.4106801450252533, "learning_rate": 2.5977617927049887e-06, "loss": 0.3732, "step": 3706 }, { "epoch": 2.081414935429534, "grad_norm": 0.4035387635231018, "learning_rate": 2.5948970287791485e-06, "loss": 0.3877, "step": 3707 }, { "epoch": 2.0819764177428413, "grad_norm": 0.39163458347320557, "learning_rate": 2.5920332916973505e-06, "loss": 0.3698, "step": 3708 }, { "epoch": 2.082537900056148, "grad_norm": 0.40635621547698975, "learning_rate": 2.5891705826822456e-06, "loss": 0.345, "step": 3709 }, { "epoch": 2.0830993823694555, "grad_norm": 0.4176384210586548, "learning_rate": 2.586308902956053e-06, "loss": 0.3498, "step": 3710 }, { "epoch": 2.0836608646827623, "grad_norm": 0.39087727665901184, "learning_rate": 2.583448253740549e-06, "loss": 0.3637, "step": 3711 }, { "epoch": 2.0842223469960697, "grad_norm": 0.46871140599250793, "learning_rate": 2.5805886362570688e-06, "loss": 0.3883, "step": 3712 }, { "epoch": 2.0847838293093766, "grad_norm": 0.3695259392261505, "learning_rate": 2.5777300517265113e-06, "loss": 0.3361, "step": 3713 }, { "epoch": 2.085345311622684, "grad_norm": 0.39155060052871704, "learning_rate": 2.574872501369328e-06, "loss": 0.3627, "step": 3714 }, { "epoch": 2.085906793935991, "grad_norm": 0.43659740686416626, "learning_rate": 2.572015986405535e-06, "loss": 0.3594, "step": 3715 }, { "epoch": 2.086468276249298, "grad_norm": 0.39247798919677734, "learning_rate": 2.5691605080547054e-06, "loss": 0.3712, "step": 3716 }, { "epoch": 2.0870297585626054, "grad_norm": 0.42447173595428467, "learning_rate": 2.5663060675359643e-06, "loss": 0.3731, "step": 3717 }, { "epoch": 2.0875912408759123, "grad_norm": 0.42474666237831116, "learning_rate": 2.563452666068002e-06, "loss": 0.3871, "step": 3718 }, { "epoch": 2.0881527231892196, "grad_norm": 0.40810132026672363, "learning_rate": 2.5606003048690566e-06, "loss": 0.3648, "step": 3719 }, { "epoch": 2.0887142055025265, "grad_norm": 0.3978343904018402, "learning_rate": 2.5577489851569283e-06, "loss": 0.3859, "step": 3720 }, { "epoch": 2.089275687815834, "grad_norm": 0.4352272152900696, "learning_rate": 2.554898708148973e-06, "loss": 0.3466, "step": 3721 }, { "epoch": 2.0898371701291407, "grad_norm": 0.45456236600875854, "learning_rate": 2.5520494750620946e-06, "loss": 0.3701, "step": 3722 }, { "epoch": 2.090398652442448, "grad_norm": 0.4466168284416199, "learning_rate": 2.54920128711276e-06, "loss": 0.3913, "step": 3723 }, { "epoch": 2.0909601347557554, "grad_norm": 0.4057765305042267, "learning_rate": 2.5463541455169818e-06, "loss": 0.3545, "step": 3724 }, { "epoch": 2.0915216170690623, "grad_norm": 0.40849098563194275, "learning_rate": 2.543508051490331e-06, "loss": 0.3496, "step": 3725 }, { "epoch": 2.0920830993823696, "grad_norm": 0.41849127411842346, "learning_rate": 2.5406630062479333e-06, "loss": 0.3634, "step": 3726 }, { "epoch": 2.0926445816956765, "grad_norm": 0.39990442991256714, "learning_rate": 2.537819011004459e-06, "loss": 0.3454, "step": 3727 }, { "epoch": 2.093206064008984, "grad_norm": 0.4914538562297821, "learning_rate": 2.5349760669741375e-06, "loss": 0.3607, "step": 3728 }, { "epoch": 2.0937675463222907, "grad_norm": 0.45015382766723633, "learning_rate": 2.532134175370744e-06, "loss": 0.3676, "step": 3729 }, { "epoch": 2.094329028635598, "grad_norm": 0.4117872416973114, "learning_rate": 2.529293337407608e-06, "loss": 0.3878, "step": 3730 }, { "epoch": 2.094890510948905, "grad_norm": 0.3906334936618805, "learning_rate": 2.5264535542976094e-06, "loss": 0.3708, "step": 3731 }, { "epoch": 2.095451993262212, "grad_norm": 0.440542072057724, "learning_rate": 2.5236148272531726e-06, "loss": 0.3492, "step": 3732 }, { "epoch": 2.0960134755755195, "grad_norm": 0.4239747226238251, "learning_rate": 2.520777157486279e-06, "loss": 0.3745, "step": 3733 }, { "epoch": 2.0965749578888264, "grad_norm": 0.42989203333854675, "learning_rate": 2.517940546208452e-06, "loss": 0.3499, "step": 3734 }, { "epoch": 2.0971364402021337, "grad_norm": 0.43307968974113464, "learning_rate": 2.515104994630764e-06, "loss": 0.3739, "step": 3735 }, { "epoch": 2.0976979225154406, "grad_norm": 0.4566024839878082, "learning_rate": 2.5122705039638412e-06, "loss": 0.3958, "step": 3736 }, { "epoch": 2.098259404828748, "grad_norm": 0.43872055411338806, "learning_rate": 2.5094370754178464e-06, "loss": 0.3894, "step": 3737 }, { "epoch": 2.098820887142055, "grad_norm": 0.4284649193286896, "learning_rate": 2.5066047102025004e-06, "loss": 0.3526, "step": 3738 }, { "epoch": 2.099382369455362, "grad_norm": 0.4127212464809418, "learning_rate": 2.5037734095270595e-06, "loss": 0.3664, "step": 3739 }, { "epoch": 2.0999438517686695, "grad_norm": 0.4290589392185211, "learning_rate": 2.500943174600331e-06, "loss": 0.3717, "step": 3740 }, { "epoch": 2.1005053340819764, "grad_norm": 0.41859304904937744, "learning_rate": 2.4981140066306703e-06, "loss": 0.3629, "step": 3741 }, { "epoch": 2.1010668163952837, "grad_norm": 0.3949582874774933, "learning_rate": 2.4952859068259685e-06, "loss": 0.3774, "step": 3742 }, { "epoch": 2.1016282987085906, "grad_norm": 0.4436594545841217, "learning_rate": 2.4924588763936693e-06, "loss": 0.3605, "step": 3743 }, { "epoch": 2.102189781021898, "grad_norm": 0.4634290933609009, "learning_rate": 2.4896329165407524e-06, "loss": 0.3644, "step": 3744 }, { "epoch": 2.102751263335205, "grad_norm": 0.4018929600715637, "learning_rate": 2.4868080284737463e-06, "loss": 0.3647, "step": 3745 }, { "epoch": 2.103312745648512, "grad_norm": 0.4144563674926758, "learning_rate": 2.4839842133987214e-06, "loss": 0.3739, "step": 3746 }, { "epoch": 2.103874227961819, "grad_norm": 0.43237999081611633, "learning_rate": 2.4811614725212845e-06, "loss": 0.3636, "step": 3747 }, { "epoch": 2.1044357102751263, "grad_norm": 0.42221900820732117, "learning_rate": 2.478339807046592e-06, "loss": 0.4122, "step": 3748 }, { "epoch": 2.1049971925884337, "grad_norm": 0.42510274052619934, "learning_rate": 2.475519218179332e-06, "loss": 0.3811, "step": 3749 }, { "epoch": 2.1055586749017405, "grad_norm": 0.44211089611053467, "learning_rate": 2.47269970712374e-06, "loss": 0.3724, "step": 3750 }, { "epoch": 2.106120157215048, "grad_norm": 0.3983185291290283, "learning_rate": 2.4698812750835914e-06, "loss": 0.3584, "step": 3751 }, { "epoch": 2.1066816395283547, "grad_norm": 0.36625826358795166, "learning_rate": 2.4670639232621946e-06, "loss": 0.353, "step": 3752 }, { "epoch": 2.107243121841662, "grad_norm": 0.4182247817516327, "learning_rate": 2.4642476528624037e-06, "loss": 0.3809, "step": 3753 }, { "epoch": 2.107804604154969, "grad_norm": 0.46602779626846313, "learning_rate": 2.4614324650866055e-06, "loss": 0.365, "step": 3754 }, { "epoch": 2.1083660864682763, "grad_norm": 0.43705129623413086, "learning_rate": 2.458618361136728e-06, "loss": 0.357, "step": 3755 }, { "epoch": 2.108927568781583, "grad_norm": 0.438043475151062, "learning_rate": 2.455805342214239e-06, "loss": 0.3742, "step": 3756 }, { "epoch": 2.1094890510948905, "grad_norm": 0.4230178892612457, "learning_rate": 2.4529934095201347e-06, "loss": 0.3864, "step": 3757 }, { "epoch": 2.110050533408198, "grad_norm": 0.42210978269577026, "learning_rate": 2.4501825642549575e-06, "loss": 0.3755, "step": 3758 }, { "epoch": 2.1106120157215047, "grad_norm": 0.4298780858516693, "learning_rate": 2.447372807618776e-06, "loss": 0.3713, "step": 3759 }, { "epoch": 2.111173498034812, "grad_norm": 0.4106406569480896, "learning_rate": 2.4445641408112024e-06, "loss": 0.3716, "step": 3760 }, { "epoch": 2.111734980348119, "grad_norm": 0.40662428736686707, "learning_rate": 2.441756565031377e-06, "loss": 0.3846, "step": 3761 }, { "epoch": 2.1122964626614262, "grad_norm": 0.4278675317764282, "learning_rate": 2.438950081477978e-06, "loss": 0.3755, "step": 3762 }, { "epoch": 2.112857944974733, "grad_norm": 0.4005792737007141, "learning_rate": 2.436144691349219e-06, "loss": 0.3858, "step": 3763 }, { "epoch": 2.1134194272880404, "grad_norm": 0.3821815550327301, "learning_rate": 2.43334039584284e-06, "loss": 0.3867, "step": 3764 }, { "epoch": 2.1139809096013478, "grad_norm": 0.4246648848056793, "learning_rate": 2.430537196156122e-06, "loss": 0.3795, "step": 3765 }, { "epoch": 2.1145423919146547, "grad_norm": 0.3983655869960785, "learning_rate": 2.4277350934858697e-06, "loss": 0.3431, "step": 3766 }, { "epoch": 2.115103874227962, "grad_norm": 0.45658615231513977, "learning_rate": 2.424934089028426e-06, "loss": 0.3907, "step": 3767 }, { "epoch": 2.115665356541269, "grad_norm": 0.42832663655281067, "learning_rate": 2.422134183979665e-06, "loss": 0.3574, "step": 3768 }, { "epoch": 2.116226838854576, "grad_norm": 0.3976607322692871, "learning_rate": 2.4193353795349843e-06, "loss": 0.3989, "step": 3769 }, { "epoch": 2.116788321167883, "grad_norm": 0.4291619658470154, "learning_rate": 2.416537676889321e-06, "loss": 0.3722, "step": 3770 }, { "epoch": 2.1173498034811904, "grad_norm": 0.4532161056995392, "learning_rate": 2.4137410772371333e-06, "loss": 0.4115, "step": 3771 }, { "epoch": 2.1179112857944973, "grad_norm": 0.41942504048347473, "learning_rate": 2.4109455817724137e-06, "loss": 0.3798, "step": 3772 }, { "epoch": 2.1184727681078046, "grad_norm": 0.44134825468063354, "learning_rate": 2.408151191688684e-06, "loss": 0.3872, "step": 3773 }, { "epoch": 2.119034250421112, "grad_norm": 0.41499921679496765, "learning_rate": 2.4053579081789883e-06, "loss": 0.3522, "step": 3774 }, { "epoch": 2.119595732734419, "grad_norm": 0.39995405077934265, "learning_rate": 2.4025657324359063e-06, "loss": 0.3836, "step": 3775 }, { "epoch": 2.120157215047726, "grad_norm": 0.4009915292263031, "learning_rate": 2.3997746656515362e-06, "loss": 0.3915, "step": 3776 }, { "epoch": 2.120718697361033, "grad_norm": 0.4408192038536072, "learning_rate": 2.3969847090175086e-06, "loss": 0.3731, "step": 3777 }, { "epoch": 2.1212801796743403, "grad_norm": 0.42135584354400635, "learning_rate": 2.39419586372498e-06, "loss": 0.396, "step": 3778 }, { "epoch": 2.1218416619876472, "grad_norm": 0.3835914731025696, "learning_rate": 2.391408130964628e-06, "loss": 0.3494, "step": 3779 }, { "epoch": 2.1224031443009546, "grad_norm": 0.4496828615665436, "learning_rate": 2.388621511926662e-06, "loss": 0.3929, "step": 3780 }, { "epoch": 2.1229646266142614, "grad_norm": 0.4406295418739319, "learning_rate": 2.385836007800807e-06, "loss": 0.3875, "step": 3781 }, { "epoch": 2.1235261089275688, "grad_norm": 0.41717806458473206, "learning_rate": 2.3830516197763197e-06, "loss": 0.3758, "step": 3782 }, { "epoch": 2.124087591240876, "grad_norm": 0.5002897381782532, "learning_rate": 2.380268349041979e-06, "loss": 0.3805, "step": 3783 }, { "epoch": 2.124649073554183, "grad_norm": 0.4325953722000122, "learning_rate": 2.377486196786083e-06, "loss": 0.3832, "step": 3784 }, { "epoch": 2.1252105558674903, "grad_norm": 0.40455830097198486, "learning_rate": 2.374705164196456e-06, "loss": 0.3928, "step": 3785 }, { "epoch": 2.125772038180797, "grad_norm": 0.5154212713241577, "learning_rate": 2.371925252460442e-06, "loss": 0.3892, "step": 3786 }, { "epoch": 2.1263335204941045, "grad_norm": 0.4491945505142212, "learning_rate": 2.3691464627649064e-06, "loss": 0.3906, "step": 3787 }, { "epoch": 2.1268950028074114, "grad_norm": 0.3975179195404053, "learning_rate": 2.3663687962962403e-06, "loss": 0.3432, "step": 3788 }, { "epoch": 2.1274564851207187, "grad_norm": 0.4095819294452667, "learning_rate": 2.363592254240347e-06, "loss": 0.3593, "step": 3789 }, { "epoch": 2.128017967434026, "grad_norm": 0.4463280737400055, "learning_rate": 2.360816837782658e-06, "loss": 0.3671, "step": 3790 }, { "epoch": 2.128579449747333, "grad_norm": 0.39784303307533264, "learning_rate": 2.3580425481081164e-06, "loss": 0.3726, "step": 3791 }, { "epoch": 2.1291409320606403, "grad_norm": 0.39655786752700806, "learning_rate": 2.355269386401191e-06, "loss": 0.3535, "step": 3792 }, { "epoch": 2.129702414373947, "grad_norm": 0.4319501221179962, "learning_rate": 2.3524973538458674e-06, "loss": 0.3618, "step": 3793 }, { "epoch": 2.1302638966872545, "grad_norm": 0.4359988570213318, "learning_rate": 2.349726451625644e-06, "loss": 0.4043, "step": 3794 }, { "epoch": 2.1308253790005613, "grad_norm": 0.4050378203392029, "learning_rate": 2.346956680923545e-06, "loss": 0.4056, "step": 3795 }, { "epoch": 2.1313868613138687, "grad_norm": 0.40767160058021545, "learning_rate": 2.3441880429221038e-06, "loss": 0.3749, "step": 3796 }, { "epoch": 2.1319483436271756, "grad_norm": 0.44899752736091614, "learning_rate": 2.341420538803374e-06, "loss": 0.3821, "step": 3797 }, { "epoch": 2.132509825940483, "grad_norm": 0.47530439496040344, "learning_rate": 2.338654169748927e-06, "loss": 0.3611, "step": 3798 }, { "epoch": 2.13307130825379, "grad_norm": 0.40709954500198364, "learning_rate": 2.335888936939844e-06, "loss": 0.3743, "step": 3799 }, { "epoch": 2.133632790567097, "grad_norm": 0.42185601592063904, "learning_rate": 2.3331248415567276e-06, "loss": 0.3869, "step": 3800 }, { "epoch": 2.1341942728804044, "grad_norm": 0.4248482882976532, "learning_rate": 2.330361884779688e-06, "loss": 0.3397, "step": 3801 }, { "epoch": 2.1347557551937113, "grad_norm": 0.39179596304893494, "learning_rate": 2.3276000677883538e-06, "loss": 0.3637, "step": 3802 }, { "epoch": 2.1353172375070186, "grad_norm": 0.3852061927318573, "learning_rate": 2.3248393917618685e-06, "loss": 0.3595, "step": 3803 }, { "epoch": 2.1358787198203255, "grad_norm": 0.49353712797164917, "learning_rate": 2.3220798578788817e-06, "loss": 0.4132, "step": 3804 }, { "epoch": 2.136440202133633, "grad_norm": 0.4294200539588928, "learning_rate": 2.319321467317564e-06, "loss": 0.3593, "step": 3805 }, { "epoch": 2.1370016844469397, "grad_norm": 0.38786447048187256, "learning_rate": 2.3165642212555893e-06, "loss": 0.3744, "step": 3806 }, { "epoch": 2.137563166760247, "grad_norm": 0.4384607672691345, "learning_rate": 2.3138081208701485e-06, "loss": 0.3595, "step": 3807 }, { "epoch": 2.1381246490735544, "grad_norm": 0.4192083179950714, "learning_rate": 2.3110531673379444e-06, "loss": 0.3671, "step": 3808 }, { "epoch": 2.1386861313868613, "grad_norm": 0.45315709710121155, "learning_rate": 2.3082993618351835e-06, "loss": 0.3808, "step": 3809 }, { "epoch": 2.1392476137001686, "grad_norm": 0.4155977666378021, "learning_rate": 2.3055467055375895e-06, "loss": 0.3651, "step": 3810 }, { "epoch": 2.1398090960134755, "grad_norm": 0.39477649331092834, "learning_rate": 2.3027951996203896e-06, "loss": 0.3653, "step": 3811 }, { "epoch": 2.140370578326783, "grad_norm": 0.4242783486843109, "learning_rate": 2.3000448452583234e-06, "loss": 0.3761, "step": 3812 }, { "epoch": 2.1409320606400897, "grad_norm": 0.4134542942047119, "learning_rate": 2.2972956436256405e-06, "loss": 0.384, "step": 3813 }, { "epoch": 2.141493542953397, "grad_norm": 0.4331666827201843, "learning_rate": 2.294547595896091e-06, "loss": 0.3565, "step": 3814 }, { "epoch": 2.1420550252667043, "grad_norm": 0.4036306142807007, "learning_rate": 2.2918007032429417e-06, "loss": 0.3578, "step": 3815 }, { "epoch": 2.142616507580011, "grad_norm": 0.4010465145111084, "learning_rate": 2.289054966838958e-06, "loss": 0.3931, "step": 3816 }, { "epoch": 2.1431779898933185, "grad_norm": 0.4339010417461395, "learning_rate": 2.2863103878564165e-06, "loss": 0.3725, "step": 3817 }, { "epoch": 2.1437394722066254, "grad_norm": 0.4226911664009094, "learning_rate": 2.2835669674671007e-06, "loss": 0.3718, "step": 3818 }, { "epoch": 2.1443009545199327, "grad_norm": 0.40320491790771484, "learning_rate": 2.2808247068422933e-06, "loss": 0.374, "step": 3819 }, { "epoch": 2.1448624368332396, "grad_norm": 0.41361698508262634, "learning_rate": 2.27808360715279e-06, "loss": 0.3743, "step": 3820 }, { "epoch": 2.145423919146547, "grad_norm": 0.4215828776359558, "learning_rate": 2.2753436695688825e-06, "loss": 0.359, "step": 3821 }, { "epoch": 2.145985401459854, "grad_norm": 0.4517790973186493, "learning_rate": 2.2726048952603717e-06, "loss": 0.4158, "step": 3822 }, { "epoch": 2.146546883773161, "grad_norm": 0.4286952018737793, "learning_rate": 2.2698672853965638e-06, "loss": 0.3838, "step": 3823 }, { "epoch": 2.147108366086468, "grad_norm": 0.43176835775375366, "learning_rate": 2.26713084114626e-06, "loss": 0.3509, "step": 3824 }, { "epoch": 2.1476698483997754, "grad_norm": 0.39520078897476196, "learning_rate": 2.264395563677772e-06, "loss": 0.3839, "step": 3825 }, { "epoch": 2.1482313307130827, "grad_norm": 0.4577789902687073, "learning_rate": 2.261661454158907e-06, "loss": 0.3737, "step": 3826 }, { "epoch": 2.1487928130263896, "grad_norm": 0.4080440402030945, "learning_rate": 2.2589285137569765e-06, "loss": 0.3964, "step": 3827 }, { "epoch": 2.149354295339697, "grad_norm": 0.3915026783943176, "learning_rate": 2.2561967436387965e-06, "loss": 0.3548, "step": 3828 }, { "epoch": 2.149915777653004, "grad_norm": 0.41945862770080566, "learning_rate": 2.2534661449706753e-06, "loss": 0.3474, "step": 3829 }, { "epoch": 2.150477259966311, "grad_norm": 0.42879432439804077, "learning_rate": 2.250736718918429e-06, "loss": 0.3755, "step": 3830 }, { "epoch": 2.151038742279618, "grad_norm": 0.4065026342868805, "learning_rate": 2.248008466647365e-06, "loss": 0.3556, "step": 3831 }, { "epoch": 2.1516002245929253, "grad_norm": 0.3798445761203766, "learning_rate": 2.245281389322297e-06, "loss": 0.3718, "step": 3832 }, { "epoch": 2.1521617069062327, "grad_norm": 0.41787001490592957, "learning_rate": 2.2425554881075345e-06, "loss": 0.3982, "step": 3833 }, { "epoch": 2.1527231892195395, "grad_norm": 0.4191994369029999, "learning_rate": 2.239830764166882e-06, "loss": 0.3847, "step": 3834 }, { "epoch": 2.153284671532847, "grad_norm": 0.3964087963104248, "learning_rate": 2.237107218663647e-06, "loss": 0.3691, "step": 3835 }, { "epoch": 2.1538461538461537, "grad_norm": 0.39545735716819763, "learning_rate": 2.2343848527606265e-06, "loss": 0.3658, "step": 3836 }, { "epoch": 2.154407636159461, "grad_norm": 0.3986629247665405, "learning_rate": 2.231663667620121e-06, "loss": 0.3772, "step": 3837 }, { "epoch": 2.154969118472768, "grad_norm": 0.3934970796108246, "learning_rate": 2.228943664403924e-06, "loss": 0.3633, "step": 3838 }, { "epoch": 2.1555306007860753, "grad_norm": 0.4212356209754944, "learning_rate": 2.226224844273322e-06, "loss": 0.3936, "step": 3839 }, { "epoch": 2.156092083099382, "grad_norm": 0.40780508518218994, "learning_rate": 2.223507208389102e-06, "loss": 0.349, "step": 3840 }, { "epoch": 2.1566535654126895, "grad_norm": 0.3696664571762085, "learning_rate": 2.220790757911536e-06, "loss": 0.3319, "step": 3841 }, { "epoch": 2.157215047725997, "grad_norm": 0.38568857312202454, "learning_rate": 2.2180754940004047e-06, "loss": 0.39, "step": 3842 }, { "epoch": 2.1577765300393037, "grad_norm": 0.408987820148468, "learning_rate": 2.215361417814966e-06, "loss": 0.3702, "step": 3843 }, { "epoch": 2.158338012352611, "grad_norm": 0.3611666262149811, "learning_rate": 2.2126485305139837e-06, "loss": 0.3706, "step": 3844 }, { "epoch": 2.158899494665918, "grad_norm": 0.4560571610927582, "learning_rate": 2.2099368332557035e-06, "loss": 0.3756, "step": 3845 }, { "epoch": 2.1594609769792252, "grad_norm": 0.37295109033584595, "learning_rate": 2.2072263271978704e-06, "loss": 0.3703, "step": 3846 }, { "epoch": 2.160022459292532, "grad_norm": 0.3938451111316681, "learning_rate": 2.20451701349772e-06, "loss": 0.3691, "step": 3847 }, { "epoch": 2.1605839416058394, "grad_norm": 0.3735730051994324, "learning_rate": 2.2018088933119743e-06, "loss": 0.3794, "step": 3848 }, { "epoch": 2.1611454239191463, "grad_norm": 0.3772662878036499, "learning_rate": 2.199101967796851e-06, "loss": 0.3665, "step": 3849 }, { "epoch": 2.1617069062324537, "grad_norm": 0.45742130279541016, "learning_rate": 2.1963962381080534e-06, "loss": 0.4002, "step": 3850 }, { "epoch": 2.162268388545761, "grad_norm": 0.38414812088012695, "learning_rate": 2.1936917054007768e-06, "loss": 0.3701, "step": 3851 }, { "epoch": 2.162829870859068, "grad_norm": 0.39677008986473083, "learning_rate": 2.190988370829707e-06, "loss": 0.3521, "step": 3852 }, { "epoch": 2.163391353172375, "grad_norm": 0.4098846912384033, "learning_rate": 2.188286235549013e-06, "loss": 0.3881, "step": 3853 }, { "epoch": 2.163952835485682, "grad_norm": 0.44234219193458557, "learning_rate": 2.185585300712359e-06, "loss": 0.3652, "step": 3854 }, { "epoch": 2.1645143177989894, "grad_norm": 0.40787044167518616, "learning_rate": 2.1828855674728876e-06, "loss": 0.3807, "step": 3855 }, { "epoch": 2.1650758001122963, "grad_norm": 0.45785820484161377, "learning_rate": 2.1801870369832366e-06, "loss": 0.3588, "step": 3856 }, { "epoch": 2.1656372824256036, "grad_norm": 0.43020787835121155, "learning_rate": 2.1774897103955284e-06, "loss": 0.3504, "step": 3857 }, { "epoch": 2.166198764738911, "grad_norm": 0.418121337890625, "learning_rate": 2.1747935888613664e-06, "loss": 0.3779, "step": 3858 }, { "epoch": 2.166760247052218, "grad_norm": 0.4078671932220459, "learning_rate": 2.1720986735318474e-06, "loss": 0.3407, "step": 3859 }, { "epoch": 2.167321729365525, "grad_norm": 0.4019639790058136, "learning_rate": 2.1694049655575442e-06, "loss": 0.362, "step": 3860 }, { "epoch": 2.167883211678832, "grad_norm": 0.4657253623008728, "learning_rate": 2.1667124660885218e-06, "loss": 0.3728, "step": 3861 }, { "epoch": 2.1684446939921393, "grad_norm": 0.4044835865497589, "learning_rate": 2.164021176274329e-06, "loss": 0.3697, "step": 3862 }, { "epoch": 2.1690061763054462, "grad_norm": 0.45186087489128113, "learning_rate": 2.16133109726399e-06, "loss": 0.3693, "step": 3863 }, { "epoch": 2.1695676586187536, "grad_norm": 0.38016849756240845, "learning_rate": 2.1586422302060226e-06, "loss": 0.4127, "step": 3864 }, { "epoch": 2.1701291409320604, "grad_norm": 0.388454794883728, "learning_rate": 2.1559545762484184e-06, "loss": 0.3677, "step": 3865 }, { "epoch": 2.1706906232453678, "grad_norm": 0.3718401789665222, "learning_rate": 2.1532681365386564e-06, "loss": 0.4073, "step": 3866 }, { "epoch": 2.171252105558675, "grad_norm": 0.4030757546424866, "learning_rate": 2.1505829122236976e-06, "loss": 0.3688, "step": 3867 }, { "epoch": 2.171813587871982, "grad_norm": 0.4289921522140503, "learning_rate": 2.1478989044499783e-06, "loss": 0.3669, "step": 3868 }, { "epoch": 2.1723750701852893, "grad_norm": 0.42608267068862915, "learning_rate": 2.1452161143634233e-06, "loss": 0.3715, "step": 3869 }, { "epoch": 2.172936552498596, "grad_norm": 0.4214073717594147, "learning_rate": 2.142534543109429e-06, "loss": 0.3681, "step": 3870 }, { "epoch": 2.1734980348119035, "grad_norm": 0.41704806685447693, "learning_rate": 2.1398541918328786e-06, "loss": 0.3833, "step": 3871 }, { "epoch": 2.1740595171252104, "grad_norm": 0.414375364780426, "learning_rate": 2.1371750616781334e-06, "loss": 0.3842, "step": 3872 }, { "epoch": 2.1746209994385177, "grad_norm": 0.4251266121864319, "learning_rate": 2.1344971537890275e-06, "loss": 0.3807, "step": 3873 }, { "epoch": 2.1751824817518246, "grad_norm": 0.41887015104293823, "learning_rate": 2.1318204693088817e-06, "loss": 0.3755, "step": 3874 }, { "epoch": 2.175743964065132, "grad_norm": 0.45986825227737427, "learning_rate": 2.1291450093804857e-06, "loss": 0.3771, "step": 3875 }, { "epoch": 2.1763054463784393, "grad_norm": 0.44519853591918945, "learning_rate": 2.1264707751461135e-06, "loss": 0.4019, "step": 3876 }, { "epoch": 2.176866928691746, "grad_norm": 0.4152379035949707, "learning_rate": 2.1237977677475137e-06, "loss": 0.3676, "step": 3877 }, { "epoch": 2.1774284110050535, "grad_norm": 0.425370991230011, "learning_rate": 2.121125988325908e-06, "loss": 0.4098, "step": 3878 }, { "epoch": 2.1779898933183603, "grad_norm": 0.3773520588874817, "learning_rate": 2.1184554380219996e-06, "loss": 0.3873, "step": 3879 }, { "epoch": 2.1785513756316677, "grad_norm": 0.3853224515914917, "learning_rate": 2.11578611797596e-06, "loss": 0.3633, "step": 3880 }, { "epoch": 2.1791128579449746, "grad_norm": 0.40829998254776, "learning_rate": 2.1131180293274407e-06, "loss": 0.3805, "step": 3881 }, { "epoch": 2.179674340258282, "grad_norm": 0.38391613960266113, "learning_rate": 2.1104511732155675e-06, "loss": 0.3768, "step": 3882 }, { "epoch": 2.180235822571589, "grad_norm": 0.41371655464172363, "learning_rate": 2.107785550778935e-06, "loss": 0.4004, "step": 3883 }, { "epoch": 2.180797304884896, "grad_norm": 0.40763455629348755, "learning_rate": 2.1051211631556188e-06, "loss": 0.3556, "step": 3884 }, { "epoch": 2.1813587871982034, "grad_norm": 0.4134567379951477, "learning_rate": 2.102458011483159e-06, "loss": 0.3503, "step": 3885 }, { "epoch": 2.1819202695115103, "grad_norm": 0.39921844005584717, "learning_rate": 2.0997960968985724e-06, "loss": 0.3658, "step": 3886 }, { "epoch": 2.1824817518248176, "grad_norm": 0.3986179530620575, "learning_rate": 2.097135420538351e-06, "loss": 0.3645, "step": 3887 }, { "epoch": 2.1830432341381245, "grad_norm": 0.4065936803817749, "learning_rate": 2.0944759835384494e-06, "loss": 0.3793, "step": 3888 }, { "epoch": 2.183604716451432, "grad_norm": 0.3908589482307434, "learning_rate": 2.0918177870343013e-06, "loss": 0.365, "step": 3889 }, { "epoch": 2.1841661987647387, "grad_norm": 0.4189639687538147, "learning_rate": 2.0891608321608054e-06, "loss": 0.3755, "step": 3890 }, { "epoch": 2.184727681078046, "grad_norm": 0.41499781608581543, "learning_rate": 2.086505120052333e-06, "loss": 0.3922, "step": 3891 }, { "epoch": 2.1852891633913534, "grad_norm": 0.41979122161865234, "learning_rate": 2.0838506518427265e-06, "loss": 0.3774, "step": 3892 }, { "epoch": 2.1858506457046603, "grad_norm": 0.4487759470939636, "learning_rate": 2.0811974286652907e-06, "loss": 0.376, "step": 3893 }, { "epoch": 2.1864121280179676, "grad_norm": 0.40442222356796265, "learning_rate": 2.0785454516528073e-06, "loss": 0.365, "step": 3894 }, { "epoch": 2.1869736103312745, "grad_norm": 0.44954559206962585, "learning_rate": 2.075894721937518e-06, "loss": 0.3682, "step": 3895 }, { "epoch": 2.187535092644582, "grad_norm": 0.42590683698654175, "learning_rate": 2.073245240651137e-06, "loss": 0.3636, "step": 3896 }, { "epoch": 2.1880965749578887, "grad_norm": 0.4644666314125061, "learning_rate": 2.0705970089248468e-06, "loss": 0.3641, "step": 3897 }, { "epoch": 2.188658057271196, "grad_norm": 0.4144204258918762, "learning_rate": 2.0679500278892893e-06, "loss": 0.3817, "step": 3898 }, { "epoch": 2.189219539584503, "grad_norm": 0.4200987219810486, "learning_rate": 2.0653042986745814e-06, "loss": 0.3635, "step": 3899 }, { "epoch": 2.18978102189781, "grad_norm": 0.42575129866600037, "learning_rate": 2.0626598224102963e-06, "loss": 0.395, "step": 3900 }, { "epoch": 2.1903425042111175, "grad_norm": 0.4583381414413452, "learning_rate": 2.06001660022548e-06, "loss": 0.3706, "step": 3901 }, { "epoch": 2.1909039865244244, "grad_norm": 0.3946093022823334, "learning_rate": 2.057374633248641e-06, "loss": 0.3586, "step": 3902 }, { "epoch": 2.1914654688377317, "grad_norm": 0.4097551703453064, "learning_rate": 2.054733922607748e-06, "loss": 0.3596, "step": 3903 }, { "epoch": 2.1920269511510386, "grad_norm": 0.419822633266449, "learning_rate": 2.0520944694302393e-06, "loss": 0.3828, "step": 3904 }, { "epoch": 2.192588433464346, "grad_norm": 0.43669959902763367, "learning_rate": 2.0494562748430104e-06, "loss": 0.3913, "step": 3905 }, { "epoch": 2.193149915777653, "grad_norm": 0.43360352516174316, "learning_rate": 2.0468193399724235e-06, "loss": 0.3672, "step": 3906 }, { "epoch": 2.19371139809096, "grad_norm": 0.4065428078174591, "learning_rate": 2.044183665944304e-06, "loss": 0.3822, "step": 3907 }, { "epoch": 2.1942728804042675, "grad_norm": 0.4031980633735657, "learning_rate": 2.041549253883933e-06, "loss": 0.3905, "step": 3908 }, { "epoch": 2.1948343627175744, "grad_norm": 0.36918118596076965, "learning_rate": 2.038916104916061e-06, "loss": 0.3933, "step": 3909 }, { "epoch": 2.1953958450308817, "grad_norm": 0.40407389402389526, "learning_rate": 2.0362842201648904e-06, "loss": 0.3811, "step": 3910 }, { "epoch": 2.1959573273441886, "grad_norm": 0.3857789635658264, "learning_rate": 2.033653600754091e-06, "loss": 0.3704, "step": 3911 }, { "epoch": 2.196518809657496, "grad_norm": 0.4340645968914032, "learning_rate": 2.0310242478067905e-06, "loss": 0.392, "step": 3912 }, { "epoch": 2.197080291970803, "grad_norm": 0.4336501657962799, "learning_rate": 2.028396162445572e-06, "loss": 0.3703, "step": 3913 }, { "epoch": 2.19764177428411, "grad_norm": 0.4039976894855499, "learning_rate": 2.0257693457924847e-06, "loss": 0.3939, "step": 3914 }, { "epoch": 2.198203256597417, "grad_norm": 0.41305845975875854, "learning_rate": 2.0231437989690273e-06, "loss": 0.3568, "step": 3915 }, { "epoch": 2.1987647389107243, "grad_norm": 0.4200856685638428, "learning_rate": 2.020519523096164e-06, "loss": 0.3708, "step": 3916 }, { "epoch": 2.199326221224031, "grad_norm": 0.40082845091819763, "learning_rate": 2.0178965192943134e-06, "loss": 0.3524, "step": 3917 }, { "epoch": 2.1998877035373385, "grad_norm": 0.36408403515815735, "learning_rate": 2.015274788683349e-06, "loss": 0.3337, "step": 3918 }, { "epoch": 2.200449185850646, "grad_norm": 0.4306933283805847, "learning_rate": 2.0126543323826054e-06, "loss": 0.4122, "step": 3919 }, { "epoch": 2.2010106681639527, "grad_norm": 0.456891804933548, "learning_rate": 2.0100351515108675e-06, "loss": 0.3609, "step": 3920 }, { "epoch": 2.20157215047726, "grad_norm": 0.46365076303482056, "learning_rate": 2.00741724718638e-06, "loss": 0.3832, "step": 3921 }, { "epoch": 2.202133632790567, "grad_norm": 0.3865043520927429, "learning_rate": 2.0048006205268426e-06, "loss": 0.3489, "step": 3922 }, { "epoch": 2.2026951151038743, "grad_norm": 0.41111260652542114, "learning_rate": 2.0021852726494058e-06, "loss": 0.3918, "step": 3923 }, { "epoch": 2.203256597417181, "grad_norm": 0.4015122056007385, "learning_rate": 1.999571204670678e-06, "loss": 0.362, "step": 3924 }, { "epoch": 2.2038180797304885, "grad_norm": 0.3993726372718811, "learning_rate": 1.9969584177067176e-06, "loss": 0.334, "step": 3925 }, { "epoch": 2.204379562043796, "grad_norm": 0.4358080327510834, "learning_rate": 1.9943469128730416e-06, "loss": 0.379, "step": 3926 }, { "epoch": 2.2049410443571027, "grad_norm": 0.43146243691444397, "learning_rate": 1.9917366912846113e-06, "loss": 0.3523, "step": 3927 }, { "epoch": 2.20550252667041, "grad_norm": 0.40033748745918274, "learning_rate": 1.989127754055848e-06, "loss": 0.3623, "step": 3928 }, { "epoch": 2.206064008983717, "grad_norm": 0.43824058771133423, "learning_rate": 1.986520102300622e-06, "loss": 0.3855, "step": 3929 }, { "epoch": 2.2066254912970242, "grad_norm": 0.40925702452659607, "learning_rate": 1.983913737132252e-06, "loss": 0.3599, "step": 3930 }, { "epoch": 2.207186973610331, "grad_norm": 0.40451398491859436, "learning_rate": 1.9813086596635124e-06, "loss": 0.3638, "step": 3931 }, { "epoch": 2.2077484559236384, "grad_norm": 0.3817262351512909, "learning_rate": 1.978704871006622e-06, "loss": 0.3674, "step": 3932 }, { "epoch": 2.2083099382369458, "grad_norm": 0.3877817690372467, "learning_rate": 1.9761023722732543e-06, "loss": 0.3617, "step": 3933 }, { "epoch": 2.2088714205502527, "grad_norm": 0.4256679117679596, "learning_rate": 1.9735011645745317e-06, "loss": 0.3958, "step": 3934 }, { "epoch": 2.20943290286356, "grad_norm": 0.42751333117485046, "learning_rate": 1.9709012490210206e-06, "loss": 0.3392, "step": 3935 }, { "epoch": 2.209994385176867, "grad_norm": 0.40624648332595825, "learning_rate": 1.968302626722743e-06, "loss": 0.3822, "step": 3936 }, { "epoch": 2.210555867490174, "grad_norm": 0.42358165979385376, "learning_rate": 1.96570529878916e-06, "loss": 0.4001, "step": 3937 }, { "epoch": 2.211117349803481, "grad_norm": 0.42347338795661926, "learning_rate": 1.9631092663291886e-06, "loss": 0.3838, "step": 3938 }, { "epoch": 2.2116788321167884, "grad_norm": 0.39355942606925964, "learning_rate": 1.960514530451189e-06, "loss": 0.3825, "step": 3939 }, { "epoch": 2.2122403144300953, "grad_norm": 0.44177594780921936, "learning_rate": 1.9579210922629654e-06, "loss": 0.3533, "step": 3940 }, { "epoch": 2.2128017967434026, "grad_norm": 0.42369651794433594, "learning_rate": 1.955328952871774e-06, "loss": 0.3634, "step": 3941 }, { "epoch": 2.2133632790567095, "grad_norm": 0.4328472912311554, "learning_rate": 1.9527381133843086e-06, "loss": 0.3886, "step": 3942 }, { "epoch": 2.213924761370017, "grad_norm": 0.44137340784072876, "learning_rate": 1.9501485749067145e-06, "loss": 0.3772, "step": 3943 }, { "epoch": 2.214486243683324, "grad_norm": 0.40810632705688477, "learning_rate": 1.9475603385445813e-06, "loss": 0.3773, "step": 3944 }, { "epoch": 2.215047725996631, "grad_norm": 0.402252197265625, "learning_rate": 1.9449734054029374e-06, "loss": 0.3446, "step": 3945 }, { "epoch": 2.2156092083099383, "grad_norm": 0.39889445900917053, "learning_rate": 1.9423877765862602e-06, "loss": 0.4041, "step": 3946 }, { "epoch": 2.2161706906232452, "grad_norm": 0.4484409689903259, "learning_rate": 1.9398034531984668e-06, "loss": 0.3842, "step": 3947 }, { "epoch": 2.2167321729365526, "grad_norm": 0.3822300136089325, "learning_rate": 1.937220436342918e-06, "loss": 0.3688, "step": 3948 }, { "epoch": 2.2172936552498594, "grad_norm": 0.41135579347610474, "learning_rate": 1.9346387271224205e-06, "loss": 0.3676, "step": 3949 }, { "epoch": 2.2178551375631668, "grad_norm": 0.3734052777290344, "learning_rate": 1.932058326639215e-06, "loss": 0.3632, "step": 3950 }, { "epoch": 2.218416619876474, "grad_norm": 0.38998353481292725, "learning_rate": 1.929479235994991e-06, "loss": 0.3822, "step": 3951 }, { "epoch": 2.218978102189781, "grad_norm": 0.4077869951725006, "learning_rate": 1.9269014562908723e-06, "loss": 0.3533, "step": 3952 }, { "epoch": 2.2195395845030883, "grad_norm": 0.39747166633605957, "learning_rate": 1.9243249886274286e-06, "loss": 0.3601, "step": 3953 }, { "epoch": 2.220101066816395, "grad_norm": 0.39216986298561096, "learning_rate": 1.9217498341046673e-06, "loss": 0.3722, "step": 3954 }, { "epoch": 2.2206625491297025, "grad_norm": 0.4634299874305725, "learning_rate": 1.9191759938220334e-06, "loss": 0.3663, "step": 3955 }, { "epoch": 2.2212240314430094, "grad_norm": 0.3813633322715759, "learning_rate": 1.916603468878414e-06, "loss": 0.3913, "step": 3956 }, { "epoch": 2.2217855137563167, "grad_norm": 0.40096795558929443, "learning_rate": 1.91403226037213e-06, "loss": 0.3784, "step": 3957 }, { "epoch": 2.2223469960696236, "grad_norm": 0.3928035497665405, "learning_rate": 1.9114623694009456e-06, "loss": 0.3546, "step": 3958 }, { "epoch": 2.222908478382931, "grad_norm": 0.38430699706077576, "learning_rate": 1.908893797062061e-06, "loss": 0.3484, "step": 3959 }, { "epoch": 2.2234699606962383, "grad_norm": 0.40546920895576477, "learning_rate": 1.9063265444521096e-06, "loss": 0.3453, "step": 3960 }, { "epoch": 2.224031443009545, "grad_norm": 0.41043519973754883, "learning_rate": 1.9037606126671676e-06, "loss": 0.3656, "step": 3961 }, { "epoch": 2.2245929253228525, "grad_norm": 0.3749231696128845, "learning_rate": 1.90119600280274e-06, "loss": 0.3715, "step": 3962 }, { "epoch": 2.2251544076361593, "grad_norm": 0.4119007885456085, "learning_rate": 1.8986327159537732e-06, "loss": 0.3746, "step": 3963 }, { "epoch": 2.2257158899494667, "grad_norm": 0.41415277123451233, "learning_rate": 1.8960707532146488e-06, "loss": 0.3828, "step": 3964 }, { "epoch": 2.2262773722627736, "grad_norm": 0.4060900807380676, "learning_rate": 1.8935101156791774e-06, "loss": 0.3653, "step": 3965 }, { "epoch": 2.226838854576081, "grad_norm": 0.38441288471221924, "learning_rate": 1.8909508044406122e-06, "loss": 0.3473, "step": 3966 }, { "epoch": 2.2274003368893878, "grad_norm": 0.39315760135650635, "learning_rate": 1.8883928205916303e-06, "loss": 0.3568, "step": 3967 }, { "epoch": 2.227961819202695, "grad_norm": 0.38364508748054504, "learning_rate": 1.88583616522435e-06, "loss": 0.3734, "step": 3968 }, { "epoch": 2.2285233015160024, "grad_norm": 0.3765764832496643, "learning_rate": 1.8832808394303214e-06, "loss": 0.3724, "step": 3969 }, { "epoch": 2.2290847838293093, "grad_norm": 0.4118582010269165, "learning_rate": 1.8807268443005222e-06, "loss": 0.3705, "step": 3970 }, { "epoch": 2.2296462661426166, "grad_norm": 0.3722091615200043, "learning_rate": 1.8781741809253678e-06, "loss": 0.3808, "step": 3971 }, { "epoch": 2.2302077484559235, "grad_norm": 0.37003788352012634, "learning_rate": 1.875622850394699e-06, "loss": 0.3349, "step": 3972 }, { "epoch": 2.230769230769231, "grad_norm": 0.44157862663269043, "learning_rate": 1.8730728537977922e-06, "loss": 0.3437, "step": 3973 }, { "epoch": 2.2313307130825377, "grad_norm": 0.41399943828582764, "learning_rate": 1.8705241922233552e-06, "loss": 0.3701, "step": 3974 }, { "epoch": 2.231892195395845, "grad_norm": 0.37345439195632935, "learning_rate": 1.86797686675952e-06, "loss": 0.3575, "step": 3975 }, { "epoch": 2.2324536777091524, "grad_norm": 0.400236040353775, "learning_rate": 1.8654308784938547e-06, "loss": 0.3591, "step": 3976 }, { "epoch": 2.2330151600224593, "grad_norm": 0.40649959444999695, "learning_rate": 1.8628862285133502e-06, "loss": 0.36, "step": 3977 }, { "epoch": 2.2335766423357666, "grad_norm": 0.3761659860610962, "learning_rate": 1.8603429179044314e-06, "loss": 0.3569, "step": 3978 }, { "epoch": 2.2341381246490735, "grad_norm": 0.4074244797229767, "learning_rate": 1.8578009477529502e-06, "loss": 0.3677, "step": 3979 }, { "epoch": 2.234699606962381, "grad_norm": 0.40121424198150635, "learning_rate": 1.8552603191441826e-06, "loss": 0.3894, "step": 3980 }, { "epoch": 2.2352610892756877, "grad_norm": 0.41565680503845215, "learning_rate": 1.852721033162837e-06, "loss": 0.3865, "step": 3981 }, { "epoch": 2.235822571588995, "grad_norm": 0.3841438889503479, "learning_rate": 1.8501830908930435e-06, "loss": 0.3946, "step": 3982 }, { "epoch": 2.236384053902302, "grad_norm": 0.39984527230262756, "learning_rate": 1.847646493418363e-06, "loss": 0.3615, "step": 3983 }, { "epoch": 2.236945536215609, "grad_norm": 0.4810103476047516, "learning_rate": 1.845111241821781e-06, "loss": 0.3999, "step": 3984 }, { "epoch": 2.2375070185289165, "grad_norm": 0.3849307894706726, "learning_rate": 1.8425773371857048e-06, "loss": 0.3663, "step": 3985 }, { "epoch": 2.2380685008422234, "grad_norm": 0.3961506187915802, "learning_rate": 1.840044780591973e-06, "loss": 0.3552, "step": 3986 }, { "epoch": 2.2386299831555307, "grad_norm": 0.3873623013496399, "learning_rate": 1.8375135731218412e-06, "loss": 0.3787, "step": 3987 }, { "epoch": 2.2391914654688376, "grad_norm": 0.4114799201488495, "learning_rate": 1.8349837158559957e-06, "loss": 0.3634, "step": 3988 }, { "epoch": 2.239752947782145, "grad_norm": 0.4284598231315613, "learning_rate": 1.8324552098745447e-06, "loss": 0.3978, "step": 3989 }, { "epoch": 2.240314430095452, "grad_norm": 0.40930745005607605, "learning_rate": 1.829928056257015e-06, "loss": 0.3659, "step": 3990 }, { "epoch": 2.240875912408759, "grad_norm": 0.4014183580875397, "learning_rate": 1.827402256082363e-06, "loss": 0.3647, "step": 3991 }, { "epoch": 2.241437394722066, "grad_norm": 0.4601297080516815, "learning_rate": 1.8248778104289604e-06, "loss": 0.3911, "step": 3992 }, { "epoch": 2.2419988770353734, "grad_norm": 0.3704639971256256, "learning_rate": 1.822354720374605e-06, "loss": 0.3825, "step": 3993 }, { "epoch": 2.2425603593486807, "grad_norm": 0.39615121483802795, "learning_rate": 1.819832986996517e-06, "loss": 0.3924, "step": 3994 }, { "epoch": 2.2431218416619876, "grad_norm": 0.4063425362110138, "learning_rate": 1.8173126113713313e-06, "loss": 0.3383, "step": 3995 }, { "epoch": 2.243683323975295, "grad_norm": 0.3922058641910553, "learning_rate": 1.814793594575111e-06, "loss": 0.3277, "step": 3996 }, { "epoch": 2.244244806288602, "grad_norm": 0.39006292819976807, "learning_rate": 1.8122759376833303e-06, "loss": 0.344, "step": 3997 }, { "epoch": 2.244806288601909, "grad_norm": 0.42032480239868164, "learning_rate": 1.8097596417708902e-06, "loss": 0.3697, "step": 3998 }, { "epoch": 2.245367770915216, "grad_norm": 0.38420626521110535, "learning_rate": 1.8072447079121091e-06, "loss": 0.378, "step": 3999 }, { "epoch": 2.2459292532285233, "grad_norm": 0.40934881567955017, "learning_rate": 1.8047311371807196e-06, "loss": 0.3942, "step": 4000 }, { "epoch": 2.2464907355418307, "grad_norm": 0.41350114345550537, "learning_rate": 1.8022189306498788e-06, "loss": 0.3712, "step": 4001 }, { "epoch": 2.2470522178551375, "grad_norm": 0.4309745132923126, "learning_rate": 1.7997080893921537e-06, "loss": 0.3771, "step": 4002 }, { "epoch": 2.247613700168445, "grad_norm": 0.4252851605415344, "learning_rate": 1.7971986144795356e-06, "loss": 0.3835, "step": 4003 }, { "epoch": 2.2481751824817517, "grad_norm": 0.4331389367580414, "learning_rate": 1.7946905069834303e-06, "loss": 0.3905, "step": 4004 }, { "epoch": 2.248736664795059, "grad_norm": 0.4249155819416046, "learning_rate": 1.7921837679746563e-06, "loss": 0.3583, "step": 4005 }, { "epoch": 2.249298147108366, "grad_norm": 0.3749099671840668, "learning_rate": 1.7896783985234528e-06, "loss": 0.3769, "step": 4006 }, { "epoch": 2.2498596294216733, "grad_norm": 0.38840413093566895, "learning_rate": 1.78717439969947e-06, "loss": 0.3431, "step": 4007 }, { "epoch": 2.25042111173498, "grad_norm": 0.41430407762527466, "learning_rate": 1.7846717725717755e-06, "loss": 0.3808, "step": 4008 }, { "epoch": 2.2509825940482875, "grad_norm": 0.4123918414115906, "learning_rate": 1.7821705182088534e-06, "loss": 0.367, "step": 4009 }, { "epoch": 2.2515440763615944, "grad_norm": 0.4341263175010681, "learning_rate": 1.7796706376785943e-06, "loss": 0.3678, "step": 4010 }, { "epoch": 2.2521055586749017, "grad_norm": 0.410606324672699, "learning_rate": 1.7771721320483116e-06, "loss": 0.371, "step": 4011 }, { "epoch": 2.252667040988209, "grad_norm": 0.4215277433395386, "learning_rate": 1.774675002384723e-06, "loss": 0.3798, "step": 4012 }, { "epoch": 2.253228523301516, "grad_norm": 0.36557039618492126, "learning_rate": 1.7721792497539664e-06, "loss": 0.3677, "step": 4013 }, { "epoch": 2.2537900056148232, "grad_norm": 0.40857207775115967, "learning_rate": 1.7696848752215845e-06, "loss": 0.3883, "step": 4014 }, { "epoch": 2.25435148792813, "grad_norm": 0.38743147253990173, "learning_rate": 1.7671918798525373e-06, "loss": 0.3679, "step": 4015 }, { "epoch": 2.2549129702414374, "grad_norm": 0.42269188165664673, "learning_rate": 1.7647002647111948e-06, "loss": 0.3603, "step": 4016 }, { "epoch": 2.2554744525547443, "grad_norm": 0.4551185071468353, "learning_rate": 1.7622100308613349e-06, "loss": 0.3952, "step": 4017 }, { "epoch": 2.2560359348680517, "grad_norm": 0.40716949105262756, "learning_rate": 1.75972117936615e-06, "loss": 0.3902, "step": 4018 }, { "epoch": 2.256597417181359, "grad_norm": 0.3870183527469635, "learning_rate": 1.7572337112882376e-06, "loss": 0.4109, "step": 4019 }, { "epoch": 2.257158899494666, "grad_norm": 0.39178207516670227, "learning_rate": 1.754747627689608e-06, "loss": 0.3737, "step": 4020 }, { "epoch": 2.257720381807973, "grad_norm": 0.43436795473098755, "learning_rate": 1.7522629296316812e-06, "loss": 0.4163, "step": 4021 }, { "epoch": 2.25828186412128, "grad_norm": 0.40286368131637573, "learning_rate": 1.7497796181752807e-06, "loss": 0.3588, "step": 4022 }, { "epoch": 2.2588433464345874, "grad_norm": 0.3755340278148651, "learning_rate": 1.7472976943806442e-06, "loss": 0.3608, "step": 4023 }, { "epoch": 2.2594048287478943, "grad_norm": 0.41586896777153015, "learning_rate": 1.744817159307411e-06, "loss": 0.3776, "step": 4024 }, { "epoch": 2.2599663110612016, "grad_norm": 0.4127662479877472, "learning_rate": 1.7423380140146324e-06, "loss": 0.3952, "step": 4025 }, { "epoch": 2.260527793374509, "grad_norm": 0.40173107385635376, "learning_rate": 1.739860259560765e-06, "loss": 0.3852, "step": 4026 }, { "epoch": 2.261089275687816, "grad_norm": 0.44462013244628906, "learning_rate": 1.7373838970036683e-06, "loss": 0.3578, "step": 4027 }, { "epoch": 2.261650758001123, "grad_norm": 0.4231359660625458, "learning_rate": 1.7349089274006131e-06, "loss": 0.3771, "step": 4028 }, { "epoch": 2.26221224031443, "grad_norm": 0.43324825167655945, "learning_rate": 1.73243535180827e-06, "loss": 0.3613, "step": 4029 }, { "epoch": 2.2627737226277373, "grad_norm": 0.3920248746871948, "learning_rate": 1.729963171282718e-06, "loss": 0.3841, "step": 4030 }, { "epoch": 2.2633352049410442, "grad_norm": 0.4618297517299652, "learning_rate": 1.7274923868794414e-06, "loss": 0.3812, "step": 4031 }, { "epoch": 2.2638966872543516, "grad_norm": 0.4142466187477112, "learning_rate": 1.725022999653323e-06, "loss": 0.3671, "step": 4032 }, { "epoch": 2.2644581695676584, "grad_norm": 0.41090917587280273, "learning_rate": 1.7225550106586559e-06, "loss": 0.3631, "step": 4033 }, { "epoch": 2.2650196518809658, "grad_norm": 0.449215292930603, "learning_rate": 1.7200884209491303e-06, "loss": 0.4121, "step": 4034 }, { "epoch": 2.2655811341942727, "grad_norm": 0.393142968416214, "learning_rate": 1.7176232315778425e-06, "loss": 0.3641, "step": 4035 }, { "epoch": 2.26614261650758, "grad_norm": 0.4021137058734894, "learning_rate": 1.715159443597292e-06, "loss": 0.3901, "step": 4036 }, { "epoch": 2.2667040988208873, "grad_norm": 0.41598859429359436, "learning_rate": 1.712697058059375e-06, "loss": 0.4065, "step": 4037 }, { "epoch": 2.267265581134194, "grad_norm": 0.4831911325454712, "learning_rate": 1.7102360760153946e-06, "loss": 0.3867, "step": 4038 }, { "epoch": 2.2678270634475015, "grad_norm": 0.37714678049087524, "learning_rate": 1.7077764985160494e-06, "loss": 0.3469, "step": 4039 }, { "epoch": 2.2683885457608084, "grad_norm": 0.3736817240715027, "learning_rate": 1.7053183266114426e-06, "loss": 0.3607, "step": 4040 }, { "epoch": 2.2689500280741157, "grad_norm": 0.4086097180843353, "learning_rate": 1.7028615613510774e-06, "loss": 0.3825, "step": 4041 }, { "epoch": 2.2695115103874226, "grad_norm": 0.4217855632305145, "learning_rate": 1.7004062037838509e-06, "loss": 0.3772, "step": 4042 }, { "epoch": 2.27007299270073, "grad_norm": 0.37547221779823303, "learning_rate": 1.6979522549580668e-06, "loss": 0.3731, "step": 4043 }, { "epoch": 2.2706344750140373, "grad_norm": 0.4054681062698364, "learning_rate": 1.6954997159214205e-06, "loss": 0.3546, "step": 4044 }, { "epoch": 2.271195957327344, "grad_norm": 0.4064677953720093, "learning_rate": 1.693048587721009e-06, "loss": 0.3767, "step": 4045 }, { "epoch": 2.2717574396406515, "grad_norm": 0.39790475368499756, "learning_rate": 1.69059887140333e-06, "loss": 0.3685, "step": 4046 }, { "epoch": 2.2723189219539583, "grad_norm": 0.41290637850761414, "learning_rate": 1.6881505680142702e-06, "loss": 0.3777, "step": 4047 }, { "epoch": 2.2728804042672657, "grad_norm": 0.3956133723258972, "learning_rate": 1.6857036785991215e-06, "loss": 0.3791, "step": 4048 }, { "epoch": 2.2734418865805726, "grad_norm": 0.4238951504230499, "learning_rate": 1.6832582042025659e-06, "loss": 0.3518, "step": 4049 }, { "epoch": 2.27400336889388, "grad_norm": 0.40449199080467224, "learning_rate": 1.6808141458686838e-06, "loss": 0.3942, "step": 4050 }, { "epoch": 2.274564851207187, "grad_norm": 0.4223158359527588, "learning_rate": 1.6783715046409543e-06, "loss": 0.3717, "step": 4051 }, { "epoch": 2.275126333520494, "grad_norm": 0.3990144729614258, "learning_rate": 1.6759302815622435e-06, "loss": 0.3735, "step": 4052 }, { "epoch": 2.2756878158338014, "grad_norm": 0.39125847816467285, "learning_rate": 1.6734904776748212e-06, "loss": 0.3588, "step": 4053 }, { "epoch": 2.2762492981471083, "grad_norm": 0.3848765790462494, "learning_rate": 1.6710520940203428e-06, "loss": 0.3698, "step": 4054 }, { "epoch": 2.2768107804604156, "grad_norm": 0.4098925292491913, "learning_rate": 1.668615131639863e-06, "loss": 0.352, "step": 4055 }, { "epoch": 2.2773722627737225, "grad_norm": 0.4036547541618347, "learning_rate": 1.6661795915738299e-06, "loss": 0.3652, "step": 4056 }, { "epoch": 2.27793374508703, "grad_norm": 0.42741212248802185, "learning_rate": 1.663745474862079e-06, "loss": 0.3644, "step": 4057 }, { "epoch": 2.2784952274003367, "grad_norm": 0.39835381507873535, "learning_rate": 1.6613127825438452e-06, "loss": 0.3974, "step": 4058 }, { "epoch": 2.279056709713644, "grad_norm": 0.4086603820323944, "learning_rate": 1.6588815156577476e-06, "loss": 0.3805, "step": 4059 }, { "epoch": 2.279618192026951, "grad_norm": 0.40900343656539917, "learning_rate": 1.6564516752418036e-06, "loss": 0.3732, "step": 4060 }, { "epoch": 2.2801796743402583, "grad_norm": 0.3963452875614166, "learning_rate": 1.6540232623334191e-06, "loss": 0.3513, "step": 4061 }, { "epoch": 2.2807411566535656, "grad_norm": 0.42801156640052795, "learning_rate": 1.6515962779693872e-06, "loss": 0.3811, "step": 4062 }, { "epoch": 2.2813026389668725, "grad_norm": 0.42403122782707214, "learning_rate": 1.6491707231858983e-06, "loss": 0.4036, "step": 4063 }, { "epoch": 2.28186412128018, "grad_norm": 0.43091055750846863, "learning_rate": 1.6467465990185238e-06, "loss": 0.4042, "step": 4064 }, { "epoch": 2.2824256035934867, "grad_norm": 0.409035861492157, "learning_rate": 1.644323906502231e-06, "loss": 0.4074, "step": 4065 }, { "epoch": 2.282987085906794, "grad_norm": 0.38261500000953674, "learning_rate": 1.641902646671375e-06, "loss": 0.3873, "step": 4066 }, { "epoch": 2.283548568220101, "grad_norm": 0.40656793117523193, "learning_rate": 1.6394828205596953e-06, "loss": 0.37, "step": 4067 }, { "epoch": 2.284110050533408, "grad_norm": 0.3800978660583496, "learning_rate": 1.6370644292003241e-06, "loss": 0.3706, "step": 4068 }, { "epoch": 2.2846715328467155, "grad_norm": 0.4056176543235779, "learning_rate": 1.6346474736257762e-06, "loss": 0.3809, "step": 4069 }, { "epoch": 2.2852330151600224, "grad_norm": 0.40927958488464355, "learning_rate": 1.632231954867957e-06, "loss": 0.4185, "step": 4070 }, { "epoch": 2.2857944974733297, "grad_norm": 0.3854992985725403, "learning_rate": 1.6298178739581605e-06, "loss": 0.3687, "step": 4071 }, { "epoch": 2.2863559797866366, "grad_norm": 0.4141371250152588, "learning_rate": 1.62740523192706e-06, "loss": 0.3663, "step": 4072 }, { "epoch": 2.286917462099944, "grad_norm": 0.3834075629711151, "learning_rate": 1.6249940298047206e-06, "loss": 0.3974, "step": 4073 }, { "epoch": 2.287478944413251, "grad_norm": 0.4250081479549408, "learning_rate": 1.6225842686205885e-06, "loss": 0.3795, "step": 4074 }, { "epoch": 2.288040426726558, "grad_norm": 0.4030463993549347, "learning_rate": 1.620175949403497e-06, "loss": 0.3794, "step": 4075 }, { "epoch": 2.2886019090398655, "grad_norm": 0.40089496970176697, "learning_rate": 1.6177690731816654e-06, "loss": 0.3626, "step": 4076 }, { "epoch": 2.2891633913531724, "grad_norm": 0.40350109338760376, "learning_rate": 1.6153636409826917e-06, "loss": 0.3805, "step": 4077 }, { "epoch": 2.2897248736664797, "grad_norm": 0.42160722613334656, "learning_rate": 1.6129596538335636e-06, "loss": 0.3976, "step": 4078 }, { "epoch": 2.2902863559797866, "grad_norm": 0.39743688702583313, "learning_rate": 1.610557112760645e-06, "loss": 0.3722, "step": 4079 }, { "epoch": 2.290847838293094, "grad_norm": 0.408740758895874, "learning_rate": 1.6081560187896888e-06, "loss": 0.3546, "step": 4080 }, { "epoch": 2.291409320606401, "grad_norm": 0.3791080713272095, "learning_rate": 1.6057563729458286e-06, "loss": 0.4046, "step": 4081 }, { "epoch": 2.291970802919708, "grad_norm": 0.40856534242630005, "learning_rate": 1.6033581762535749e-06, "loss": 0.3998, "step": 4082 }, { "epoch": 2.292532285233015, "grad_norm": 0.39558514952659607, "learning_rate": 1.600961429736827e-06, "loss": 0.3599, "step": 4083 }, { "epoch": 2.2930937675463223, "grad_norm": 0.4335796535015106, "learning_rate": 1.5985661344188574e-06, "loss": 0.3906, "step": 4084 }, { "epoch": 2.293655249859629, "grad_norm": 0.38653647899627686, "learning_rate": 1.5961722913223249e-06, "loss": 0.3713, "step": 4085 }, { "epoch": 2.2942167321729365, "grad_norm": 0.4512888789176941, "learning_rate": 1.5937799014692678e-06, "loss": 0.3885, "step": 4086 }, { "epoch": 2.294778214486244, "grad_norm": 0.4125877916812897, "learning_rate": 1.5913889658810987e-06, "loss": 0.41, "step": 4087 }, { "epoch": 2.2953396967995507, "grad_norm": 0.3885774612426758, "learning_rate": 1.5889994855786168e-06, "loss": 0.384, "step": 4088 }, { "epoch": 2.295901179112858, "grad_norm": 0.35610300302505493, "learning_rate": 1.5866114615819923e-06, "loss": 0.3612, "step": 4089 }, { "epoch": 2.296462661426165, "grad_norm": 0.36096054315567017, "learning_rate": 1.584224894910779e-06, "loss": 0.3732, "step": 4090 }, { "epoch": 2.2970241437394723, "grad_norm": 0.3658565878868103, "learning_rate": 1.5818397865839097e-06, "loss": 0.3589, "step": 4091 }, { "epoch": 2.297585626052779, "grad_norm": 0.4127848446369171, "learning_rate": 1.5794561376196877e-06, "loss": 0.3735, "step": 4092 }, { "epoch": 2.2981471083660865, "grad_norm": 0.44730737805366516, "learning_rate": 1.5770739490358006e-06, "loss": 0.3906, "step": 4093 }, { "epoch": 2.298708590679394, "grad_norm": 0.40649983286857605, "learning_rate": 1.574693221849306e-06, "loss": 0.3531, "step": 4094 }, { "epoch": 2.2992700729927007, "grad_norm": 0.4202244281768799, "learning_rate": 1.5723139570766432e-06, "loss": 0.3883, "step": 4095 }, { "epoch": 2.299831555306008, "grad_norm": 0.3687955141067505, "learning_rate": 1.5699361557336256e-06, "loss": 0.3895, "step": 4096 }, { "epoch": 2.300393037619315, "grad_norm": 0.4033234417438507, "learning_rate": 1.5675598188354379e-06, "loss": 0.4081, "step": 4097 }, { "epoch": 2.3009545199326222, "grad_norm": 0.3893221616744995, "learning_rate": 1.5651849473966462e-06, "loss": 0.3658, "step": 4098 }, { "epoch": 2.301516002245929, "grad_norm": 0.40908294916152954, "learning_rate": 1.562811542431184e-06, "loss": 0.3628, "step": 4099 }, { "epoch": 2.3020774845592364, "grad_norm": 0.40818578004837036, "learning_rate": 1.5604396049523657e-06, "loss": 0.3586, "step": 4100 }, { "epoch": 2.3026389668725433, "grad_norm": 0.43010860681533813, "learning_rate": 1.558069135972871e-06, "loss": 0.408, "step": 4101 }, { "epoch": 2.3032004491858507, "grad_norm": 0.421297550201416, "learning_rate": 1.5557001365047603e-06, "loss": 0.3813, "step": 4102 }, { "epoch": 2.3037619314991575, "grad_norm": 0.3767099380493164, "learning_rate": 1.553332607559464e-06, "loss": 0.3974, "step": 4103 }, { "epoch": 2.304323413812465, "grad_norm": 0.4145900011062622, "learning_rate": 1.5509665501477817e-06, "loss": 0.3842, "step": 4104 }, { "epoch": 2.304884896125772, "grad_norm": 0.4567049741744995, "learning_rate": 1.54860196527989e-06, "loss": 0.3835, "step": 4105 }, { "epoch": 2.305446378439079, "grad_norm": 0.405751496553421, "learning_rate": 1.5462388539653306e-06, "loss": 0.3521, "step": 4106 }, { "epoch": 2.3060078607523864, "grad_norm": 0.36296218633651733, "learning_rate": 1.5438772172130212e-06, "loss": 0.3534, "step": 4107 }, { "epoch": 2.3065693430656933, "grad_norm": 0.3647633492946625, "learning_rate": 1.5415170560312498e-06, "loss": 0.3803, "step": 4108 }, { "epoch": 2.3071308253790006, "grad_norm": 0.39942029118537903, "learning_rate": 1.5391583714276704e-06, "loss": 0.3841, "step": 4109 }, { "epoch": 2.3076923076923075, "grad_norm": 0.40520766377449036, "learning_rate": 1.5368011644093105e-06, "loss": 0.3693, "step": 4110 }, { "epoch": 2.308253790005615, "grad_norm": 0.392047256231308, "learning_rate": 1.534445435982564e-06, "loss": 0.3943, "step": 4111 }, { "epoch": 2.308815272318922, "grad_norm": 0.42951375246047974, "learning_rate": 1.5320911871531946e-06, "loss": 0.3849, "step": 4112 }, { "epoch": 2.309376754632229, "grad_norm": 0.39205923676490784, "learning_rate": 1.5297384189263376e-06, "loss": 0.3668, "step": 4113 }, { "epoch": 2.3099382369455363, "grad_norm": 0.4255583584308624, "learning_rate": 1.5273871323064888e-06, "loss": 0.374, "step": 4114 }, { "epoch": 2.3104997192588432, "grad_norm": 0.40122562646865845, "learning_rate": 1.525037328297519e-06, "loss": 0.3524, "step": 4115 }, { "epoch": 2.3110612015721506, "grad_norm": 0.4501184821128845, "learning_rate": 1.5226890079026602e-06, "loss": 0.3605, "step": 4116 }, { "epoch": 2.3116226838854574, "grad_norm": 0.40584298968315125, "learning_rate": 1.5203421721245142e-06, "loss": 0.3674, "step": 4117 }, { "epoch": 2.3121841661987648, "grad_norm": 0.4024868607521057, "learning_rate": 1.5179968219650504e-06, "loss": 0.3869, "step": 4118 }, { "epoch": 2.312745648512072, "grad_norm": 0.4024317264556885, "learning_rate": 1.515652958425598e-06, "loss": 0.4048, "step": 4119 }, { "epoch": 2.313307130825379, "grad_norm": 0.41787323355674744, "learning_rate": 1.5133105825068588e-06, "loss": 0.3744, "step": 4120 }, { "epoch": 2.3138686131386863, "grad_norm": 0.4343962073326111, "learning_rate": 1.5109696952088932e-06, "loss": 0.3782, "step": 4121 }, { "epoch": 2.314430095451993, "grad_norm": 0.44399240612983704, "learning_rate": 1.5086302975311296e-06, "loss": 0.3461, "step": 4122 }, { "epoch": 2.3149915777653005, "grad_norm": 0.4099523723125458, "learning_rate": 1.5062923904723609e-06, "loss": 0.3488, "step": 4123 }, { "epoch": 2.3155530600786074, "grad_norm": 0.4051162898540497, "learning_rate": 1.5039559750307403e-06, "loss": 0.3843, "step": 4124 }, { "epoch": 2.3161145423919147, "grad_norm": 0.39383968710899353, "learning_rate": 1.5016210522037884e-06, "loss": 0.3615, "step": 4125 }, { "epoch": 2.3166760247052216, "grad_norm": 0.3968193233013153, "learning_rate": 1.499287622988383e-06, "loss": 0.3385, "step": 4126 }, { "epoch": 2.317237507018529, "grad_norm": 0.3720206618309021, "learning_rate": 1.496955688380769e-06, "loss": 0.3562, "step": 4127 }, { "epoch": 2.317798989331836, "grad_norm": 0.40108469128608704, "learning_rate": 1.4946252493765539e-06, "loss": 0.3746, "step": 4128 }, { "epoch": 2.318360471645143, "grad_norm": 0.37054187059402466, "learning_rate": 1.4922963069707009e-06, "loss": 0.3654, "step": 4129 }, { "epoch": 2.3189219539584505, "grad_norm": 0.4000416100025177, "learning_rate": 1.489968862157541e-06, "loss": 0.3501, "step": 4130 }, { "epoch": 2.3194834362717573, "grad_norm": 0.4152678847312927, "learning_rate": 1.4876429159307586e-06, "loss": 0.3758, "step": 4131 }, { "epoch": 2.3200449185850647, "grad_norm": 0.38556063175201416, "learning_rate": 1.4853184692834055e-06, "loss": 0.3727, "step": 4132 }, { "epoch": 2.3206064008983716, "grad_norm": 0.4223717153072357, "learning_rate": 1.4829955232078903e-06, "loss": 0.366, "step": 4133 }, { "epoch": 2.321167883211679, "grad_norm": 0.37474262714385986, "learning_rate": 1.4806740786959772e-06, "loss": 0.3776, "step": 4134 }, { "epoch": 2.3217293655249858, "grad_norm": 0.41073641180992126, "learning_rate": 1.4783541367387971e-06, "loss": 0.3761, "step": 4135 }, { "epoch": 2.322290847838293, "grad_norm": 0.4126848876476288, "learning_rate": 1.4760356983268315e-06, "loss": 0.3897, "step": 4136 }, { "epoch": 2.3228523301516004, "grad_norm": 0.3976772725582123, "learning_rate": 1.4737187644499245e-06, "loss": 0.3943, "step": 4137 }, { "epoch": 2.3234138124649073, "grad_norm": 0.3915645480155945, "learning_rate": 1.471403336097279e-06, "loss": 0.3821, "step": 4138 }, { "epoch": 2.3239752947782146, "grad_norm": 0.40332043170928955, "learning_rate": 1.4690894142574501e-06, "loss": 0.3593, "step": 4139 }, { "epoch": 2.3245367770915215, "grad_norm": 0.4143243134021759, "learning_rate": 1.466776999918355e-06, "loss": 0.3996, "step": 4140 }, { "epoch": 2.325098259404829, "grad_norm": 0.38621556758880615, "learning_rate": 1.4644660940672628e-06, "loss": 0.377, "step": 4141 }, { "epoch": 2.3256597417181357, "grad_norm": 0.378163605928421, "learning_rate": 1.4621566976908014e-06, "loss": 0.395, "step": 4142 }, { "epoch": 2.326221224031443, "grad_norm": 0.3809984624385834, "learning_rate": 1.4598488117749548e-06, "loss": 0.3659, "step": 4143 }, { "epoch": 2.3267827063447504, "grad_norm": 0.41323933005332947, "learning_rate": 1.457542437305059e-06, "loss": 0.3819, "step": 4144 }, { "epoch": 2.3273441886580573, "grad_norm": 0.35510969161987305, "learning_rate": 1.4552375752658081e-06, "loss": 0.3685, "step": 4145 }, { "epoch": 2.3279056709713646, "grad_norm": 0.39571723341941833, "learning_rate": 1.4529342266412467e-06, "loss": 0.3514, "step": 4146 }, { "epoch": 2.3284671532846715, "grad_norm": 0.40493977069854736, "learning_rate": 1.450632392414777e-06, "loss": 0.3984, "step": 4147 }, { "epoch": 2.329028635597979, "grad_norm": 0.39647167921066284, "learning_rate": 1.4483320735691542e-06, "loss": 0.3479, "step": 4148 }, { "epoch": 2.3295901179112857, "grad_norm": 0.39864835143089294, "learning_rate": 1.4460332710864822e-06, "loss": 0.391, "step": 4149 }, { "epoch": 2.330151600224593, "grad_norm": 0.3891521990299225, "learning_rate": 1.4437359859482242e-06, "loss": 0.3635, "step": 4150 }, { "epoch": 2.3307130825379, "grad_norm": 0.3845609426498413, "learning_rate": 1.441440219135189e-06, "loss": 0.3706, "step": 4151 }, { "epoch": 2.331274564851207, "grad_norm": 0.38694190979003906, "learning_rate": 1.4391459716275418e-06, "loss": 0.3342, "step": 4152 }, { "epoch": 2.331836047164514, "grad_norm": 0.39203423261642456, "learning_rate": 1.436853244404799e-06, "loss": 0.3827, "step": 4153 }, { "epoch": 2.3323975294778214, "grad_norm": 0.3999992907047272, "learning_rate": 1.4345620384458226e-06, "loss": 0.3562, "step": 4154 }, { "epoch": 2.3329590117911287, "grad_norm": 0.39568591117858887, "learning_rate": 1.4322723547288336e-06, "loss": 0.3839, "step": 4155 }, { "epoch": 2.3335204941044356, "grad_norm": 0.40248745679855347, "learning_rate": 1.4299841942313947e-06, "loss": 0.3743, "step": 4156 }, { "epoch": 2.334081976417743, "grad_norm": 0.41134217381477356, "learning_rate": 1.4276975579304232e-06, "loss": 0.3478, "step": 4157 }, { "epoch": 2.33464345873105, "grad_norm": 0.3885422945022583, "learning_rate": 1.4254124468021868e-06, "loss": 0.3776, "step": 4158 }, { "epoch": 2.335204941044357, "grad_norm": 0.4056529104709625, "learning_rate": 1.423128861822296e-06, "loss": 0.3711, "step": 4159 }, { "epoch": 2.335766423357664, "grad_norm": 0.42392250895500183, "learning_rate": 1.420846803965717e-06, "loss": 0.3597, "step": 4160 }, { "epoch": 2.3363279056709714, "grad_norm": 0.37522801756858826, "learning_rate": 1.4185662742067575e-06, "loss": 0.3766, "step": 4161 }, { "epoch": 2.3368893879842787, "grad_norm": 0.4491144120693207, "learning_rate": 1.4162872735190764e-06, "loss": 0.4035, "step": 4162 }, { "epoch": 2.3374508702975856, "grad_norm": 0.3843354284763336, "learning_rate": 1.414009802875682e-06, "loss": 0.3801, "step": 4163 }, { "epoch": 2.338012352610893, "grad_norm": 0.4536619782447815, "learning_rate": 1.4117338632489225e-06, "loss": 0.3703, "step": 4164 }, { "epoch": 2.3385738349242, "grad_norm": 0.4315115213394165, "learning_rate": 1.4094594556104996e-06, "loss": 0.3343, "step": 4165 }, { "epoch": 2.339135317237507, "grad_norm": 0.45106279850006104, "learning_rate": 1.4071865809314555e-06, "loss": 0.3832, "step": 4166 }, { "epoch": 2.339696799550814, "grad_norm": 0.4212765693664551, "learning_rate": 1.4049152401821804e-06, "loss": 0.3704, "step": 4167 }, { "epoch": 2.3402582818641213, "grad_norm": 0.39156702160835266, "learning_rate": 1.402645434332412e-06, "loss": 0.3722, "step": 4168 }, { "epoch": 2.3408197641774287, "grad_norm": 0.4094436764717102, "learning_rate": 1.4003771643512266e-06, "loss": 0.3881, "step": 4169 }, { "epoch": 2.3413812464907355, "grad_norm": 0.4203847050666809, "learning_rate": 1.398110431207051e-06, "loss": 0.4074, "step": 4170 }, { "epoch": 2.341942728804043, "grad_norm": 0.3995791971683502, "learning_rate": 1.3958452358676505e-06, "loss": 0.3706, "step": 4171 }, { "epoch": 2.3425042111173497, "grad_norm": 0.41594812273979187, "learning_rate": 1.3935815793001377e-06, "loss": 0.368, "step": 4172 }, { "epoch": 2.343065693430657, "grad_norm": 0.3918347954750061, "learning_rate": 1.3913194624709692e-06, "loss": 0.4166, "step": 4173 }, { "epoch": 2.343627175743964, "grad_norm": 0.450300008058548, "learning_rate": 1.3890588863459375e-06, "loss": 0.3814, "step": 4174 }, { "epoch": 2.3441886580572713, "grad_norm": 0.42959561944007874, "learning_rate": 1.3867998518901859e-06, "loss": 0.349, "step": 4175 }, { "epoch": 2.344750140370578, "grad_norm": 0.373519629240036, "learning_rate": 1.3845423600681918e-06, "loss": 0.3694, "step": 4176 }, { "epoch": 2.3453116226838855, "grad_norm": 0.4629288613796234, "learning_rate": 1.3822864118437795e-06, "loss": 0.3594, "step": 4177 }, { "epoch": 2.3458731049971924, "grad_norm": 0.4293800890445709, "learning_rate": 1.380032008180114e-06, "loss": 0.3827, "step": 4178 }, { "epoch": 2.3464345873104997, "grad_norm": 0.40191036462783813, "learning_rate": 1.3777791500396953e-06, "loss": 0.3935, "step": 4179 }, { "epoch": 2.346996069623807, "grad_norm": 0.4172593057155609, "learning_rate": 1.3755278383843717e-06, "loss": 0.34, "step": 4180 }, { "epoch": 2.347557551937114, "grad_norm": 0.34784191846847534, "learning_rate": 1.3732780741753226e-06, "loss": 0.3659, "step": 4181 }, { "epoch": 2.3481190342504212, "grad_norm": 0.41125717759132385, "learning_rate": 1.3710298583730735e-06, "loss": 0.3815, "step": 4182 }, { "epoch": 2.348680516563728, "grad_norm": 0.39060285687446594, "learning_rate": 1.3687831919374878e-06, "loss": 0.3809, "step": 4183 }, { "epoch": 2.3492419988770354, "grad_norm": 0.39958250522613525, "learning_rate": 1.366538075827763e-06, "loss": 0.3956, "step": 4184 }, { "epoch": 2.3498034811903423, "grad_norm": 0.41959714889526367, "learning_rate": 1.3642945110024402e-06, "loss": 0.3722, "step": 4185 }, { "epoch": 2.3503649635036497, "grad_norm": 0.4088451862335205, "learning_rate": 1.3620524984193939e-06, "loss": 0.3679, "step": 4186 }, { "epoch": 2.350926445816957, "grad_norm": 0.39949530363082886, "learning_rate": 1.3598120390358393e-06, "loss": 0.3692, "step": 4187 }, { "epoch": 2.351487928130264, "grad_norm": 0.4165663421154022, "learning_rate": 1.3575731338083248e-06, "loss": 0.3665, "step": 4188 }, { "epoch": 2.352049410443571, "grad_norm": 0.4198518693447113, "learning_rate": 1.3553357836927383e-06, "loss": 0.3914, "step": 4189 }, { "epoch": 2.352610892756878, "grad_norm": 0.378234326839447, "learning_rate": 1.3530999896443048e-06, "loss": 0.3762, "step": 4190 }, { "epoch": 2.3531723750701854, "grad_norm": 0.42904001474380493, "learning_rate": 1.3508657526175801e-06, "loss": 0.3874, "step": 4191 }, { "epoch": 2.3537338573834923, "grad_norm": 0.39918938279151917, "learning_rate": 1.34863307356646e-06, "loss": 0.4177, "step": 4192 }, { "epoch": 2.3542953396967996, "grad_norm": 0.39396610856056213, "learning_rate": 1.3464019534441709e-06, "loss": 0.3437, "step": 4193 }, { "epoch": 2.3548568220101065, "grad_norm": 0.41359570622444153, "learning_rate": 1.3441723932032774e-06, "loss": 0.3535, "step": 4194 }, { "epoch": 2.355418304323414, "grad_norm": 0.46123090386390686, "learning_rate": 1.3419443937956784e-06, "loss": 0.3595, "step": 4195 }, { "epoch": 2.3559797866367207, "grad_norm": 0.4209584891796112, "learning_rate": 1.339717956172601e-06, "loss": 0.3513, "step": 4196 }, { "epoch": 2.356541268950028, "grad_norm": 0.4059302806854248, "learning_rate": 1.3374930812846127e-06, "loss": 0.3818, "step": 4197 }, { "epoch": 2.3571027512633353, "grad_norm": 0.40826281905174255, "learning_rate": 1.3352697700816063e-06, "loss": 0.375, "step": 4198 }, { "epoch": 2.3576642335766422, "grad_norm": 0.42406490445137024, "learning_rate": 1.333048023512814e-06, "loss": 0.364, "step": 4199 }, { "epoch": 2.3582257158899496, "grad_norm": 0.41022953391075134, "learning_rate": 1.3308278425267978e-06, "loss": 0.3729, "step": 4200 }, { "epoch": 2.3587871982032564, "grad_norm": 0.42460453510284424, "learning_rate": 1.328609228071447e-06, "loss": 0.3836, "step": 4201 }, { "epoch": 2.3593486805165638, "grad_norm": 0.3844737708568573, "learning_rate": 1.326392181093989e-06, "loss": 0.3663, "step": 4202 }, { "epoch": 2.3599101628298707, "grad_norm": 0.3924168646335602, "learning_rate": 1.324176702540975e-06, "loss": 0.3878, "step": 4203 }, { "epoch": 2.360471645143178, "grad_norm": 0.3679245412349701, "learning_rate": 1.3219627933582918e-06, "loss": 0.3618, "step": 4204 }, { "epoch": 2.3610331274564853, "grad_norm": 0.42269426584243774, "learning_rate": 1.319750454491156e-06, "loss": 0.3716, "step": 4205 }, { "epoch": 2.361594609769792, "grad_norm": 0.4026450216770172, "learning_rate": 1.3175396868841095e-06, "loss": 0.3856, "step": 4206 }, { "epoch": 2.3621560920830995, "grad_norm": 0.39115017652511597, "learning_rate": 1.3153304914810284e-06, "loss": 0.369, "step": 4207 }, { "epoch": 2.3627175743964064, "grad_norm": 0.3597429692745209, "learning_rate": 1.3131228692251124e-06, "loss": 0.3654, "step": 4208 }, { "epoch": 2.3632790567097137, "grad_norm": 0.3809575140476227, "learning_rate": 1.3109168210588941e-06, "loss": 0.368, "step": 4209 }, { "epoch": 2.3638405390230206, "grad_norm": 0.36132562160491943, "learning_rate": 1.3087123479242337e-06, "loss": 0.3607, "step": 4210 }, { "epoch": 2.364402021336328, "grad_norm": 0.3837547302246094, "learning_rate": 1.3065094507623145e-06, "loss": 0.3511, "step": 4211 }, { "epoch": 2.3649635036496353, "grad_norm": 0.4088019132614136, "learning_rate": 1.3043081305136524e-06, "loss": 0.3817, "step": 4212 }, { "epoch": 2.365524985962942, "grad_norm": 0.38380083441734314, "learning_rate": 1.3021083881180856e-06, "loss": 0.3936, "step": 4213 }, { "epoch": 2.3660864682762495, "grad_norm": 0.38897430896759033, "learning_rate": 1.299910224514782e-06, "loss": 0.371, "step": 4214 }, { "epoch": 2.3666479505895563, "grad_norm": 0.37530654668807983, "learning_rate": 1.2977136406422352e-06, "loss": 0.3846, "step": 4215 }, { "epoch": 2.3672094329028637, "grad_norm": 0.37340274453163147, "learning_rate": 1.295518637438261e-06, "loss": 0.3663, "step": 4216 }, { "epoch": 2.3677709152161706, "grad_norm": 0.3668399453163147, "learning_rate": 1.293325215840005e-06, "loss": 0.3876, "step": 4217 }, { "epoch": 2.368332397529478, "grad_norm": 0.38251975178718567, "learning_rate": 1.2911333767839323e-06, "loss": 0.3326, "step": 4218 }, { "epoch": 2.3688938798427848, "grad_norm": 0.38054126501083374, "learning_rate": 1.2889431212058367e-06, "loss": 0.3389, "step": 4219 }, { "epoch": 2.369455362156092, "grad_norm": 0.41506972908973694, "learning_rate": 1.2867544500408357e-06, "loss": 0.3859, "step": 4220 }, { "epoch": 2.370016844469399, "grad_norm": 0.6317151784896851, "learning_rate": 1.284567364223367e-06, "loss": 0.3559, "step": 4221 }, { "epoch": 2.3705783267827063, "grad_norm": 0.3876466453075409, "learning_rate": 1.282381864687196e-06, "loss": 0.3776, "step": 4222 }, { "epoch": 2.3711398090960136, "grad_norm": 0.3696313798427582, "learning_rate": 1.2801979523654052e-06, "loss": 0.3791, "step": 4223 }, { "epoch": 2.3717012914093205, "grad_norm": 0.4045000970363617, "learning_rate": 1.2780156281904044e-06, "loss": 0.3909, "step": 4224 }, { "epoch": 2.372262773722628, "grad_norm": 0.41395437717437744, "learning_rate": 1.2758348930939247e-06, "loss": 0.385, "step": 4225 }, { "epoch": 2.3728242560359347, "grad_norm": 0.3744024932384491, "learning_rate": 1.2736557480070149e-06, "loss": 0.357, "step": 4226 }, { "epoch": 2.373385738349242, "grad_norm": 0.35520532727241516, "learning_rate": 1.2714781938600506e-06, "loss": 0.3452, "step": 4227 }, { "epoch": 2.373947220662549, "grad_norm": 0.3965369462966919, "learning_rate": 1.269302231582723e-06, "loss": 0.3485, "step": 4228 }, { "epoch": 2.3745087029758563, "grad_norm": 0.3931077718734741, "learning_rate": 1.2671278621040467e-06, "loss": 0.3801, "step": 4229 }, { "epoch": 2.3750701852891636, "grad_norm": 0.41728147864341736, "learning_rate": 1.264955086352357e-06, "loss": 0.3884, "step": 4230 }, { "epoch": 2.3756316676024705, "grad_norm": 0.3680240511894226, "learning_rate": 1.2627839052553042e-06, "loss": 0.3643, "step": 4231 }, { "epoch": 2.376193149915778, "grad_norm": 0.38711977005004883, "learning_rate": 1.260614319739864e-06, "loss": 0.3718, "step": 4232 }, { "epoch": 2.3767546322290847, "grad_norm": 0.40491318702697754, "learning_rate": 1.2584463307323246e-06, "loss": 0.3563, "step": 4233 }, { "epoch": 2.377316114542392, "grad_norm": 0.43034717440605164, "learning_rate": 1.2562799391582974e-06, "loss": 0.3858, "step": 4234 }, { "epoch": 2.377877596855699, "grad_norm": 0.3783480226993561, "learning_rate": 1.254115145942712e-06, "loss": 0.3711, "step": 4235 }, { "epoch": 2.378439079169006, "grad_norm": 0.4492305517196655, "learning_rate": 1.2519519520098095e-06, "loss": 0.3927, "step": 4236 }, { "epoch": 2.3790005614823135, "grad_norm": 0.4193274974822998, "learning_rate": 1.2497903582831555e-06, "loss": 0.3613, "step": 4237 }, { "epoch": 2.3795620437956204, "grad_norm": 0.423643559217453, "learning_rate": 1.2476303656856275e-06, "loss": 0.3865, "step": 4238 }, { "epoch": 2.3801235261089277, "grad_norm": 0.38040387630462646, "learning_rate": 1.2454719751394213e-06, "loss": 0.3888, "step": 4239 }, { "epoch": 2.3806850084222346, "grad_norm": 0.40842294692993164, "learning_rate": 1.2433151875660509e-06, "loss": 0.3963, "step": 4240 }, { "epoch": 2.381246490735542, "grad_norm": 0.4338062107563019, "learning_rate": 1.2411600038863403e-06, "loss": 0.3812, "step": 4241 }, { "epoch": 2.381807973048849, "grad_norm": 0.415481835603714, "learning_rate": 1.2390064250204347e-06, "loss": 0.3827, "step": 4242 }, { "epoch": 2.382369455362156, "grad_norm": 0.38777416944503784, "learning_rate": 1.2368544518877894e-06, "loss": 0.38, "step": 4243 }, { "epoch": 2.382930937675463, "grad_norm": 0.4464210569858551, "learning_rate": 1.2347040854071767e-06, "loss": 0.3897, "step": 4244 }, { "epoch": 2.3834924199887704, "grad_norm": 0.38432979583740234, "learning_rate": 1.2325553264966845e-06, "loss": 0.3978, "step": 4245 }, { "epoch": 2.3840539023020773, "grad_norm": 0.4089092016220093, "learning_rate": 1.2304081760737091e-06, "loss": 0.3541, "step": 4246 }, { "epoch": 2.3846153846153846, "grad_norm": 0.37308019399642944, "learning_rate": 1.228262635054966e-06, "loss": 0.3676, "step": 4247 }, { "epoch": 2.385176866928692, "grad_norm": 0.4043823480606079, "learning_rate": 1.2261187043564788e-06, "loss": 0.3676, "step": 4248 }, { "epoch": 2.385738349241999, "grad_norm": 0.40828070044517517, "learning_rate": 1.223976384893586e-06, "loss": 0.3506, "step": 4249 }, { "epoch": 2.386299831555306, "grad_norm": 0.41530805826187134, "learning_rate": 1.2218356775809397e-06, "loss": 0.3533, "step": 4250 }, { "epoch": 2.386861313868613, "grad_norm": 0.41935572028160095, "learning_rate": 1.2196965833324998e-06, "loss": 0.3694, "step": 4251 }, { "epoch": 2.3874227961819203, "grad_norm": 0.40046921372413635, "learning_rate": 1.2175591030615407e-06, "loss": 0.3764, "step": 4252 }, { "epoch": 2.387984278495227, "grad_norm": 0.39889952540397644, "learning_rate": 1.2154232376806451e-06, "loss": 0.3879, "step": 4253 }, { "epoch": 2.3885457608085345, "grad_norm": 0.3747151792049408, "learning_rate": 1.213288988101709e-06, "loss": 0.3891, "step": 4254 }, { "epoch": 2.389107243121842, "grad_norm": 0.3984718918800354, "learning_rate": 1.211156355235938e-06, "loss": 0.3994, "step": 4255 }, { "epoch": 2.3896687254351487, "grad_norm": 0.3708553612232208, "learning_rate": 1.209025339993844e-06, "loss": 0.383, "step": 4256 }, { "epoch": 2.390230207748456, "grad_norm": 0.39878299832344055, "learning_rate": 1.2068959432852538e-06, "loss": 0.3625, "step": 4257 }, { "epoch": 2.390791690061763, "grad_norm": 0.3985726833343506, "learning_rate": 1.2047681660192984e-06, "loss": 0.4041, "step": 4258 }, { "epoch": 2.3913531723750703, "grad_norm": 0.41522905230522156, "learning_rate": 1.2026420091044194e-06, "loss": 0.3343, "step": 4259 }, { "epoch": 2.391914654688377, "grad_norm": 0.340585857629776, "learning_rate": 1.2005174734483688e-06, "loss": 0.3932, "step": 4260 }, { "epoch": 2.3924761370016845, "grad_norm": 0.37777262926101685, "learning_rate": 1.1983945599582008e-06, "loss": 0.371, "step": 4261 }, { "epoch": 2.393037619314992, "grad_norm": 0.38323235511779785, "learning_rate": 1.1962732695402834e-06, "loss": 0.3672, "step": 4262 }, { "epoch": 2.3935991016282987, "grad_norm": 0.42499420046806335, "learning_rate": 1.194153603100286e-06, "loss": 0.3582, "step": 4263 }, { "epoch": 2.394160583941606, "grad_norm": 0.459640771150589, "learning_rate": 1.1920355615431884e-06, "loss": 0.3891, "step": 4264 }, { "epoch": 2.394722066254913, "grad_norm": 0.38134610652923584, "learning_rate": 1.1899191457732766e-06, "loss": 0.3591, "step": 4265 }, { "epoch": 2.3952835485682202, "grad_norm": 0.4434778094291687, "learning_rate": 1.1878043566941395e-06, "loss": 0.3968, "step": 4266 }, { "epoch": 2.395845030881527, "grad_norm": 0.3865840435028076, "learning_rate": 1.1856911952086757e-06, "loss": 0.3897, "step": 4267 }, { "epoch": 2.3964065131948344, "grad_norm": 0.4351981282234192, "learning_rate": 1.1835796622190837e-06, "loss": 0.3528, "step": 4268 }, { "epoch": 2.3969679955081413, "grad_norm": 0.3905952572822571, "learning_rate": 1.181469758626871e-06, "loss": 0.3782, "step": 4269 }, { "epoch": 2.3975294778214487, "grad_norm": 0.39893272519111633, "learning_rate": 1.17936148533285e-06, "loss": 0.3847, "step": 4270 }, { "epoch": 2.3980909601347555, "grad_norm": 0.40060099959373474, "learning_rate": 1.177254843237131e-06, "loss": 0.3594, "step": 4271 }, { "epoch": 2.398652442448063, "grad_norm": 0.4140298664569855, "learning_rate": 1.1751498332391365e-06, "loss": 0.3941, "step": 4272 }, { "epoch": 2.39921392476137, "grad_norm": 0.37097403407096863, "learning_rate": 1.1730464562375843e-06, "loss": 0.3572, "step": 4273 }, { "epoch": 2.399775407074677, "grad_norm": 0.4114643335342407, "learning_rate": 1.1709447131305002e-06, "loss": 0.3582, "step": 4274 }, { "epoch": 2.4003368893879844, "grad_norm": 0.389835923910141, "learning_rate": 1.168844604815209e-06, "loss": 0.3704, "step": 4275 }, { "epoch": 2.4008983717012913, "grad_norm": 0.4099624454975128, "learning_rate": 1.16674613218834e-06, "loss": 0.3861, "step": 4276 }, { "epoch": 2.4014598540145986, "grad_norm": 0.44166675209999084, "learning_rate": 1.1646492961458245e-06, "loss": 0.3739, "step": 4277 }, { "epoch": 2.4020213363279055, "grad_norm": 0.39087578654289246, "learning_rate": 1.1625540975828914e-06, "loss": 0.3601, "step": 4278 }, { "epoch": 2.402582818641213, "grad_norm": 0.37087711691856384, "learning_rate": 1.1604605373940758e-06, "loss": 0.3632, "step": 4279 }, { "epoch": 2.40314430095452, "grad_norm": 0.40792563557624817, "learning_rate": 1.1583686164732077e-06, "loss": 0.3838, "step": 4280 }, { "epoch": 2.403705783267827, "grad_norm": 0.37769973278045654, "learning_rate": 1.156278335713421e-06, "loss": 0.3707, "step": 4281 }, { "epoch": 2.4042672655811343, "grad_norm": 0.38248899579048157, "learning_rate": 1.1541896960071502e-06, "loss": 0.3933, "step": 4282 }, { "epoch": 2.4048287478944412, "grad_norm": 0.4236691892147064, "learning_rate": 1.1521026982461236e-06, "loss": 0.3881, "step": 4283 }, { "epoch": 2.4053902302077486, "grad_norm": 0.3821027874946594, "learning_rate": 1.1500173433213756e-06, "loss": 0.3738, "step": 4284 }, { "epoch": 2.4059517125210554, "grad_norm": 0.39464524388313293, "learning_rate": 1.1479336321232336e-06, "loss": 0.3781, "step": 4285 }, { "epoch": 2.4065131948343628, "grad_norm": 0.4357953667640686, "learning_rate": 1.1458515655413255e-06, "loss": 0.3507, "step": 4286 }, { "epoch": 2.40707467714767, "grad_norm": 0.4173157215118408, "learning_rate": 1.1437711444645795e-06, "loss": 0.3782, "step": 4287 }, { "epoch": 2.407636159460977, "grad_norm": 0.37715113162994385, "learning_rate": 1.1416923697812149e-06, "loss": 0.3614, "step": 4288 }, { "epoch": 2.4081976417742843, "grad_norm": 0.37815824151039124, "learning_rate": 1.1396152423787549e-06, "loss": 0.3795, "step": 4289 }, { "epoch": 2.408759124087591, "grad_norm": 0.3810286521911621, "learning_rate": 1.1375397631440139e-06, "loss": 0.3561, "step": 4290 }, { "epoch": 2.4093206064008985, "grad_norm": 0.41924139857292175, "learning_rate": 1.1354659329631062e-06, "loss": 0.3806, "step": 4291 }, { "epoch": 2.4098820887142054, "grad_norm": 0.36761045455932617, "learning_rate": 1.1333937527214428e-06, "loss": 0.3792, "step": 4292 }, { "epoch": 2.4104435710275127, "grad_norm": 0.44044676423072815, "learning_rate": 1.1313232233037252e-06, "loss": 0.409, "step": 4293 }, { "epoch": 2.4110050533408196, "grad_norm": 0.3904513418674469, "learning_rate": 1.1292543455939558e-06, "loss": 0.3982, "step": 4294 }, { "epoch": 2.411566535654127, "grad_norm": 0.3791710138320923, "learning_rate": 1.1271871204754276e-06, "loss": 0.3816, "step": 4295 }, { "epoch": 2.412128017967434, "grad_norm": 0.3843848705291748, "learning_rate": 1.12512154883073e-06, "loss": 0.3738, "step": 4296 }, { "epoch": 2.412689500280741, "grad_norm": 0.37030884623527527, "learning_rate": 1.1230576315417485e-06, "loss": 0.3552, "step": 4297 }, { "epoch": 2.4132509825940485, "grad_norm": 0.3885851502418518, "learning_rate": 1.1209953694896564e-06, "loss": 0.3593, "step": 4298 }, { "epoch": 2.4138124649073553, "grad_norm": 0.36976882815361023, "learning_rate": 1.1189347635549269e-06, "loss": 0.3513, "step": 4299 }, { "epoch": 2.4143739472206627, "grad_norm": 0.36055731773376465, "learning_rate": 1.1168758146173208e-06, "loss": 0.379, "step": 4300 }, { "epoch": 2.4149354295339696, "grad_norm": 0.38419562578201294, "learning_rate": 1.114818523555895e-06, "loss": 0.3741, "step": 4301 }, { "epoch": 2.415496911847277, "grad_norm": 0.3632352948188782, "learning_rate": 1.1127628912489986e-06, "loss": 0.3678, "step": 4302 }, { "epoch": 2.4160583941605838, "grad_norm": 0.4311046004295349, "learning_rate": 1.1107089185742686e-06, "loss": 0.3686, "step": 4303 }, { "epoch": 2.416619876473891, "grad_norm": 0.39566174149513245, "learning_rate": 1.1086566064086386e-06, "loss": 0.3805, "step": 4304 }, { "epoch": 2.4171813587871984, "grad_norm": 0.4176063537597656, "learning_rate": 1.1066059556283282e-06, "loss": 0.3929, "step": 4305 }, { "epoch": 2.4177428411005053, "grad_norm": 0.3633875846862793, "learning_rate": 1.1045569671088518e-06, "loss": 0.3724, "step": 4306 }, { "epoch": 2.4183043234138126, "grad_norm": 0.3754139542579651, "learning_rate": 1.1025096417250142e-06, "loss": 0.3951, "step": 4307 }, { "epoch": 2.4188658057271195, "grad_norm": 0.3720216155052185, "learning_rate": 1.1004639803509049e-06, "loss": 0.3906, "step": 4308 }, { "epoch": 2.419427288040427, "grad_norm": 0.4033909738063812, "learning_rate": 1.098419983859909e-06, "loss": 0.3619, "step": 4309 }, { "epoch": 2.4199887703537337, "grad_norm": 0.3870863616466522, "learning_rate": 1.0963776531246972e-06, "loss": 0.3744, "step": 4310 }, { "epoch": 2.420550252667041, "grad_norm": 0.3986334800720215, "learning_rate": 1.09433698901723e-06, "loss": 0.3682, "step": 4311 }, { "epoch": 2.421111734980348, "grad_norm": 0.4152964651584625, "learning_rate": 1.092297992408759e-06, "loss": 0.3487, "step": 4312 }, { "epoch": 2.4216732172936553, "grad_norm": 0.3967514634132385, "learning_rate": 1.0902606641698176e-06, "loss": 0.3868, "step": 4313 }, { "epoch": 2.422234699606962, "grad_norm": 0.4528026878833771, "learning_rate": 1.0882250051702341e-06, "loss": 0.3436, "step": 4314 }, { "epoch": 2.4227961819202695, "grad_norm": 0.37583211064338684, "learning_rate": 1.0861910162791178e-06, "loss": 0.3624, "step": 4315 }, { "epoch": 2.423357664233577, "grad_norm": 0.42273232340812683, "learning_rate": 1.0841586983648694e-06, "loss": 0.371, "step": 4316 }, { "epoch": 2.4239191465468837, "grad_norm": 0.37922635674476624, "learning_rate": 1.082128052295175e-06, "loss": 0.361, "step": 4317 }, { "epoch": 2.424480628860191, "grad_norm": 0.38448190689086914, "learning_rate": 1.0800990789370053e-06, "loss": 0.3913, "step": 4318 }, { "epoch": 2.425042111173498, "grad_norm": 0.39564675092697144, "learning_rate": 1.0780717791566197e-06, "loss": 0.3427, "step": 4319 }, { "epoch": 2.425603593486805, "grad_norm": 0.39494842290878296, "learning_rate": 1.0760461538195588e-06, "loss": 0.4052, "step": 4320 }, { "epoch": 2.426165075800112, "grad_norm": 0.3644457161426544, "learning_rate": 1.0740222037906534e-06, "loss": 0.3778, "step": 4321 }, { "epoch": 2.4267265581134194, "grad_norm": 0.41098615527153015, "learning_rate": 1.0719999299340168e-06, "loss": 0.3769, "step": 4322 }, { "epoch": 2.4272880404267267, "grad_norm": 0.37284594774246216, "learning_rate": 1.0699793331130442e-06, "loss": 0.3885, "step": 4323 }, { "epoch": 2.4278495227400336, "grad_norm": 0.4106243848800659, "learning_rate": 1.06796041419042e-06, "loss": 0.3689, "step": 4324 }, { "epoch": 2.428411005053341, "grad_norm": 0.3814992904663086, "learning_rate": 1.0659431740281062e-06, "loss": 0.3513, "step": 4325 }, { "epoch": 2.428972487366648, "grad_norm": 0.3968982398509979, "learning_rate": 1.0639276134873533e-06, "loss": 0.367, "step": 4326 }, { "epoch": 2.429533969679955, "grad_norm": 0.38571634888648987, "learning_rate": 1.0619137334286928e-06, "loss": 0.3805, "step": 4327 }, { "epoch": 2.430095451993262, "grad_norm": 0.3552822768688202, "learning_rate": 1.0599015347119368e-06, "loss": 0.3909, "step": 4328 }, { "epoch": 2.4306569343065694, "grad_norm": 0.3983383774757385, "learning_rate": 1.0578910181961833e-06, "loss": 0.3926, "step": 4329 }, { "epoch": 2.4312184166198767, "grad_norm": 0.3857637941837311, "learning_rate": 1.0558821847398076e-06, "loss": 0.3626, "step": 4330 }, { "epoch": 2.4317798989331836, "grad_norm": 0.40377140045166016, "learning_rate": 1.05387503520047e-06, "loss": 0.3655, "step": 4331 }, { "epoch": 2.432341381246491, "grad_norm": 0.37225550413131714, "learning_rate": 1.0518695704351128e-06, "loss": 0.3551, "step": 4332 }, { "epoch": 2.432902863559798, "grad_norm": 0.3718332350254059, "learning_rate": 1.0498657912999538e-06, "loss": 0.3715, "step": 4333 }, { "epoch": 2.433464345873105, "grad_norm": 0.42261528968811035, "learning_rate": 1.047863698650497e-06, "loss": 0.3623, "step": 4334 }, { "epoch": 2.434025828186412, "grad_norm": 0.394039124250412, "learning_rate": 1.0458632933415209e-06, "loss": 0.3696, "step": 4335 }, { "epoch": 2.4345873104997193, "grad_norm": 0.39822918176651, "learning_rate": 1.043864576227087e-06, "loss": 0.3749, "step": 4336 }, { "epoch": 2.435148792813026, "grad_norm": 0.440110981464386, "learning_rate": 1.0418675481605383e-06, "loss": 0.3842, "step": 4337 }, { "epoch": 2.4357102751263335, "grad_norm": 0.3853782117366791, "learning_rate": 1.0398722099944902e-06, "loss": 0.3769, "step": 4338 }, { "epoch": 2.4362717574396404, "grad_norm": 0.40641871094703674, "learning_rate": 1.0378785625808425e-06, "loss": 0.369, "step": 4339 }, { "epoch": 2.4368332397529477, "grad_norm": 0.3789682388305664, "learning_rate": 1.0358866067707684e-06, "loss": 0.3634, "step": 4340 }, { "epoch": 2.437394722066255, "grad_norm": 0.38717466592788696, "learning_rate": 1.0338963434147231e-06, "loss": 0.3841, "step": 4341 }, { "epoch": 2.437956204379562, "grad_norm": 0.35422930121421814, "learning_rate": 1.0319077733624377e-06, "loss": 0.4157, "step": 4342 }, { "epoch": 2.4385176866928693, "grad_norm": 0.39854419231414795, "learning_rate": 1.0299208974629183e-06, "loss": 0.3792, "step": 4343 }, { "epoch": 2.439079169006176, "grad_norm": 0.4129280149936676, "learning_rate": 1.027935716564452e-06, "loss": 0.3565, "step": 4344 }, { "epoch": 2.4396406513194835, "grad_norm": 0.37851381301879883, "learning_rate": 1.0259522315145965e-06, "loss": 0.387, "step": 4345 }, { "epoch": 2.4402021336327904, "grad_norm": 0.38330772519111633, "learning_rate": 1.0239704431601905e-06, "loss": 0.3919, "step": 4346 }, { "epoch": 2.4407636159460977, "grad_norm": 0.3813129961490631, "learning_rate": 1.0219903523473473e-06, "loss": 0.3458, "step": 4347 }, { "epoch": 2.441325098259405, "grad_norm": 0.38117262721061707, "learning_rate": 1.0200119599214525e-06, "loss": 0.3446, "step": 4348 }, { "epoch": 2.441886580572712, "grad_norm": 0.39580613374710083, "learning_rate": 1.0180352667271709e-06, "loss": 0.3672, "step": 4349 }, { "epoch": 2.4424480628860192, "grad_norm": 0.3771534562110901, "learning_rate": 1.0160602736084368e-06, "loss": 0.3727, "step": 4350 }, { "epoch": 2.443009545199326, "grad_norm": 0.4098842144012451, "learning_rate": 1.0140869814084637e-06, "loss": 0.355, "step": 4351 }, { "epoch": 2.4435710275126334, "grad_norm": 0.391140341758728, "learning_rate": 1.0121153909697368e-06, "loss": 0.347, "step": 4352 }, { "epoch": 2.4441325098259403, "grad_norm": 0.40927040576934814, "learning_rate": 1.010145503134013e-06, "loss": 0.3646, "step": 4353 }, { "epoch": 2.4446939921392477, "grad_norm": 0.3802870213985443, "learning_rate": 1.0081773187423255e-06, "loss": 0.3415, "step": 4354 }, { "epoch": 2.445255474452555, "grad_norm": 0.38650524616241455, "learning_rate": 1.0062108386349767e-06, "loss": 0.396, "step": 4355 }, { "epoch": 2.445816956765862, "grad_norm": 0.37233105301856995, "learning_rate": 1.0042460636515445e-06, "loss": 0.355, "step": 4356 }, { "epoch": 2.446378439079169, "grad_norm": 0.4018929600715637, "learning_rate": 1.0022829946308787e-06, "loss": 0.3666, "step": 4357 }, { "epoch": 2.446939921392476, "grad_norm": 0.4329030215740204, "learning_rate": 1.0003216324110976e-06, "loss": 0.3339, "step": 4358 }, { "epoch": 2.4475014037057834, "grad_norm": 0.4034518599510193, "learning_rate": 9.983619778295945e-07, "loss": 0.3884, "step": 4359 }, { "epoch": 2.4480628860190903, "grad_norm": 0.4327312111854553, "learning_rate": 9.9640403172303e-07, "loss": 0.3649, "step": 4360 }, { "epoch": 2.4486243683323976, "grad_norm": 0.3782729506492615, "learning_rate": 9.9444779492734e-07, "loss": 0.3914, "step": 4361 }, { "epoch": 2.4491858506457045, "grad_norm": 0.38181883096694946, "learning_rate": 9.924932682777245e-07, "loss": 0.3656, "step": 4362 }, { "epoch": 2.449747332959012, "grad_norm": 0.40793493390083313, "learning_rate": 9.905404526086587e-07, "loss": 0.3624, "step": 4363 }, { "epoch": 2.4503088152723187, "grad_norm": 0.4089382588863373, "learning_rate": 9.885893487538867e-07, "loss": 0.3691, "step": 4364 }, { "epoch": 2.450870297585626, "grad_norm": 0.3877773582935333, "learning_rate": 9.866399575464176e-07, "loss": 0.3907, "step": 4365 }, { "epoch": 2.4514317798989333, "grad_norm": 0.43151429295539856, "learning_rate": 9.846922798185342e-07, "loss": 0.3551, "step": 4366 }, { "epoch": 2.4519932622122402, "grad_norm": 0.4066796898841858, "learning_rate": 9.827463164017837e-07, "loss": 0.3921, "step": 4367 }, { "epoch": 2.4525547445255476, "grad_norm": 0.41231122612953186, "learning_rate": 9.808020681269843e-07, "loss": 0.3636, "step": 4368 }, { "epoch": 2.4531162268388544, "grad_norm": 0.3899294435977936, "learning_rate": 9.78859535824222e-07, "loss": 0.3883, "step": 4369 }, { "epoch": 2.4536777091521618, "grad_norm": 0.4315827488899231, "learning_rate": 9.769187203228469e-07, "loss": 0.3697, "step": 4370 }, { "epoch": 2.4542391914654687, "grad_norm": 0.3908140957355499, "learning_rate": 9.749796224514806e-07, "loss": 0.3605, "step": 4371 }, { "epoch": 2.454800673778776, "grad_norm": 0.37892666459083557, "learning_rate": 9.730422430380077e-07, "loss": 0.3731, "step": 4372 }, { "epoch": 2.4553621560920833, "grad_norm": 0.4328417181968689, "learning_rate": 9.7110658290958e-07, "loss": 0.3633, "step": 4373 }, { "epoch": 2.45592363840539, "grad_norm": 0.39924418926239014, "learning_rate": 9.69172642892618e-07, "loss": 0.3789, "step": 4374 }, { "epoch": 2.4564851207186975, "grad_norm": 0.42460519075393677, "learning_rate": 9.67240423812803e-07, "loss": 0.3949, "step": 4375 }, { "epoch": 2.4570466030320044, "grad_norm": 0.4033779203891754, "learning_rate": 9.65309926495087e-07, "loss": 0.3426, "step": 4376 }, { "epoch": 2.4576080853453117, "grad_norm": 0.40788668394088745, "learning_rate": 9.63381151763681e-07, "loss": 0.3909, "step": 4377 }, { "epoch": 2.4581695676586186, "grad_norm": 0.377837210893631, "learning_rate": 9.614541004420652e-07, "loss": 0.3486, "step": 4378 }, { "epoch": 2.458731049971926, "grad_norm": 0.4107156991958618, "learning_rate": 9.59528773352983e-07, "loss": 0.3352, "step": 4379 }, { "epoch": 2.4592925322852333, "grad_norm": 0.3770356774330139, "learning_rate": 9.576051713184392e-07, "loss": 0.4024, "step": 4380 }, { "epoch": 2.45985401459854, "grad_norm": 0.38936299085617065, "learning_rate": 9.556832951597067e-07, "loss": 0.3763, "step": 4381 }, { "epoch": 2.4604154969118475, "grad_norm": 0.40179190039634705, "learning_rate": 9.537631456973158e-07, "loss": 0.3949, "step": 4382 }, { "epoch": 2.4609769792251543, "grad_norm": 0.37553277611732483, "learning_rate": 9.518447237510636e-07, "loss": 0.3706, "step": 4383 }, { "epoch": 2.4615384615384617, "grad_norm": 0.3924628496170044, "learning_rate": 9.499280301400105e-07, "loss": 0.396, "step": 4384 }, { "epoch": 2.4620999438517686, "grad_norm": 0.3855682611465454, "learning_rate": 9.480130656824743e-07, "loss": 0.3676, "step": 4385 }, { "epoch": 2.462661426165076, "grad_norm": 0.37093299627304077, "learning_rate": 9.460998311960401e-07, "loss": 0.4078, "step": 4386 }, { "epoch": 2.4632229084783828, "grad_norm": 0.3832871615886688, "learning_rate": 9.441883274975489e-07, "loss": 0.3999, "step": 4387 }, { "epoch": 2.46378439079169, "grad_norm": 0.3773365020751953, "learning_rate": 9.422785554031072e-07, "loss": 0.3555, "step": 4388 }, { "epoch": 2.464345873104997, "grad_norm": 0.42297250032424927, "learning_rate": 9.403705157280813e-07, "loss": 0.3495, "step": 4389 }, { "epoch": 2.4649073554183043, "grad_norm": 0.3750593662261963, "learning_rate": 9.384642092870949e-07, "loss": 0.376, "step": 4390 }, { "epoch": 2.4654688377316116, "grad_norm": 0.39663445949554443, "learning_rate": 9.365596368940366e-07, "loss": 0.3614, "step": 4391 }, { "epoch": 2.4660303200449185, "grad_norm": 0.42258360981941223, "learning_rate": 9.346567993620492e-07, "loss": 0.3473, "step": 4392 }, { "epoch": 2.466591802358226, "grad_norm": 0.3783074915409088, "learning_rate": 9.327556975035384e-07, "loss": 0.3784, "step": 4393 }, { "epoch": 2.4671532846715327, "grad_norm": 0.4186030626296997, "learning_rate": 9.308563321301701e-07, "loss": 0.3564, "step": 4394 }, { "epoch": 2.46771476698484, "grad_norm": 0.4015771746635437, "learning_rate": 9.289587040528641e-07, "loss": 0.4048, "step": 4395 }, { "epoch": 2.468276249298147, "grad_norm": 0.37606942653656006, "learning_rate": 9.270628140818034e-07, "loss": 0.3666, "step": 4396 }, { "epoch": 2.4688377316114543, "grad_norm": 0.41528013348579407, "learning_rate": 9.251686630264239e-07, "loss": 0.3872, "step": 4397 }, { "epoch": 2.4693992139247616, "grad_norm": 0.39047276973724365, "learning_rate": 9.23276251695423e-07, "loss": 0.3627, "step": 4398 }, { "epoch": 2.4699606962380685, "grad_norm": 0.41247954964637756, "learning_rate": 9.213855808967564e-07, "loss": 0.4186, "step": 4399 }, { "epoch": 2.470522178551376, "grad_norm": 0.3712351620197296, "learning_rate": 9.194966514376313e-07, "loss": 0.3392, "step": 4400 }, { "epoch": 2.4710836608646827, "grad_norm": 0.3888494074344635, "learning_rate": 9.176094641245171e-07, "loss": 0.3601, "step": 4401 }, { "epoch": 2.47164514317799, "grad_norm": 0.409347802400589, "learning_rate": 9.157240197631345e-07, "loss": 0.3914, "step": 4402 }, { "epoch": 2.472206625491297, "grad_norm": 0.4149004817008972, "learning_rate": 9.138403191584639e-07, "loss": 0.3668, "step": 4403 }, { "epoch": 2.472768107804604, "grad_norm": 0.3871806561946869, "learning_rate": 9.119583631147405e-07, "loss": 0.3881, "step": 4404 }, { "epoch": 2.473329590117911, "grad_norm": 0.4064255952835083, "learning_rate": 9.100781524354518e-07, "loss": 0.4112, "step": 4405 }, { "epoch": 2.4738910724312184, "grad_norm": 0.41569188237190247, "learning_rate": 9.081996879233446e-07, "loss": 0.3907, "step": 4406 }, { "epoch": 2.4744525547445253, "grad_norm": 0.4252353012561798, "learning_rate": 9.063229703804155e-07, "loss": 0.399, "step": 4407 }, { "epoch": 2.4750140370578326, "grad_norm": 0.40315547585487366, "learning_rate": 9.044480006079193e-07, "loss": 0.3776, "step": 4408 }, { "epoch": 2.47557551937114, "grad_norm": 0.3668069839477539, "learning_rate": 9.025747794063639e-07, "loss": 0.3581, "step": 4409 }, { "epoch": 2.476137001684447, "grad_norm": 0.43349185585975647, "learning_rate": 9.007033075755062e-07, "loss": 0.3691, "step": 4410 }, { "epoch": 2.476698483997754, "grad_norm": 0.3610156774520874, "learning_rate": 8.988335859143632e-07, "loss": 0.3788, "step": 4411 }, { "epoch": 2.477259966311061, "grad_norm": 0.37054839730262756, "learning_rate": 8.969656152211987e-07, "loss": 0.3834, "step": 4412 }, { "epoch": 2.4778214486243684, "grad_norm": 0.3940719664096832, "learning_rate": 8.950993962935322e-07, "loss": 0.3901, "step": 4413 }, { "epoch": 2.4783829309376753, "grad_norm": 0.3917013108730316, "learning_rate": 8.932349299281362e-07, "loss": 0.3839, "step": 4414 }, { "epoch": 2.4789444132509826, "grad_norm": 0.4016041159629822, "learning_rate": 8.913722169210303e-07, "loss": 0.3708, "step": 4415 }, { "epoch": 2.47950589556429, "grad_norm": 0.37346965074539185, "learning_rate": 8.895112580674908e-07, "loss": 0.377, "step": 4416 }, { "epoch": 2.480067377877597, "grad_norm": 0.3810291588306427, "learning_rate": 8.876520541620409e-07, "loss": 0.3803, "step": 4417 }, { "epoch": 2.480628860190904, "grad_norm": 0.36459437012672424, "learning_rate": 8.857946059984573e-07, "loss": 0.3826, "step": 4418 }, { "epoch": 2.481190342504211, "grad_norm": 0.47706273198127747, "learning_rate": 8.839389143697669e-07, "loss": 0.3876, "step": 4419 }, { "epoch": 2.4817518248175183, "grad_norm": 0.4422367215156555, "learning_rate": 8.820849800682441e-07, "loss": 0.347, "step": 4420 }, { "epoch": 2.482313307130825, "grad_norm": 0.4127006232738495, "learning_rate": 8.802328038854175e-07, "loss": 0.3665, "step": 4421 }, { "epoch": 2.4828747894441325, "grad_norm": 0.38690876960754395, "learning_rate": 8.783823866120589e-07, "loss": 0.3814, "step": 4422 }, { "epoch": 2.48343627175744, "grad_norm": 0.41556623578071594, "learning_rate": 8.765337290381948e-07, "loss": 0.3506, "step": 4423 }, { "epoch": 2.4839977540707467, "grad_norm": 0.44181984663009644, "learning_rate": 8.746868319530993e-07, "loss": 0.3595, "step": 4424 }, { "epoch": 2.484559236384054, "grad_norm": 0.397203654050827, "learning_rate": 8.728416961452912e-07, "loss": 0.3799, "step": 4425 }, { "epoch": 2.485120718697361, "grad_norm": 0.42826443910598755, "learning_rate": 8.709983224025431e-07, "loss": 0.3889, "step": 4426 }, { "epoch": 2.4856822010106683, "grad_norm": 0.39540359377861023, "learning_rate": 8.691567115118687e-07, "loss": 0.3488, "step": 4427 }, { "epoch": 2.486243683323975, "grad_norm": 0.3892425298690796, "learning_rate": 8.673168642595342e-07, "loss": 0.3693, "step": 4428 }, { "epoch": 2.4868051656372825, "grad_norm": 0.3708018362522125, "learning_rate": 8.654787814310522e-07, "loss": 0.3955, "step": 4429 }, { "epoch": 2.4873666479505894, "grad_norm": 0.43452131748199463, "learning_rate": 8.636424638111796e-07, "loss": 0.332, "step": 4430 }, { "epoch": 2.4879281302638967, "grad_norm": 0.38027796149253845, "learning_rate": 8.618079121839218e-07, "loss": 0.3521, "step": 4431 }, { "epoch": 2.4884896125772036, "grad_norm": 0.39416205883026123, "learning_rate": 8.59975127332528e-07, "loss": 0.3765, "step": 4432 }, { "epoch": 2.489051094890511, "grad_norm": 0.38424453139305115, "learning_rate": 8.581441100394955e-07, "loss": 0.3544, "step": 4433 }, { "epoch": 2.4896125772038182, "grad_norm": 0.3914647400379181, "learning_rate": 8.563148610865674e-07, "loss": 0.3962, "step": 4434 }, { "epoch": 2.490174059517125, "grad_norm": 0.4056921899318695, "learning_rate": 8.544873812547272e-07, "loss": 0.3686, "step": 4435 }, { "epoch": 2.4907355418304324, "grad_norm": 0.3655182719230652, "learning_rate": 8.526616713242098e-07, "loss": 0.3587, "step": 4436 }, { "epoch": 2.4912970241437393, "grad_norm": 0.41270387172698975, "learning_rate": 8.508377320744876e-07, "loss": 0.3832, "step": 4437 }, { "epoch": 2.4918585064570467, "grad_norm": 0.40703558921813965, "learning_rate": 8.490155642842818e-07, "loss": 0.3669, "step": 4438 }, { "epoch": 2.4924199887703535, "grad_norm": 0.3660275936126709, "learning_rate": 8.471951687315571e-07, "loss": 0.3738, "step": 4439 }, { "epoch": 2.492981471083661, "grad_norm": 0.355326771736145, "learning_rate": 8.453765461935182e-07, "loss": 0.3691, "step": 4440 }, { "epoch": 2.493542953396968, "grad_norm": 0.42080020904541016, "learning_rate": 8.435596974466165e-07, "loss": 0.3793, "step": 4441 }, { "epoch": 2.494104435710275, "grad_norm": 0.35030075907707214, "learning_rate": 8.417446232665416e-07, "loss": 0.4133, "step": 4442 }, { "epoch": 2.4946659180235824, "grad_norm": 0.4544270634651184, "learning_rate": 8.399313244282326e-07, "loss": 0.364, "step": 4443 }, { "epoch": 2.4952274003368893, "grad_norm": 0.3721926808357239, "learning_rate": 8.381198017058634e-07, "loss": 0.3648, "step": 4444 }, { "epoch": 2.4957888826501966, "grad_norm": 0.4072818160057068, "learning_rate": 8.363100558728543e-07, "loss": 0.3496, "step": 4445 }, { "epoch": 2.4963503649635035, "grad_norm": 0.4060557186603546, "learning_rate": 8.345020877018633e-07, "loss": 0.3765, "step": 4446 }, { "epoch": 2.496911847276811, "grad_norm": 0.38404005765914917, "learning_rate": 8.326958979647926e-07, "loss": 0.3404, "step": 4447 }, { "epoch": 2.497473329590118, "grad_norm": 0.38668566942214966, "learning_rate": 8.30891487432785e-07, "loss": 0.3592, "step": 4448 }, { "epoch": 2.498034811903425, "grad_norm": 0.3998941481113434, "learning_rate": 8.290888568762201e-07, "loss": 0.3707, "step": 4449 }, { "epoch": 2.4985962942167323, "grad_norm": 0.391815721988678, "learning_rate": 8.272880070647221e-07, "loss": 0.355, "step": 4450 }, { "epoch": 2.4991577765300392, "grad_norm": 0.41935500502586365, "learning_rate": 8.25488938767151e-07, "loss": 0.3763, "step": 4451 }, { "epoch": 2.4997192588433466, "grad_norm": 0.408460408449173, "learning_rate": 8.236916527516092e-07, "loss": 0.3296, "step": 4452 }, { "epoch": 2.5002807411566534, "grad_norm": 0.3863939642906189, "learning_rate": 8.218961497854378e-07, "loss": 0.3764, "step": 4453 }, { "epoch": 2.5008422234699608, "grad_norm": 0.4112633168697357, "learning_rate": 8.201024306352134e-07, "loss": 0.3906, "step": 4454 }, { "epoch": 2.501403705783268, "grad_norm": 0.39354878664016724, "learning_rate": 8.183104960667565e-07, "loss": 0.3931, "step": 4455 }, { "epoch": 2.501965188096575, "grad_norm": 0.3655575215816498, "learning_rate": 8.165203468451193e-07, "loss": 0.37, "step": 4456 }, { "epoch": 2.502526670409882, "grad_norm": 0.38277480006217957, "learning_rate": 8.147319837345974e-07, "loss": 0.3938, "step": 4457 }, { "epoch": 2.503088152723189, "grad_norm": 0.3867373466491699, "learning_rate": 8.129454074987219e-07, "loss": 0.3763, "step": 4458 }, { "epoch": 2.5036496350364965, "grad_norm": 0.4007774591445923, "learning_rate": 8.111606189002585e-07, "loss": 0.328, "step": 4459 }, { "epoch": 2.5042111173498034, "grad_norm": 0.3716161549091339, "learning_rate": 8.093776187012136e-07, "loss": 0.3691, "step": 4460 }, { "epoch": 2.5047725996631107, "grad_norm": 0.3984174132347107, "learning_rate": 8.075964076628273e-07, "loss": 0.3627, "step": 4461 }, { "epoch": 2.5053340819764176, "grad_norm": 0.41282007098197937, "learning_rate": 8.058169865455768e-07, "loss": 0.3437, "step": 4462 }, { "epoch": 2.505895564289725, "grad_norm": 0.407710999250412, "learning_rate": 8.040393561091764e-07, "loss": 0.3727, "step": 4463 }, { "epoch": 2.506457046603032, "grad_norm": 0.4043843448162079, "learning_rate": 8.022635171125725e-07, "loss": 0.3508, "step": 4464 }, { "epoch": 2.507018528916339, "grad_norm": 0.3651256263256073, "learning_rate": 8.00489470313951e-07, "loss": 0.3725, "step": 4465 }, { "epoch": 2.5075800112296465, "grad_norm": 0.37716180086135864, "learning_rate": 7.987172164707274e-07, "loss": 0.3313, "step": 4466 }, { "epoch": 2.5081414935429533, "grad_norm": 0.40144529938697815, "learning_rate": 7.969467563395572e-07, "loss": 0.3562, "step": 4467 }, { "epoch": 2.5087029758562607, "grad_norm": 0.4037589728832245, "learning_rate": 7.951780906763274e-07, "loss": 0.3554, "step": 4468 }, { "epoch": 2.5092644581695676, "grad_norm": 0.37255367636680603, "learning_rate": 7.934112202361577e-07, "loss": 0.3829, "step": 4469 }, { "epoch": 2.509825940482875, "grad_norm": 0.36713582277297974, "learning_rate": 7.916461457734043e-07, "loss": 0.3591, "step": 4470 }, { "epoch": 2.5103874227961818, "grad_norm": 0.3756272792816162, "learning_rate": 7.898828680416526e-07, "loss": 0.3909, "step": 4471 }, { "epoch": 2.510948905109489, "grad_norm": 0.3889174461364746, "learning_rate": 7.881213877937249e-07, "loss": 0.3625, "step": 4472 }, { "epoch": 2.5115103874227964, "grad_norm": 0.34808143973350525, "learning_rate": 7.863617057816752e-07, "loss": 0.3822, "step": 4473 }, { "epoch": 2.5120718697361033, "grad_norm": 0.4198404848575592, "learning_rate": 7.846038227567871e-07, "loss": 0.3662, "step": 4474 }, { "epoch": 2.51263335204941, "grad_norm": 0.3912544548511505, "learning_rate": 7.828477394695799e-07, "loss": 0.3794, "step": 4475 }, { "epoch": 2.5131948343627175, "grad_norm": 0.4461274743080139, "learning_rate": 7.810934566698003e-07, "loss": 0.3783, "step": 4476 }, { "epoch": 2.513756316676025, "grad_norm": 0.35609954595565796, "learning_rate": 7.793409751064307e-07, "loss": 0.3492, "step": 4477 }, { "epoch": 2.5143177989893317, "grad_norm": 0.3761810064315796, "learning_rate": 7.775902955276826e-07, "loss": 0.3795, "step": 4478 }, { "epoch": 2.514879281302639, "grad_norm": 0.3722012937068939, "learning_rate": 7.758414186809959e-07, "loss": 0.373, "step": 4479 }, { "epoch": 2.5154407636159464, "grad_norm": 0.361824095249176, "learning_rate": 7.74094345313045e-07, "loss": 0.3592, "step": 4480 }, { "epoch": 2.5160022459292533, "grad_norm": 0.36059340834617615, "learning_rate": 7.723490761697305e-07, "loss": 0.3653, "step": 4481 }, { "epoch": 2.51656372824256, "grad_norm": 0.36421310901641846, "learning_rate": 7.706056119961852e-07, "loss": 0.3727, "step": 4482 }, { "epoch": 2.5171252105558675, "grad_norm": 0.39096131920814514, "learning_rate": 7.688639535367714e-07, "loss": 0.3694, "step": 4483 }, { "epoch": 2.517686692869175, "grad_norm": 0.4002029001712799, "learning_rate": 7.671241015350778e-07, "loss": 0.3919, "step": 4484 }, { "epoch": 2.5182481751824817, "grad_norm": 0.3652794361114502, "learning_rate": 7.653860567339255e-07, "loss": 0.3586, "step": 4485 }, { "epoch": 2.518809657495789, "grad_norm": 0.3739278316497803, "learning_rate": 7.6364981987536e-07, "loss": 0.3649, "step": 4486 }, { "epoch": 2.519371139809096, "grad_norm": 0.3684339225292206, "learning_rate": 7.619153917006583e-07, "loss": 0.3734, "step": 4487 }, { "epoch": 2.519932622122403, "grad_norm": 0.423453152179718, "learning_rate": 7.601827729503252e-07, "loss": 0.3649, "step": 4488 }, { "epoch": 2.52049410443571, "grad_norm": 0.39883673191070557, "learning_rate": 7.584519643640886e-07, "loss": 0.3543, "step": 4489 }, { "epoch": 2.5210555867490174, "grad_norm": 0.3873201906681061, "learning_rate": 7.567229666809095e-07, "loss": 0.3614, "step": 4490 }, { "epoch": 2.5216170690623247, "grad_norm": 0.3973695933818817, "learning_rate": 7.549957806389708e-07, "loss": 0.3816, "step": 4491 }, { "epoch": 2.5221785513756316, "grad_norm": 0.39094066619873047, "learning_rate": 7.532704069756846e-07, "loss": 0.3703, "step": 4492 }, { "epoch": 2.522740033688939, "grad_norm": 0.3979533612728119, "learning_rate": 7.515468464276909e-07, "loss": 0.3604, "step": 4493 }, { "epoch": 2.523301516002246, "grad_norm": 0.3990233242511749, "learning_rate": 7.498250997308498e-07, "loss": 0.3642, "step": 4494 }, { "epoch": 2.523862998315553, "grad_norm": 0.37460994720458984, "learning_rate": 7.481051676202538e-07, "loss": 0.3731, "step": 4495 }, { "epoch": 2.52442448062886, "grad_norm": 0.3905060887336731, "learning_rate": 7.463870508302146e-07, "loss": 0.3495, "step": 4496 }, { "epoch": 2.5249859629421674, "grad_norm": 0.4002072513103485, "learning_rate": 7.446707500942729e-07, "loss": 0.3774, "step": 4497 }, { "epoch": 2.5255474452554747, "grad_norm": 0.3520466089248657, "learning_rate": 7.429562661451939e-07, "loss": 0.3463, "step": 4498 }, { "epoch": 2.5261089275687816, "grad_norm": 0.4089442491531372, "learning_rate": 7.412435997149642e-07, "loss": 0.3708, "step": 4499 }, { "epoch": 2.5266704098820885, "grad_norm": 0.43684691190719604, "learning_rate": 7.395327515347977e-07, "loss": 0.4009, "step": 4500 }, { "epoch": 2.527231892195396, "grad_norm": 0.3590644896030426, "learning_rate": 7.378237223351292e-07, "loss": 0.386, "step": 4501 }, { "epoch": 2.527793374508703, "grad_norm": 0.4453056752681732, "learning_rate": 7.361165128456194e-07, "loss": 0.3655, "step": 4502 }, { "epoch": 2.52835485682201, "grad_norm": 0.38455432653427124, "learning_rate": 7.344111237951518e-07, "loss": 0.364, "step": 4503 }, { "epoch": 2.5289163391353173, "grad_norm": 0.3533548414707184, "learning_rate": 7.327075559118297e-07, "loss": 0.3805, "step": 4504 }, { "epoch": 2.529477821448624, "grad_norm": 0.38613495230674744, "learning_rate": 7.310058099229828e-07, "loss": 0.367, "step": 4505 }, { "epoch": 2.5300393037619315, "grad_norm": 0.37346363067626953, "learning_rate": 7.293058865551594e-07, "loss": 0.3687, "step": 4506 }, { "epoch": 2.5306007860752384, "grad_norm": 0.4183979332447052, "learning_rate": 7.276077865341324e-07, "loss": 0.3568, "step": 4507 }, { "epoch": 2.5311622683885457, "grad_norm": 0.37481752038002014, "learning_rate": 7.259115105848963e-07, "loss": 0.3572, "step": 4508 }, { "epoch": 2.531723750701853, "grad_norm": 0.3761235773563385, "learning_rate": 7.242170594316639e-07, "loss": 0.3497, "step": 4509 }, { "epoch": 2.53228523301516, "grad_norm": 0.39838141202926636, "learning_rate": 7.225244337978726e-07, "loss": 0.4007, "step": 4510 }, { "epoch": 2.5328467153284673, "grad_norm": 0.4225531816482544, "learning_rate": 7.208336344061767e-07, "loss": 0.3568, "step": 4511 }, { "epoch": 2.533408197641774, "grad_norm": 0.40367937088012695, "learning_rate": 7.191446619784536e-07, "loss": 0.3731, "step": 4512 }, { "epoch": 2.5339696799550815, "grad_norm": 0.4538433253765106, "learning_rate": 7.174575172358006e-07, "loss": 0.4004, "step": 4513 }, { "epoch": 2.5345311622683884, "grad_norm": 0.39057525992393494, "learning_rate": 7.157722008985324e-07, "loss": 0.3832, "step": 4514 }, { "epoch": 2.5350926445816957, "grad_norm": 0.3999640941619873, "learning_rate": 7.140887136861857e-07, "loss": 0.3741, "step": 4515 }, { "epoch": 2.535654126895003, "grad_norm": 0.38090863823890686, "learning_rate": 7.124070563175139e-07, "loss": 0.359, "step": 4516 }, { "epoch": 2.53621560920831, "grad_norm": 0.3674054741859436, "learning_rate": 7.107272295104906e-07, "loss": 0.3707, "step": 4517 }, { "epoch": 2.5367770915216172, "grad_norm": 0.44120660424232483, "learning_rate": 7.090492339823096e-07, "loss": 0.397, "step": 4518 }, { "epoch": 2.537338573834924, "grad_norm": 0.4223296642303467, "learning_rate": 7.073730704493781e-07, "loss": 0.3883, "step": 4519 }, { "epoch": 2.5379000561482314, "grad_norm": 0.4016600549221039, "learning_rate": 7.056987396273257e-07, "loss": 0.4039, "step": 4520 }, { "epoch": 2.5384615384615383, "grad_norm": 0.4121388792991638, "learning_rate": 7.040262422309962e-07, "loss": 0.3976, "step": 4521 }, { "epoch": 2.5390230207748457, "grad_norm": 0.3474540412425995, "learning_rate": 7.023555789744535e-07, "loss": 0.3831, "step": 4522 }, { "epoch": 2.539584503088153, "grad_norm": 0.36493581533432007, "learning_rate": 7.006867505709769e-07, "loss": 0.3612, "step": 4523 }, { "epoch": 2.54014598540146, "grad_norm": 0.38512900471687317, "learning_rate": 6.990197577330621e-07, "loss": 0.3861, "step": 4524 }, { "epoch": 2.5407074677147667, "grad_norm": 0.3958102762699127, "learning_rate": 6.97354601172422e-07, "loss": 0.4009, "step": 4525 }, { "epoch": 2.541268950028074, "grad_norm": 0.4267043471336365, "learning_rate": 6.956912815999839e-07, "loss": 0.3816, "step": 4526 }, { "epoch": 2.5418304323413814, "grad_norm": 0.41569316387176514, "learning_rate": 6.940297997258938e-07, "loss": 0.371, "step": 4527 }, { "epoch": 2.5423919146546883, "grad_norm": 0.40410739183425903, "learning_rate": 6.923701562595092e-07, "loss": 0.3763, "step": 4528 }, { "epoch": 2.5429533969679956, "grad_norm": 0.3803502023220062, "learning_rate": 6.907123519094055e-07, "loss": 0.357, "step": 4529 }, { "epoch": 2.5435148792813025, "grad_norm": 0.3862360119819641, "learning_rate": 6.890563873833739e-07, "loss": 0.3886, "step": 4530 }, { "epoch": 2.54407636159461, "grad_norm": 0.40128666162490845, "learning_rate": 6.874022633884154e-07, "loss": 0.3885, "step": 4531 }, { "epoch": 2.5446378439079167, "grad_norm": 0.35451027750968933, "learning_rate": 6.85749980630751e-07, "loss": 0.3674, "step": 4532 }, { "epoch": 2.545199326221224, "grad_norm": 0.3942285180091858, "learning_rate": 6.840995398158101e-07, "loss": 0.3707, "step": 4533 }, { "epoch": 2.5457608085345313, "grad_norm": 0.37781137228012085, "learning_rate": 6.824509416482394e-07, "loss": 0.3731, "step": 4534 }, { "epoch": 2.5463222908478382, "grad_norm": 0.376055508852005, "learning_rate": 6.808041868318993e-07, "loss": 0.3652, "step": 4535 }, { "epoch": 2.5468837731611456, "grad_norm": 0.3811047077178955, "learning_rate": 6.79159276069859e-07, "loss": 0.3759, "step": 4536 }, { "epoch": 2.5474452554744524, "grad_norm": 0.41242703795433044, "learning_rate": 6.775162100644062e-07, "loss": 0.3557, "step": 4537 }, { "epoch": 2.5480067377877598, "grad_norm": 0.3860129117965698, "learning_rate": 6.758749895170347e-07, "loss": 0.363, "step": 4538 }, { "epoch": 2.5485682201010667, "grad_norm": 0.3977435827255249, "learning_rate": 6.742356151284551e-07, "loss": 0.3509, "step": 4539 }, { "epoch": 2.549129702414374, "grad_norm": 0.41043832898139954, "learning_rate": 6.725980875985899e-07, "loss": 0.3938, "step": 4540 }, { "epoch": 2.5496911847276813, "grad_norm": 0.41317662596702576, "learning_rate": 6.709624076265686e-07, "loss": 0.382, "step": 4541 }, { "epoch": 2.550252667040988, "grad_norm": 0.37569063901901245, "learning_rate": 6.693285759107371e-07, "loss": 0.386, "step": 4542 }, { "epoch": 2.550814149354295, "grad_norm": 0.409391313791275, "learning_rate": 6.676965931486478e-07, "loss": 0.3393, "step": 4543 }, { "epoch": 2.5513756316676024, "grad_norm": 0.40186169743537903, "learning_rate": 6.66066460037067e-07, "loss": 0.3701, "step": 4544 }, { "epoch": 2.5519371139809097, "grad_norm": 0.3790963292121887, "learning_rate": 6.64438177271971e-07, "loss": 0.3673, "step": 4545 }, { "epoch": 2.5524985962942166, "grad_norm": 0.39030689001083374, "learning_rate": 6.628117455485434e-07, "loss": 0.3472, "step": 4546 }, { "epoch": 2.553060078607524, "grad_norm": 0.39446520805358887, "learning_rate": 6.611871655611807e-07, "loss": 0.3911, "step": 4547 }, { "epoch": 2.5536215609208313, "grad_norm": 0.3840888738632202, "learning_rate": 6.59564438003486e-07, "loss": 0.3403, "step": 4548 }, { "epoch": 2.554183043234138, "grad_norm": 0.40875113010406494, "learning_rate": 6.579435635682735e-07, "loss": 0.3698, "step": 4549 }, { "epoch": 2.554744525547445, "grad_norm": 0.40355628728866577, "learning_rate": 6.563245429475673e-07, "loss": 0.391, "step": 4550 }, { "epoch": 2.5553060078607523, "grad_norm": 0.3829365372657776, "learning_rate": 6.547073768325956e-07, "loss": 0.3949, "step": 4551 }, { "epoch": 2.5558674901740597, "grad_norm": 0.3903424143791199, "learning_rate": 6.530920659137996e-07, "loss": 0.3886, "step": 4552 }, { "epoch": 2.5564289724873666, "grad_norm": 0.42156457901000977, "learning_rate": 6.514786108808252e-07, "loss": 0.4043, "step": 4553 }, { "epoch": 2.556990454800674, "grad_norm": 0.4029518663883209, "learning_rate": 6.498670124225276e-07, "loss": 0.4061, "step": 4554 }, { "epoch": 2.5575519371139808, "grad_norm": 0.38258931040763855, "learning_rate": 6.482572712269702e-07, "loss": 0.3812, "step": 4555 }, { "epoch": 2.558113419427288, "grad_norm": 0.39796358346939087, "learning_rate": 6.466493879814201e-07, "loss": 0.3925, "step": 4556 }, { "epoch": 2.558674901740595, "grad_norm": 0.40095919370651245, "learning_rate": 6.450433633723557e-07, "loss": 0.3798, "step": 4557 }, { "epoch": 2.5592363840539023, "grad_norm": 0.4295879304409027, "learning_rate": 6.434391980854559e-07, "loss": 0.3694, "step": 4558 }, { "epoch": 2.5597978663672096, "grad_norm": 0.41227978467941284, "learning_rate": 6.418368928056123e-07, "loss": 0.3874, "step": 4559 }, { "epoch": 2.5603593486805165, "grad_norm": 0.41139766573905945, "learning_rate": 6.402364482169188e-07, "loss": 0.3651, "step": 4560 }, { "epoch": 2.560920830993824, "grad_norm": 0.38880011439323425, "learning_rate": 6.386378650026742e-07, "loss": 0.3914, "step": 4561 }, { "epoch": 2.5614823133071307, "grad_norm": 0.40460628271102905, "learning_rate": 6.370411438453855e-07, "loss": 0.378, "step": 4562 }, { "epoch": 2.562043795620438, "grad_norm": 0.37005814909935, "learning_rate": 6.354462854267612e-07, "loss": 0.3523, "step": 4563 }, { "epoch": 2.562605277933745, "grad_norm": 0.39378201961517334, "learning_rate": 6.338532904277178e-07, "loss": 0.4046, "step": 4564 }, { "epoch": 2.5631667602470523, "grad_norm": 0.3619510233402252, "learning_rate": 6.322621595283752e-07, "loss": 0.3616, "step": 4565 }, { "epoch": 2.5637282425603596, "grad_norm": 0.38412752747535706, "learning_rate": 6.306728934080552e-07, "loss": 0.3747, "step": 4566 }, { "epoch": 2.5642897248736665, "grad_norm": 0.36508044600486755, "learning_rate": 6.290854927452872e-07, "loss": 0.3734, "step": 4567 }, { "epoch": 2.5648512071869733, "grad_norm": 0.39637282490730286, "learning_rate": 6.274999582178004e-07, "loss": 0.3884, "step": 4568 }, { "epoch": 2.5654126895002807, "grad_norm": 0.4080197513103485, "learning_rate": 6.259162905025301e-07, "loss": 0.3551, "step": 4569 }, { "epoch": 2.565974171813588, "grad_norm": 0.3795369863510132, "learning_rate": 6.243344902756148e-07, "loss": 0.3868, "step": 4570 }, { "epoch": 2.566535654126895, "grad_norm": 0.41184499859809875, "learning_rate": 6.227545582123917e-07, "loss": 0.4021, "step": 4571 }, { "epoch": 2.567097136440202, "grad_norm": 0.3986143469810486, "learning_rate": 6.21176494987406e-07, "loss": 0.3589, "step": 4572 }, { "epoch": 2.5676586187535095, "grad_norm": 0.3753019869327545, "learning_rate": 6.196003012743995e-07, "loss": 0.3616, "step": 4573 }, { "epoch": 2.5682201010668164, "grad_norm": 0.4043245315551758, "learning_rate": 6.180259777463199e-07, "loss": 0.3552, "step": 4574 }, { "epoch": 2.5687815833801233, "grad_norm": 0.39795371890068054, "learning_rate": 6.164535250753156e-07, "loss": 0.414, "step": 4575 }, { "epoch": 2.5693430656934306, "grad_norm": 0.3849906623363495, "learning_rate": 6.148829439327341e-07, "loss": 0.3772, "step": 4576 }, { "epoch": 2.569904548006738, "grad_norm": 0.35056501626968384, "learning_rate": 6.133142349891275e-07, "loss": 0.3742, "step": 4577 }, { "epoch": 2.570466030320045, "grad_norm": 0.39419791102409363, "learning_rate": 6.117473989142442e-07, "loss": 0.3535, "step": 4578 }, { "epoch": 2.571027512633352, "grad_norm": 0.40138596296310425, "learning_rate": 6.101824363770364e-07, "loss": 0.3561, "step": 4579 }, { "epoch": 2.571588994946659, "grad_norm": 0.3633236885070801, "learning_rate": 6.086193480456565e-07, "loss": 0.3551, "step": 4580 }, { "epoch": 2.5721504772599664, "grad_norm": 0.3685232996940613, "learning_rate": 6.070581345874527e-07, "loss": 0.3681, "step": 4581 }, { "epoch": 2.5727119595732733, "grad_norm": 0.4262574315071106, "learning_rate": 6.05498796668979e-07, "loss": 0.3892, "step": 4582 }, { "epoch": 2.5732734418865806, "grad_norm": 0.42237526178359985, "learning_rate": 6.03941334955982e-07, "loss": 0.3848, "step": 4583 }, { "epoch": 2.573834924199888, "grad_norm": 0.36599472165107727, "learning_rate": 6.023857501134117e-07, "loss": 0.354, "step": 4584 }, { "epoch": 2.574396406513195, "grad_norm": 0.3963872194290161, "learning_rate": 6.008320428054176e-07, "loss": 0.364, "step": 4585 }, { "epoch": 2.574957888826502, "grad_norm": 0.3917140066623688, "learning_rate": 5.992802136953419e-07, "loss": 0.4074, "step": 4586 }, { "epoch": 2.575519371139809, "grad_norm": 0.3903845250606537, "learning_rate": 5.977302634457316e-07, "loss": 0.3729, "step": 4587 }, { "epoch": 2.5760808534531163, "grad_norm": 0.37903907895088196, "learning_rate": 5.961821927183265e-07, "loss": 0.3531, "step": 4588 }, { "epoch": 2.576642335766423, "grad_norm": 0.43257445096969604, "learning_rate": 5.946360021740666e-07, "loss": 0.3672, "step": 4589 }, { "epoch": 2.5772038180797305, "grad_norm": 0.37675371766090393, "learning_rate": 5.930916924730895e-07, "loss": 0.353, "step": 4590 }, { "epoch": 2.577765300393038, "grad_norm": 0.3847522735595703, "learning_rate": 5.915492642747273e-07, "loss": 0.3598, "step": 4591 }, { "epoch": 2.5783267827063447, "grad_norm": 0.39511391520500183, "learning_rate": 5.900087182375119e-07, "loss": 0.3634, "step": 4592 }, { "epoch": 2.5788882650196516, "grad_norm": 0.37792763113975525, "learning_rate": 5.88470055019168e-07, "loss": 0.3927, "step": 4593 }, { "epoch": 2.579449747332959, "grad_norm": 0.36911627650260925, "learning_rate": 5.869332752766194e-07, "loss": 0.3872, "step": 4594 }, { "epoch": 2.5800112296462663, "grad_norm": 0.3872385621070862, "learning_rate": 5.853983796659862e-07, "loss": 0.4161, "step": 4595 }, { "epoch": 2.580572711959573, "grad_norm": 0.43597936630249023, "learning_rate": 5.838653688425805e-07, "loss": 0.3904, "step": 4596 }, { "epoch": 2.5811341942728805, "grad_norm": 0.3709871172904968, "learning_rate": 5.823342434609142e-07, "loss": 0.3921, "step": 4597 }, { "epoch": 2.5816956765861874, "grad_norm": 0.37966254353523254, "learning_rate": 5.808050041746893e-07, "loss": 0.3464, "step": 4598 }, { "epoch": 2.5822571588994947, "grad_norm": 0.3888424336910248, "learning_rate": 5.79277651636807e-07, "loss": 0.3914, "step": 4599 }, { "epoch": 2.5828186412128016, "grad_norm": 0.3683303892612457, "learning_rate": 5.777521864993613e-07, "loss": 0.3696, "step": 4600 }, { "epoch": 2.583380123526109, "grad_norm": 0.38851767778396606, "learning_rate": 5.762286094136388e-07, "loss": 0.3655, "step": 4601 }, { "epoch": 2.5839416058394162, "grad_norm": 0.3790731728076935, "learning_rate": 5.747069210301237e-07, "loss": 0.3425, "step": 4602 }, { "epoch": 2.584503088152723, "grad_norm": 0.37242889404296875, "learning_rate": 5.731871219984886e-07, "loss": 0.3664, "step": 4603 }, { "epoch": 2.5850645704660304, "grad_norm": 0.3973209857940674, "learning_rate": 5.716692129676044e-07, "loss": 0.3665, "step": 4604 }, { "epoch": 2.5856260527793373, "grad_norm": 0.3646850287914276, "learning_rate": 5.701531945855332e-07, "loss": 0.3584, "step": 4605 }, { "epoch": 2.5861875350926447, "grad_norm": 0.39416810870170593, "learning_rate": 5.68639067499529e-07, "loss": 0.3836, "step": 4606 }, { "epoch": 2.5867490174059515, "grad_norm": 0.3916473686695099, "learning_rate": 5.671268323560397e-07, "loss": 0.3534, "step": 4607 }, { "epoch": 2.587310499719259, "grad_norm": 0.4023967981338501, "learning_rate": 5.656164898007039e-07, "loss": 0.3659, "step": 4608 }, { "epoch": 2.587871982032566, "grad_norm": 0.41172876954078674, "learning_rate": 5.641080404783539e-07, "loss": 0.3548, "step": 4609 }, { "epoch": 2.588433464345873, "grad_norm": 0.35319381952285767, "learning_rate": 5.626014850330136e-07, "loss": 0.3578, "step": 4610 }, { "epoch": 2.5889949466591804, "grad_norm": 0.3662619888782501, "learning_rate": 5.610968241078963e-07, "loss": 0.3574, "step": 4611 }, { "epoch": 2.5895564289724873, "grad_norm": 0.4355729818344116, "learning_rate": 5.595940583454091e-07, "loss": 0.3719, "step": 4612 }, { "epoch": 2.5901179112857946, "grad_norm": 0.35370078682899475, "learning_rate": 5.580931883871471e-07, "loss": 0.4007, "step": 4613 }, { "epoch": 2.5906793935991015, "grad_norm": 0.38077858090400696, "learning_rate": 5.565942148738995e-07, "loss": 0.3807, "step": 4614 }, { "epoch": 2.591240875912409, "grad_norm": 0.37898916006088257, "learning_rate": 5.550971384456416e-07, "loss": 0.3489, "step": 4615 }, { "epoch": 2.591802358225716, "grad_norm": 0.39589938521385193, "learning_rate": 5.536019597415426e-07, "loss": 0.3757, "step": 4616 }, { "epoch": 2.592363840539023, "grad_norm": 0.3894980251789093, "learning_rate": 5.521086793999608e-07, "loss": 0.3548, "step": 4617 }, { "epoch": 2.59292532285233, "grad_norm": 0.39076563715934753, "learning_rate": 5.506172980584406e-07, "loss": 0.3716, "step": 4618 }, { "epoch": 2.5934868051656372, "grad_norm": 0.3726377487182617, "learning_rate": 5.491278163537206e-07, "loss": 0.3785, "step": 4619 }, { "epoch": 2.5940482874789446, "grad_norm": 0.38418084383010864, "learning_rate": 5.476402349217241e-07, "loss": 0.3793, "step": 4620 }, { "epoch": 2.5946097697922514, "grad_norm": 0.3873618543148041, "learning_rate": 5.461545543975655e-07, "loss": 0.374, "step": 4621 }, { "epoch": 2.5951712521055588, "grad_norm": 0.3945091664791107, "learning_rate": 5.446707754155478e-07, "loss": 0.3587, "step": 4622 }, { "epoch": 2.5957327344188657, "grad_norm": 0.40265849232673645, "learning_rate": 5.431888986091605e-07, "loss": 0.38, "step": 4623 }, { "epoch": 2.596294216732173, "grad_norm": 0.3897704780101776, "learning_rate": 5.417089246110824e-07, "loss": 0.3495, "step": 4624 }, { "epoch": 2.59685569904548, "grad_norm": 0.39613351225852966, "learning_rate": 5.402308540531787e-07, "loss": 0.3431, "step": 4625 }, { "epoch": 2.597417181358787, "grad_norm": 0.4130885601043701, "learning_rate": 5.387546875665028e-07, "loss": 0.373, "step": 4626 }, { "epoch": 2.5979786636720945, "grad_norm": 0.40771156549453735, "learning_rate": 5.372804257812964e-07, "loss": 0.3882, "step": 4627 }, { "epoch": 2.5985401459854014, "grad_norm": 0.392876535654068, "learning_rate": 5.35808069326984e-07, "loss": 0.3694, "step": 4628 }, { "epoch": 2.5991016282987087, "grad_norm": 0.4370930790901184, "learning_rate": 5.343376188321819e-07, "loss": 0.3977, "step": 4629 }, { "epoch": 2.5996631106120156, "grad_norm": 0.38676297664642334, "learning_rate": 5.328690749246878e-07, "loss": 0.3713, "step": 4630 }, { "epoch": 2.600224592925323, "grad_norm": 0.37843191623687744, "learning_rate": 5.31402438231488e-07, "loss": 0.364, "step": 4631 }, { "epoch": 2.60078607523863, "grad_norm": 0.35840925574302673, "learning_rate": 5.299377093787555e-07, "loss": 0.3508, "step": 4632 }, { "epoch": 2.601347557551937, "grad_norm": 0.38232895731925964, "learning_rate": 5.284748889918456e-07, "loss": 0.3657, "step": 4633 }, { "epoch": 2.6019090398652445, "grad_norm": 0.36044394969940186, "learning_rate": 5.270139776953026e-07, "loss": 0.3915, "step": 4634 }, { "epoch": 2.6024705221785513, "grad_norm": 0.35498958826065063, "learning_rate": 5.255549761128509e-07, "loss": 0.3691, "step": 4635 }, { "epoch": 2.6030320044918582, "grad_norm": 0.3584323227405548, "learning_rate": 5.240978848674039e-07, "loss": 0.3752, "step": 4636 }, { "epoch": 2.6035934868051656, "grad_norm": 0.36285650730133057, "learning_rate": 5.226427045810583e-07, "loss": 0.365, "step": 4637 }, { "epoch": 2.604154969118473, "grad_norm": 0.3978897035121918, "learning_rate": 5.211894358750935e-07, "loss": 0.3774, "step": 4638 }, { "epoch": 2.6047164514317798, "grad_norm": 0.3848741054534912, "learning_rate": 5.197380793699747e-07, "loss": 0.3949, "step": 4639 }, { "epoch": 2.605277933745087, "grad_norm": 0.3911392092704773, "learning_rate": 5.182886356853484e-07, "loss": 0.3455, "step": 4640 }, { "epoch": 2.6058394160583944, "grad_norm": 0.39880698919296265, "learning_rate": 5.168411054400457e-07, "loss": 0.3589, "step": 4641 }, { "epoch": 2.6064008983717013, "grad_norm": 0.4017636775970459, "learning_rate": 5.15395489252083e-07, "loss": 0.4083, "step": 4642 }, { "epoch": 2.606962380685008, "grad_norm": 0.38900279998779297, "learning_rate": 5.139517877386546e-07, "loss": 0.3884, "step": 4643 }, { "epoch": 2.6075238629983155, "grad_norm": 0.3693084418773651, "learning_rate": 5.125100015161427e-07, "loss": 0.3733, "step": 4644 }, { "epoch": 2.608085345311623, "grad_norm": 0.40364840626716614, "learning_rate": 5.11070131200106e-07, "loss": 0.381, "step": 4645 }, { "epoch": 2.6086468276249297, "grad_norm": 0.3761065602302551, "learning_rate": 5.096321774052903e-07, "loss": 0.3875, "step": 4646 }, { "epoch": 2.609208309938237, "grad_norm": 0.3596464991569519, "learning_rate": 5.081961407456221e-07, "loss": 0.3347, "step": 4647 }, { "epoch": 2.609769792251544, "grad_norm": 0.3855254650115967, "learning_rate": 5.067620218342068e-07, "loss": 0.4067, "step": 4648 }, { "epoch": 2.6103312745648513, "grad_norm": 0.3731272220611572, "learning_rate": 5.053298212833335e-07, "loss": 0.3958, "step": 4649 }, { "epoch": 2.610892756878158, "grad_norm": 0.36950334906578064, "learning_rate": 5.038995397044705e-07, "loss": 0.3592, "step": 4650 }, { "epoch": 2.6114542391914655, "grad_norm": 0.36938270926475525, "learning_rate": 5.024711777082686e-07, "loss": 0.3926, "step": 4651 }, { "epoch": 2.612015721504773, "grad_norm": 0.4159104824066162, "learning_rate": 5.010447359045589e-07, "loss": 0.3513, "step": 4652 }, { "epoch": 2.6125772038180797, "grad_norm": 0.38555601239204407, "learning_rate": 4.996202149023505e-07, "loss": 0.3713, "step": 4653 }, { "epoch": 2.613138686131387, "grad_norm": 0.3723369538784027, "learning_rate": 4.981976153098355e-07, "loss": 0.3576, "step": 4654 }, { "epoch": 2.613700168444694, "grad_norm": 0.3812188506126404, "learning_rate": 4.967769377343829e-07, "loss": 0.3313, "step": 4655 }, { "epoch": 2.614261650758001, "grad_norm": 0.35964545607566833, "learning_rate": 4.953581827825421e-07, "loss": 0.3769, "step": 4656 }, { "epoch": 2.614823133071308, "grad_norm": 0.381626158952713, "learning_rate": 4.939413510600444e-07, "loss": 0.3691, "step": 4657 }, { "epoch": 2.6153846153846154, "grad_norm": 0.41381409764289856, "learning_rate": 4.925264431717941e-07, "loss": 0.3688, "step": 4658 }, { "epoch": 2.6159460976979227, "grad_norm": 0.3658832907676697, "learning_rate": 4.911134597218798e-07, "loss": 0.3858, "step": 4659 }, { "epoch": 2.6165075800112296, "grad_norm": 0.3750095069408417, "learning_rate": 4.897024013135648e-07, "loss": 0.408, "step": 4660 }, { "epoch": 2.6170690623245365, "grad_norm": 0.409168004989624, "learning_rate": 4.882932685492925e-07, "loss": 0.3802, "step": 4661 }, { "epoch": 2.617630544637844, "grad_norm": 0.3916269540786743, "learning_rate": 4.868860620306842e-07, "loss": 0.3894, "step": 4662 }, { "epoch": 2.618192026951151, "grad_norm": 0.41442450881004333, "learning_rate": 4.854807823585367e-07, "loss": 0.3665, "step": 4663 }, { "epoch": 2.618753509264458, "grad_norm": 0.3781140148639679, "learning_rate": 4.840774301328272e-07, "loss": 0.375, "step": 4664 }, { "epoch": 2.6193149915777654, "grad_norm": 0.3697752356529236, "learning_rate": 4.826760059527064e-07, "loss": 0.3486, "step": 4665 }, { "epoch": 2.6198764738910727, "grad_norm": 0.37577909231185913, "learning_rate": 4.812765104165046e-07, "loss": 0.3651, "step": 4666 }, { "epoch": 2.6204379562043796, "grad_norm": 0.38997119665145874, "learning_rate": 4.798789441217294e-07, "loss": 0.346, "step": 4667 }, { "epoch": 2.6209994385176865, "grad_norm": 0.3878112733364105, "learning_rate": 4.784833076650613e-07, "loss": 0.3717, "step": 4668 }, { "epoch": 2.621560920830994, "grad_norm": 0.40804120898246765, "learning_rate": 4.770896016423599e-07, "loss": 0.3539, "step": 4669 }, { "epoch": 2.622122403144301, "grad_norm": 0.36673036217689514, "learning_rate": 4.7569782664865837e-07, "loss": 0.3816, "step": 4670 }, { "epoch": 2.622683885457608, "grad_norm": 0.4096638262271881, "learning_rate": 4.7430798327816716e-07, "loss": 0.3784, "step": 4671 }, { "epoch": 2.6232453677709153, "grad_norm": 0.3734525442123413, "learning_rate": 4.7292007212427273e-07, "loss": 0.3656, "step": 4672 }, { "epoch": 2.623806850084222, "grad_norm": 0.3809833526611328, "learning_rate": 4.7153409377953264e-07, "loss": 0.3993, "step": 4673 }, { "epoch": 2.6243683323975295, "grad_norm": 0.35484132170677185, "learning_rate": 4.7015004883568416e-07, "loss": 0.3688, "step": 4674 }, { "epoch": 2.6249298147108364, "grad_norm": 0.3570456802845001, "learning_rate": 4.6876793788363516e-07, "loss": 0.3742, "step": 4675 }, { "epoch": 2.6254912970241437, "grad_norm": 0.38554108142852783, "learning_rate": 4.673877615134703e-07, "loss": 0.3888, "step": 4676 }, { "epoch": 2.626052779337451, "grad_norm": 0.3676082491874695, "learning_rate": 4.6600952031444826e-07, "loss": 0.37, "step": 4677 }, { "epoch": 2.626614261650758, "grad_norm": 0.3873608112335205, "learning_rate": 4.646332148749988e-07, "loss": 0.3725, "step": 4678 }, { "epoch": 2.6271757439640653, "grad_norm": 0.36516353487968445, "learning_rate": 4.632588457827297e-07, "loss": 0.3859, "step": 4679 }, { "epoch": 2.627737226277372, "grad_norm": 0.39586853981018066, "learning_rate": 4.618864136244167e-07, "loss": 0.3733, "step": 4680 }, { "epoch": 2.6282987085906795, "grad_norm": 0.3804961144924164, "learning_rate": 4.60515918986012e-07, "loss": 0.3615, "step": 4681 }, { "epoch": 2.6288601909039864, "grad_norm": 0.3642585873603821, "learning_rate": 4.591473624526421e-07, "loss": 0.4014, "step": 4682 }, { "epoch": 2.6294216732172937, "grad_norm": 0.3971051871776581, "learning_rate": 4.5778074460860134e-07, "loss": 0.3493, "step": 4683 }, { "epoch": 2.629983155530601, "grad_norm": 0.41935452818870544, "learning_rate": 4.5641606603736064e-07, "loss": 0.358, "step": 4684 }, { "epoch": 2.630544637843908, "grad_norm": 0.38088274002075195, "learning_rate": 4.550533273215596e-07, "loss": 0.3792, "step": 4685 }, { "epoch": 2.631106120157215, "grad_norm": 0.38121333718299866, "learning_rate": 4.5369252904301164e-07, "loss": 0.3633, "step": 4686 }, { "epoch": 2.631667602470522, "grad_norm": 0.36437925696372986, "learning_rate": 4.523336717827026e-07, "loss": 0.3832, "step": 4687 }, { "epoch": 2.6322290847838294, "grad_norm": 0.38126006722450256, "learning_rate": 4.509767561207862e-07, "loss": 0.3806, "step": 4688 }, { "epoch": 2.6327905670971363, "grad_norm": 0.3994176983833313, "learning_rate": 4.496217826365917e-07, "loss": 0.3887, "step": 4689 }, { "epoch": 2.6333520494104437, "grad_norm": 0.4026045501232147, "learning_rate": 4.482687519086143e-07, "loss": 0.4116, "step": 4690 }, { "epoch": 2.633913531723751, "grad_norm": 0.38685837388038635, "learning_rate": 4.46917664514524e-07, "loss": 0.3671, "step": 4691 }, { "epoch": 2.634475014037058, "grad_norm": 0.35219258069992065, "learning_rate": 4.4556852103115887e-07, "loss": 0.3759, "step": 4692 }, { "epoch": 2.6350364963503647, "grad_norm": 0.4063318371772766, "learning_rate": 4.442213220345265e-07, "loss": 0.3685, "step": 4693 }, { "epoch": 2.635597978663672, "grad_norm": 0.3796703517436981, "learning_rate": 4.428760680998073e-07, "loss": 0.3652, "step": 4694 }, { "epoch": 2.6361594609769794, "grad_norm": 0.360767126083374, "learning_rate": 4.415327598013469e-07, "loss": 0.3632, "step": 4695 }, { "epoch": 2.6367209432902863, "grad_norm": 0.34594103693962097, "learning_rate": 4.4019139771266373e-07, "loss": 0.382, "step": 4696 }, { "epoch": 2.6372824256035936, "grad_norm": 0.38451215624809265, "learning_rate": 4.3885198240644524e-07, "loss": 0.3866, "step": 4697 }, { "epoch": 2.6378439079169005, "grad_norm": 0.3874061703681946, "learning_rate": 4.3751451445454453e-07, "loss": 0.3741, "step": 4698 }, { "epoch": 2.638405390230208, "grad_norm": 0.389641672372818, "learning_rate": 4.361789944279876e-07, "loss": 0.3903, "step": 4699 }, { "epoch": 2.6389668725435147, "grad_norm": 0.37347427010536194, "learning_rate": 4.34845422896964e-07, "loss": 0.4086, "step": 4700 }, { "epoch": 2.639528354856822, "grad_norm": 0.4029987156391144, "learning_rate": 4.335138004308359e-07, "loss": 0.3659, "step": 4701 }, { "epoch": 2.6400898371701293, "grad_norm": 0.3806951344013214, "learning_rate": 4.3218412759813034e-07, "loss": 0.3593, "step": 4702 }, { "epoch": 2.6406513194834362, "grad_norm": 0.37749335169792175, "learning_rate": 4.308564049665437e-07, "loss": 0.3555, "step": 4703 }, { "epoch": 2.6412128017967436, "grad_norm": 0.35336631536483765, "learning_rate": 4.2953063310293973e-07, "loss": 0.3772, "step": 4704 }, { "epoch": 2.6417742841100504, "grad_norm": 0.3840005397796631, "learning_rate": 4.282068125733463e-07, "loss": 0.3823, "step": 4705 }, { "epoch": 2.6423357664233578, "grad_norm": 0.38034552335739136, "learning_rate": 4.268849439429634e-07, "loss": 0.3819, "step": 4706 }, { "epoch": 2.6428972487366647, "grad_norm": 0.4028221666812897, "learning_rate": 4.2556502777615194e-07, "loss": 0.3792, "step": 4707 }, { "epoch": 2.643458731049972, "grad_norm": 0.3875294029712677, "learning_rate": 4.242470646364438e-07, "loss": 0.3478, "step": 4708 }, { "epoch": 2.6440202133632793, "grad_norm": 0.3720325529575348, "learning_rate": 4.2293105508653564e-07, "loss": 0.3927, "step": 4709 }, { "epoch": 2.644581695676586, "grad_norm": 0.404727578163147, "learning_rate": 4.216169996882885e-07, "loss": 0.3569, "step": 4710 }, { "epoch": 2.645143177989893, "grad_norm": 0.3841872811317444, "learning_rate": 4.203048990027314e-07, "loss": 0.3931, "step": 4711 }, { "epoch": 2.6457046603032004, "grad_norm": 0.39979273080825806, "learning_rate": 4.1899475359005623e-07, "loss": 0.3892, "step": 4712 }, { "epoch": 2.6462661426165077, "grad_norm": 0.4141218960285187, "learning_rate": 4.1768656400962283e-07, "loss": 0.3713, "step": 4713 }, { "epoch": 2.6468276249298146, "grad_norm": 0.3638370633125305, "learning_rate": 4.1638033081995543e-07, "loss": 0.3402, "step": 4714 }, { "epoch": 2.647389107243122, "grad_norm": 0.4033310115337372, "learning_rate": 4.1507605457874025e-07, "loss": 0.404, "step": 4715 }, { "epoch": 2.647950589556429, "grad_norm": 0.3734534680843353, "learning_rate": 4.1377373584283174e-07, "loss": 0.3601, "step": 4716 }, { "epoch": 2.648512071869736, "grad_norm": 0.37064215540885925, "learning_rate": 4.1247337516824584e-07, "loss": 0.3677, "step": 4717 }, { "epoch": 2.649073554183043, "grad_norm": 0.37314051389694214, "learning_rate": 4.111749731101644e-07, "loss": 0.4062, "step": 4718 }, { "epoch": 2.6496350364963503, "grad_norm": 0.3466314375400543, "learning_rate": 4.0987853022293243e-07, "loss": 0.3505, "step": 4719 }, { "epoch": 2.6501965188096577, "grad_norm": 0.3724139332771301, "learning_rate": 4.0858404706005716e-07, "loss": 0.3807, "step": 4720 }, { "epoch": 2.6507580011229646, "grad_norm": 0.35720115900039673, "learning_rate": 4.0729152417421203e-07, "loss": 0.3681, "step": 4721 }, { "epoch": 2.651319483436272, "grad_norm": 0.3699739873409271, "learning_rate": 4.060009621172295e-07, "loss": 0.3924, "step": 4722 }, { "epoch": 2.6518809657495788, "grad_norm": 0.40552863478660583, "learning_rate": 4.0471236144010886e-07, "loss": 0.3784, "step": 4723 }, { "epoch": 2.652442448062886, "grad_norm": 0.3540946841239929, "learning_rate": 4.034257226930105e-07, "loss": 0.3771, "step": 4724 }, { "epoch": 2.653003930376193, "grad_norm": 0.37629377841949463, "learning_rate": 4.0214104642525507e-07, "loss": 0.3954, "step": 4725 }, { "epoch": 2.6535654126895003, "grad_norm": 0.39279696345329285, "learning_rate": 4.0085833318532995e-07, "loss": 0.3821, "step": 4726 }, { "epoch": 2.6541268950028076, "grad_norm": 0.3916677236557007, "learning_rate": 3.9957758352087984e-07, "loss": 0.3963, "step": 4727 }, { "epoch": 2.6546883773161145, "grad_norm": 0.36508527398109436, "learning_rate": 3.9829879797871284e-07, "loss": 0.3863, "step": 4728 }, { "epoch": 2.655249859629422, "grad_norm": 0.3778470456600189, "learning_rate": 3.9702197710480007e-07, "loss": 0.3935, "step": 4729 }, { "epoch": 2.6558113419427287, "grad_norm": 0.37333157658576965, "learning_rate": 3.957471214442715e-07, "loss": 0.3773, "step": 4730 }, { "epoch": 2.656372824256036, "grad_norm": 0.38986796140670776, "learning_rate": 3.9447423154141907e-07, "loss": 0.3721, "step": 4731 }, { "epoch": 2.656934306569343, "grad_norm": 0.3739251494407654, "learning_rate": 3.932033079396952e-07, "loss": 0.3558, "step": 4732 }, { "epoch": 2.6574957888826503, "grad_norm": 0.385038822889328, "learning_rate": 3.919343511817131e-07, "loss": 0.3617, "step": 4733 }, { "epoch": 2.6580572711959576, "grad_norm": 0.38843676447868347, "learning_rate": 3.906673618092466e-07, "loss": 0.3734, "step": 4734 }, { "epoch": 2.6586187535092645, "grad_norm": 0.38877367973327637, "learning_rate": 3.8940234036322853e-07, "loss": 0.3867, "step": 4735 }, { "epoch": 2.6591802358225713, "grad_norm": 0.40858983993530273, "learning_rate": 3.881392873837525e-07, "loss": 0.39, "step": 4736 }, { "epoch": 2.6597417181358787, "grad_norm": 0.4080926477909088, "learning_rate": 3.8687820341007043e-07, "loss": 0.3737, "step": 4737 }, { "epoch": 2.660303200449186, "grad_norm": 0.39465758204460144, "learning_rate": 3.8561908898059443e-07, "loss": 0.3596, "step": 4738 }, { "epoch": 2.660864682762493, "grad_norm": 0.3767772614955902, "learning_rate": 3.8436194463289777e-07, "loss": 0.3666, "step": 4739 }, { "epoch": 2.6614261650758, "grad_norm": 0.38374531269073486, "learning_rate": 3.8310677090370785e-07, "loss": 0.3723, "step": 4740 }, { "epoch": 2.661987647389107, "grad_norm": 0.38160935044288635, "learning_rate": 3.818535683289154e-07, "loss": 0.3522, "step": 4741 }, { "epoch": 2.6625491297024144, "grad_norm": 0.3925570547580719, "learning_rate": 3.8060233744356634e-07, "loss": 0.4069, "step": 4742 }, { "epoch": 2.6631106120157213, "grad_norm": 0.36221590638160706, "learning_rate": 3.793530787818667e-07, "loss": 0.367, "step": 4743 }, { "epoch": 2.6636720943290286, "grad_norm": 0.4206506907939911, "learning_rate": 3.781057928771803e-07, "loss": 0.3679, "step": 4744 }, { "epoch": 2.664233576642336, "grad_norm": 0.3964281976222992, "learning_rate": 3.7686048026202683e-07, "loss": 0.3752, "step": 4745 }, { "epoch": 2.664795058955643, "grad_norm": 0.38573604822158813, "learning_rate": 3.756171414680865e-07, "loss": 0.3574, "step": 4746 }, { "epoch": 2.66535654126895, "grad_norm": 0.38561582565307617, "learning_rate": 3.743757770261941e-07, "loss": 0.3439, "step": 4747 }, { "epoch": 2.665918023582257, "grad_norm": 0.36613738536834717, "learning_rate": 3.731363874663424e-07, "loss": 0.3732, "step": 4748 }, { "epoch": 2.6664795058955644, "grad_norm": 0.3748065233230591, "learning_rate": 3.718989733176831e-07, "loss": 0.3839, "step": 4749 }, { "epoch": 2.6670409882088713, "grad_norm": 0.39993637800216675, "learning_rate": 3.7066353510852024e-07, "loss": 0.383, "step": 4750 }, { "epoch": 2.6676024705221786, "grad_norm": 0.43113473057746887, "learning_rate": 3.6943007336631866e-07, "loss": 0.4022, "step": 4751 }, { "epoch": 2.668163952835486, "grad_norm": 0.391757994890213, "learning_rate": 3.6819858861769585e-07, "loss": 0.372, "step": 4752 }, { "epoch": 2.668725435148793, "grad_norm": 0.38856399059295654, "learning_rate": 3.6696908138842693e-07, "loss": 0.4078, "step": 4753 }, { "epoch": 2.6692869174620997, "grad_norm": 0.3709568977355957, "learning_rate": 3.6574155220344356e-07, "loss": 0.3537, "step": 4754 }, { "epoch": 2.669848399775407, "grad_norm": 0.3591859042644501, "learning_rate": 3.6451600158683044e-07, "loss": 0.3849, "step": 4755 }, { "epoch": 2.6704098820887143, "grad_norm": 0.43818914890289307, "learning_rate": 3.632924300618301e-07, "loss": 0.3519, "step": 4756 }, { "epoch": 2.670971364402021, "grad_norm": 0.3687063455581665, "learning_rate": 3.6207083815083754e-07, "loss": 0.3704, "step": 4757 }, { "epoch": 2.6715328467153285, "grad_norm": 0.38303446769714355, "learning_rate": 3.608512263754055e-07, "loss": 0.3714, "step": 4758 }, { "epoch": 2.672094329028636, "grad_norm": 0.3813070058822632, "learning_rate": 3.5963359525623966e-07, "loss": 0.3916, "step": 4759 }, { "epoch": 2.6726558113419427, "grad_norm": 0.38097167015075684, "learning_rate": 3.584179453131986e-07, "loss": 0.3965, "step": 4760 }, { "epoch": 2.6732172936552496, "grad_norm": 0.3935537040233612, "learning_rate": 3.5720427706529835e-07, "loss": 0.3747, "step": 4761 }, { "epoch": 2.673778775968557, "grad_norm": 0.3601261377334595, "learning_rate": 3.559925910307066e-07, "loss": 0.3712, "step": 4762 }, { "epoch": 2.6743402582818643, "grad_norm": 0.384395569562912, "learning_rate": 3.547828877267445e-07, "loss": 0.3705, "step": 4763 }, { "epoch": 2.674901740595171, "grad_norm": 0.3978480100631714, "learning_rate": 3.53575167669889e-07, "loss": 0.4083, "step": 4764 }, { "epoch": 2.6754632229084785, "grad_norm": 0.3900682330131531, "learning_rate": 3.523694313757675e-07, "loss": 0.3541, "step": 4765 }, { "epoch": 2.6760247052217854, "grad_norm": 0.4211737811565399, "learning_rate": 3.5116567935916334e-07, "loss": 0.3695, "step": 4766 }, { "epoch": 2.6765861875350927, "grad_norm": 0.3969603180885315, "learning_rate": 3.499639121340093e-07, "loss": 0.3609, "step": 4767 }, { "epoch": 2.6771476698483996, "grad_norm": 0.37369200587272644, "learning_rate": 3.487641302133932e-07, "loss": 0.363, "step": 4768 }, { "epoch": 2.677709152161707, "grad_norm": 0.3799739480018616, "learning_rate": 3.475663341095553e-07, "loss": 0.3384, "step": 4769 }, { "epoch": 2.6782706344750142, "grad_norm": 0.3490656316280365, "learning_rate": 3.463705243338861e-07, "loss": 0.3891, "step": 4770 }, { "epoch": 2.678832116788321, "grad_norm": 0.38178667426109314, "learning_rate": 3.4517670139693035e-07, "loss": 0.3709, "step": 4771 }, { "epoch": 2.6793935991016284, "grad_norm": 0.4222783148288727, "learning_rate": 3.4398486580838273e-07, "loss": 0.3834, "step": 4772 }, { "epoch": 2.6799550814149353, "grad_norm": 0.404717355966568, "learning_rate": 3.427950180770906e-07, "loss": 0.3787, "step": 4773 }, { "epoch": 2.6805165637282427, "grad_norm": 0.3969772756099701, "learning_rate": 3.416071587110531e-07, "loss": 0.3816, "step": 4774 }, { "epoch": 2.6810780460415495, "grad_norm": 0.39196255803108215, "learning_rate": 3.4042128821741736e-07, "loss": 0.3584, "step": 4775 }, { "epoch": 2.681639528354857, "grad_norm": 0.38365209102630615, "learning_rate": 3.3923740710248567e-07, "loss": 0.3673, "step": 4776 }, { "epoch": 2.682201010668164, "grad_norm": 0.3484252691268921, "learning_rate": 3.380555158717075e-07, "loss": 0.3501, "step": 4777 }, { "epoch": 2.682762492981471, "grad_norm": 0.3935635983943939, "learning_rate": 3.368756150296837e-07, "loss": 0.3815, "step": 4778 }, { "epoch": 2.683323975294778, "grad_norm": 0.40964341163635254, "learning_rate": 3.3569770508016797e-07, "loss": 0.3618, "step": 4779 }, { "epoch": 2.6838854576080853, "grad_norm": 0.39022162556648254, "learning_rate": 3.345217865260597e-07, "loss": 0.3619, "step": 4780 }, { "epoch": 2.6844469399213926, "grad_norm": 0.3765307664871216, "learning_rate": 3.333478598694112e-07, "loss": 0.3805, "step": 4781 }, { "epoch": 2.6850084222346995, "grad_norm": 0.3945939540863037, "learning_rate": 3.321759256114226e-07, "loss": 0.3573, "step": 4782 }, { "epoch": 2.685569904548007, "grad_norm": 0.37876516580581665, "learning_rate": 3.310059842524438e-07, "loss": 0.3844, "step": 4783 }, { "epoch": 2.686131386861314, "grad_norm": 0.3771460950374603, "learning_rate": 3.2983803629197566e-07, "loss": 0.3748, "step": 4784 }, { "epoch": 2.686692869174621, "grad_norm": 0.39077821373939514, "learning_rate": 3.286720822286649e-07, "loss": 0.3725, "step": 4785 }, { "epoch": 2.687254351487928, "grad_norm": 0.3804316520690918, "learning_rate": 3.2750812256030993e-07, "loss": 0.3825, "step": 4786 }, { "epoch": 2.6878158338012352, "grad_norm": 0.3815259039402008, "learning_rate": 3.263461577838545e-07, "loss": 0.3654, "step": 4787 }, { "epoch": 2.6883773161145426, "grad_norm": 0.4054114520549774, "learning_rate": 3.251861883953944e-07, "loss": 0.3687, "step": 4788 }, { "epoch": 2.6889387984278494, "grad_norm": 0.388249009847641, "learning_rate": 3.240282148901691e-07, "loss": 0.3685, "step": 4789 }, { "epoch": 2.6895002807411568, "grad_norm": 0.37450769543647766, "learning_rate": 3.2287223776256983e-07, "loss": 0.3759, "step": 4790 }, { "epoch": 2.6900617630544637, "grad_norm": 0.35757938027381897, "learning_rate": 3.217182575061339e-07, "loss": 0.3522, "step": 4791 }, { "epoch": 2.690623245367771, "grad_norm": 0.4145902991294861, "learning_rate": 3.205662746135457e-07, "loss": 0.3646, "step": 4792 }, { "epoch": 2.691184727681078, "grad_norm": 0.38880056142807007, "learning_rate": 3.194162895766384e-07, "loss": 0.3874, "step": 4793 }, { "epoch": 2.691746209994385, "grad_norm": 0.3551712930202484, "learning_rate": 3.182683028863892e-07, "loss": 0.3687, "step": 4794 }, { "epoch": 2.6923076923076925, "grad_norm": 0.4065272808074951, "learning_rate": 3.1712231503292555e-07, "loss": 0.366, "step": 4795 }, { "epoch": 2.6928691746209994, "grad_norm": 0.3793240189552307, "learning_rate": 3.1597832650551997e-07, "loss": 0.424, "step": 4796 }, { "epoch": 2.6934306569343067, "grad_norm": 0.37628766894340515, "learning_rate": 3.1483633779259004e-07, "loss": 0.3382, "step": 4797 }, { "epoch": 2.6939921392476136, "grad_norm": 0.3595896065235138, "learning_rate": 3.136963493817025e-07, "loss": 0.362, "step": 4798 }, { "epoch": 2.694553621560921, "grad_norm": 0.39449429512023926, "learning_rate": 3.1255836175956676e-07, "loss": 0.3561, "step": 4799 }, { "epoch": 2.695115103874228, "grad_norm": 0.40826526284217834, "learning_rate": 3.1142237541204103e-07, "loss": 0.3698, "step": 4800 }, { "epoch": 2.695676586187535, "grad_norm": 0.39351537823677063, "learning_rate": 3.1028839082412774e-07, "loss": 0.3563, "step": 4801 }, { "epoch": 2.6962380685008425, "grad_norm": 0.3742627203464508, "learning_rate": 3.0915640847997364e-07, "loss": 0.391, "step": 4802 }, { "epoch": 2.6967995508141493, "grad_norm": 0.3668820261955261, "learning_rate": 3.0802642886287315e-07, "loss": 0.3887, "step": 4803 }, { "epoch": 2.6973610331274562, "grad_norm": 0.4230174422264099, "learning_rate": 3.068984524552626e-07, "loss": 0.3667, "step": 4804 }, { "epoch": 2.6979225154407636, "grad_norm": 0.3838897943496704, "learning_rate": 3.057724797387257e-07, "loss": 0.3592, "step": 4805 }, { "epoch": 2.698483997754071, "grad_norm": 0.40718555450439453, "learning_rate": 3.046485111939895e-07, "loss": 0.3518, "step": 4806 }, { "epoch": 2.6990454800673778, "grad_norm": 0.35084426403045654, "learning_rate": 3.035265473009252e-07, "loss": 0.3795, "step": 4807 }, { "epoch": 2.699606962380685, "grad_norm": 0.3810840845108032, "learning_rate": 3.0240658853854855e-07, "loss": 0.3936, "step": 4808 }, { "epoch": 2.700168444693992, "grad_norm": 0.34626585245132446, "learning_rate": 3.012886353850192e-07, "loss": 0.3741, "step": 4809 }, { "epoch": 2.7007299270072993, "grad_norm": 0.3641880750656128, "learning_rate": 3.0017268831763977e-07, "loss": 0.3558, "step": 4810 }, { "epoch": 2.701291409320606, "grad_norm": 0.37435492873191833, "learning_rate": 2.990587478128587e-07, "loss": 0.3681, "step": 4811 }, { "epoch": 2.7018528916339135, "grad_norm": 0.38166794180870056, "learning_rate": 2.979468143462638e-07, "loss": 0.3556, "step": 4812 }, { "epoch": 2.702414373947221, "grad_norm": 0.3929373323917389, "learning_rate": 2.96836888392591e-07, "loss": 0.3912, "step": 4813 }, { "epoch": 2.7029758562605277, "grad_norm": 0.3725539445877075, "learning_rate": 2.957289704257138e-07, "loss": 0.3856, "step": 4814 }, { "epoch": 2.703537338573835, "grad_norm": 0.3658425211906433, "learning_rate": 2.9462306091865235e-07, "loss": 0.3792, "step": 4815 }, { "epoch": 2.704098820887142, "grad_norm": 0.42080169916152954, "learning_rate": 2.935191603435683e-07, "loss": 0.3619, "step": 4816 }, { "epoch": 2.7046603032004493, "grad_norm": 0.386111319065094, "learning_rate": 2.9241726917176363e-07, "loss": 0.3465, "step": 4817 }, { "epoch": 2.705221785513756, "grad_norm": 0.38183310627937317, "learning_rate": 2.9131738787368636e-07, "loss": 0.3659, "step": 4818 }, { "epoch": 2.7057832678270635, "grad_norm": 0.39601969718933105, "learning_rate": 2.90219516918922e-07, "loss": 0.3775, "step": 4819 }, { "epoch": 2.706344750140371, "grad_norm": 0.3761867880821228, "learning_rate": 2.8912365677620126e-07, "loss": 0.349, "step": 4820 }, { "epoch": 2.7069062324536777, "grad_norm": 0.38314539194107056, "learning_rate": 2.8802980791339476e-07, "loss": 0.398, "step": 4821 }, { "epoch": 2.707467714766985, "grad_norm": 0.3859674632549286, "learning_rate": 2.869379707975134e-07, "loss": 0.3547, "step": 4822 }, { "epoch": 2.708029197080292, "grad_norm": 0.36475884914398193, "learning_rate": 2.858481458947121e-07, "loss": 0.3766, "step": 4823 }, { "epoch": 2.708590679393599, "grad_norm": 0.3908791244029999, "learning_rate": 2.847603336702831e-07, "loss": 0.3782, "step": 4824 }, { "epoch": 2.709152161706906, "grad_norm": 0.3681521713733673, "learning_rate": 2.836745345886627e-07, "loss": 0.3968, "step": 4825 }, { "epoch": 2.7097136440202134, "grad_norm": 0.3750554621219635, "learning_rate": 2.825907491134261e-07, "loss": 0.3402, "step": 4826 }, { "epoch": 2.7102751263335207, "grad_norm": 0.3875619173049927, "learning_rate": 2.815089777072877e-07, "loss": 0.3949, "step": 4827 }, { "epoch": 2.7108366086468276, "grad_norm": 0.373585045337677, "learning_rate": 2.804292208321052e-07, "loss": 0.3681, "step": 4828 }, { "epoch": 2.7113980909601345, "grad_norm": 0.38702377676963806, "learning_rate": 2.793514789488716e-07, "loss": 0.3654, "step": 4829 }, { "epoch": 2.711959573273442, "grad_norm": 0.37067410349845886, "learning_rate": 2.782757525177243e-07, "loss": 0.3768, "step": 4830 }, { "epoch": 2.712521055586749, "grad_norm": 0.4291858375072479, "learning_rate": 2.7720204199793765e-07, "loss": 0.408, "step": 4831 }, { "epoch": 2.713082537900056, "grad_norm": 0.40115174651145935, "learning_rate": 2.7613034784792494e-07, "loss": 0.3837, "step": 4832 }, { "epoch": 2.7136440202133634, "grad_norm": 0.32870209217071533, "learning_rate": 2.7506067052524124e-07, "loss": 0.36, "step": 4833 }, { "epoch": 2.7142055025266703, "grad_norm": 0.3859201669692993, "learning_rate": 2.739930104865762e-07, "loss": 0.3803, "step": 4834 }, { "epoch": 2.7147669848399776, "grad_norm": 0.41089707612991333, "learning_rate": 2.7292736818776257e-07, "loss": 0.3873, "step": 4835 }, { "epoch": 2.7153284671532845, "grad_norm": 0.39492329955101013, "learning_rate": 2.718637440837707e-07, "loss": 0.3447, "step": 4836 }, { "epoch": 2.715889949466592, "grad_norm": 0.40502429008483887, "learning_rate": 2.7080213862870575e-07, "loss": 0.3593, "step": 4837 }, { "epoch": 2.716451431779899, "grad_norm": 0.39214956760406494, "learning_rate": 2.6974255227581624e-07, "loss": 0.3705, "step": 4838 }, { "epoch": 2.717012914093206, "grad_norm": 0.3732733428478241, "learning_rate": 2.6868498547748367e-07, "loss": 0.3652, "step": 4839 }, { "epoch": 2.7175743964065133, "grad_norm": 0.40439724922180176, "learning_rate": 2.6762943868523186e-07, "loss": 0.3743, "step": 4840 }, { "epoch": 2.71813587871982, "grad_norm": 0.4210774600505829, "learning_rate": 2.6657591234971914e-07, "loss": 0.3605, "step": 4841 }, { "epoch": 2.7186973610331275, "grad_norm": 0.39361703395843506, "learning_rate": 2.6552440692074187e-07, "loss": 0.3827, "step": 4842 }, { "epoch": 2.7192588433464344, "grad_norm": 0.39421913027763367, "learning_rate": 2.6447492284723486e-07, "loss": 0.3783, "step": 4843 }, { "epoch": 2.7198203256597417, "grad_norm": 0.3845304846763611, "learning_rate": 2.6342746057726796e-07, "loss": 0.378, "step": 4844 }, { "epoch": 2.720381807973049, "grad_norm": 0.36558249592781067, "learning_rate": 2.6238202055804904e-07, "loss": 0.3956, "step": 4845 }, { "epoch": 2.720943290286356, "grad_norm": 0.3608235716819763, "learning_rate": 2.613386032359233e-07, "loss": 0.3719, "step": 4846 }, { "epoch": 2.721504772599663, "grad_norm": 0.37958356738090515, "learning_rate": 2.6029720905636936e-07, "loss": 0.3736, "step": 4847 }, { "epoch": 2.72206625491297, "grad_norm": 0.36610057950019836, "learning_rate": 2.5925783846400655e-07, "loss": 0.4019, "step": 4848 }, { "epoch": 2.7226277372262775, "grad_norm": 0.38093310594558716, "learning_rate": 2.5822049190258547e-07, "loss": 0.36, "step": 4849 }, { "epoch": 2.7231892195395844, "grad_norm": 0.3689562976360321, "learning_rate": 2.571851698149963e-07, "loss": 0.3507, "step": 4850 }, { "epoch": 2.7237507018528917, "grad_norm": 0.392485111951828, "learning_rate": 2.561518726432638e-07, "loss": 0.3558, "step": 4851 }, { "epoch": 2.724312184166199, "grad_norm": 0.35743263363838196, "learning_rate": 2.5512060082854675e-07, "loss": 0.3646, "step": 4852 }, { "epoch": 2.724873666479506, "grad_norm": 0.39198458194732666, "learning_rate": 2.540913548111412e-07, "loss": 0.3247, "step": 4853 }, { "epoch": 2.725435148792813, "grad_norm": 0.3720390796661377, "learning_rate": 2.5306413503047686e-07, "loss": 0.3727, "step": 4854 }, { "epoch": 2.72599663110612, "grad_norm": 0.37583497166633606, "learning_rate": 2.52038941925119e-07, "loss": 0.3771, "step": 4855 }, { "epoch": 2.7265581134194274, "grad_norm": 0.366077721118927, "learning_rate": 2.510157759327686e-07, "loss": 0.355, "step": 4856 }, { "epoch": 2.7271195957327343, "grad_norm": 0.3702395260334015, "learning_rate": 2.4999463749025845e-07, "loss": 0.3274, "step": 4857 }, { "epoch": 2.7276810780460417, "grad_norm": 0.37240052223205566, "learning_rate": 2.489755270335592e-07, "loss": 0.3841, "step": 4858 }, { "epoch": 2.7282425603593485, "grad_norm": 0.3657897114753723, "learning_rate": 2.4795844499777233e-07, "loss": 0.3659, "step": 4859 }, { "epoch": 2.728804042672656, "grad_norm": 0.3993796706199646, "learning_rate": 2.469433918171349e-07, "loss": 0.3654, "step": 4860 }, { "epoch": 2.7293655249859627, "grad_norm": 0.373061865568161, "learning_rate": 2.459303679250197e-07, "loss": 0.3808, "step": 4861 }, { "epoch": 2.72992700729927, "grad_norm": 0.3775833547115326, "learning_rate": 2.4491937375392793e-07, "loss": 0.3682, "step": 4862 }, { "epoch": 2.7304884896125774, "grad_norm": 0.344818115234375, "learning_rate": 2.439104097355005e-07, "loss": 0.3652, "step": 4863 }, { "epoch": 2.7310499719258843, "grad_norm": 0.36899659037590027, "learning_rate": 2.429034763005056e-07, "loss": 0.3663, "step": 4864 }, { "epoch": 2.7316114542391916, "grad_norm": 0.3672730624675751, "learning_rate": 2.418985738788493e-07, "loss": 0.3959, "step": 4865 }, { "epoch": 2.7321729365524985, "grad_norm": 0.38151344656944275, "learning_rate": 2.4089570289956786e-07, "loss": 0.3552, "step": 4866 }, { "epoch": 2.732734418865806, "grad_norm": 0.3632182478904724, "learning_rate": 2.3989486379083106e-07, "loss": 0.3643, "step": 4867 }, { "epoch": 2.7332959011791127, "grad_norm": 0.34899887442588806, "learning_rate": 2.3889605697994146e-07, "loss": 0.3525, "step": 4868 }, { "epoch": 2.73385738349242, "grad_norm": 0.3659873902797699, "learning_rate": 2.3789928289333243e-07, "loss": 0.3667, "step": 4869 }, { "epoch": 2.7344188658057273, "grad_norm": 0.3831065595149994, "learning_rate": 2.3690454195657076e-07, "loss": 0.3851, "step": 4870 }, { "epoch": 2.7349803481190342, "grad_norm": 0.3757367730140686, "learning_rate": 2.3591183459435673e-07, "loss": 0.3674, "step": 4871 }, { "epoch": 2.735541830432341, "grad_norm": 0.37234780192375183, "learning_rate": 2.3492116123051855e-07, "loss": 0.3558, "step": 4872 }, { "epoch": 2.7361033127456484, "grad_norm": 0.3945425748825073, "learning_rate": 2.3393252228801955e-07, "loss": 0.3692, "step": 4873 }, { "epoch": 2.7366647950589558, "grad_norm": 0.34498560428619385, "learning_rate": 2.3294591818895264e-07, "loss": 0.3565, "step": 4874 }, { "epoch": 2.7372262773722627, "grad_norm": 0.3808472752571106, "learning_rate": 2.3196134935454252e-07, "loss": 0.3869, "step": 4875 }, { "epoch": 2.73778775968557, "grad_norm": 0.3774579167366028, "learning_rate": 2.3097881620514406e-07, "loss": 0.3722, "step": 4876 }, { "epoch": 2.7383492419988773, "grad_norm": 0.3817330002784729, "learning_rate": 2.299983191602445e-07, "loss": 0.3792, "step": 4877 }, { "epoch": 2.738910724312184, "grad_norm": 0.402194082736969, "learning_rate": 2.2901985863846176e-07, "loss": 0.3452, "step": 4878 }, { "epoch": 2.739472206625491, "grad_norm": 0.3935263156890869, "learning_rate": 2.2804343505754222e-07, "loss": 0.3686, "step": 4879 }, { "epoch": 2.7400336889387984, "grad_norm": 0.3999941647052765, "learning_rate": 2.270690488343652e-07, "loss": 0.3906, "step": 4880 }, { "epoch": 2.7405951712521057, "grad_norm": 0.40438055992126465, "learning_rate": 2.2609670038493737e-07, "loss": 0.3552, "step": 4881 }, { "epoch": 2.7411566535654126, "grad_norm": 0.3933880627155304, "learning_rate": 2.251263901243983e-07, "loss": 0.3874, "step": 4882 }, { "epoch": 2.74171813587872, "grad_norm": 0.3967152535915375, "learning_rate": 2.2415811846701664e-07, "loss": 0.4029, "step": 4883 }, { "epoch": 2.742279618192027, "grad_norm": 0.3612099885940552, "learning_rate": 2.231918858261878e-07, "loss": 0.369, "step": 4884 }, { "epoch": 2.742841100505334, "grad_norm": 0.4053208529949188, "learning_rate": 2.222276926144412e-07, "loss": 0.3789, "step": 4885 }, { "epoch": 2.743402582818641, "grad_norm": 0.3564148247241974, "learning_rate": 2.212655392434321e-07, "loss": 0.3694, "step": 4886 }, { "epoch": 2.7439640651319483, "grad_norm": 0.36424520611763, "learning_rate": 2.2030542612394567e-07, "loss": 0.3725, "step": 4887 }, { "epoch": 2.7445255474452557, "grad_norm": 0.3918653428554535, "learning_rate": 2.1934735366589744e-07, "loss": 0.3846, "step": 4888 }, { "epoch": 2.7450870297585626, "grad_norm": 0.42804402112960815, "learning_rate": 2.183913222783296e-07, "loss": 0.3356, "step": 4889 }, { "epoch": 2.74564851207187, "grad_norm": 0.39993607997894287, "learning_rate": 2.174373323694151e-07, "loss": 0.372, "step": 4890 }, { "epoch": 2.7462099943851768, "grad_norm": 0.39632266759872437, "learning_rate": 2.164853843464526e-07, "loss": 0.3517, "step": 4891 }, { "epoch": 2.746771476698484, "grad_norm": 0.3724188506603241, "learning_rate": 2.1553547861587143e-07, "loss": 0.39, "step": 4892 }, { "epoch": 2.747332959011791, "grad_norm": 0.37311410903930664, "learning_rate": 2.1458761558322826e-07, "loss": 0.3602, "step": 4893 }, { "epoch": 2.7478944413250983, "grad_norm": 0.39225584268569946, "learning_rate": 2.1364179565320608e-07, "loss": 0.3632, "step": 4894 }, { "epoch": 2.7484559236384056, "grad_norm": 0.3841055631637573, "learning_rate": 2.1269801922961907e-07, "loss": 0.3952, "step": 4895 }, { "epoch": 2.7490174059517125, "grad_norm": 0.38132596015930176, "learning_rate": 2.1175628671540494e-07, "loss": 0.3638, "step": 4896 }, { "epoch": 2.7495788882650194, "grad_norm": 0.32728561758995056, "learning_rate": 2.1081659851263149e-07, "loss": 0.3637, "step": 4897 }, { "epoch": 2.7501403705783267, "grad_norm": 0.3550022840499878, "learning_rate": 2.0987895502249335e-07, "loss": 0.3422, "step": 4898 }, { "epoch": 2.750701852891634, "grad_norm": 0.3624716103076935, "learning_rate": 2.089433566453103e-07, "loss": 0.365, "step": 4899 }, { "epoch": 2.751263335204941, "grad_norm": 0.4113461673259735, "learning_rate": 2.080098037805317e-07, "loss": 0.3761, "step": 4900 }, { "epoch": 2.7518248175182483, "grad_norm": 0.3925926983356476, "learning_rate": 2.0707829682673098e-07, "loss": 0.3682, "step": 4901 }, { "epoch": 2.7523862998315556, "grad_norm": 0.3586964011192322, "learning_rate": 2.0614883618160997e-07, "loss": 0.3687, "step": 4902 }, { "epoch": 2.7529477821448625, "grad_norm": 0.41690880060195923, "learning_rate": 2.0522142224199737e-07, "loss": 0.3673, "step": 4903 }, { "epoch": 2.7535092644581693, "grad_norm": 0.32287663221359253, "learning_rate": 2.0429605540384477e-07, "loss": 0.3533, "step": 4904 }, { "epoch": 2.7540707467714767, "grad_norm": 0.40812090039253235, "learning_rate": 2.0337273606223395e-07, "loss": 0.3976, "step": 4905 }, { "epoch": 2.754632229084784, "grad_norm": 0.37500515580177307, "learning_rate": 2.0245146461136843e-07, "loss": 0.3542, "step": 4906 }, { "epoch": 2.755193711398091, "grad_norm": 0.3489953875541687, "learning_rate": 2.0153224144458084e-07, "loss": 0.3791, "step": 4907 }, { "epoch": 2.755755193711398, "grad_norm": 0.3645203411579132, "learning_rate": 2.006150669543283e-07, "loss": 0.3742, "step": 4908 }, { "epoch": 2.756316676024705, "grad_norm": 0.3778213560581207, "learning_rate": 1.9969994153219098e-07, "loss": 0.3606, "step": 4909 }, { "epoch": 2.7568781583380124, "grad_norm": 0.3746356666088104, "learning_rate": 1.987868655688785e-07, "loss": 0.3833, "step": 4910 }, { "epoch": 2.7574396406513193, "grad_norm": 0.3534126877784729, "learning_rate": 1.9787583945422072e-07, "loss": 0.3732, "step": 4911 }, { "epoch": 2.7580011229646266, "grad_norm": 0.3763459622859955, "learning_rate": 1.9696686357717542e-07, "loss": 0.3824, "step": 4912 }, { "epoch": 2.758562605277934, "grad_norm": 0.3892681896686554, "learning_rate": 1.9605993832582603e-07, "loss": 0.3814, "step": 4913 }, { "epoch": 2.759124087591241, "grad_norm": 0.350077748298645, "learning_rate": 1.9515506408737616e-07, "loss": 0.3438, "step": 4914 }, { "epoch": 2.759685569904548, "grad_norm": 0.3576473295688629, "learning_rate": 1.9425224124815846e-07, "loss": 0.3494, "step": 4915 }, { "epoch": 2.760247052217855, "grad_norm": 0.41229110956192017, "learning_rate": 1.933514701936262e-07, "loss": 0.3786, "step": 4916 }, { "epoch": 2.7608085345311624, "grad_norm": 0.42996254563331604, "learning_rate": 1.9245275130835849e-07, "loss": 0.3816, "step": 4917 }, { "epoch": 2.7613700168444693, "grad_norm": 0.38392382860183716, "learning_rate": 1.9155608497605836e-07, "loss": 0.368, "step": 4918 }, { "epoch": 2.7619314991577766, "grad_norm": 0.40605247020721436, "learning_rate": 1.9066147157955128e-07, "loss": 0.3769, "step": 4919 }, { "epoch": 2.762492981471084, "grad_norm": 0.3773554265499115, "learning_rate": 1.8976891150078836e-07, "loss": 0.3699, "step": 4920 }, { "epoch": 2.763054463784391, "grad_norm": 0.4236624836921692, "learning_rate": 1.8887840512084033e-07, "loss": 0.3676, "step": 4921 }, { "epoch": 2.7636159460976977, "grad_norm": 0.39666983485221863, "learning_rate": 1.8798995281990528e-07, "loss": 0.3933, "step": 4922 }, { "epoch": 2.764177428411005, "grad_norm": 0.375496506690979, "learning_rate": 1.871035549773026e-07, "loss": 0.3635, "step": 4923 }, { "epoch": 2.7647389107243123, "grad_norm": 0.4201972186565399, "learning_rate": 1.8621921197147342e-07, "loss": 0.3684, "step": 4924 }, { "epoch": 2.765300393037619, "grad_norm": 0.37182095646858215, "learning_rate": 1.853369241799835e-07, "loss": 0.3696, "step": 4925 }, { "epoch": 2.7658618753509265, "grad_norm": 0.3841799199581146, "learning_rate": 1.8445669197951932e-07, "loss": 0.3625, "step": 4926 }, { "epoch": 2.7664233576642334, "grad_norm": 0.37596508860588074, "learning_rate": 1.835785157458908e-07, "loss": 0.3668, "step": 4927 }, { "epoch": 2.7669848399775407, "grad_norm": 0.4161820709705353, "learning_rate": 1.827023958540308e-07, "loss": 0.3822, "step": 4928 }, { "epoch": 2.7675463222908476, "grad_norm": 0.4265937805175781, "learning_rate": 1.8182833267799237e-07, "loss": 0.3483, "step": 4929 }, { "epoch": 2.768107804604155, "grad_norm": 0.3599810302257538, "learning_rate": 1.8095632659095253e-07, "loss": 0.3736, "step": 4930 }, { "epoch": 2.7686692869174623, "grad_norm": 0.42132049798965454, "learning_rate": 1.8008637796520734e-07, "loss": 0.3521, "step": 4931 }, { "epoch": 2.769230769230769, "grad_norm": 0.415012389421463, "learning_rate": 1.792184871721775e-07, "loss": 0.3615, "step": 4932 }, { "epoch": 2.7697922515440765, "grad_norm": 0.3812330663204193, "learning_rate": 1.783526545824038e-07, "loss": 0.3569, "step": 4933 }, { "epoch": 2.7703537338573834, "grad_norm": 0.36845266819000244, "learning_rate": 1.774888805655467e-07, "loss": 0.3712, "step": 4934 }, { "epoch": 2.7709152161706907, "grad_norm": 0.36809244751930237, "learning_rate": 1.766271654903906e-07, "loss": 0.3682, "step": 4935 }, { "epoch": 2.7714766984839976, "grad_norm": 0.377371221780777, "learning_rate": 1.7576750972483903e-07, "loss": 0.3752, "step": 4936 }, { "epoch": 2.772038180797305, "grad_norm": 0.4041268825531006, "learning_rate": 1.7490991363591615e-07, "loss": 0.3998, "step": 4937 }, { "epoch": 2.7725996631106122, "grad_norm": 0.35417208075523376, "learning_rate": 1.7405437758976905e-07, "loss": 0.3745, "step": 4938 }, { "epoch": 2.773161145423919, "grad_norm": 0.3685002028942108, "learning_rate": 1.7320090195166171e-07, "loss": 0.3719, "step": 4939 }, { "epoch": 2.7737226277372264, "grad_norm": 0.3582022786140442, "learning_rate": 1.723494870859821e-07, "loss": 0.3748, "step": 4940 }, { "epoch": 2.7742841100505333, "grad_norm": 0.36736878752708435, "learning_rate": 1.7150013335623495e-07, "loss": 0.4016, "step": 4941 }, { "epoch": 2.7748455923638407, "grad_norm": 0.3581879436969757, "learning_rate": 1.7065284112504744e-07, "loss": 0.3608, "step": 4942 }, { "epoch": 2.7754070746771475, "grad_norm": 0.3607819080352783, "learning_rate": 1.698076107541663e-07, "loss": 0.3995, "step": 4943 }, { "epoch": 2.775968556990455, "grad_norm": 0.3508816957473755, "learning_rate": 1.689644426044562e-07, "loss": 0.3729, "step": 4944 }, { "epoch": 2.776530039303762, "grad_norm": 0.3792043924331665, "learning_rate": 1.6812333703590423e-07, "loss": 0.3806, "step": 4945 }, { "epoch": 2.777091521617069, "grad_norm": 0.353903591632843, "learning_rate": 1.6728429440761418e-07, "loss": 0.3709, "step": 4946 }, { "epoch": 2.777653003930376, "grad_norm": 0.37974709272384644, "learning_rate": 1.664473150778101e-07, "loss": 0.3696, "step": 4947 }, { "epoch": 2.7782144862436833, "grad_norm": 0.37422218918800354, "learning_rate": 1.6561239940383612e-07, "loss": 0.3907, "step": 4948 }, { "epoch": 2.7787759685569906, "grad_norm": 0.3827471137046814, "learning_rate": 1.6477954774215377e-07, "loss": 0.365, "step": 4949 }, { "epoch": 2.7793374508702975, "grad_norm": 0.40698570013046265, "learning_rate": 1.6394876044834418e-07, "loss": 0.3531, "step": 4950 }, { "epoch": 2.779898933183605, "grad_norm": 0.3667711615562439, "learning_rate": 1.631200378771064e-07, "loss": 0.3759, "step": 4951 }, { "epoch": 2.7804604154969117, "grad_norm": 0.4113500118255615, "learning_rate": 1.6229338038225852e-07, "loss": 0.3887, "step": 4952 }, { "epoch": 2.781021897810219, "grad_norm": 0.3832653760910034, "learning_rate": 1.6146878831673818e-07, "loss": 0.3593, "step": 4953 }, { "epoch": 2.781583380123526, "grad_norm": 0.3670588433742523, "learning_rate": 1.6064626203259827e-07, "loss": 0.347, "step": 4954 }, { "epoch": 2.7821448624368332, "grad_norm": 0.4124890863895416, "learning_rate": 1.5982580188101282e-07, "loss": 0.3957, "step": 4955 }, { "epoch": 2.7827063447501406, "grad_norm": 0.3707529902458191, "learning_rate": 1.5900740821227113e-07, "loss": 0.3762, "step": 4956 }, { "epoch": 2.7832678270634474, "grad_norm": 0.3638993799686432, "learning_rate": 1.5819108137578198e-07, "loss": 0.373, "step": 4957 }, { "epoch": 2.7838293093767548, "grad_norm": 0.41168516874313354, "learning_rate": 1.5737682172007108e-07, "loss": 0.3598, "step": 4958 }, { "epoch": 2.7843907916900617, "grad_norm": 0.5139597654342651, "learning_rate": 1.565646295927814e-07, "loss": 0.3882, "step": 4959 }, { "epoch": 2.784952274003369, "grad_norm": 0.37479084730148315, "learning_rate": 1.5575450534067394e-07, "loss": 0.3592, "step": 4960 }, { "epoch": 2.785513756316676, "grad_norm": 0.38091039657592773, "learning_rate": 1.5494644930962533e-07, "loss": 0.348, "step": 4961 }, { "epoch": 2.786075238629983, "grad_norm": 0.3786410093307495, "learning_rate": 1.5414046184463127e-07, "loss": 0.3766, "step": 4962 }, { "epoch": 2.7866367209432905, "grad_norm": 0.3664923906326294, "learning_rate": 1.5333654328980264e-07, "loss": 0.3605, "step": 4963 }, { "epoch": 2.7871982032565974, "grad_norm": 0.3930692970752716, "learning_rate": 1.5253469398836707e-07, "loss": 0.349, "step": 4964 }, { "epoch": 2.7877596855699043, "grad_norm": 0.3909927010536194, "learning_rate": 1.5173491428267018e-07, "loss": 0.3538, "step": 4965 }, { "epoch": 2.7883211678832116, "grad_norm": 0.40731626749038696, "learning_rate": 1.5093720451417216e-07, "loss": 0.3716, "step": 4966 }, { "epoch": 2.788882650196519, "grad_norm": 0.3881210684776306, "learning_rate": 1.5014156502345167e-07, "loss": 0.3528, "step": 4967 }, { "epoch": 2.789444132509826, "grad_norm": 0.37402865290641785, "learning_rate": 1.4934799615020034e-07, "loss": 0.3428, "step": 4968 }, { "epoch": 2.790005614823133, "grad_norm": 0.3543051779270172, "learning_rate": 1.4855649823322883e-07, "loss": 0.3699, "step": 4969 }, { "epoch": 2.7905670971364405, "grad_norm": 0.4145013988018036, "learning_rate": 1.4776707161046178e-07, "loss": 0.4153, "step": 4970 }, { "epoch": 2.7911285794497473, "grad_norm": 0.3820224106311798, "learning_rate": 1.4697971661894072e-07, "loss": 0.3918, "step": 4971 }, { "epoch": 2.7916900617630542, "grad_norm": 0.3610817492008209, "learning_rate": 1.461944335948212e-07, "loss": 0.3419, "step": 4972 }, { "epoch": 2.7922515440763616, "grad_norm": 0.42354443669319153, "learning_rate": 1.454112228733756e-07, "loss": 0.3888, "step": 4973 }, { "epoch": 2.792813026389669, "grad_norm": 0.4077790081501007, "learning_rate": 1.446300847889903e-07, "loss": 0.4044, "step": 4974 }, { "epoch": 2.7933745087029758, "grad_norm": 0.3787437081336975, "learning_rate": 1.4385101967516911e-07, "loss": 0.3796, "step": 4975 }, { "epoch": 2.793935991016283, "grad_norm": 0.37124091386795044, "learning_rate": 1.430740278645265e-07, "loss": 0.3847, "step": 4976 }, { "epoch": 2.79449747332959, "grad_norm": 0.3606906235218048, "learning_rate": 1.4229910968879713e-07, "loss": 0.3414, "step": 4977 }, { "epoch": 2.7950589556428973, "grad_norm": 0.38749152421951294, "learning_rate": 1.4152626547882519e-07, "loss": 0.3795, "step": 4978 }, { "epoch": 2.795620437956204, "grad_norm": 0.4243083894252777, "learning_rate": 1.4075549556457347e-07, "loss": 0.3751, "step": 4979 }, { "epoch": 2.7961819202695115, "grad_norm": 0.35697245597839355, "learning_rate": 1.3998680027511703e-07, "loss": 0.3947, "step": 4980 }, { "epoch": 2.796743402582819, "grad_norm": 0.37990185618400574, "learning_rate": 1.39220179938645e-07, "loss": 0.3497, "step": 4981 }, { "epoch": 2.7973048848961257, "grad_norm": 0.34533071517944336, "learning_rate": 1.3845563488246227e-07, "loss": 0.36, "step": 4982 }, { "epoch": 2.797866367209433, "grad_norm": 0.4126512110233307, "learning_rate": 1.3769316543298604e-07, "loss": 0.3727, "step": 4983 }, { "epoch": 2.79842784952274, "grad_norm": 0.3870365619659424, "learning_rate": 1.369327719157476e-07, "loss": 0.3754, "step": 4984 }, { "epoch": 2.7989893318360473, "grad_norm": 0.3730488717556, "learning_rate": 1.361744546553939e-07, "loss": 0.3555, "step": 4985 }, { "epoch": 2.799550814149354, "grad_norm": 0.3863545060157776, "learning_rate": 1.3541821397568212e-07, "loss": 0.3551, "step": 4986 }, { "epoch": 2.8001122964626615, "grad_norm": 0.37313637137413025, "learning_rate": 1.346640501994856e-07, "loss": 0.393, "step": 4987 }, { "epoch": 2.800673778775969, "grad_norm": 0.3632469177246094, "learning_rate": 1.33911963648789e-07, "loss": 0.3823, "step": 4988 }, { "epoch": 2.8012352610892757, "grad_norm": 0.3434154987335205, "learning_rate": 1.3316195464469216e-07, "loss": 0.3952, "step": 4989 }, { "epoch": 2.8017967434025826, "grad_norm": 0.3992873430252075, "learning_rate": 1.3241402350740674e-07, "loss": 0.3674, "step": 4990 }, { "epoch": 2.80235822571589, "grad_norm": 0.3592512011528015, "learning_rate": 1.316681705562567e-07, "loss": 0.3517, "step": 4991 }, { "epoch": 2.802919708029197, "grad_norm": 0.4393292963504791, "learning_rate": 1.3092439610968065e-07, "loss": 0.3953, "step": 4992 }, { "epoch": 2.803481190342504, "grad_norm": 0.3963760435581207, "learning_rate": 1.3018270048522687e-07, "loss": 0.3768, "step": 4993 }, { "epoch": 2.8040426726558114, "grad_norm": 0.4180271327495575, "learning_rate": 1.2944308399955808e-07, "loss": 0.3917, "step": 4994 }, { "epoch": 2.8046041549691187, "grad_norm": 0.3609219789505005, "learning_rate": 1.287055469684506e-07, "loss": 0.378, "step": 4995 }, { "epoch": 2.8051656372824256, "grad_norm": 0.3971406817436218, "learning_rate": 1.2797008970678981e-07, "loss": 0.4167, "step": 4996 }, { "epoch": 2.8057271195957325, "grad_norm": 0.36231306195259094, "learning_rate": 1.2723671252857562e-07, "loss": 0.3643, "step": 4997 }, { "epoch": 2.80628860190904, "grad_norm": 0.37932369112968445, "learning_rate": 1.265054157469181e-07, "loss": 0.3643, "step": 4998 }, { "epoch": 2.806850084222347, "grad_norm": 0.38238948583602905, "learning_rate": 1.2577619967404032e-07, "loss": 0.3781, "step": 4999 }, { "epoch": 2.807411566535654, "grad_norm": 0.3659485876560211, "learning_rate": 1.2504906462127708e-07, "loss": 0.3626, "step": 5000 }, { "epoch": 2.8079730488489614, "grad_norm": 0.3565763533115387, "learning_rate": 1.243240108990734e-07, "loss": 0.3543, "step": 5001 }, { "epoch": 2.8085345311622683, "grad_norm": 0.3450455367565155, "learning_rate": 1.2360103881698726e-07, "loss": 0.3741, "step": 5002 }, { "epoch": 2.8090960134755756, "grad_norm": 0.382046103477478, "learning_rate": 1.228801486836867e-07, "loss": 0.3728, "step": 5003 }, { "epoch": 2.8096574957888825, "grad_norm": 0.38008731603622437, "learning_rate": 1.2216134080695163e-07, "loss": 0.3926, "step": 5004 }, { "epoch": 2.81021897810219, "grad_norm": 0.4063502252101898, "learning_rate": 1.2144461549367327e-07, "loss": 0.3855, "step": 5005 }, { "epoch": 2.810780460415497, "grad_norm": 0.3666808009147644, "learning_rate": 1.2072997304985178e-07, "loss": 0.379, "step": 5006 }, { "epoch": 2.811341942728804, "grad_norm": 0.3750617504119873, "learning_rate": 1.2001741378060095e-07, "loss": 0.3744, "step": 5007 }, { "epoch": 2.8119034250421113, "grad_norm": 0.36877623200416565, "learning_rate": 1.1930693799014182e-07, "loss": 0.3656, "step": 5008 }, { "epoch": 2.812464907355418, "grad_norm": 0.39016997814178467, "learning_rate": 1.1859854598180897e-07, "loss": 0.3696, "step": 5009 }, { "epoch": 2.8130263896687255, "grad_norm": 0.40267887711524963, "learning_rate": 1.1789223805804605e-07, "loss": 0.3333, "step": 5010 }, { "epoch": 2.8135878719820324, "grad_norm": 0.3925192356109619, "learning_rate": 1.1718801452040574e-07, "loss": 0.3601, "step": 5011 }, { "epoch": 2.8141493542953397, "grad_norm": 0.37634000182151794, "learning_rate": 1.164858756695536e-07, "loss": 0.3736, "step": 5012 }, { "epoch": 2.814710836608647, "grad_norm": 0.38444653153419495, "learning_rate": 1.1578582180526266e-07, "loss": 0.4024, "step": 5013 }, { "epoch": 2.815272318921954, "grad_norm": 0.37631356716156006, "learning_rate": 1.1508785322641658e-07, "loss": 0.3617, "step": 5014 }, { "epoch": 2.815833801235261, "grad_norm": 0.3501458466053009, "learning_rate": 1.1439197023100922e-07, "loss": 0.4018, "step": 5015 }, { "epoch": 2.816395283548568, "grad_norm": 0.42617690563201904, "learning_rate": 1.1369817311614296e-07, "loss": 0.3838, "step": 5016 }, { "epoch": 2.8169567658618755, "grad_norm": 0.42982926964759827, "learning_rate": 1.130064621780308e-07, "loss": 0.3669, "step": 5017 }, { "epoch": 2.8175182481751824, "grad_norm": 0.3789714574813843, "learning_rate": 1.1231683771199431e-07, "loss": 0.3402, "step": 5018 }, { "epoch": 2.8180797304884897, "grad_norm": 0.40478581190109253, "learning_rate": 1.1162930001246408e-07, "loss": 0.3979, "step": 5019 }, { "epoch": 2.8186412128017966, "grad_norm": 0.3812825381755829, "learning_rate": 1.1094384937298086e-07, "loss": 0.3291, "step": 5020 }, { "epoch": 2.819202695115104, "grad_norm": 0.3830914795398712, "learning_rate": 1.1026048608619333e-07, "loss": 0.3726, "step": 5021 }, { "epoch": 2.819764177428411, "grad_norm": 0.3727912902832031, "learning_rate": 1.0957921044385921e-07, "loss": 0.3762, "step": 5022 }, { "epoch": 2.820325659741718, "grad_norm": 0.3883983790874481, "learning_rate": 1.0890002273684475e-07, "loss": 0.3838, "step": 5023 }, { "epoch": 2.8208871420550254, "grad_norm": 0.36612069606781006, "learning_rate": 1.0822292325512517e-07, "loss": 0.3902, "step": 5024 }, { "epoch": 2.8214486243683323, "grad_norm": 0.4071313142776489, "learning_rate": 1.0754791228778428e-07, "loss": 0.3639, "step": 5025 }, { "epoch": 2.8220101066816397, "grad_norm": 0.3578735888004303, "learning_rate": 1.0687499012301316e-07, "loss": 0.386, "step": 5026 }, { "epoch": 2.8225715889949465, "grad_norm": 0.3792220950126648, "learning_rate": 1.0620415704811316e-07, "loss": 0.3497, "step": 5027 }, { "epoch": 2.823133071308254, "grad_norm": 0.3985273838043213, "learning_rate": 1.0553541334949069e-07, "loss": 0.3885, "step": 5028 }, { "epoch": 2.8236945536215607, "grad_norm": 0.4038177728652954, "learning_rate": 1.0486875931266294e-07, "loss": 0.3852, "step": 5029 }, { "epoch": 2.824256035934868, "grad_norm": 0.3905893564224243, "learning_rate": 1.0420419522225334e-07, "loss": 0.3614, "step": 5030 }, { "epoch": 2.8248175182481754, "grad_norm": 0.3921181261539459, "learning_rate": 1.035417213619938e-07, "loss": 0.3552, "step": 5031 }, { "epoch": 2.8253790005614823, "grad_norm": 0.3708798587322235, "learning_rate": 1.0288133801472312e-07, "loss": 0.3589, "step": 5032 }, { "epoch": 2.8259404828747896, "grad_norm": 0.3805485963821411, "learning_rate": 1.0222304546238792e-07, "loss": 0.3819, "step": 5033 }, { "epoch": 2.8265019651880965, "grad_norm": 0.3641560673713684, "learning_rate": 1.0156684398604233e-07, "loss": 0.3413, "step": 5034 }, { "epoch": 2.827063447501404, "grad_norm": 0.3497603237628937, "learning_rate": 1.0091273386584832e-07, "loss": 0.3712, "step": 5035 }, { "epoch": 2.8276249298147107, "grad_norm": 0.3742990791797638, "learning_rate": 1.0026071538107251e-07, "loss": 0.3742, "step": 5036 }, { "epoch": 2.828186412128018, "grad_norm": 0.3638858199119568, "learning_rate": 9.961078881009223e-08, "loss": 0.3763, "step": 5037 }, { "epoch": 2.8287478944413254, "grad_norm": 0.3492191433906555, "learning_rate": 9.896295443038773e-08, "loss": 0.3634, "step": 5038 }, { "epoch": 2.8293093767546322, "grad_norm": 0.37371760606765747, "learning_rate": 9.831721251854887e-08, "loss": 0.3411, "step": 5039 }, { "epoch": 2.829870859067939, "grad_norm": 0.38150790333747864, "learning_rate": 9.767356335027178e-08, "loss": 0.3755, "step": 5040 }, { "epoch": 2.8304323413812464, "grad_norm": 0.3773679733276367, "learning_rate": 9.703200720035777e-08, "loss": 0.3957, "step": 5041 }, { "epoch": 2.8309938236945538, "grad_norm": 0.3959170877933502, "learning_rate": 9.63925443427155e-08, "loss": 0.3566, "step": 5042 }, { "epoch": 2.8315553060078607, "grad_norm": 0.4233883321285248, "learning_rate": 9.575517505035881e-08, "loss": 0.3688, "step": 5043 }, { "epoch": 2.832116788321168, "grad_norm": 0.37511304020881653, "learning_rate": 9.511989959541113e-08, "loss": 0.3434, "step": 5044 }, { "epoch": 2.832678270634475, "grad_norm": 0.3691667318344116, "learning_rate": 9.448671824909661e-08, "loss": 0.3933, "step": 5045 }, { "epoch": 2.833239752947782, "grad_norm": 0.3682698905467987, "learning_rate": 9.385563128175068e-08, "loss": 0.3903, "step": 5046 }, { "epoch": 2.833801235261089, "grad_norm": 0.35713544487953186, "learning_rate": 9.322663896281003e-08, "loss": 0.3848, "step": 5047 }, { "epoch": 2.8343627175743964, "grad_norm": 0.4115452170372009, "learning_rate": 9.259974156081986e-08, "loss": 0.3825, "step": 5048 }, { "epoch": 2.8349241998877037, "grad_norm": 0.40094253420829773, "learning_rate": 9.197493934343049e-08, "loss": 0.3941, "step": 5049 }, { "epoch": 2.8354856822010106, "grad_norm": 0.3675900399684906, "learning_rate": 9.135223257739689e-08, "loss": 0.3708, "step": 5050 }, { "epoch": 2.836047164514318, "grad_norm": 0.35713639855384827, "learning_rate": 9.073162152858028e-08, "loss": 0.3828, "step": 5051 }, { "epoch": 2.836608646827625, "grad_norm": 0.4123381972312927, "learning_rate": 9.011310646194649e-08, "loss": 0.3812, "step": 5052 }, { "epoch": 2.837170129140932, "grad_norm": 0.4156380593776703, "learning_rate": 8.949668764156705e-08, "loss": 0.3763, "step": 5053 }, { "epoch": 2.837731611454239, "grad_norm": 0.38267359137535095, "learning_rate": 8.888236533061867e-08, "loss": 0.3668, "step": 5054 }, { "epoch": 2.8382930937675463, "grad_norm": 0.40210992097854614, "learning_rate": 8.827013979138099e-08, "loss": 0.3668, "step": 5055 }, { "epoch": 2.8388545760808537, "grad_norm": 0.3800215423107147, "learning_rate": 8.766001128524159e-08, "loss": 0.3614, "step": 5056 }, { "epoch": 2.8394160583941606, "grad_norm": 0.3652217984199524, "learning_rate": 8.705198007268989e-08, "loss": 0.3624, "step": 5057 }, { "epoch": 2.8399775407074674, "grad_norm": 0.42005476355552673, "learning_rate": 8.64460464133221e-08, "loss": 0.3841, "step": 5058 }, { "epoch": 2.8405390230207748, "grad_norm": 0.40903475880622864, "learning_rate": 8.584221056583741e-08, "loss": 0.3835, "step": 5059 }, { "epoch": 2.841100505334082, "grad_norm": 0.3518482744693756, "learning_rate": 8.52404727880396e-08, "loss": 0.3636, "step": 5060 }, { "epoch": 2.841661987647389, "grad_norm": 0.3791026771068573, "learning_rate": 8.46408333368376e-08, "loss": 0.3892, "step": 5061 }, { "epoch": 2.8422234699606963, "grad_norm": 0.35887575149536133, "learning_rate": 8.404329246824272e-08, "loss": 0.4293, "step": 5062 }, { "epoch": 2.8427849522740036, "grad_norm": 0.36264917254447937, "learning_rate": 8.344785043737258e-08, "loss": 0.3435, "step": 5063 }, { "epoch": 2.8433464345873105, "grad_norm": 0.38224324584007263, "learning_rate": 8.285450749844715e-08, "loss": 0.3704, "step": 5064 }, { "epoch": 2.8439079169006174, "grad_norm": 0.3690374791622162, "learning_rate": 8.226326390479044e-08, "loss": 0.3407, "step": 5065 }, { "epoch": 2.8444693992139247, "grad_norm": 0.4192430078983307, "learning_rate": 8.167411990883057e-08, "loss": 0.3644, "step": 5066 }, { "epoch": 2.845030881527232, "grad_norm": 0.3663051128387451, "learning_rate": 8.108707576209796e-08, "loss": 0.3578, "step": 5067 }, { "epoch": 2.845592363840539, "grad_norm": 0.3885164260864258, "learning_rate": 8.05021317152288e-08, "loss": 0.3629, "step": 5068 }, { "epoch": 2.8461538461538463, "grad_norm": 0.38649383187294006, "learning_rate": 7.991928801796111e-08, "loss": 0.4124, "step": 5069 }, { "epoch": 2.846715328467153, "grad_norm": 0.3879776895046234, "learning_rate": 7.933854491913584e-08, "loss": 0.3802, "step": 5070 }, { "epoch": 2.8472768107804605, "grad_norm": 0.3530077636241913, "learning_rate": 7.875990266669853e-08, "loss": 0.3888, "step": 5071 }, { "epoch": 2.8478382930937673, "grad_norm": 0.4120860993862152, "learning_rate": 7.818336150769602e-08, "loss": 0.3883, "step": 5072 }, { "epoch": 2.8483997754070747, "grad_norm": 0.38298526406288147, "learning_rate": 7.760892168827971e-08, "loss": 0.3788, "step": 5073 }, { "epoch": 2.848961257720382, "grad_norm": 0.3761160969734192, "learning_rate": 7.703658345370346e-08, "loss": 0.3577, "step": 5074 }, { "epoch": 2.849522740033689, "grad_norm": 0.41023948788642883, "learning_rate": 7.646634704832235e-08, "loss": 0.3885, "step": 5075 }, { "epoch": 2.850084222346996, "grad_norm": 0.38848385214805603, "learning_rate": 7.589821271559716e-08, "loss": 0.3708, "step": 5076 }, { "epoch": 2.850645704660303, "grad_norm": 0.382362425327301, "learning_rate": 7.533218069808723e-08, "loss": 0.3895, "step": 5077 }, { "epoch": 2.8512071869736104, "grad_norm": 0.4344220459461212, "learning_rate": 7.476825123745813e-08, "loss": 0.3829, "step": 5078 }, { "epoch": 2.8517686692869173, "grad_norm": 0.380564421415329, "learning_rate": 7.420642457447557e-08, "loss": 0.4136, "step": 5079 }, { "epoch": 2.8523301516002246, "grad_norm": 0.36134499311447144, "learning_rate": 7.364670094900827e-08, "loss": 0.3665, "step": 5080 }, { "epoch": 2.852891633913532, "grad_norm": 0.3762810230255127, "learning_rate": 7.308908060002617e-08, "loss": 0.3679, "step": 5081 }, { "epoch": 2.853453116226839, "grad_norm": 0.3397273123264313, "learning_rate": 7.253356376560216e-08, "loss": 0.3697, "step": 5082 }, { "epoch": 2.8540145985401457, "grad_norm": 0.37790194153785706, "learning_rate": 7.198015068291097e-08, "loss": 0.338, "step": 5083 }, { "epoch": 2.854576080853453, "grad_norm": 0.4123695194721222, "learning_rate": 7.142884158822915e-08, "loss": 0.3905, "step": 5084 }, { "epoch": 2.8551375631667604, "grad_norm": 0.38928812742233276, "learning_rate": 7.087963671693454e-08, "loss": 0.3506, "step": 5085 }, { "epoch": 2.8556990454800673, "grad_norm": 0.4048002064228058, "learning_rate": 7.033253630350622e-08, "loss": 0.4127, "step": 5086 }, { "epoch": 2.8562605277933746, "grad_norm": 0.4342040717601776, "learning_rate": 6.978754058152626e-08, "loss": 0.3724, "step": 5087 }, { "epoch": 2.856822010106682, "grad_norm": 0.3953598141670227, "learning_rate": 6.924464978367629e-08, "loss": 0.3664, "step": 5088 }, { "epoch": 2.857383492419989, "grad_norm": 0.3863067626953125, "learning_rate": 6.870386414174146e-08, "loss": 0.3409, "step": 5089 }, { "epoch": 2.8579449747332957, "grad_norm": 0.347110778093338, "learning_rate": 6.816518388660597e-08, "loss": 0.3715, "step": 5090 }, { "epoch": 2.858506457046603, "grad_norm": 0.3598485291004181, "learning_rate": 6.762860924825642e-08, "loss": 0.3711, "step": 5091 }, { "epoch": 2.8590679393599103, "grad_norm": 0.3631708323955536, "learning_rate": 6.709414045577956e-08, "loss": 0.37, "step": 5092 }, { "epoch": 2.859629421673217, "grad_norm": 0.35790494084358215, "learning_rate": 6.656177773736449e-08, "loss": 0.364, "step": 5093 }, { "epoch": 2.8601909039865245, "grad_norm": 0.4221910536289215, "learning_rate": 6.603152132029944e-08, "loss": 0.3567, "step": 5094 }, { "epoch": 2.8607523862998314, "grad_norm": 0.3849811553955078, "learning_rate": 6.550337143097441e-08, "loss": 0.3392, "step": 5095 }, { "epoch": 2.8613138686131387, "grad_norm": 0.39620891213417053, "learning_rate": 6.497732829488068e-08, "loss": 0.3866, "step": 5096 }, { "epoch": 2.8618753509264456, "grad_norm": 0.36469560861587524, "learning_rate": 6.445339213660751e-08, "loss": 0.3909, "step": 5097 }, { "epoch": 2.862436833239753, "grad_norm": 0.36793053150177, "learning_rate": 6.393156317984706e-08, "loss": 0.3819, "step": 5098 }, { "epoch": 2.8629983155530603, "grad_norm": 0.3749925494194031, "learning_rate": 6.341184164739111e-08, "loss": 0.3818, "step": 5099 }, { "epoch": 2.863559797866367, "grad_norm": 0.4095827639102936, "learning_rate": 6.289422776113163e-08, "loss": 0.3587, "step": 5100 }, { "epoch": 2.8641212801796745, "grad_norm": 0.3877618908882141, "learning_rate": 6.237872174206017e-08, "loss": 0.3791, "step": 5101 }, { "epoch": 2.8646827624929814, "grad_norm": 0.4049367904663086, "learning_rate": 6.186532381026955e-08, "loss": 0.3613, "step": 5102 }, { "epoch": 2.8652442448062887, "grad_norm": 0.3438570499420166, "learning_rate": 6.135403418495167e-08, "loss": 0.4075, "step": 5103 }, { "epoch": 2.8658057271195956, "grad_norm": 0.38621780276298523, "learning_rate": 6.084485308439858e-08, "loss": 0.3727, "step": 5104 }, { "epoch": 2.866367209432903, "grad_norm": 0.3852682411670685, "learning_rate": 6.033778072600138e-08, "loss": 0.3831, "step": 5105 }, { "epoch": 2.8669286917462102, "grad_norm": 0.39376720786094666, "learning_rate": 5.9832817326253e-08, "loss": 0.3602, "step": 5106 }, { "epoch": 2.867490174059517, "grad_norm": 0.3773861825466156, "learning_rate": 5.9329963100742685e-08, "loss": 0.3563, "step": 5107 }, { "epoch": 2.868051656372824, "grad_norm": 0.3659355342388153, "learning_rate": 5.882921826416199e-08, "loss": 0.3594, "step": 5108 }, { "epoch": 2.8686131386861313, "grad_norm": 0.36560577154159546, "learning_rate": 5.833058303030103e-08, "loss": 0.3646, "step": 5109 }, { "epoch": 2.8691746209994387, "grad_norm": 0.37810519337654114, "learning_rate": 5.7834057612048414e-08, "loss": 0.3646, "step": 5110 }, { "epoch": 2.8697361033127455, "grad_norm": 0.38533779978752136, "learning_rate": 5.733964222139343e-08, "loss": 0.3717, "step": 5111 }, { "epoch": 2.870297585626053, "grad_norm": 0.3919704556465149, "learning_rate": 5.6847337069422805e-08, "loss": 0.3565, "step": 5112 }, { "epoch": 2.87085906793936, "grad_norm": 0.4034050405025482, "learning_rate": 5.635714236632339e-08, "loss": 0.3792, "step": 5113 }, { "epoch": 2.871420550252667, "grad_norm": 0.403588205575943, "learning_rate": 5.586905832138223e-08, "loss": 0.3642, "step": 5114 }, { "epoch": 2.871982032565974, "grad_norm": 0.3855592608451843, "learning_rate": 5.538308514298152e-08, "loss": 0.3942, "step": 5115 }, { "epoch": 2.8725435148792813, "grad_norm": 0.34427282214164734, "learning_rate": 5.489922303860695e-08, "loss": 0.3749, "step": 5116 }, { "epoch": 2.8731049971925886, "grad_norm": 0.3549034893512726, "learning_rate": 5.441747221483828e-08, "loss": 0.3917, "step": 5117 }, { "epoch": 2.8736664795058955, "grad_norm": 0.3883206248283386, "learning_rate": 5.39378328773571e-08, "loss": 0.3754, "step": 5118 }, { "epoch": 2.874227961819203, "grad_norm": 0.3793310523033142, "learning_rate": 5.346030523094348e-08, "loss": 0.3428, "step": 5119 }, { "epoch": 2.8747894441325097, "grad_norm": 0.34164074063301086, "learning_rate": 5.298488947947267e-08, "loss": 0.3645, "step": 5120 }, { "epoch": 2.875350926445817, "grad_norm": 0.4076628088951111, "learning_rate": 5.251158582592286e-08, "loss": 0.3714, "step": 5121 }, { "epoch": 2.875912408759124, "grad_norm": 0.37229228019714355, "learning_rate": 5.204039447236631e-08, "loss": 0.3647, "step": 5122 }, { "epoch": 2.8764738910724312, "grad_norm": 0.3804805874824524, "learning_rate": 5.1571315619976525e-08, "loss": 0.3567, "step": 5123 }, { "epoch": 2.8770353733857386, "grad_norm": 0.34514638781547546, "learning_rate": 5.110434946902332e-08, "loss": 0.3775, "step": 5124 }, { "epoch": 2.8775968556990454, "grad_norm": 0.3661583662033081, "learning_rate": 5.063949621887554e-08, "loss": 0.3721, "step": 5125 }, { "epoch": 2.8781583380123528, "grad_norm": 0.40025779604911804, "learning_rate": 5.0176756067998324e-08, "loss": 0.3638, "step": 5126 }, { "epoch": 2.8787198203256597, "grad_norm": 0.42707881331443787, "learning_rate": 4.971612921395697e-08, "loss": 0.3734, "step": 5127 }, { "epoch": 2.879281302638967, "grad_norm": 0.37389135360717773, "learning_rate": 4.925761585341249e-08, "loss": 0.3797, "step": 5128 }, { "epoch": 2.879842784952274, "grad_norm": 0.3777991235256195, "learning_rate": 4.8801216182124965e-08, "loss": 0.3706, "step": 5129 }, { "epoch": 2.880404267265581, "grad_norm": 0.37927693128585815, "learning_rate": 4.834693039495075e-08, "loss": 0.3713, "step": 5130 }, { "epoch": 2.8809657495788885, "grad_norm": 0.40874338150024414, "learning_rate": 4.789475868584525e-08, "loss": 0.3563, "step": 5131 }, { "epoch": 2.8815272318921954, "grad_norm": 0.42913419008255005, "learning_rate": 4.744470124786016e-08, "loss": 0.3774, "step": 5132 }, { "epoch": 2.8820887142055023, "grad_norm": 0.37842491269111633, "learning_rate": 4.699675827314398e-08, "loss": 0.3691, "step": 5133 }, { "epoch": 2.8826501965188096, "grad_norm": 0.34226855635643005, "learning_rate": 4.6550929952944304e-08, "loss": 0.3569, "step": 5134 }, { "epoch": 2.883211678832117, "grad_norm": 0.3452940285205841, "learning_rate": 4.6107216477603855e-08, "loss": 0.3584, "step": 5135 }, { "epoch": 2.883773161145424, "grad_norm": 0.3690970540046692, "learning_rate": 4.566561803656444e-08, "loss": 0.3763, "step": 5136 }, { "epoch": 2.884334643458731, "grad_norm": 0.3613230586051941, "learning_rate": 4.5226134818362464e-08, "loss": 0.3736, "step": 5137 }, { "epoch": 2.884896125772038, "grad_norm": 0.357837051153183, "learning_rate": 4.4788767010634505e-08, "loss": 0.3833, "step": 5138 }, { "epoch": 2.8854576080853453, "grad_norm": 0.40119969844818115, "learning_rate": 4.4353514800110073e-08, "loss": 0.3767, "step": 5139 }, { "epoch": 2.8860190903986522, "grad_norm": 0.37592363357543945, "learning_rate": 4.392037837261831e-08, "loss": 0.3942, "step": 5140 }, { "epoch": 2.8865805727119596, "grad_norm": 0.3558832108974457, "learning_rate": 4.348935791308462e-08, "loss": 0.3899, "step": 5141 }, { "epoch": 2.887142055025267, "grad_norm": 0.3657326102256775, "learning_rate": 4.306045360553013e-08, "loss": 0.3726, "step": 5142 }, { "epoch": 2.8877035373385738, "grad_norm": 0.4071579575538635, "learning_rate": 4.263366563307281e-08, "loss": 0.3571, "step": 5143 }, { "epoch": 2.888265019651881, "grad_norm": 0.37109801173210144, "learning_rate": 4.220899417792745e-08, "loss": 0.3648, "step": 5144 }, { "epoch": 2.888826501965188, "grad_norm": 0.38448911905288696, "learning_rate": 4.178643942140403e-08, "loss": 0.372, "step": 5145 }, { "epoch": 2.8893879842784953, "grad_norm": 0.45353376865386963, "learning_rate": 4.136600154391157e-08, "loss": 0.383, "step": 5146 }, { "epoch": 2.889949466591802, "grad_norm": 0.38584989309310913, "learning_rate": 4.094768072495148e-08, "loss": 0.3843, "step": 5147 }, { "epoch": 2.8905109489051095, "grad_norm": 0.3611151874065399, "learning_rate": 4.0531477143124776e-08, "loss": 0.384, "step": 5148 }, { "epoch": 2.891072431218417, "grad_norm": 0.6672108173370361, "learning_rate": 4.0117390976126545e-08, "loss": 0.3826, "step": 5149 }, { "epoch": 2.8916339135317237, "grad_norm": 0.365304172039032, "learning_rate": 3.970542240074759e-08, "loss": 0.3699, "step": 5150 }, { "epoch": 2.892195395845031, "grad_norm": 0.405093252658844, "learning_rate": 3.929557159287667e-08, "loss": 0.3571, "step": 5151 }, { "epoch": 2.892756878158338, "grad_norm": 0.41585633158683777, "learning_rate": 3.8887838727496017e-08, "loss": 0.39, "step": 5152 }, { "epoch": 2.8933183604716453, "grad_norm": 0.39906200766563416, "learning_rate": 3.848222397868528e-08, "loss": 0.358, "step": 5153 }, { "epoch": 2.893879842784952, "grad_norm": 0.3444410562515259, "learning_rate": 3.8078727519618716e-08, "loss": 0.3565, "step": 5154 }, { "epoch": 2.8944413250982595, "grad_norm": 0.3833906054496765, "learning_rate": 3.767734952256741e-08, "loss": 0.3654, "step": 5155 }, { "epoch": 2.895002807411567, "grad_norm": 0.36682724952697754, "learning_rate": 3.72780901588965e-08, "loss": 0.3723, "step": 5156 }, { "epoch": 2.8955642897248737, "grad_norm": 0.3643714487552643, "learning_rate": 3.688094959906796e-08, "loss": 0.4065, "step": 5157 }, { "epoch": 2.8961257720381806, "grad_norm": 0.35881516337394714, "learning_rate": 3.64859280126384e-08, "loss": 0.3586, "step": 5158 }, { "epoch": 2.896687254351488, "grad_norm": 0.3635329604148865, "learning_rate": 3.6093025568259e-08, "loss": 0.3847, "step": 5159 }, { "epoch": 2.897248736664795, "grad_norm": 0.4125414788722992, "learning_rate": 3.570224243367837e-08, "loss": 0.3827, "step": 5160 }, { "epoch": 2.897810218978102, "grad_norm": 0.3682129681110382, "learning_rate": 3.53135787757386e-08, "loss": 0.3856, "step": 5161 }, { "epoch": 2.8983717012914094, "grad_norm": 0.3961210548877716, "learning_rate": 3.49270347603764e-08, "loss": 0.3883, "step": 5162 }, { "epoch": 2.8989331836047163, "grad_norm": 0.4086105525493622, "learning_rate": 3.4542610552625314e-08, "loss": 0.364, "step": 5163 }, { "epoch": 2.8994946659180236, "grad_norm": 0.38059112429618835, "learning_rate": 3.4160306316612936e-08, "loss": 0.3576, "step": 5164 }, { "epoch": 2.9000561482313305, "grad_norm": 0.3573255240917206, "learning_rate": 3.3780122215560926e-08, "loss": 0.3813, "step": 5165 }, { "epoch": 2.900617630544638, "grad_norm": 0.39040520787239075, "learning_rate": 3.340205841178778e-08, "loss": 0.368, "step": 5166 }, { "epoch": 2.901179112857945, "grad_norm": 0.3648778796195984, "learning_rate": 3.302611506670383e-08, "loss": 0.3627, "step": 5167 }, { "epoch": 2.901740595171252, "grad_norm": 0.34817689657211304, "learning_rate": 3.26522923408179e-08, "loss": 0.3688, "step": 5168 }, { "epoch": 2.9023020774845594, "grad_norm": 0.3872241675853729, "learning_rate": 3.228059039372955e-08, "loss": 0.3477, "step": 5169 }, { "epoch": 2.9028635597978663, "grad_norm": 0.35761913657188416, "learning_rate": 3.191100938413516e-08, "loss": 0.3849, "step": 5170 }, { "epoch": 2.9034250421111736, "grad_norm": 0.40400561690330505, "learning_rate": 3.154354946982574e-08, "loss": 0.3727, "step": 5171 }, { "epoch": 2.9039865244244805, "grad_norm": 0.35010647773742676, "learning_rate": 3.117821080768524e-08, "loss": 0.3602, "step": 5172 }, { "epoch": 2.904548006737788, "grad_norm": 0.37819328904151917, "learning_rate": 3.081499355369333e-08, "loss": 0.3767, "step": 5173 }, { "epoch": 2.905109489051095, "grad_norm": 0.3651933968067169, "learning_rate": 3.045389786292263e-08, "loss": 0.3498, "step": 5174 }, { "epoch": 2.905670971364402, "grad_norm": 0.39729124307632446, "learning_rate": 3.009492388954149e-08, "loss": 0.4073, "step": 5175 }, { "epoch": 2.906232453677709, "grad_norm": 0.37125611305236816, "learning_rate": 2.973807178681176e-08, "loss": 0.3815, "step": 5176 }, { "epoch": 2.906793935991016, "grad_norm": 0.35432928800582886, "learning_rate": 2.9383341707088786e-08, "loss": 0.3454, "step": 5177 }, { "epoch": 2.9073554183043235, "grad_norm": 0.37909016013145447, "learning_rate": 2.9030733801823084e-08, "loss": 0.3409, "step": 5178 }, { "epoch": 2.9079169006176304, "grad_norm": 0.4617297649383545, "learning_rate": 2.8680248221557017e-08, "loss": 0.4041, "step": 5179 }, { "epoch": 2.9084783829309377, "grad_norm": 0.3953055143356323, "learning_rate": 2.8331885115929768e-08, "loss": 0.3852, "step": 5180 }, { "epoch": 2.909039865244245, "grad_norm": 0.3417675197124481, "learning_rate": 2.798564463367237e-08, "loss": 0.3335, "step": 5181 }, { "epoch": 2.909601347557552, "grad_norm": 0.388013631105423, "learning_rate": 2.764152692260935e-08, "loss": 0.3698, "step": 5182 }, { "epoch": 2.910162829870859, "grad_norm": 0.3303796350955963, "learning_rate": 2.7299532129660967e-08, "loss": 0.3578, "step": 5183 }, { "epoch": 2.910724312184166, "grad_norm": 0.3945184051990509, "learning_rate": 2.695966040083875e-08, "loss": 0.3805, "step": 5184 }, { "epoch": 2.9112857944974735, "grad_norm": 0.36242833733558655, "learning_rate": 2.6621911881249405e-08, "loss": 0.3786, "step": 5185 }, { "epoch": 2.9118472768107804, "grad_norm": 0.3761397898197174, "learning_rate": 2.628628671509259e-08, "loss": 0.3666, "step": 5186 }, { "epoch": 2.9124087591240877, "grad_norm": 0.39549022912979126, "learning_rate": 2.59527850456609e-08, "loss": 0.379, "step": 5187 }, { "epoch": 2.9129702414373946, "grad_norm": 0.33624693751335144, "learning_rate": 2.562140701534155e-08, "loss": 0.3646, "step": 5188 }, { "epoch": 2.913531723750702, "grad_norm": 0.38407501578330994, "learning_rate": 2.5292152765614143e-08, "loss": 0.361, "step": 5189 }, { "epoch": 2.914093206064009, "grad_norm": 0.3547528088092804, "learning_rate": 2.4965022437051236e-08, "loss": 0.3618, "step": 5190 }, { "epoch": 2.914654688377316, "grad_norm": 0.36696943640708923, "learning_rate": 2.4640016169319437e-08, "loss": 0.3521, "step": 5191 }, { "epoch": 2.9152161706906234, "grad_norm": 0.3789697289466858, "learning_rate": 2.431713410117886e-08, "loss": 0.369, "step": 5192 }, { "epoch": 2.9157776530039303, "grad_norm": 0.4137653410434723, "learning_rate": 2.3996376370481467e-08, "loss": 0.3808, "step": 5193 }, { "epoch": 2.9163391353172377, "grad_norm": 0.36709830164909363, "learning_rate": 2.3677743114172146e-08, "loss": 0.3549, "step": 5194 }, { "epoch": 2.9169006176305445, "grad_norm": 0.35981956124305725, "learning_rate": 2.3361234468290415e-08, "loss": 0.4088, "step": 5195 }, { "epoch": 2.917462099943852, "grad_norm": 0.4159349799156189, "learning_rate": 2.3046850567967626e-08, "loss": 0.3743, "step": 5196 }, { "epoch": 2.9180235822571587, "grad_norm": 0.3309078514575958, "learning_rate": 2.2734591547428077e-08, "loss": 0.3893, "step": 5197 }, { "epoch": 2.918585064570466, "grad_norm": 0.3967859148979187, "learning_rate": 2.2424457539988453e-08, "loss": 0.3835, "step": 5198 }, { "epoch": 2.9191465468837734, "grad_norm": 0.4060434103012085, "learning_rate": 2.21164486780584e-08, "loss": 0.3835, "step": 5199 }, { "epoch": 2.9197080291970803, "grad_norm": 0.3679191470146179, "learning_rate": 2.18105650931405e-08, "loss": 0.3578, "step": 5200 }, { "epoch": 2.920269511510387, "grad_norm": 0.3535308837890625, "learning_rate": 2.1506806915830846e-08, "loss": 0.4066, "step": 5201 }, { "epoch": 2.9208309938236945, "grad_norm": 0.40612345933914185, "learning_rate": 2.1205174275816254e-08, "loss": 0.3935, "step": 5202 }, { "epoch": 2.921392476137002, "grad_norm": 0.3665991425514221, "learning_rate": 2.0905667301877042e-08, "loss": 0.3732, "step": 5203 }, { "epoch": 2.9219539584503087, "grad_norm": 0.3963286280632019, "learning_rate": 2.0608286121885924e-08, "loss": 0.3918, "step": 5204 }, { "epoch": 2.922515440763616, "grad_norm": 0.3834450840950012, "learning_rate": 2.0313030862808004e-08, "loss": 0.3311, "step": 5205 }, { "epoch": 2.9230769230769234, "grad_norm": 0.40146297216415405, "learning_rate": 2.001990165070078e-08, "loss": 0.3859, "step": 5206 }, { "epoch": 2.9236384053902302, "grad_norm": 0.40374496579170227, "learning_rate": 1.972889861071359e-08, "loss": 0.3899, "step": 5207 }, { "epoch": 2.924199887703537, "grad_norm": 0.4001128077507019, "learning_rate": 1.9440021867089265e-08, "loss": 0.3861, "step": 5208 }, { "epoch": 2.9247613700168444, "grad_norm": 0.3791598081588745, "learning_rate": 1.915327154316138e-08, "loss": 0.3974, "step": 5209 }, { "epoch": 2.9253228523301518, "grad_norm": 0.38149213790893555, "learning_rate": 1.8868647761355884e-08, "loss": 0.3479, "step": 5210 }, { "epoch": 2.9258843346434587, "grad_norm": 0.4113655388355255, "learning_rate": 1.8586150643192248e-08, "loss": 0.3785, "step": 5211 }, { "epoch": 2.926445816956766, "grad_norm": 0.3751506805419922, "learning_rate": 1.830578030927954e-08, "loss": 0.3539, "step": 5212 }, { "epoch": 2.927007299270073, "grad_norm": 0.36662253737449646, "learning_rate": 1.802753687932146e-08, "loss": 0.3694, "step": 5213 }, { "epoch": 2.92756878158338, "grad_norm": 0.3815315365791321, "learning_rate": 1.7751420472111312e-08, "loss": 0.3734, "step": 5214 }, { "epoch": 2.928130263896687, "grad_norm": 0.34898093342781067, "learning_rate": 1.747743120553591e-08, "loss": 0.3719, "step": 5215 }, { "epoch": 2.9286917462099944, "grad_norm": 0.36895349621772766, "learning_rate": 1.7205569196573345e-08, "loss": 0.3544, "step": 5216 }, { "epoch": 2.9292532285233017, "grad_norm": 0.44269707798957825, "learning_rate": 1.6935834561292997e-08, "loss": 0.3588, "step": 5217 }, { "epoch": 2.9298147108366086, "grad_norm": 0.36951160430908203, "learning_rate": 1.666822741485663e-08, "loss": 0.3982, "step": 5218 }, { "epoch": 2.930376193149916, "grad_norm": 0.397516667842865, "learning_rate": 1.6402747871517298e-08, "loss": 0.35, "step": 5219 }, { "epoch": 2.930937675463223, "grad_norm": 0.3809799253940582, "learning_rate": 1.613939604462045e-08, "loss": 0.3754, "step": 5220 }, { "epoch": 2.93149915777653, "grad_norm": 0.3759686052799225, "learning_rate": 1.587817204660169e-08, "loss": 0.3682, "step": 5221 }, { "epoch": 2.932060640089837, "grad_norm": 0.3850093483924866, "learning_rate": 1.561907598899015e-08, "loss": 0.3628, "step": 5222 }, { "epoch": 2.9326221224031443, "grad_norm": 0.40013206005096436, "learning_rate": 1.536210798240456e-08, "loss": 0.369, "step": 5223 }, { "epoch": 2.9331836047164517, "grad_norm": 0.3404639661312103, "learning_rate": 1.51072681365555e-08, "loss": 0.396, "step": 5224 }, { "epoch": 2.9337450870297586, "grad_norm": 0.36933597922325134, "learning_rate": 1.4854556560245925e-08, "loss": 0.3681, "step": 5225 }, { "epoch": 2.9343065693430654, "grad_norm": 0.3491629660129547, "learning_rate": 1.4603973361368984e-08, "loss": 0.3798, "step": 5226 }, { "epoch": 2.9348680516563728, "grad_norm": 0.384735643863678, "learning_rate": 1.4355518646910205e-08, "loss": 0.3843, "step": 5227 }, { "epoch": 2.93542953396968, "grad_norm": 0.377480149269104, "learning_rate": 1.4109192522945847e-08, "loss": 0.3213, "step": 5228 }, { "epoch": 2.935991016282987, "grad_norm": 0.4025007486343384, "learning_rate": 1.3864995094642898e-08, "loss": 0.3528, "step": 5229 }, { "epoch": 2.9365524985962943, "grad_norm": 0.3562312424182892, "learning_rate": 1.3622926466260178e-08, "loss": 0.3824, "step": 5230 }, { "epoch": 2.937113980909601, "grad_norm": 0.39968791604042053, "learning_rate": 1.3382986741147241e-08, "loss": 0.3493, "step": 5231 }, { "epoch": 2.9376754632229085, "grad_norm": 0.3693688213825226, "learning_rate": 1.3145176021744922e-08, "loss": 0.3654, "step": 5232 }, { "epoch": 2.9382369455362154, "grad_norm": 0.38225042819976807, "learning_rate": 1.2909494409585333e-08, "loss": 0.367, "step": 5233 }, { "epoch": 2.9387984278495227, "grad_norm": 0.3620010316371918, "learning_rate": 1.2675942005290765e-08, "loss": 0.3858, "step": 5234 }, { "epoch": 2.93935991016283, "grad_norm": 0.39435166120529175, "learning_rate": 1.2444518908575898e-08, "loss": 0.3599, "step": 5235 }, { "epoch": 2.939921392476137, "grad_norm": 0.4114034175872803, "learning_rate": 1.2215225218244475e-08, "loss": 0.3732, "step": 5236 }, { "epoch": 2.9404828747894443, "grad_norm": 0.3613292872905731, "learning_rate": 1.1988061032192633e-08, "loss": 0.3537, "step": 5237 }, { "epoch": 2.941044357102751, "grad_norm": 0.35971876978874207, "learning_rate": 1.1763026447406122e-08, "loss": 0.3555, "step": 5238 }, { "epoch": 2.9416058394160585, "grad_norm": 0.35173630714416504, "learning_rate": 1.154012155996309e-08, "loss": 0.3527, "step": 5239 }, { "epoch": 2.9421673217293653, "grad_norm": 0.33580970764160156, "learning_rate": 1.131934646503019e-08, "loss": 0.3637, "step": 5240 }, { "epoch": 2.9427288040426727, "grad_norm": 0.3690026104450226, "learning_rate": 1.110070125686702e-08, "loss": 0.3857, "step": 5241 }, { "epoch": 2.94329028635598, "grad_norm": 0.40599262714385986, "learning_rate": 1.0884186028822241e-08, "loss": 0.3679, "step": 5242 }, { "epoch": 2.943851768669287, "grad_norm": 0.3768482804298401, "learning_rate": 1.06698008733358e-08, "loss": 0.346, "step": 5243 }, { "epoch": 2.944413250982594, "grad_norm": 0.35824474692344666, "learning_rate": 1.0457545881938368e-08, "loss": 0.3494, "step": 5244 }, { "epoch": 2.944974733295901, "grad_norm": 0.39909881353378296, "learning_rate": 1.024742114525079e-08, "loss": 0.342, "step": 5245 }, { "epoch": 2.9455362156092084, "grad_norm": 0.37210598587989807, "learning_rate": 1.0039426752984637e-08, "loss": 0.3815, "step": 5246 }, { "epoch": 2.9460976979225153, "grad_norm": 0.3742882311344147, "learning_rate": 9.833562793941099e-09, "loss": 0.3409, "step": 5247 }, { "epoch": 2.9466591802358226, "grad_norm": 0.39762577414512634, "learning_rate": 9.629829356013754e-09, "loss": 0.4126, "step": 5248 }, { "epoch": 2.94722066254913, "grad_norm": 0.37140756845474243, "learning_rate": 9.428226526184691e-09, "loss": 0.364, "step": 5249 }, { "epoch": 2.947782144862437, "grad_norm": 0.370779812335968, "learning_rate": 9.228754390527284e-09, "loss": 0.404, "step": 5250 }, { "epoch": 2.9483436271757437, "grad_norm": 0.38171911239624023, "learning_rate": 9.031413034204516e-09, "loss": 0.3531, "step": 5251 }, { "epoch": 2.948905109489051, "grad_norm": 0.39524272084236145, "learning_rate": 8.836202541470107e-09, "loss": 0.4089, "step": 5252 }, { "epoch": 2.9494665918023584, "grad_norm": 0.3407927453517914, "learning_rate": 8.643122995668496e-09, "loss": 0.3756, "step": 5253 }, { "epoch": 2.9500280741156653, "grad_norm": 0.3814270496368408, "learning_rate": 8.452174479233189e-09, "loss": 0.3714, "step": 5254 }, { "epoch": 2.9505895564289726, "grad_norm": 0.347802072763443, "learning_rate": 8.263357073688972e-09, "loss": 0.3593, "step": 5255 }, { "epoch": 2.9511510387422795, "grad_norm": 0.3824666142463684, "learning_rate": 8.076670859649694e-09, "loss": 0.3866, "step": 5256 }, { "epoch": 2.951712521055587, "grad_norm": 0.3648795187473297, "learning_rate": 7.892115916821041e-09, "loss": 0.3752, "step": 5257 }, { "epoch": 2.9522740033688937, "grad_norm": 0.351816862821579, "learning_rate": 7.709692323996654e-09, "loss": 0.3661, "step": 5258 }, { "epoch": 2.952835485682201, "grad_norm": 0.3704140782356262, "learning_rate": 7.529400159060895e-09, "loss": 0.3807, "step": 5259 }, { "epoch": 2.9533969679955083, "grad_norm": 0.3473844826221466, "learning_rate": 7.3512394989894154e-09, "loss": 0.3769, "step": 5260 }, { "epoch": 2.953958450308815, "grad_norm": 0.3948903977870941, "learning_rate": 7.175210419846368e-09, "loss": 0.3669, "step": 5261 }, { "epoch": 2.9545199326221225, "grad_norm": 0.37371984124183655, "learning_rate": 7.0013129967860805e-09, "loss": 0.3531, "step": 5262 }, { "epoch": 2.9550814149354294, "grad_norm": 0.3901353180408478, "learning_rate": 6.829547304053052e-09, "loss": 0.3618, "step": 5263 }, { "epoch": 2.9556428972487367, "grad_norm": 0.3798507750034332, "learning_rate": 6.659913414981956e-09, "loss": 0.3758, "step": 5264 }, { "epoch": 2.9562043795620436, "grad_norm": 0.380633145570755, "learning_rate": 6.492411401996524e-09, "loss": 0.4124, "step": 5265 }, { "epoch": 2.956765861875351, "grad_norm": 0.35711929202079773, "learning_rate": 6.3270413366112175e-09, "loss": 0.3736, "step": 5266 }, { "epoch": 2.9573273441886583, "grad_norm": 0.400812029838562, "learning_rate": 6.163803289429005e-09, "loss": 0.3647, "step": 5267 }, { "epoch": 2.957888826501965, "grad_norm": 0.3697125017642975, "learning_rate": 6.002697330144136e-09, "loss": 0.3494, "step": 5268 }, { "epoch": 2.958450308815272, "grad_norm": 0.3908836543560028, "learning_rate": 5.843723527539369e-09, "loss": 0.3966, "step": 5269 }, { "epoch": 2.9590117911285794, "grad_norm": 0.3704424798488617, "learning_rate": 5.686881949487632e-09, "loss": 0.3741, "step": 5270 }, { "epoch": 2.9595732734418867, "grad_norm": 0.35887718200683594, "learning_rate": 5.532172662952029e-09, "loss": 0.4019, "step": 5271 }, { "epoch": 2.9601347557551936, "grad_norm": 0.39572250843048096, "learning_rate": 5.379595733983611e-09, "loss": 0.3813, "step": 5272 }, { "epoch": 2.960696238068501, "grad_norm": 0.4427841305732727, "learning_rate": 5.229151227725271e-09, "loss": 0.3582, "step": 5273 }, { "epoch": 2.9612577203818082, "grad_norm": 0.35484686493873596, "learning_rate": 5.080839208408406e-09, "loss": 0.4016, "step": 5274 }, { "epoch": 2.961819202695115, "grad_norm": 0.3327328860759735, "learning_rate": 4.934659739352921e-09, "loss": 0.3682, "step": 5275 }, { "epoch": 2.962380685008422, "grad_norm": 0.41807106137275696, "learning_rate": 4.790612882970003e-09, "loss": 0.354, "step": 5276 }, { "epoch": 2.9629421673217293, "grad_norm": 0.38834792375564575, "learning_rate": 4.648698700758792e-09, "loss": 0.3747, "step": 5277 }, { "epoch": 2.9635036496350367, "grad_norm": 0.3945891261100769, "learning_rate": 4.508917253310263e-09, "loss": 0.3794, "step": 5278 }, { "epoch": 2.9640651319483435, "grad_norm": 0.4305301606655121, "learning_rate": 4.371268600301681e-09, "loss": 0.3895, "step": 5279 }, { "epoch": 2.964626614261651, "grad_norm": 0.37037786841392517, "learning_rate": 4.235752800502146e-09, "loss": 0.3743, "step": 5280 }, { "epoch": 2.9651880965749577, "grad_norm": 0.35556507110595703, "learning_rate": 4.102369911768711e-09, "loss": 0.3773, "step": 5281 }, { "epoch": 2.965749578888265, "grad_norm": 0.3669593930244446, "learning_rate": 3.9711199910485995e-09, "loss": 0.3448, "step": 5282 }, { "epoch": 2.966311061201572, "grad_norm": 0.37403398752212524, "learning_rate": 3.842003094378654e-09, "loss": 0.3597, "step": 5283 }, { "epoch": 2.9668725435148793, "grad_norm": 0.3648824095726013, "learning_rate": 3.715019276884224e-09, "loss": 0.3741, "step": 5284 }, { "epoch": 2.9674340258281866, "grad_norm": 0.3992325961589813, "learning_rate": 3.5901685927802743e-09, "loss": 0.3598, "step": 5285 }, { "epoch": 2.9679955081414935, "grad_norm": 0.3865833282470703, "learning_rate": 3.467451095370833e-09, "loss": 0.3516, "step": 5286 }, { "epoch": 2.968556990454801, "grad_norm": 0.37446948885917664, "learning_rate": 3.3468668370489898e-09, "loss": 0.3827, "step": 5287 }, { "epoch": 2.9691184727681077, "grad_norm": 0.3692103922367096, "learning_rate": 3.2284158692991174e-09, "loss": 0.3521, "step": 5288 }, { "epoch": 2.969679955081415, "grad_norm": 0.3983895182609558, "learning_rate": 3.1120982426913194e-09, "loss": 0.3697, "step": 5289 }, { "epoch": 2.970241437394722, "grad_norm": 0.37403109669685364, "learning_rate": 2.997914006887537e-09, "loss": 0.3762, "step": 5290 }, { "epoch": 2.9708029197080292, "grad_norm": 0.3804003894329071, "learning_rate": 2.8858632106376626e-09, "loss": 0.3644, "step": 5291 }, { "epoch": 2.9713644020213366, "grad_norm": 0.3758683204650879, "learning_rate": 2.7759459017817624e-09, "loss": 0.387, "step": 5292 }, { "epoch": 2.9719258843346434, "grad_norm": 0.38183918595314026, "learning_rate": 2.6681621272478533e-09, "loss": 0.3538, "step": 5293 }, { "epoch": 2.9724873666479503, "grad_norm": 0.40723150968551636, "learning_rate": 2.5625119330535684e-09, "loss": 0.3969, "step": 5294 }, { "epoch": 2.9730488489612577, "grad_norm": 0.3519853949546814, "learning_rate": 2.4589953643056053e-09, "loss": 0.3816, "step": 5295 }, { "epoch": 2.973610331274565, "grad_norm": 0.368971586227417, "learning_rate": 2.3576124651997213e-09, "loss": 0.3591, "step": 5296 }, { "epoch": 2.974171813587872, "grad_norm": 0.34605735540390015, "learning_rate": 2.2583632790212917e-09, "loss": 0.364, "step": 5297 }, { "epoch": 2.974733295901179, "grad_norm": 0.3934346139431, "learning_rate": 2.1612478481430886e-09, "loss": 0.346, "step": 5298 }, { "epoch": 2.9752947782144865, "grad_norm": 0.36611244082450867, "learning_rate": 2.0662662140291666e-09, "loss": 0.3651, "step": 5299 }, { "epoch": 2.9758562605277934, "grad_norm": 0.3907800316810608, "learning_rate": 1.9734184172304215e-09, "loss": 0.3643, "step": 5300 }, { "epoch": 2.9764177428411003, "grad_norm": 0.36163201928138733, "learning_rate": 1.8827044973879215e-09, "loss": 0.3865, "step": 5301 }, { "epoch": 2.9769792251544076, "grad_norm": 0.37043365836143494, "learning_rate": 1.7941244932312418e-09, "loss": 0.3344, "step": 5302 }, { "epoch": 2.977540707467715, "grad_norm": 0.3765031397342682, "learning_rate": 1.7076784425795744e-09, "loss": 0.3823, "step": 5303 }, { "epoch": 2.978102189781022, "grad_norm": 0.39913755655288696, "learning_rate": 1.6233663823400635e-09, "loss": 0.3757, "step": 5304 }, { "epoch": 2.978663672094329, "grad_norm": 0.3500332236289978, "learning_rate": 1.5411883485100254e-09, "loss": 0.3855, "step": 5305 }, { "epoch": 2.979225154407636, "grad_norm": 0.3840027153491974, "learning_rate": 1.4611443761736176e-09, "loss": 0.4072, "step": 5306 }, { "epoch": 2.9797866367209433, "grad_norm": 0.3730204105377197, "learning_rate": 1.3832344995062807e-09, "loss": 0.3678, "step": 5307 }, { "epoch": 2.9803481190342502, "grad_norm": 0.3822515904903412, "learning_rate": 1.3074587517702963e-09, "loss": 0.3515, "step": 5308 }, { "epoch": 2.9809096013475576, "grad_norm": 0.3385835587978363, "learning_rate": 1.2338171653181186e-09, "loss": 0.3625, "step": 5309 }, { "epoch": 2.981471083660865, "grad_norm": 0.3736189901828766, "learning_rate": 1.1623097715901533e-09, "loss": 0.3525, "step": 5310 }, { "epoch": 2.9820325659741718, "grad_norm": 0.39100125432014465, "learning_rate": 1.0929366011169785e-09, "loss": 0.3744, "step": 5311 }, { "epoch": 2.982594048287479, "grad_norm": 0.4149768650531769, "learning_rate": 1.0256976835160138e-09, "loss": 0.3881, "step": 5312 }, { "epoch": 2.983155530600786, "grad_norm": 0.39042025804519653, "learning_rate": 9.60593047494851e-10, "loss": 0.3789, "step": 5313 }, { "epoch": 2.9837170129140933, "grad_norm": 0.34494104981422424, "learning_rate": 8.976227208501442e-10, "loss": 0.3733, "step": 5314 }, { "epoch": 2.9842784952274, "grad_norm": 0.3602551519870758, "learning_rate": 8.367867304653887e-10, "loss": 0.3544, "step": 5315 }, { "epoch": 2.9848399775407075, "grad_norm": 0.38881853222846985, "learning_rate": 7.780851023153624e-10, "loss": 0.3908, "step": 5316 }, { "epoch": 2.985401459854015, "grad_norm": 0.38089510798454285, "learning_rate": 7.215178614616847e-10, "loss": 0.3334, "step": 5317 }, { "epoch": 2.9859629421673217, "grad_norm": 0.36021387577056885, "learning_rate": 6.670850320561473e-10, "loss": 0.3626, "step": 5318 }, { "epoch": 2.9865244244806286, "grad_norm": 0.3933035135269165, "learning_rate": 6.147866373373834e-10, "loss": 0.3711, "step": 5319 }, { "epoch": 2.987085906793936, "grad_norm": 0.3768443465232849, "learning_rate": 5.646226996347537e-10, "loss": 0.374, "step": 5320 }, { "epoch": 2.9876473891072433, "grad_norm": 0.39113157987594604, "learning_rate": 5.165932403650154e-10, "loss": 0.3558, "step": 5321 }, { "epoch": 2.98820887142055, "grad_norm": 0.42018836736679077, "learning_rate": 4.706982800339876e-10, "loss": 0.3958, "step": 5322 }, { "epoch": 2.9887703537338575, "grad_norm": 0.39682823419570923, "learning_rate": 4.2693783823655187e-10, "loss": 0.3846, "step": 5323 }, { "epoch": 2.9893318360471643, "grad_norm": 0.3892541527748108, "learning_rate": 3.8531193365609623e-10, "loss": 0.3936, "step": 5324 }, { "epoch": 2.9898933183604717, "grad_norm": 0.7406057119369507, "learning_rate": 3.458205840639606e-10, "loss": 0.3543, "step": 5325 }, { "epoch": 2.9904548006737786, "grad_norm": 0.3825279474258423, "learning_rate": 3.0846380632165717e-10, "loss": 0.3526, "step": 5326 }, { "epoch": 2.991016282987086, "grad_norm": 0.3761199712753296, "learning_rate": 2.7324161637753974e-10, "loss": 0.3697, "step": 5327 }, { "epoch": 2.991577765300393, "grad_norm": 0.39621978998184204, "learning_rate": 2.401540292695792e-10, "loss": 0.3735, "step": 5328 }, { "epoch": 2.9921392476137, "grad_norm": 0.35178741812705994, "learning_rate": 2.0920105912480838e-10, "loss": 0.3511, "step": 5329 }, { "epoch": 2.9927007299270074, "grad_norm": 0.37877798080444336, "learning_rate": 1.8038271915821192e-10, "loss": 0.3968, "step": 5330 }, { "epoch": 2.9932622122403143, "grad_norm": 0.3664992153644562, "learning_rate": 1.5369902167328143e-10, "loss": 0.367, "step": 5331 }, { "epoch": 2.9938236945536216, "grad_norm": 0.35629937052726746, "learning_rate": 1.2914997806312558e-10, "loss": 0.3691, "step": 5332 }, { "epoch": 2.9943851768669285, "grad_norm": 0.3784909248352051, "learning_rate": 1.0673559880824968e-10, "loss": 0.363, "step": 5333 }, { "epoch": 2.994946659180236, "grad_norm": 0.3677016496658325, "learning_rate": 8.645589347822114e-11, "loss": 0.3605, "step": 5334 }, { "epoch": 2.995508141493543, "grad_norm": 0.4054654836654663, "learning_rate": 6.83108707316693e-11, "loss": 0.3675, "step": 5335 }, { "epoch": 2.99606962380685, "grad_norm": 0.3641449809074402, "learning_rate": 5.2300538315730455e-11, "loss": 0.3993, "step": 5336 }, { "epoch": 2.9966311061201574, "grad_norm": 0.3786807358264923, "learning_rate": 3.842490306549263e-11, "loss": 0.3503, "step": 5337 }, { "epoch": 2.9971925884334643, "grad_norm": 0.363553524017334, "learning_rate": 2.6683970905660995e-11, "loss": 0.3717, "step": 5338 }, { "epoch": 2.9977540707467716, "grad_norm": 0.34883207082748413, "learning_rate": 1.7077746847782294e-11, "loss": 0.377, "step": 5339 }, { "epoch": 2.9983155530600785, "grad_norm": 0.3623546063899994, "learning_rate": 9.606234994130603e-12, "loss": 0.3454, "step": 5340 }, { "epoch": 2.998877035373386, "grad_norm": 0.3791167140007019, "learning_rate": 4.26943853493178e-12, "loss": 0.3673, "step": 5341 }, { "epoch": 2.999438517686693, "grad_norm": 0.38299569487571716, "learning_rate": 1.0673597478083608e-12, "loss": 0.3723, "step": 5342 }, { "epoch": 3.0, "grad_norm": 0.3682976961135864, "learning_rate": 0.0, "loss": 0.363, "step": 5343 }, { "epoch": 3.0, "step": 5343, "total_flos": 4620219213545472.0, "train_loss": 0.42465074877713965, "train_runtime": 123675.358, "train_samples_per_second": 2.764, "train_steps_per_second": 0.043 } ], "logging_steps": 1.0, "max_steps": 5343, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4620219213545472.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }