{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5928147616874225, "eval_steps": 500, "global_step": 2273, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002608071982786725, "grad_norm": 77.0, "learning_rate": 0.0, "loss": 17.314273834228516, "step": 1 }, { "epoch": 0.000521614396557345, "grad_norm": 72.5, "learning_rate": 3.773584905660378e-07, "loss": 17.201396942138672, "step": 2 }, { "epoch": 0.0007824215948360175, "grad_norm": 72.5, "learning_rate": 7.547169811320755e-07, "loss": 17.240808486938477, "step": 3 }, { "epoch": 0.00104322879311469, "grad_norm": 77.0, "learning_rate": 1.1320754716981133e-06, "loss": 17.186172485351562, "step": 4 }, { "epoch": 0.0013040359913933624, "grad_norm": 75.5, "learning_rate": 1.509433962264151e-06, "loss": 17.1783447265625, "step": 5 }, { "epoch": 0.001564843189672035, "grad_norm": 91.0, "learning_rate": 1.8867924528301889e-06, "loss": 17.03378677368164, "step": 6 }, { "epoch": 0.0018256503879507074, "grad_norm": 89.5, "learning_rate": 2.2641509433962266e-06, "loss": 17.054584503173828, "step": 7 }, { "epoch": 0.00208645758622938, "grad_norm": 78.0, "learning_rate": 2.6415094339622644e-06, "loss": 16.92469596862793, "step": 8 }, { "epoch": 0.0023472647845080522, "grad_norm": 78.0, "learning_rate": 3.018867924528302e-06, "loss": 16.677852630615234, "step": 9 }, { "epoch": 0.002608071982786725, "grad_norm": 82.0, "learning_rate": 3.3962264150943395e-06, "loss": 16.411882400512695, "step": 10 }, { "epoch": 0.0028688791810653974, "grad_norm": 86.5, "learning_rate": 3.7735849056603777e-06, "loss": 16.228927612304688, "step": 11 }, { "epoch": 0.00312968637934407, "grad_norm": 89.5, "learning_rate": 4.150943396226416e-06, "loss": 15.959747314453125, "step": 12 }, { "epoch": 0.0033904935776227422, "grad_norm": 96.5, "learning_rate": 4.528301886792453e-06, "loss": 15.387005805969238, "step": 13 }, { "epoch": 0.003651300775901415, "grad_norm": 95.0, "learning_rate": 4.905660377358491e-06, "loss": 14.818863868713379, "step": 14 }, { "epoch": 0.003912107974180087, "grad_norm": 97.0, "learning_rate": 5.283018867924529e-06, "loss": 14.189617156982422, "step": 15 }, { "epoch": 0.00417291517245876, "grad_norm": 100.5, "learning_rate": 5.660377358490566e-06, "loss": 13.341421127319336, "step": 16 }, { "epoch": 0.004433722370737432, "grad_norm": 55.25, "learning_rate": 6.037735849056604e-06, "loss": 12.826044082641602, "step": 17 }, { "epoch": 0.0046945295690161044, "grad_norm": 70.0, "learning_rate": 6.415094339622642e-06, "loss": 12.598797798156738, "step": 18 }, { "epoch": 0.0049553367672947775, "grad_norm": 41.75, "learning_rate": 6.792452830188679e-06, "loss": 12.168102264404297, "step": 19 }, { "epoch": 0.00521614396557345, "grad_norm": 80.0, "learning_rate": 7.169811320754717e-06, "loss": 11.898005485534668, "step": 20 }, { "epoch": 0.005476951163852123, "grad_norm": 49.25, "learning_rate": 7.5471698113207555e-06, "loss": 11.813962936401367, "step": 21 }, { "epoch": 0.005737758362130795, "grad_norm": 40.0, "learning_rate": 7.924528301886793e-06, "loss": 11.574141502380371, "step": 22 }, { "epoch": 0.005998565560409467, "grad_norm": 137.0, "learning_rate": 8.301886792452832e-06, "loss": 11.403704643249512, "step": 23 }, { "epoch": 0.00625937275868814, "grad_norm": 40.75, "learning_rate": 8.67924528301887e-06, "loss": 11.084342956542969, "step": 24 }, { "epoch": 0.006520179956966812, "grad_norm": 34.25, "learning_rate": 9.056603773584907e-06, "loss": 11.013508796691895, "step": 25 }, { "epoch": 0.0067809871552454845, "grad_norm": 84.5, "learning_rate": 9.433962264150944e-06, "loss": 10.844001770019531, "step": 26 }, { "epoch": 0.0070417943535241575, "grad_norm": 27.0, "learning_rate": 9.811320754716981e-06, "loss": 10.781389236450195, "step": 27 }, { "epoch": 0.00730260155180283, "grad_norm": 25.75, "learning_rate": 1.018867924528302e-05, "loss": 10.518528938293457, "step": 28 }, { "epoch": 0.007563408750081502, "grad_norm": 97.0, "learning_rate": 1.0566037735849058e-05, "loss": 10.529638290405273, "step": 29 }, { "epoch": 0.007824215948360174, "grad_norm": 44.5, "learning_rate": 1.0943396226415095e-05, "loss": 10.512063980102539, "step": 30 }, { "epoch": 0.008085023146638847, "grad_norm": 27.5, "learning_rate": 1.1320754716981132e-05, "loss": 10.42243766784668, "step": 31 }, { "epoch": 0.00834583034491752, "grad_norm": 23.875, "learning_rate": 1.169811320754717e-05, "loss": 10.236409187316895, "step": 32 }, { "epoch": 0.008606637543196191, "grad_norm": 54.75, "learning_rate": 1.2075471698113209e-05, "loss": 10.11730670928955, "step": 33 }, { "epoch": 0.008867444741474865, "grad_norm": 26.875, "learning_rate": 1.2452830188679246e-05, "loss": 9.971153259277344, "step": 34 }, { "epoch": 0.009128251939753538, "grad_norm": 24.125, "learning_rate": 1.2830188679245283e-05, "loss": 9.97641658782959, "step": 35 }, { "epoch": 0.009389059138032209, "grad_norm": 31.75, "learning_rate": 1.320754716981132e-05, "loss": 9.677864074707031, "step": 36 }, { "epoch": 0.009649866336310882, "grad_norm": 24.875, "learning_rate": 1.3584905660377358e-05, "loss": 9.680337905883789, "step": 37 }, { "epoch": 0.009910673534589555, "grad_norm": 24.875, "learning_rate": 1.3962264150943397e-05, "loss": 9.397380828857422, "step": 38 }, { "epoch": 0.010171480732868228, "grad_norm": 26.375, "learning_rate": 1.4339622641509435e-05, "loss": 9.12952709197998, "step": 39 }, { "epoch": 0.0104322879311469, "grad_norm": 27.25, "learning_rate": 1.4716981132075472e-05, "loss": 9.214681625366211, "step": 40 }, { "epoch": 0.010693095129425572, "grad_norm": 34.5, "learning_rate": 1.5094339622641511e-05, "loss": 8.994095802307129, "step": 41 }, { "epoch": 0.010953902327704245, "grad_norm": 18.5, "learning_rate": 1.547169811320755e-05, "loss": 8.78971004486084, "step": 42 }, { "epoch": 0.011214709525982917, "grad_norm": 23.625, "learning_rate": 1.5849056603773586e-05, "loss": 8.471336364746094, "step": 43 }, { "epoch": 0.01147551672426159, "grad_norm": 30.5, "learning_rate": 1.6226415094339625e-05, "loss": 8.377541542053223, "step": 44 }, { "epoch": 0.011736323922540263, "grad_norm": 28.5, "learning_rate": 1.6603773584905664e-05, "loss": 8.198575973510742, "step": 45 }, { "epoch": 0.011997131120818934, "grad_norm": 26.75, "learning_rate": 1.69811320754717e-05, "loss": 7.972673416137695, "step": 46 }, { "epoch": 0.012257938319097607, "grad_norm": 24.75, "learning_rate": 1.735849056603774e-05, "loss": 8.09756851196289, "step": 47 }, { "epoch": 0.01251874551737628, "grad_norm": 20.0, "learning_rate": 1.7735849056603774e-05, "loss": 7.87360954284668, "step": 48 }, { "epoch": 0.012779552715654952, "grad_norm": 32.75, "learning_rate": 1.8113207547169813e-05, "loss": 7.462000370025635, "step": 49 }, { "epoch": 0.013040359913933625, "grad_norm": 29.0, "learning_rate": 1.8490566037735852e-05, "loss": 7.528225421905518, "step": 50 }, { "epoch": 0.013301167112212298, "grad_norm": 22.25, "learning_rate": 1.8867924528301888e-05, "loss": 7.312280654907227, "step": 51 }, { "epoch": 0.013561974310490969, "grad_norm": 26.375, "learning_rate": 1.9245283018867927e-05, "loss": 6.996704578399658, "step": 52 }, { "epoch": 0.013822781508769642, "grad_norm": 22.5, "learning_rate": 1.9622641509433963e-05, "loss": 7.07768440246582, "step": 53 }, { "epoch": 0.014083588707048315, "grad_norm": 23.125, "learning_rate": 2e-05, "loss": 6.913934230804443, "step": 54 }, { "epoch": 0.014344395905326986, "grad_norm": 17.25, "learning_rate": 2.037735849056604e-05, "loss": 7.123829364776611, "step": 55 }, { "epoch": 0.01460520310360566, "grad_norm": 23.5, "learning_rate": 2.0754716981132076e-05, "loss": 6.840793132781982, "step": 56 }, { "epoch": 0.014866010301884332, "grad_norm": 23.875, "learning_rate": 2.1132075471698115e-05, "loss": 6.265410423278809, "step": 57 }, { "epoch": 0.015126817500163004, "grad_norm": 19.875, "learning_rate": 2.150943396226415e-05, "loss": 6.487156391143799, "step": 58 }, { "epoch": 0.015387624698441677, "grad_norm": 18.75, "learning_rate": 2.188679245283019e-05, "loss": 6.40866756439209, "step": 59 }, { "epoch": 0.015648431896720348, "grad_norm": 26.25, "learning_rate": 2.226415094339623e-05, "loss": 6.230491638183594, "step": 60 }, { "epoch": 0.01590923909499902, "grad_norm": 16.75, "learning_rate": 2.2641509433962265e-05, "loss": 5.978422164916992, "step": 61 }, { "epoch": 0.016170046293277694, "grad_norm": 20.875, "learning_rate": 2.3018867924528304e-05, "loss": 5.85161828994751, "step": 62 }, { "epoch": 0.016430853491556367, "grad_norm": 20.25, "learning_rate": 2.339622641509434e-05, "loss": 5.712477207183838, "step": 63 }, { "epoch": 0.01669166068983504, "grad_norm": 17.625, "learning_rate": 2.377358490566038e-05, "loss": 5.904017448425293, "step": 64 }, { "epoch": 0.016952467888113713, "grad_norm": 18.75, "learning_rate": 2.4150943396226418e-05, "loss": 5.851974010467529, "step": 65 }, { "epoch": 0.017213275086392383, "grad_norm": 19.375, "learning_rate": 2.4528301886792453e-05, "loss": 5.791886806488037, "step": 66 }, { "epoch": 0.017474082284671056, "grad_norm": 18.5, "learning_rate": 2.4905660377358492e-05, "loss": 5.528830528259277, "step": 67 }, { "epoch": 0.01773488948294973, "grad_norm": 27.125, "learning_rate": 2.5283018867924528e-05, "loss": 5.4120564460754395, "step": 68 }, { "epoch": 0.017995696681228402, "grad_norm": 15.1875, "learning_rate": 2.5660377358490567e-05, "loss": 5.542486190795898, "step": 69 }, { "epoch": 0.018256503879507075, "grad_norm": 29.625, "learning_rate": 2.6037735849056606e-05, "loss": 5.326672554016113, "step": 70 }, { "epoch": 0.018517311077785748, "grad_norm": 21.625, "learning_rate": 2.641509433962264e-05, "loss": 5.187875270843506, "step": 71 }, { "epoch": 0.018778118276064418, "grad_norm": 17.875, "learning_rate": 2.679245283018868e-05, "loss": 5.226883888244629, "step": 72 }, { "epoch": 0.01903892547434309, "grad_norm": 14.8125, "learning_rate": 2.7169811320754716e-05, "loss": 5.023570537567139, "step": 73 }, { "epoch": 0.019299732672621764, "grad_norm": 26.375, "learning_rate": 2.7547169811320755e-05, "loss": 4.935462951660156, "step": 74 }, { "epoch": 0.019560539870900437, "grad_norm": 14.0625, "learning_rate": 2.7924528301886794e-05, "loss": 5.175811290740967, "step": 75 }, { "epoch": 0.01982134706917911, "grad_norm": 20.0, "learning_rate": 2.830188679245283e-05, "loss": 5.010772228240967, "step": 76 }, { "epoch": 0.020082154267457783, "grad_norm": 16.375, "learning_rate": 2.867924528301887e-05, "loss": 4.9048967361450195, "step": 77 }, { "epoch": 0.020342961465736456, "grad_norm": 16.0, "learning_rate": 2.9056603773584905e-05, "loss": 4.898214340209961, "step": 78 }, { "epoch": 0.020603768664015126, "grad_norm": 20.0, "learning_rate": 2.9433962264150944e-05, "loss": 4.572073936462402, "step": 79 }, { "epoch": 0.0208645758622938, "grad_norm": 15.0, "learning_rate": 2.9811320754716983e-05, "loss": 4.445930480957031, "step": 80 }, { "epoch": 0.02112538306057247, "grad_norm": 14.9375, "learning_rate": 3.0188679245283022e-05, "loss": 4.540976524353027, "step": 81 }, { "epoch": 0.021386190258851145, "grad_norm": 18.125, "learning_rate": 3.0566037735849064e-05, "loss": 4.4916791915893555, "step": 82 }, { "epoch": 0.021646997457129818, "grad_norm": 12.9375, "learning_rate": 3.09433962264151e-05, "loss": 4.635715484619141, "step": 83 }, { "epoch": 0.02190780465540849, "grad_norm": 20.25, "learning_rate": 3.1320754716981136e-05, "loss": 4.457133769989014, "step": 84 }, { "epoch": 0.02216861185368716, "grad_norm": 15.875, "learning_rate": 3.169811320754717e-05, "loss": 4.446690082550049, "step": 85 }, { "epoch": 0.022429419051965833, "grad_norm": 12.6875, "learning_rate": 3.2075471698113214e-05, "loss": 4.444411277770996, "step": 86 }, { "epoch": 0.022690226250244507, "grad_norm": 16.5, "learning_rate": 3.245283018867925e-05, "loss": 4.506210803985596, "step": 87 }, { "epoch": 0.02295103344852318, "grad_norm": 18.625, "learning_rate": 3.2830188679245285e-05, "loss": 4.7216081619262695, "step": 88 }, { "epoch": 0.023211840646801853, "grad_norm": 17.125, "learning_rate": 3.320754716981133e-05, "loss": 4.35127067565918, "step": 89 }, { "epoch": 0.023472647845080526, "grad_norm": 17.875, "learning_rate": 3.358490566037736e-05, "loss": 4.327524185180664, "step": 90 }, { "epoch": 0.023733455043359195, "grad_norm": 15.6875, "learning_rate": 3.39622641509434e-05, "loss": 4.007119178771973, "step": 91 }, { "epoch": 0.02399426224163787, "grad_norm": 14.0, "learning_rate": 3.433962264150944e-05, "loss": 4.097439765930176, "step": 92 }, { "epoch": 0.02425506943991654, "grad_norm": 16.75, "learning_rate": 3.471698113207548e-05, "loss": 4.138132095336914, "step": 93 }, { "epoch": 0.024515876638195214, "grad_norm": 13.5625, "learning_rate": 3.509433962264151e-05, "loss": 3.882037401199341, "step": 94 }, { "epoch": 0.024776683836473887, "grad_norm": 14.875, "learning_rate": 3.547169811320755e-05, "loss": 4.11362886428833, "step": 95 }, { "epoch": 0.02503749103475256, "grad_norm": 14.3125, "learning_rate": 3.584905660377359e-05, "loss": 4.373976230621338, "step": 96 }, { "epoch": 0.02529829823303123, "grad_norm": 14.5, "learning_rate": 3.6226415094339626e-05, "loss": 3.847653865814209, "step": 97 }, { "epoch": 0.025559105431309903, "grad_norm": 9.875, "learning_rate": 3.660377358490566e-05, "loss": 4.18071174621582, "step": 98 }, { "epoch": 0.025819912629588576, "grad_norm": 12.8125, "learning_rate": 3.6981132075471704e-05, "loss": 3.893112897872925, "step": 99 }, { "epoch": 0.02608071982786725, "grad_norm": 12.25, "learning_rate": 3.735849056603774e-05, "loss": 3.7350988388061523, "step": 100 }, { "epoch": 0.026341527026145922, "grad_norm": 11.1875, "learning_rate": 3.7735849056603776e-05, "loss": 3.977013111114502, "step": 101 }, { "epoch": 0.026602334224424595, "grad_norm": 15.0, "learning_rate": 3.811320754716982e-05, "loss": 3.959244728088379, "step": 102 }, { "epoch": 0.02686314142270327, "grad_norm": 13.375, "learning_rate": 3.8490566037735854e-05, "loss": 3.6513144969940186, "step": 103 }, { "epoch": 0.027123948620981938, "grad_norm": 14.6875, "learning_rate": 3.886792452830189e-05, "loss": 3.9482178688049316, "step": 104 }, { "epoch": 0.02738475581926061, "grad_norm": 16.25, "learning_rate": 3.9245283018867925e-05, "loss": 3.997860908508301, "step": 105 }, { "epoch": 0.027645563017539284, "grad_norm": 15.625, "learning_rate": 3.962264150943397e-05, "loss": 3.7535505294799805, "step": 106 }, { "epoch": 0.027906370215817957, "grad_norm": 9.875, "learning_rate": 4e-05, "loss": 3.1479573249816895, "step": 107 }, { "epoch": 0.02816717741409663, "grad_norm": 22.625, "learning_rate": 3.999999467458553e-05, "loss": 3.762944221496582, "step": 108 }, { "epoch": 0.028427984612375303, "grad_norm": 12.5625, "learning_rate": 3.999997869834493e-05, "loss": 3.4817514419555664, "step": 109 }, { "epoch": 0.028688791810653973, "grad_norm": 16.125, "learning_rate": 3.999995207128673e-05, "loss": 3.6522598266601562, "step": 110 }, { "epoch": 0.028949599008932646, "grad_norm": 11.875, "learning_rate": 3.9999914793425094e-05, "loss": 3.936000347137451, "step": 111 }, { "epoch": 0.02921040620721132, "grad_norm": 17.0, "learning_rate": 3.999986686477989e-05, "loss": 3.454770088195801, "step": 112 }, { "epoch": 0.029471213405489992, "grad_norm": 10.75, "learning_rate": 3.9999808285376626e-05, "loss": 3.5592823028564453, "step": 113 }, { "epoch": 0.029732020603768665, "grad_norm": 21.125, "learning_rate": 3.999973905524651e-05, "loss": 3.7373290061950684, "step": 114 }, { "epoch": 0.029992827802047338, "grad_norm": 12.0, "learning_rate": 3.9999659174426395e-05, "loss": 3.6499617099761963, "step": 115 }, { "epoch": 0.030253635000326008, "grad_norm": 14.6875, "learning_rate": 3.999956864295883e-05, "loss": 3.2987723350524902, "step": 116 }, { "epoch": 0.03051444219860468, "grad_norm": 15.0, "learning_rate": 3.999946746089204e-05, "loss": 3.405784845352173, "step": 117 }, { "epoch": 0.030775249396883354, "grad_norm": 12.75, "learning_rate": 3.999935562827989e-05, "loss": 3.4816479682922363, "step": 118 }, { "epoch": 0.031036056595162027, "grad_norm": 14.1875, "learning_rate": 3.999923314518194e-05, "loss": 3.635232448577881, "step": 119 }, { "epoch": 0.031296863793440696, "grad_norm": 13.625, "learning_rate": 3.999910001166342e-05, "loss": 3.6046676635742188, "step": 120 }, { "epoch": 0.03155767099171937, "grad_norm": 12.4375, "learning_rate": 3.999895622779523e-05, "loss": 3.4561996459960938, "step": 121 }, { "epoch": 0.03181847818999804, "grad_norm": 12.0, "learning_rate": 3.999880179365393e-05, "loss": 3.2301549911499023, "step": 122 }, { "epoch": 0.032079285388276715, "grad_norm": 9.9375, "learning_rate": 3.9998636709321774e-05, "loss": 3.497526168823242, "step": 123 }, { "epoch": 0.03234009258655539, "grad_norm": 9.8125, "learning_rate": 3.999846097488668e-05, "loss": 3.549671173095703, "step": 124 }, { "epoch": 0.03260089978483406, "grad_norm": 10.125, "learning_rate": 3.999827459044222e-05, "loss": 3.203861951828003, "step": 125 }, { "epoch": 0.032861706983112735, "grad_norm": 10.0, "learning_rate": 3.999807755608767e-05, "loss": 3.2915453910827637, "step": 126 }, { "epoch": 0.03312251418139141, "grad_norm": 10.8125, "learning_rate": 3.999786987192794e-05, "loss": 2.9838616847991943, "step": 127 }, { "epoch": 0.03338332137967008, "grad_norm": 9.0, "learning_rate": 3.999765153807364e-05, "loss": 3.060359001159668, "step": 128 }, { "epoch": 0.033644128577948754, "grad_norm": 9.5, "learning_rate": 3.999742255464103e-05, "loss": 2.9777638912200928, "step": 129 }, { "epoch": 0.03390493577622743, "grad_norm": 8.5625, "learning_rate": 3.9997182921752076e-05, "loss": 3.1705143451690674, "step": 130 }, { "epoch": 0.0341657429745061, "grad_norm": 8.5625, "learning_rate": 3.9996932639534376e-05, "loss": 3.0397515296936035, "step": 131 }, { "epoch": 0.034426550172784766, "grad_norm": 11.625, "learning_rate": 3.9996671708121214e-05, "loss": 3.138066530227661, "step": 132 }, { "epoch": 0.03468735737106344, "grad_norm": 8.8125, "learning_rate": 3.999640012765156e-05, "loss": 3.1769144535064697, "step": 133 }, { "epoch": 0.03494816456934211, "grad_norm": 9.3125, "learning_rate": 3.999611789827003e-05, "loss": 3.212942600250244, "step": 134 }, { "epoch": 0.035208971767620785, "grad_norm": 10.125, "learning_rate": 3.999582502012692e-05, "loss": 3.2294867038726807, "step": 135 }, { "epoch": 0.03546977896589946, "grad_norm": 9.0, "learning_rate": 3.999552149337822e-05, "loss": 3.4734344482421875, "step": 136 }, { "epoch": 0.03573058616417813, "grad_norm": 9.25, "learning_rate": 3.999520731818555e-05, "loss": 3.1776041984558105, "step": 137 }, { "epoch": 0.035991393362456804, "grad_norm": 9.875, "learning_rate": 3.999488249471623e-05, "loss": 3.1053857803344727, "step": 138 }, { "epoch": 0.03625220056073548, "grad_norm": 9.1875, "learning_rate": 3.9994547023143244e-05, "loss": 3.068758726119995, "step": 139 }, { "epoch": 0.03651300775901415, "grad_norm": 12.0625, "learning_rate": 3.999420090364523e-05, "loss": 2.9159774780273438, "step": 140 }, { "epoch": 0.03677381495729282, "grad_norm": 10.375, "learning_rate": 3.9993844136406535e-05, "loss": 3.1062023639678955, "step": 141 }, { "epoch": 0.037034622155571496, "grad_norm": 8.875, "learning_rate": 3.999347672161713e-05, "loss": 3.0980939865112305, "step": 142 }, { "epoch": 0.03729542935385017, "grad_norm": 9.5625, "learning_rate": 3.999309865947269e-05, "loss": 3.071870803833008, "step": 143 }, { "epoch": 0.037556236552128835, "grad_norm": 9.3125, "learning_rate": 3.999270995017455e-05, "loss": 3.085413932800293, "step": 144 }, { "epoch": 0.03781704375040751, "grad_norm": 12.0, "learning_rate": 3.999231059392971e-05, "loss": 3.1918039321899414, "step": 145 }, { "epoch": 0.03807785094868618, "grad_norm": 9.875, "learning_rate": 3.9991900590950844e-05, "loss": 3.2698490619659424, "step": 146 }, { "epoch": 0.038338658146964855, "grad_norm": 13.5625, "learning_rate": 3.999147994145629e-05, "loss": 2.81750750541687, "step": 147 }, { "epoch": 0.03859946534524353, "grad_norm": 9.25, "learning_rate": 3.999104864567007e-05, "loss": 2.9677963256835938, "step": 148 }, { "epoch": 0.0388602725435222, "grad_norm": 10.9375, "learning_rate": 3.999060670382187e-05, "loss": 2.9043052196502686, "step": 149 }, { "epoch": 0.039121079741800874, "grad_norm": 8.9375, "learning_rate": 3.9990154116147024e-05, "loss": 3.038243055343628, "step": 150 }, { "epoch": 0.03938188694007955, "grad_norm": 13.5625, "learning_rate": 3.998969088288657e-05, "loss": 3.111212730407715, "step": 151 }, { "epoch": 0.03964269413835822, "grad_norm": 8.5, "learning_rate": 3.9989217004287206e-05, "loss": 2.9221205711364746, "step": 152 }, { "epoch": 0.03990350133663689, "grad_norm": 9.6875, "learning_rate": 3.998873248060127e-05, "loss": 3.0534908771514893, "step": 153 }, { "epoch": 0.040164308534915566, "grad_norm": 8.75, "learning_rate": 3.99882373120868e-05, "loss": 3.266178607940674, "step": 154 }, { "epoch": 0.04042511573319424, "grad_norm": 11.4375, "learning_rate": 3.998773149900751e-05, "loss": 2.951430082321167, "step": 155 }, { "epoch": 0.04068592293147291, "grad_norm": 10.5, "learning_rate": 3.9987215041632737e-05, "loss": 3.2741990089416504, "step": 156 }, { "epoch": 0.04094673012975158, "grad_norm": 8.0, "learning_rate": 3.998668794023754e-05, "loss": 2.9319162368774414, "step": 157 }, { "epoch": 0.04120753732803025, "grad_norm": 9.5625, "learning_rate": 3.9986150195102604e-05, "loss": 2.9177396297454834, "step": 158 }, { "epoch": 0.041468344526308924, "grad_norm": 9.25, "learning_rate": 3.9985601806514315e-05, "loss": 3.190408229827881, "step": 159 }, { "epoch": 0.0417291517245876, "grad_norm": 9.25, "learning_rate": 3.998504277476471e-05, "loss": 3.000624179840088, "step": 160 }, { "epoch": 0.04198995892286627, "grad_norm": 8.5, "learning_rate": 3.998447310015149e-05, "loss": 2.9762725830078125, "step": 161 }, { "epoch": 0.04225076612114494, "grad_norm": 8.625, "learning_rate": 3.998389278297804e-05, "loss": 2.898078680038452, "step": 162 }, { "epoch": 0.042511573319423616, "grad_norm": 7.53125, "learning_rate": 3.9983301823553394e-05, "loss": 2.8846335411071777, "step": 163 }, { "epoch": 0.04277238051770229, "grad_norm": 9.125, "learning_rate": 3.9982700222192266e-05, "loss": 2.804777145385742, "step": 164 }, { "epoch": 0.04303318771598096, "grad_norm": 7.375, "learning_rate": 3.998208797921503e-05, "loss": 2.8471713066101074, "step": 165 }, { "epoch": 0.043293994914259636, "grad_norm": 9.3125, "learning_rate": 3.998146509494774e-05, "loss": 2.616511821746826, "step": 166 }, { "epoch": 0.04355480211253831, "grad_norm": 7.84375, "learning_rate": 3.99808315697221e-05, "loss": 2.9040181636810303, "step": 167 }, { "epoch": 0.04381560931081698, "grad_norm": 8.75, "learning_rate": 3.9980187403875485e-05, "loss": 2.8242077827453613, "step": 168 }, { "epoch": 0.04407641650909565, "grad_norm": 8.3125, "learning_rate": 3.997953259775095e-05, "loss": 2.8351857662200928, "step": 169 }, { "epoch": 0.04433722370737432, "grad_norm": 10.1875, "learning_rate": 3.99788671516972e-05, "loss": 3.20156192779541, "step": 170 }, { "epoch": 0.044598030905652994, "grad_norm": 9.125, "learning_rate": 3.9978191066068616e-05, "loss": 3.055712938308716, "step": 171 }, { "epoch": 0.04485883810393167, "grad_norm": 7.96875, "learning_rate": 3.9977504341225236e-05, "loss": 2.602215051651001, "step": 172 }, { "epoch": 0.04511964530221034, "grad_norm": 8.875, "learning_rate": 3.997680697753278e-05, "loss": 2.794217348098755, "step": 173 }, { "epoch": 0.04538045250048901, "grad_norm": 9.5, "learning_rate": 3.997609897536261e-05, "loss": 2.5247011184692383, "step": 174 }, { "epoch": 0.045641259698767686, "grad_norm": 12.25, "learning_rate": 3.9975380335091786e-05, "loss": 2.9152863025665283, "step": 175 }, { "epoch": 0.04590206689704636, "grad_norm": 8.25, "learning_rate": 3.9974651057102985e-05, "loss": 2.656716823577881, "step": 176 }, { "epoch": 0.04616287409532503, "grad_norm": 7.25, "learning_rate": 3.9973911141784605e-05, "loss": 2.746725082397461, "step": 177 }, { "epoch": 0.046423681293603705, "grad_norm": 7.21875, "learning_rate": 3.9973160589530665e-05, "loss": 2.4794938564300537, "step": 178 }, { "epoch": 0.04668448849188238, "grad_norm": 7.625, "learning_rate": 3.997239940074087e-05, "loss": 2.6509430408477783, "step": 179 }, { "epoch": 0.04694529569016105, "grad_norm": 7.40625, "learning_rate": 3.997162757582058e-05, "loss": 2.940674304962158, "step": 180 }, { "epoch": 0.047206102888439724, "grad_norm": 7.59375, "learning_rate": 3.997084511518083e-05, "loss": 3.0152294635772705, "step": 181 }, { "epoch": 0.04746691008671839, "grad_norm": 7.84375, "learning_rate": 3.997005201923832e-05, "loss": 2.9850046634674072, "step": 182 }, { "epoch": 0.047727717284997064, "grad_norm": 6.96875, "learning_rate": 3.996924828841539e-05, "loss": 2.8081016540527344, "step": 183 }, { "epoch": 0.04798852448327574, "grad_norm": 6.6875, "learning_rate": 3.9968433923140076e-05, "loss": 2.793598175048828, "step": 184 }, { "epoch": 0.04824933168155441, "grad_norm": 7.65625, "learning_rate": 3.9967608923846044e-05, "loss": 2.6727943420410156, "step": 185 }, { "epoch": 0.04851013887983308, "grad_norm": 7.03125, "learning_rate": 3.9966773290972654e-05, "loss": 2.6837079524993896, "step": 186 }, { "epoch": 0.048770946078111756, "grad_norm": 8.1875, "learning_rate": 3.996592702496491e-05, "loss": 2.765864133834839, "step": 187 }, { "epoch": 0.04903175327639043, "grad_norm": 6.59375, "learning_rate": 3.996507012627348e-05, "loss": 2.4099583625793457, "step": 188 }, { "epoch": 0.0492925604746691, "grad_norm": 8.0, "learning_rate": 3.99642025953547e-05, "loss": 2.8211944103240967, "step": 189 }, { "epoch": 0.049553367672947775, "grad_norm": 7.09375, "learning_rate": 3.996332443267058e-05, "loss": 2.359706163406372, "step": 190 }, { "epoch": 0.04981417487122645, "grad_norm": 7.75, "learning_rate": 3.996243563868876e-05, "loss": 3.0262551307678223, "step": 191 }, { "epoch": 0.05007498206950512, "grad_norm": 8.6875, "learning_rate": 3.996153621388256e-05, "loss": 2.709545373916626, "step": 192 }, { "epoch": 0.050335789267783794, "grad_norm": 6.9375, "learning_rate": 3.996062615873098e-05, "loss": 2.4291229248046875, "step": 193 }, { "epoch": 0.05059659646606246, "grad_norm": 6.9375, "learning_rate": 3.995970547371864e-05, "loss": 2.47729229927063, "step": 194 }, { "epoch": 0.05085740366434113, "grad_norm": 7.5, "learning_rate": 3.995877415933586e-05, "loss": 2.773533582687378, "step": 195 }, { "epoch": 0.051118210862619806, "grad_norm": 7.28125, "learning_rate": 3.995783221607859e-05, "loss": 2.86281418800354, "step": 196 }, { "epoch": 0.05137901806089848, "grad_norm": 7.28125, "learning_rate": 3.9956879644448456e-05, "loss": 2.6186606884002686, "step": 197 }, { "epoch": 0.05163982525917715, "grad_norm": 6.90625, "learning_rate": 3.995591644495275e-05, "loss": 2.787097215652466, "step": 198 }, { "epoch": 0.051900632457455825, "grad_norm": 6.5, "learning_rate": 3.995494261810441e-05, "loss": 2.50883150100708, "step": 199 }, { "epoch": 0.0521614396557345, "grad_norm": 8.1875, "learning_rate": 3.995395816442204e-05, "loss": 2.8412694931030273, "step": 200 }, { "epoch": 0.05242224685401317, "grad_norm": 6.9375, "learning_rate": 3.99529630844299e-05, "loss": 2.518864870071411, "step": 201 }, { "epoch": 0.052683054052291844, "grad_norm": 7.0625, "learning_rate": 3.9951957378657916e-05, "loss": 2.899658203125, "step": 202 }, { "epoch": 0.05294386125057052, "grad_norm": 6.5, "learning_rate": 3.995094104764167e-05, "loss": 2.6643025875091553, "step": 203 }, { "epoch": 0.05320466844884919, "grad_norm": 6.625, "learning_rate": 3.9949914091922394e-05, "loss": 2.6935393810272217, "step": 204 }, { "epoch": 0.053465475647127864, "grad_norm": 6.6875, "learning_rate": 3.994887651204698e-05, "loss": 2.3257358074188232, "step": 205 }, { "epoch": 0.05372628284540654, "grad_norm": 6.71875, "learning_rate": 3.9947828308568e-05, "loss": 2.544113874435425, "step": 206 }, { "epoch": 0.0539870900436852, "grad_norm": 7.78125, "learning_rate": 3.994676948204364e-05, "loss": 2.5802993774414062, "step": 207 }, { "epoch": 0.054247897241963876, "grad_norm": 7.1875, "learning_rate": 3.9945700033037794e-05, "loss": 2.469195604324341, "step": 208 }, { "epoch": 0.05450870444024255, "grad_norm": 7.5, "learning_rate": 3.994461996211998e-05, "loss": 2.5232973098754883, "step": 209 }, { "epoch": 0.05476951163852122, "grad_norm": 7.0, "learning_rate": 3.9943529269865375e-05, "loss": 2.701573371887207, "step": 210 }, { "epoch": 0.055030318836799895, "grad_norm": 7.5, "learning_rate": 3.994242795685482e-05, "loss": 2.9642443656921387, "step": 211 }, { "epoch": 0.05529112603507857, "grad_norm": 6.9375, "learning_rate": 3.994131602367481e-05, "loss": 2.6440212726593018, "step": 212 }, { "epoch": 0.05555193323335724, "grad_norm": 7.3125, "learning_rate": 3.99401934709175e-05, "loss": 2.4167118072509766, "step": 213 }, { "epoch": 0.055812740431635914, "grad_norm": 6.65625, "learning_rate": 3.993906029918069e-05, "loss": 2.6095056533813477, "step": 214 }, { "epoch": 0.05607354762991459, "grad_norm": 6.9375, "learning_rate": 3.9937916509067845e-05, "loss": 2.439197063446045, "step": 215 }, { "epoch": 0.05633435482819326, "grad_norm": 6.90625, "learning_rate": 3.993676210118808e-05, "loss": 2.5025954246520996, "step": 216 }, { "epoch": 0.05659516202647193, "grad_norm": 7.40625, "learning_rate": 3.993559707615616e-05, "loss": 2.774198532104492, "step": 217 }, { "epoch": 0.056855969224750606, "grad_norm": 6.5625, "learning_rate": 3.993442143459251e-05, "loss": 2.8386070728302, "step": 218 }, { "epoch": 0.05711677642302927, "grad_norm": 7.15625, "learning_rate": 3.993323517712322e-05, "loss": 2.4885847568511963, "step": 219 }, { "epoch": 0.057377583621307945, "grad_norm": 7.0625, "learning_rate": 3.993203830438001e-05, "loss": 2.4035756587982178, "step": 220 }, { "epoch": 0.05763839081958662, "grad_norm": 7.59375, "learning_rate": 3.993083081700026e-05, "loss": 2.9204483032226562, "step": 221 }, { "epoch": 0.05789919801786529, "grad_norm": 6.75, "learning_rate": 3.992961271562702e-05, "loss": 2.324254274368286, "step": 222 }, { "epoch": 0.058160005216143965, "grad_norm": 7.46875, "learning_rate": 3.9928384000908966e-05, "loss": 2.451913833618164, "step": 223 }, { "epoch": 0.05842081241442264, "grad_norm": 6.53125, "learning_rate": 3.992714467350045e-05, "loss": 2.5799005031585693, "step": 224 }, { "epoch": 0.05868161961270131, "grad_norm": 7.0625, "learning_rate": 3.9925894734061466e-05, "loss": 2.316067934036255, "step": 225 }, { "epoch": 0.058942426810979984, "grad_norm": 6.9375, "learning_rate": 3.992463418325765e-05, "loss": 2.6440839767456055, "step": 226 }, { "epoch": 0.05920323400925866, "grad_norm": 6.21875, "learning_rate": 3.99233630217603e-05, "loss": 2.3150699138641357, "step": 227 }, { "epoch": 0.05946404120753733, "grad_norm": 7.03125, "learning_rate": 3.992208125024637e-05, "loss": 2.774292469024658, "step": 228 }, { "epoch": 0.059724848405816, "grad_norm": 6.0, "learning_rate": 3.9920788869398445e-05, "loss": 2.1743381023406982, "step": 229 }, { "epoch": 0.059985655604094676, "grad_norm": 6.65625, "learning_rate": 3.991948587990479e-05, "loss": 2.6707546710968018, "step": 230 }, { "epoch": 0.06024646280237335, "grad_norm": 6.90625, "learning_rate": 3.9918172282459274e-05, "loss": 2.5074210166931152, "step": 231 }, { "epoch": 0.060507270000652015, "grad_norm": 6.53125, "learning_rate": 3.9916848077761455e-05, "loss": 2.515544891357422, "step": 232 }, { "epoch": 0.06076807719893069, "grad_norm": 7.0, "learning_rate": 3.991551326651653e-05, "loss": 2.456247568130493, "step": 233 }, { "epoch": 0.06102888439720936, "grad_norm": 6.0625, "learning_rate": 3.9914167849435344e-05, "loss": 2.4338889122009277, "step": 234 }, { "epoch": 0.061289691595488034, "grad_norm": 6.90625, "learning_rate": 3.991281182723438e-05, "loss": 2.734100580215454, "step": 235 }, { "epoch": 0.06155049879376671, "grad_norm": 6.4375, "learning_rate": 3.9911445200635775e-05, "loss": 2.555612325668335, "step": 236 }, { "epoch": 0.06181130599204538, "grad_norm": 6.25, "learning_rate": 3.9910067970367327e-05, "loss": 2.738067150115967, "step": 237 }, { "epoch": 0.06207211319032405, "grad_norm": 6.1875, "learning_rate": 3.990868013716245e-05, "loss": 2.395388603210449, "step": 238 }, { "epoch": 0.062332920388602726, "grad_norm": 6.3125, "learning_rate": 3.9907281701760235e-05, "loss": 2.574742317199707, "step": 239 }, { "epoch": 0.06259372758688139, "grad_norm": 6.65625, "learning_rate": 3.99058726649054e-05, "loss": 2.463757038116455, "step": 240 }, { "epoch": 0.06285453478516007, "grad_norm": 6.5, "learning_rate": 3.9904453027348324e-05, "loss": 2.42558217048645, "step": 241 }, { "epoch": 0.06311534198343874, "grad_norm": 6.34375, "learning_rate": 3.990302278984502e-05, "loss": 2.339848041534424, "step": 242 }, { "epoch": 0.06337614918171741, "grad_norm": 7.25, "learning_rate": 3.9901581953157135e-05, "loss": 2.7260823249816895, "step": 243 }, { "epoch": 0.06363695637999608, "grad_norm": 6.34375, "learning_rate": 3.9900130518052e-05, "loss": 2.346684694290161, "step": 244 }, { "epoch": 0.06389776357827476, "grad_norm": 12.0625, "learning_rate": 3.989866848530254e-05, "loss": 2.3759613037109375, "step": 245 }, { "epoch": 0.06415857077655343, "grad_norm": 7.0, "learning_rate": 3.989719585568736e-05, "loss": 2.450105905532837, "step": 246 }, { "epoch": 0.0644193779748321, "grad_norm": 18.5, "learning_rate": 3.98957126299907e-05, "loss": 2.4428210258483887, "step": 247 }, { "epoch": 0.06468018517311078, "grad_norm": 7.5625, "learning_rate": 3.989421880900243e-05, "loss": 2.590214252471924, "step": 248 }, { "epoch": 0.06494099237138945, "grad_norm": 6.46875, "learning_rate": 3.9892714393518073e-05, "loss": 2.6804447174072266, "step": 249 }, { "epoch": 0.06520179956966812, "grad_norm": 6.53125, "learning_rate": 3.98911993843388e-05, "loss": 2.209712266921997, "step": 250 }, { "epoch": 0.0654626067679468, "grad_norm": 7.125, "learning_rate": 3.98896737822714e-05, "loss": 2.7148003578186035, "step": 251 }, { "epoch": 0.06572341396622547, "grad_norm": 6.09375, "learning_rate": 3.9888137588128345e-05, "loss": 2.572610855102539, "step": 252 }, { "epoch": 0.06598422116450414, "grad_norm": 6.5625, "learning_rate": 3.98865908027277e-05, "loss": 2.296384572982788, "step": 253 }, { "epoch": 0.06624502836278282, "grad_norm": 6.59375, "learning_rate": 3.98850334268932e-05, "loss": 2.6526260375976562, "step": 254 }, { "epoch": 0.06650583556106149, "grad_norm": 6.1875, "learning_rate": 3.9883465461454215e-05, "loss": 2.5138492584228516, "step": 255 }, { "epoch": 0.06676664275934016, "grad_norm": 6.65625, "learning_rate": 3.988188690724575e-05, "loss": 2.2890474796295166, "step": 256 }, { "epoch": 0.06702744995761883, "grad_norm": 6.28125, "learning_rate": 3.9880297765108446e-05, "loss": 2.328469753265381, "step": 257 }, { "epoch": 0.06728825715589751, "grad_norm": 6.90625, "learning_rate": 3.9878698035888585e-05, "loss": 2.7269949913024902, "step": 258 }, { "epoch": 0.06754906435417618, "grad_norm": 6.8125, "learning_rate": 3.98770877204381e-05, "loss": 2.258955955505371, "step": 259 }, { "epoch": 0.06780987155245485, "grad_norm": 7.25, "learning_rate": 3.987546681961455e-05, "loss": 2.605950355529785, "step": 260 }, { "epoch": 0.06807067875073353, "grad_norm": 6.78125, "learning_rate": 3.987383533428111e-05, "loss": 2.2461400032043457, "step": 261 }, { "epoch": 0.0683314859490122, "grad_norm": 6.65625, "learning_rate": 3.9872193265306645e-05, "loss": 2.3346076011657715, "step": 262 }, { "epoch": 0.06859229314729086, "grad_norm": 6.0, "learning_rate": 3.987054061356561e-05, "loss": 2.512078285217285, "step": 263 }, { "epoch": 0.06885310034556953, "grad_norm": 6.5625, "learning_rate": 3.986887737993811e-05, "loss": 2.525033950805664, "step": 264 }, { "epoch": 0.0691139075438482, "grad_norm": 6.5625, "learning_rate": 3.986720356530988e-05, "loss": 2.4972288608551025, "step": 265 }, { "epoch": 0.06937471474212688, "grad_norm": 6.53125, "learning_rate": 3.986551917057231e-05, "loss": 2.5117955207824707, "step": 266 }, { "epoch": 0.06963552194040555, "grad_norm": 6.46875, "learning_rate": 3.98638241966224e-05, "loss": 2.182558298110962, "step": 267 }, { "epoch": 0.06989632913868422, "grad_norm": 6.5625, "learning_rate": 3.986211864436279e-05, "loss": 2.61698055267334, "step": 268 }, { "epoch": 0.0701571363369629, "grad_norm": 6.4375, "learning_rate": 3.986040251470177e-05, "loss": 2.098867893218994, "step": 269 }, { "epoch": 0.07041794353524157, "grad_norm": 6.28125, "learning_rate": 3.985867580855324e-05, "loss": 2.3573265075683594, "step": 270 }, { "epoch": 0.07067875073352024, "grad_norm": 6.65625, "learning_rate": 3.985693852683675e-05, "loss": 2.5895185470581055, "step": 271 }, { "epoch": 0.07093955793179892, "grad_norm": 6.46875, "learning_rate": 3.985519067047747e-05, "loss": 2.3642361164093018, "step": 272 }, { "epoch": 0.07120036513007759, "grad_norm": 6.0625, "learning_rate": 3.985343224040621e-05, "loss": 2.739882707595825, "step": 273 }, { "epoch": 0.07146117232835626, "grad_norm": 5.875, "learning_rate": 3.985166323755939e-05, "loss": 2.458867073059082, "step": 274 }, { "epoch": 0.07172197952663494, "grad_norm": 5.96875, "learning_rate": 3.98498836628791e-05, "loss": 2.274116039276123, "step": 275 }, { "epoch": 0.07198278672491361, "grad_norm": 5.9375, "learning_rate": 3.9848093517313036e-05, "loss": 2.380291700363159, "step": 276 }, { "epoch": 0.07224359392319228, "grad_norm": 6.65625, "learning_rate": 3.984629280181451e-05, "loss": 2.3993239402770996, "step": 277 }, { "epoch": 0.07250440112147095, "grad_norm": 6.625, "learning_rate": 3.984448151734248e-05, "loss": 2.4813575744628906, "step": 278 }, { "epoch": 0.07276520831974963, "grad_norm": 7.0, "learning_rate": 3.9842659664861536e-05, "loss": 2.6131296157836914, "step": 279 }, { "epoch": 0.0730260155180283, "grad_norm": 6.4375, "learning_rate": 3.9840827245341894e-05, "loss": 2.5049290657043457, "step": 280 }, { "epoch": 0.07328682271630697, "grad_norm": 6.40625, "learning_rate": 3.983898425975938e-05, "loss": 2.5281827449798584, "step": 281 }, { "epoch": 0.07354762991458565, "grad_norm": 6.3125, "learning_rate": 3.9837130709095475e-05, "loss": 2.6311264038085938, "step": 282 }, { "epoch": 0.07380843711286432, "grad_norm": 5.9375, "learning_rate": 3.9835266594337264e-05, "loss": 2.3365821838378906, "step": 283 }, { "epoch": 0.07406924431114299, "grad_norm": 6.15625, "learning_rate": 3.983339191647747e-05, "loss": 2.235295295715332, "step": 284 }, { "epoch": 0.07433005150942167, "grad_norm": 6.03125, "learning_rate": 3.983150667651442e-05, "loss": 2.445535659790039, "step": 285 }, { "epoch": 0.07459085870770034, "grad_norm": 6.40625, "learning_rate": 3.982961087545211e-05, "loss": 2.2840161323547363, "step": 286 }, { "epoch": 0.07485166590597901, "grad_norm": 5.4375, "learning_rate": 3.9827704514300105e-05, "loss": 2.376690149307251, "step": 287 }, { "epoch": 0.07511247310425767, "grad_norm": 5.8125, "learning_rate": 3.9825787594073644e-05, "loss": 2.3928864002227783, "step": 288 }, { "epoch": 0.07537328030253634, "grad_norm": 6.125, "learning_rate": 3.982386011579355e-05, "loss": 2.3445351123809814, "step": 289 }, { "epoch": 0.07563408750081502, "grad_norm": 5.9375, "learning_rate": 3.9821922080486296e-05, "loss": 2.208594560623169, "step": 290 }, { "epoch": 0.07589489469909369, "grad_norm": 5.40625, "learning_rate": 3.981997348918396e-05, "loss": 2.094682216644287, "step": 291 }, { "epoch": 0.07615570189737236, "grad_norm": 6.3125, "learning_rate": 3.9818014342924245e-05, "loss": 2.6614317893981934, "step": 292 }, { "epoch": 0.07641650909565104, "grad_norm": 6.5625, "learning_rate": 3.981604464275049e-05, "loss": 2.593371629714966, "step": 293 }, { "epoch": 0.07667731629392971, "grad_norm": 5.75, "learning_rate": 3.981406438971163e-05, "loss": 2.1724560260772705, "step": 294 }, { "epoch": 0.07693812349220838, "grad_norm": 7.5625, "learning_rate": 3.9812073584862234e-05, "loss": 2.3324060440063477, "step": 295 }, { "epoch": 0.07719893069048706, "grad_norm": 5.6875, "learning_rate": 3.9810072229262495e-05, "loss": 2.4392335414886475, "step": 296 }, { "epoch": 0.07745973788876573, "grad_norm": 5.84375, "learning_rate": 3.98080603239782e-05, "loss": 2.3669943809509277, "step": 297 }, { "epoch": 0.0777205450870444, "grad_norm": 6.40625, "learning_rate": 3.98060378700808e-05, "loss": 2.5146942138671875, "step": 298 }, { "epoch": 0.07798135228532307, "grad_norm": 5.9375, "learning_rate": 3.9804004868647315e-05, "loss": 2.3613228797912598, "step": 299 }, { "epoch": 0.07824215948360175, "grad_norm": 5.59375, "learning_rate": 3.98019613207604e-05, "loss": 2.367048740386963, "step": 300 }, { "epoch": 0.07850296668188042, "grad_norm": 6.15625, "learning_rate": 3.979990722750835e-05, "loss": 2.5720415115356445, "step": 301 }, { "epoch": 0.0787637738801591, "grad_norm": 5.53125, "learning_rate": 3.979784258998503e-05, "loss": 2.27433180809021, "step": 302 }, { "epoch": 0.07902458107843777, "grad_norm": 6.125, "learning_rate": 3.9795767409289965e-05, "loss": 2.362941265106201, "step": 303 }, { "epoch": 0.07928538827671644, "grad_norm": 5.40625, "learning_rate": 3.979368168652826e-05, "loss": 2.129410743713379, "step": 304 }, { "epoch": 0.07954619547499511, "grad_norm": 5.8125, "learning_rate": 3.9791585422810664e-05, "loss": 2.547970771789551, "step": 305 }, { "epoch": 0.07980700267327379, "grad_norm": 6.21875, "learning_rate": 3.9789478619253505e-05, "loss": 2.542482852935791, "step": 306 }, { "epoch": 0.08006780987155246, "grad_norm": 6.46875, "learning_rate": 3.978736127697876e-05, "loss": 2.6509501934051514, "step": 307 }, { "epoch": 0.08032861706983113, "grad_norm": 6.03125, "learning_rate": 3.978523339711399e-05, "loss": 2.4671332836151123, "step": 308 }, { "epoch": 0.0805894242681098, "grad_norm": 6.125, "learning_rate": 3.978309498079239e-05, "loss": 2.0929112434387207, "step": 309 }, { "epoch": 0.08085023146638848, "grad_norm": 7.0, "learning_rate": 3.978094602915275e-05, "loss": 2.3321259021759033, "step": 310 }, { "epoch": 0.08111103866466715, "grad_norm": 6.03125, "learning_rate": 3.977878654333947e-05, "loss": 2.3965871334075928, "step": 311 }, { "epoch": 0.08137184586294582, "grad_norm": 6.0, "learning_rate": 3.977661652450257e-05, "loss": 2.3253791332244873, "step": 312 }, { "epoch": 0.08163265306122448, "grad_norm": 6.90625, "learning_rate": 3.977443597379768e-05, "loss": 2.251070976257324, "step": 313 }, { "epoch": 0.08189346025950316, "grad_norm": 6.1875, "learning_rate": 3.977224489238603e-05, "loss": 2.551664352416992, "step": 314 }, { "epoch": 0.08215426745778183, "grad_norm": 5.9375, "learning_rate": 3.977004328143447e-05, "loss": 2.367215394973755, "step": 315 }, { "epoch": 0.0824150746560605, "grad_norm": 5.75, "learning_rate": 3.9767831142115426e-05, "loss": 2.073972702026367, "step": 316 }, { "epoch": 0.08267588185433918, "grad_norm": 6.09375, "learning_rate": 3.976560847560697e-05, "loss": 2.297461748123169, "step": 317 }, { "epoch": 0.08293668905261785, "grad_norm": 6.0, "learning_rate": 3.9763375283092774e-05, "loss": 2.428114891052246, "step": 318 }, { "epoch": 0.08319749625089652, "grad_norm": 6.09375, "learning_rate": 3.9761131565762084e-05, "loss": 2.568549633026123, "step": 319 }, { "epoch": 0.0834583034491752, "grad_norm": 5.875, "learning_rate": 3.9758877324809786e-05, "loss": 2.3213963508605957, "step": 320 }, { "epoch": 0.08371911064745387, "grad_norm": 5.5, "learning_rate": 3.975661256143635e-05, "loss": 1.9857629537582397, "step": 321 }, { "epoch": 0.08397991784573254, "grad_norm": 5.625, "learning_rate": 3.975433727684786e-05, "loss": 2.2222909927368164, "step": 322 }, { "epoch": 0.08424072504401121, "grad_norm": 5.84375, "learning_rate": 3.9752051472256e-05, "loss": 2.161729335784912, "step": 323 }, { "epoch": 0.08450153224228989, "grad_norm": 5.40625, "learning_rate": 3.9749755148878055e-05, "loss": 2.5349013805389404, "step": 324 }, { "epoch": 0.08476233944056856, "grad_norm": 5.9375, "learning_rate": 3.974744830793691e-05, "loss": 2.358365535736084, "step": 325 }, { "epoch": 0.08502314663884723, "grad_norm": 5.6875, "learning_rate": 3.974513095066106e-05, "loss": 2.1969616413116455, "step": 326 }, { "epoch": 0.0852839538371259, "grad_norm": 5.78125, "learning_rate": 3.974280307828459e-05, "loss": 2.1518025398254395, "step": 327 }, { "epoch": 0.08554476103540458, "grad_norm": 5.8125, "learning_rate": 3.974046469204719e-05, "loss": 2.4633238315582275, "step": 328 }, { "epoch": 0.08580556823368325, "grad_norm": 5.96875, "learning_rate": 3.9738115793194136e-05, "loss": 2.2212085723876953, "step": 329 }, { "epoch": 0.08606637543196193, "grad_norm": 5.4375, "learning_rate": 3.9735756382976324e-05, "loss": 2.3922016620635986, "step": 330 }, { "epoch": 0.0863271826302406, "grad_norm": 6.03125, "learning_rate": 3.973338646265024e-05, "loss": 2.308976650238037, "step": 331 }, { "epoch": 0.08658798982851927, "grad_norm": 5.90625, "learning_rate": 3.973100603347797e-05, "loss": 2.289294958114624, "step": 332 }, { "epoch": 0.08684879702679794, "grad_norm": 5.28125, "learning_rate": 3.972861509672717e-05, "loss": 2.188870429992676, "step": 333 }, { "epoch": 0.08710960422507662, "grad_norm": 5.84375, "learning_rate": 3.972621365367113e-05, "loss": 2.280560255050659, "step": 334 }, { "epoch": 0.08737041142335529, "grad_norm": 5.53125, "learning_rate": 3.9723801705588715e-05, "loss": 2.329860210418701, "step": 335 }, { "epoch": 0.08763121862163396, "grad_norm": 6.0, "learning_rate": 3.972137925376439e-05, "loss": 2.349754571914673, "step": 336 }, { "epoch": 0.08789202581991264, "grad_norm": 6.40625, "learning_rate": 3.9718946299488207e-05, "loss": 2.040006637573242, "step": 337 }, { "epoch": 0.0881528330181913, "grad_norm": 5.40625, "learning_rate": 3.9716502844055806e-05, "loss": 2.155709743499756, "step": 338 }, { "epoch": 0.08841364021646997, "grad_norm": 6.21875, "learning_rate": 3.971404888876844e-05, "loss": 2.2252378463745117, "step": 339 }, { "epoch": 0.08867444741474864, "grad_norm": 5.625, "learning_rate": 3.971158443493295e-05, "loss": 2.1346254348754883, "step": 340 }, { "epoch": 0.08893525461302731, "grad_norm": 5.96875, "learning_rate": 3.970910948386174e-05, "loss": 2.261709213256836, "step": 341 }, { "epoch": 0.08919606181130599, "grad_norm": 5.59375, "learning_rate": 3.970662403687283e-05, "loss": 2.0507190227508545, "step": 342 }, { "epoch": 0.08945686900958466, "grad_norm": 5.875, "learning_rate": 3.970412809528984e-05, "loss": 2.4081969261169434, "step": 343 }, { "epoch": 0.08971767620786333, "grad_norm": 5.6875, "learning_rate": 3.970162166044194e-05, "loss": 2.2700247764587402, "step": 344 }, { "epoch": 0.089978483406142, "grad_norm": 5.75, "learning_rate": 3.969910473366392e-05, "loss": 2.2401342391967773, "step": 345 }, { "epoch": 0.09023929060442068, "grad_norm": 5.65625, "learning_rate": 3.969657731629615e-05, "loss": 2.1488614082336426, "step": 346 }, { "epoch": 0.09050009780269935, "grad_norm": 5.6875, "learning_rate": 3.969403940968458e-05, "loss": 2.3223090171813965, "step": 347 }, { "epoch": 0.09076090500097803, "grad_norm": 6.0625, "learning_rate": 3.969149101518075e-05, "loss": 2.1582388877868652, "step": 348 }, { "epoch": 0.0910217121992567, "grad_norm": 5.4375, "learning_rate": 3.9688932134141795e-05, "loss": 2.297391653060913, "step": 349 }, { "epoch": 0.09128251939753537, "grad_norm": 5.59375, "learning_rate": 3.968636276793041e-05, "loss": 2.2849154472351074, "step": 350 }, { "epoch": 0.09154332659581405, "grad_norm": 6.0, "learning_rate": 3.9683782917914906e-05, "loss": 2.274324417114258, "step": 351 }, { "epoch": 0.09180413379409272, "grad_norm": 5.59375, "learning_rate": 3.9681192585469146e-05, "loss": 2.4225234985351562, "step": 352 }, { "epoch": 0.09206494099237139, "grad_norm": 5.96875, "learning_rate": 3.96785917719726e-05, "loss": 2.265866756439209, "step": 353 }, { "epoch": 0.09232574819065006, "grad_norm": 5.4375, "learning_rate": 3.96759804788103e-05, "loss": 2.2360856533050537, "step": 354 }, { "epoch": 0.09258655538892874, "grad_norm": 5.4375, "learning_rate": 3.9673358707372864e-05, "loss": 1.9127092361450195, "step": 355 }, { "epoch": 0.09284736258720741, "grad_norm": 5.9375, "learning_rate": 3.967072645905651e-05, "loss": 2.1417503356933594, "step": 356 }, { "epoch": 0.09310816978548608, "grad_norm": 5.75, "learning_rate": 3.9668083735263014e-05, "loss": 2.449788808822632, "step": 357 }, { "epoch": 0.09336897698376476, "grad_norm": 6.25, "learning_rate": 3.9665430537399725e-05, "loss": 1.9705320596694946, "step": 358 }, { "epoch": 0.09362978418204343, "grad_norm": 5.53125, "learning_rate": 3.9662766866879596e-05, "loss": 1.9395697116851807, "step": 359 }, { "epoch": 0.0938905913803221, "grad_norm": 5.625, "learning_rate": 3.966009272512113e-05, "loss": 2.2744157314300537, "step": 360 }, { "epoch": 0.09415139857860078, "grad_norm": 5.3125, "learning_rate": 3.9657408113548425e-05, "loss": 2.2560882568359375, "step": 361 }, { "epoch": 0.09441220577687945, "grad_norm": 5.625, "learning_rate": 3.965471303359114e-05, "loss": 2.3874456882476807, "step": 362 }, { "epoch": 0.09467301297515811, "grad_norm": 5.28125, "learning_rate": 3.965200748668453e-05, "loss": 2.2133641242980957, "step": 363 }, { "epoch": 0.09493382017343678, "grad_norm": 5.46875, "learning_rate": 3.96492914742694e-05, "loss": 2.106085777282715, "step": 364 }, { "epoch": 0.09519462737171545, "grad_norm": 5.5625, "learning_rate": 3.964656499779214e-05, "loss": 2.3144335746765137, "step": 365 }, { "epoch": 0.09545543456999413, "grad_norm": 5.21875, "learning_rate": 3.964382805870473e-05, "loss": 1.9709185361862183, "step": 366 }, { "epoch": 0.0957162417682728, "grad_norm": 5.65625, "learning_rate": 3.964108065846467e-05, "loss": 2.133201837539673, "step": 367 }, { "epoch": 0.09597704896655147, "grad_norm": 5.59375, "learning_rate": 3.963832279853509e-05, "loss": 2.362656354904175, "step": 368 }, { "epoch": 0.09623785616483015, "grad_norm": 5.09375, "learning_rate": 3.963555448038466e-05, "loss": 2.0962352752685547, "step": 369 }, { "epoch": 0.09649866336310882, "grad_norm": 5.46875, "learning_rate": 3.963277570548761e-05, "loss": 2.27280330657959, "step": 370 }, { "epoch": 0.09675947056138749, "grad_norm": 5.875, "learning_rate": 3.9629986475323773e-05, "loss": 2.6300599575042725, "step": 371 }, { "epoch": 0.09702027775966617, "grad_norm": 5.75, "learning_rate": 3.962718679137852e-05, "loss": 2.254918098449707, "step": 372 }, { "epoch": 0.09728108495794484, "grad_norm": 5.34375, "learning_rate": 3.96243766551428e-05, "loss": 2.279177665710449, "step": 373 }, { "epoch": 0.09754189215622351, "grad_norm": 5.09375, "learning_rate": 3.9621556068113124e-05, "loss": 2.049755096435547, "step": 374 }, { "epoch": 0.09780269935450218, "grad_norm": 5.4375, "learning_rate": 3.961872503179158e-05, "loss": 2.3396801948547363, "step": 375 }, { "epoch": 0.09806350655278086, "grad_norm": 6.21875, "learning_rate": 3.961588354768579e-05, "loss": 2.226260185241699, "step": 376 }, { "epoch": 0.09832431375105953, "grad_norm": 5.5, "learning_rate": 3.9613031617309e-05, "loss": 2.1153340339660645, "step": 377 }, { "epoch": 0.0985851209493382, "grad_norm": 5.21875, "learning_rate": 3.9610169242179944e-05, "loss": 1.9020158052444458, "step": 378 }, { "epoch": 0.09884592814761688, "grad_norm": 6.03125, "learning_rate": 3.9607296423822976e-05, "loss": 2.5020313262939453, "step": 379 }, { "epoch": 0.09910673534589555, "grad_norm": 5.15625, "learning_rate": 3.9604413163767985e-05, "loss": 1.8208396434783936, "step": 380 }, { "epoch": 0.09936754254417422, "grad_norm": 6.8125, "learning_rate": 3.960151946355043e-05, "loss": 2.2956275939941406, "step": 381 }, { "epoch": 0.0996283497424529, "grad_norm": 5.53125, "learning_rate": 3.9598615324711325e-05, "loss": 2.058102607727051, "step": 382 }, { "epoch": 0.09988915694073157, "grad_norm": 6.15625, "learning_rate": 3.9595700748797235e-05, "loss": 2.239431142807007, "step": 383 }, { "epoch": 0.10014996413901024, "grad_norm": 5.84375, "learning_rate": 3.959277573736031e-05, "loss": 2.1374924182891846, "step": 384 }, { "epoch": 0.10041077133728891, "grad_norm": 6.0, "learning_rate": 3.9589840291958224e-05, "loss": 2.121920108795166, "step": 385 }, { "epoch": 0.10067157853556759, "grad_norm": 5.71875, "learning_rate": 3.958689441415423e-05, "loss": 2.0743794441223145, "step": 386 }, { "epoch": 0.10093238573384626, "grad_norm": 5.59375, "learning_rate": 3.9583938105517127e-05, "loss": 2.166443347930908, "step": 387 }, { "epoch": 0.10119319293212492, "grad_norm": 5.40625, "learning_rate": 3.958097136762128e-05, "loss": 2.2176623344421387, "step": 388 }, { "epoch": 0.1014540001304036, "grad_norm": 6.0, "learning_rate": 3.957799420204659e-05, "loss": 2.3164663314819336, "step": 389 }, { "epoch": 0.10171480732868227, "grad_norm": 5.21875, "learning_rate": 3.9575006610378524e-05, "loss": 2.0331268310546875, "step": 390 }, { "epoch": 0.10197561452696094, "grad_norm": 5.71875, "learning_rate": 3.95720085942081e-05, "loss": 2.378547191619873, "step": 391 }, { "epoch": 0.10223642172523961, "grad_norm": 5.78125, "learning_rate": 3.956900015513189e-05, "loss": 1.9658644199371338, "step": 392 }, { "epoch": 0.10249722892351829, "grad_norm": 5.03125, "learning_rate": 3.9565981294752004e-05, "loss": 2.014651298522949, "step": 393 }, { "epoch": 0.10275803612179696, "grad_norm": 5.09375, "learning_rate": 3.9562952014676116e-05, "loss": 2.28570818901062, "step": 394 }, { "epoch": 0.10301884332007563, "grad_norm": 5.5625, "learning_rate": 3.955991231651744e-05, "loss": 2.0314409732818604, "step": 395 }, { "epoch": 0.1032796505183543, "grad_norm": 5.125, "learning_rate": 3.9556862201894745e-05, "loss": 2.262718915939331, "step": 396 }, { "epoch": 0.10354045771663298, "grad_norm": 4.96875, "learning_rate": 3.955380167243234e-05, "loss": 2.2126426696777344, "step": 397 }, { "epoch": 0.10380126491491165, "grad_norm": 5.625, "learning_rate": 3.9550730729760086e-05, "loss": 1.9544909000396729, "step": 398 }, { "epoch": 0.10406207211319032, "grad_norm": 5.0, "learning_rate": 3.954764937551338e-05, "loss": 2.2505781650543213, "step": 399 }, { "epoch": 0.104322879311469, "grad_norm": 5.40625, "learning_rate": 3.9544557611333185e-05, "loss": 2.292494297027588, "step": 400 }, { "epoch": 0.10458368650974767, "grad_norm": 5.21875, "learning_rate": 3.954145543886599e-05, "loss": 2.254851818084717, "step": 401 }, { "epoch": 0.10484449370802634, "grad_norm": 5.21875, "learning_rate": 3.9538342859763814e-05, "loss": 2.1972813606262207, "step": 402 }, { "epoch": 0.10510530090630502, "grad_norm": 5.0625, "learning_rate": 3.9535219875684256e-05, "loss": 2.0275416374206543, "step": 403 }, { "epoch": 0.10536610810458369, "grad_norm": 4.90625, "learning_rate": 3.953208648829042e-05, "loss": 2.085601806640625, "step": 404 }, { "epoch": 0.10562691530286236, "grad_norm": 5.4375, "learning_rate": 3.9528942699250975e-05, "loss": 2.362072229385376, "step": 405 }, { "epoch": 0.10588772250114104, "grad_norm": 4.90625, "learning_rate": 3.9525788510240105e-05, "loss": 2.1449997425079346, "step": 406 }, { "epoch": 0.10614852969941971, "grad_norm": 5.25, "learning_rate": 3.9522623922937565e-05, "loss": 2.0494844913482666, "step": 407 }, { "epoch": 0.10640933689769838, "grad_norm": 5.40625, "learning_rate": 3.9519448939028604e-05, "loss": 2.4976119995117188, "step": 408 }, { "epoch": 0.10667014409597705, "grad_norm": 5.90625, "learning_rate": 3.951626356020406e-05, "loss": 2.2443673610687256, "step": 409 }, { "epoch": 0.10693095129425573, "grad_norm": 5.625, "learning_rate": 3.9513067788160264e-05, "loss": 1.9980010986328125, "step": 410 }, { "epoch": 0.1071917584925344, "grad_norm": 5.4375, "learning_rate": 3.95098616245991e-05, "loss": 1.8560242652893066, "step": 411 }, { "epoch": 0.10745256569081307, "grad_norm": 5.25, "learning_rate": 3.950664507122798e-05, "loss": 2.078575611114502, "step": 412 }, { "epoch": 0.10771337288909173, "grad_norm": 5.84375, "learning_rate": 3.950341812975986e-05, "loss": 2.282409429550171, "step": 413 }, { "epoch": 0.1079741800873704, "grad_norm": 5.53125, "learning_rate": 3.950018080191321e-05, "loss": 2.147897243499756, "step": 414 }, { "epoch": 0.10823498728564908, "grad_norm": 5.28125, "learning_rate": 3.949693308941205e-05, "loss": 1.7462961673736572, "step": 415 }, { "epoch": 0.10849579448392775, "grad_norm": 5.53125, "learning_rate": 3.9493674993985906e-05, "loss": 2.07855224609375, "step": 416 }, { "epoch": 0.10875660168220642, "grad_norm": 5.25, "learning_rate": 3.949040651736987e-05, "loss": 2.297809600830078, "step": 417 }, { "epoch": 0.1090174088804851, "grad_norm": 5.03125, "learning_rate": 3.948712766130454e-05, "loss": 1.9195914268493652, "step": 418 }, { "epoch": 0.10927821607876377, "grad_norm": 5.8125, "learning_rate": 3.948383842753602e-05, "loss": 2.3484573364257812, "step": 419 }, { "epoch": 0.10953902327704244, "grad_norm": 5.5625, "learning_rate": 3.948053881781598e-05, "loss": 1.9339882135391235, "step": 420 }, { "epoch": 0.10979983047532112, "grad_norm": 5.5, "learning_rate": 3.9477228833901604e-05, "loss": 2.1224734783172607, "step": 421 }, { "epoch": 0.11006063767359979, "grad_norm": 5.03125, "learning_rate": 3.947390847755559e-05, "loss": 1.802683711051941, "step": 422 }, { "epoch": 0.11032144487187846, "grad_norm": 5.3125, "learning_rate": 3.9470577750546155e-05, "loss": 2.37097430229187, "step": 423 }, { "epoch": 0.11058225207015714, "grad_norm": 5.75, "learning_rate": 3.946723665464706e-05, "loss": 2.007756471633911, "step": 424 }, { "epoch": 0.11084305926843581, "grad_norm": 5.34375, "learning_rate": 3.946388519163757e-05, "loss": 1.7305908203125, "step": 425 }, { "epoch": 0.11110386646671448, "grad_norm": 5.78125, "learning_rate": 3.946052336330249e-05, "loss": 2.0597658157348633, "step": 426 }, { "epoch": 0.11136467366499316, "grad_norm": 5.90625, "learning_rate": 3.945715117143213e-05, "loss": 2.43072772026062, "step": 427 }, { "epoch": 0.11162548086327183, "grad_norm": 6.4375, "learning_rate": 3.9453768617822305e-05, "loss": 2.028156042098999, "step": 428 }, { "epoch": 0.1118862880615505, "grad_norm": 5.15625, "learning_rate": 3.945037570427439e-05, "loss": 1.962646722793579, "step": 429 }, { "epoch": 0.11214709525982917, "grad_norm": 6.15625, "learning_rate": 3.944697243259523e-05, "loss": 2.209850788116455, "step": 430 }, { "epoch": 0.11240790245810785, "grad_norm": 5.09375, "learning_rate": 3.944355880459723e-05, "loss": 2.0976197719573975, "step": 431 }, { "epoch": 0.11266870965638652, "grad_norm": 6.96875, "learning_rate": 3.9440134822098264e-05, "loss": 2.148529052734375, "step": 432 }, { "epoch": 0.1129295168546652, "grad_norm": 5.09375, "learning_rate": 3.9436700486921756e-05, "loss": 1.90488862991333, "step": 433 }, { "epoch": 0.11319032405294387, "grad_norm": 5.46875, "learning_rate": 3.9433255800896646e-05, "loss": 2.175771713256836, "step": 434 }, { "epoch": 0.11345113125122254, "grad_norm": 5.0, "learning_rate": 3.942980076585735e-05, "loss": 1.9042470455169678, "step": 435 }, { "epoch": 0.11371193844950121, "grad_norm": 6.34375, "learning_rate": 3.942633538364383e-05, "loss": 1.9849103689193726, "step": 436 }, { "epoch": 0.11397274564777989, "grad_norm": 5.34375, "learning_rate": 3.942285965610154e-05, "loss": 2.1257541179656982, "step": 437 }, { "epoch": 0.11423355284605854, "grad_norm": 5.40625, "learning_rate": 3.941937358508145e-05, "loss": 1.892786979675293, "step": 438 }, { "epoch": 0.11449436004433722, "grad_norm": 5.0625, "learning_rate": 3.9415877172440045e-05, "loss": 2.0031538009643555, "step": 439 }, { "epoch": 0.11475516724261589, "grad_norm": 5.375, "learning_rate": 3.9412370420039295e-05, "loss": 1.965477705001831, "step": 440 }, { "epoch": 0.11501597444089456, "grad_norm": 4.65625, "learning_rate": 3.94088533297467e-05, "loss": 2.049996852874756, "step": 441 }, { "epoch": 0.11527678163917324, "grad_norm": 5.21875, "learning_rate": 3.9405325903435254e-05, "loss": 2.175043821334839, "step": 442 }, { "epoch": 0.11553758883745191, "grad_norm": 5.34375, "learning_rate": 3.940178814298347e-05, "loss": 2.2416770458221436, "step": 443 }, { "epoch": 0.11579839603573058, "grad_norm": 6.125, "learning_rate": 3.939824005027533e-05, "loss": 1.885480523109436, "step": 444 }, { "epoch": 0.11605920323400926, "grad_norm": 5.15625, "learning_rate": 3.939468162720035e-05, "loss": 2.01684308052063, "step": 445 }, { "epoch": 0.11632001043228793, "grad_norm": 5.875, "learning_rate": 3.939111287565354e-05, "loss": 2.12481689453125, "step": 446 }, { "epoch": 0.1165808176305666, "grad_norm": 5.25, "learning_rate": 3.938753379753542e-05, "loss": 1.9678071737289429, "step": 447 }, { "epoch": 0.11684162482884528, "grad_norm": 5.9375, "learning_rate": 3.9383944394751975e-05, "loss": 1.8941469192504883, "step": 448 }, { "epoch": 0.11710243202712395, "grad_norm": 5.625, "learning_rate": 3.938034466921472e-05, "loss": 2.2009029388427734, "step": 449 }, { "epoch": 0.11736323922540262, "grad_norm": 4.75, "learning_rate": 3.937673462284066e-05, "loss": 1.7105543613433838, "step": 450 }, { "epoch": 0.1176240464236813, "grad_norm": 5.3125, "learning_rate": 3.937311425755229e-05, "loss": 1.9967753887176514, "step": 451 }, { "epoch": 0.11788485362195997, "grad_norm": 5.0625, "learning_rate": 3.9369483575277615e-05, "loss": 2.0934154987335205, "step": 452 }, { "epoch": 0.11814566082023864, "grad_norm": 5.21875, "learning_rate": 3.936584257795011e-05, "loss": 2.0326435565948486, "step": 453 }, { "epoch": 0.11840646801851731, "grad_norm": 5.71875, "learning_rate": 3.936219126750876e-05, "loss": 2.027519464492798, "step": 454 }, { "epoch": 0.11866727521679599, "grad_norm": 5.25, "learning_rate": 3.9358529645898054e-05, "loss": 2.176551580429077, "step": 455 }, { "epoch": 0.11892808241507466, "grad_norm": 5.40625, "learning_rate": 3.935485771506794e-05, "loss": 2.0712437629699707, "step": 456 }, { "epoch": 0.11918888961335333, "grad_norm": 5.1875, "learning_rate": 3.935117547697387e-05, "loss": 1.9934316873550415, "step": 457 }, { "epoch": 0.119449696811632, "grad_norm": 5.0625, "learning_rate": 3.934748293357682e-05, "loss": 2.2428603172302246, "step": 458 }, { "epoch": 0.11971050400991068, "grad_norm": 5.125, "learning_rate": 3.934378008684318e-05, "loss": 2.0849642753601074, "step": 459 }, { "epoch": 0.11997131120818935, "grad_norm": 4.90625, "learning_rate": 3.934006693874489e-05, "loss": 1.9194163084030151, "step": 460 }, { "epoch": 0.12023211840646802, "grad_norm": 5.28125, "learning_rate": 3.933634349125936e-05, "loss": 2.1880245208740234, "step": 461 }, { "epoch": 0.1204929256047467, "grad_norm": 5.46875, "learning_rate": 3.933260974636948e-05, "loss": 1.957980990409851, "step": 462 }, { "epoch": 0.12075373280302536, "grad_norm": 5.09375, "learning_rate": 3.932886570606361e-05, "loss": 1.950570821762085, "step": 463 }, { "epoch": 0.12101454000130403, "grad_norm": 5.71875, "learning_rate": 3.9325111372335616e-05, "loss": 2.274207353591919, "step": 464 }, { "epoch": 0.1212753471995827, "grad_norm": 4.96875, "learning_rate": 3.932134674718484e-05, "loss": 2.1359615325927734, "step": 465 }, { "epoch": 0.12153615439786138, "grad_norm": 5.375, "learning_rate": 3.931757183261609e-05, "loss": 2.3798720836639404, "step": 466 }, { "epoch": 0.12179696159614005, "grad_norm": 4.9375, "learning_rate": 3.9313786630639676e-05, "loss": 2.214578628540039, "step": 467 }, { "epoch": 0.12205776879441872, "grad_norm": 5.3125, "learning_rate": 3.930999114327137e-05, "loss": 2.2453343868255615, "step": 468 }, { "epoch": 0.1223185759926974, "grad_norm": 4.6875, "learning_rate": 3.930618537253242e-05, "loss": 1.9698379039764404, "step": 469 }, { "epoch": 0.12257938319097607, "grad_norm": 4.84375, "learning_rate": 3.930236932044957e-05, "loss": 1.7252676486968994, "step": 470 }, { "epoch": 0.12284019038925474, "grad_norm": 4.5625, "learning_rate": 3.929854298905502e-05, "loss": 2.0300962924957275, "step": 471 }, { "epoch": 0.12310099758753341, "grad_norm": 5.28125, "learning_rate": 3.929470638038645e-05, "loss": 2.0499699115753174, "step": 472 }, { "epoch": 0.12336180478581209, "grad_norm": 5.09375, "learning_rate": 3.9290859496487e-05, "loss": 1.9883675575256348, "step": 473 }, { "epoch": 0.12362261198409076, "grad_norm": 5.40625, "learning_rate": 3.928700233940531e-05, "loss": 2.0695719718933105, "step": 474 }, { "epoch": 0.12388341918236943, "grad_norm": 5.0625, "learning_rate": 3.928313491119548e-05, "loss": 2.2247023582458496, "step": 475 }, { "epoch": 0.1241442263806481, "grad_norm": 5.09375, "learning_rate": 3.927925721391707e-05, "loss": 2.141120195388794, "step": 476 }, { "epoch": 0.12440503357892678, "grad_norm": 4.4375, "learning_rate": 3.9275369249635106e-05, "loss": 1.8078746795654297, "step": 477 }, { "epoch": 0.12466584077720545, "grad_norm": 5.53125, "learning_rate": 3.927147102042011e-05, "loss": 2.255552053451538, "step": 478 }, { "epoch": 0.12492664797548413, "grad_norm": 5.0625, "learning_rate": 3.926756252834802e-05, "loss": 2.1611335277557373, "step": 479 }, { "epoch": 0.12518745517376278, "grad_norm": 4.90625, "learning_rate": 3.92636437755003e-05, "loss": 2.0030574798583984, "step": 480 }, { "epoch": 0.12544826237204146, "grad_norm": 4.875, "learning_rate": 3.9259714763963834e-05, "loss": 1.8799573183059692, "step": 481 }, { "epoch": 0.12570906957032013, "grad_norm": 5.0, "learning_rate": 3.925577549583099e-05, "loss": 1.946972131729126, "step": 482 }, { "epoch": 0.1259698767685988, "grad_norm": 4.59375, "learning_rate": 3.925182597319958e-05, "loss": 2.016092538833618, "step": 483 }, { "epoch": 0.12623068396687748, "grad_norm": 4.8125, "learning_rate": 3.92478661981729e-05, "loss": 2.0311694145202637, "step": 484 }, { "epoch": 0.12649149116515615, "grad_norm": 5.09375, "learning_rate": 3.924389617285969e-05, "loss": 1.8325190544128418, "step": 485 }, { "epoch": 0.12675229836343482, "grad_norm": 5.3125, "learning_rate": 3.9239915899374153e-05, "loss": 2.627725124359131, "step": 486 }, { "epoch": 0.1270131055617135, "grad_norm": 7.65625, "learning_rate": 3.923592537983595e-05, "loss": 2.2398009300231934, "step": 487 }, { "epoch": 0.12727391275999217, "grad_norm": 5.3125, "learning_rate": 3.92319246163702e-05, "loss": 1.9798343181610107, "step": 488 }, { "epoch": 0.12753471995827084, "grad_norm": 4.84375, "learning_rate": 3.922791361110747e-05, "loss": 1.962763786315918, "step": 489 }, { "epoch": 0.12779552715654952, "grad_norm": 6.8125, "learning_rate": 3.9223892366183795e-05, "loss": 2.0241403579711914, "step": 490 }, { "epoch": 0.1280563343548282, "grad_norm": 5.0625, "learning_rate": 3.921986088374064e-05, "loss": 2.0447614192962646, "step": 491 }, { "epoch": 0.12831714155310686, "grad_norm": 4.78125, "learning_rate": 3.9215819165924956e-05, "loss": 1.7903733253479004, "step": 492 }, { "epoch": 0.12857794875138553, "grad_norm": 5.65625, "learning_rate": 3.9211767214889114e-05, "loss": 2.12595796585083, "step": 493 }, { "epoch": 0.1288387559496642, "grad_norm": 5.6875, "learning_rate": 3.9207705032790944e-05, "loss": 1.962897539138794, "step": 494 }, { "epoch": 0.12909956314794288, "grad_norm": 4.90625, "learning_rate": 3.9203632621793726e-05, "loss": 1.8787273168563843, "step": 495 }, { "epoch": 0.12936037034622155, "grad_norm": 4.90625, "learning_rate": 3.91995499840662e-05, "loss": 1.9988367557525635, "step": 496 }, { "epoch": 0.12962117754450023, "grad_norm": 5.125, "learning_rate": 3.919545712178253e-05, "loss": 2.086315155029297, "step": 497 }, { "epoch": 0.1298819847427789, "grad_norm": 5.0625, "learning_rate": 3.919135403712233e-05, "loss": 1.9002385139465332, "step": 498 }, { "epoch": 0.13014279194105757, "grad_norm": 5.5, "learning_rate": 3.9187240732270675e-05, "loss": 2.037102460861206, "step": 499 }, { "epoch": 0.13040359913933625, "grad_norm": 4.96875, "learning_rate": 3.9183117209418055e-05, "loss": 1.9531993865966797, "step": 500 }, { "epoch": 0.13066440633761492, "grad_norm": 5.15625, "learning_rate": 3.917898347076043e-05, "loss": 2.143584728240967, "step": 501 }, { "epoch": 0.1309252135358936, "grad_norm": 4.78125, "learning_rate": 3.917483951849919e-05, "loss": 2.0562262535095215, "step": 502 }, { "epoch": 0.13118602073417227, "grad_norm": 5.46875, "learning_rate": 3.917068535484114e-05, "loss": 2.0279130935668945, "step": 503 }, { "epoch": 0.13144682793245094, "grad_norm": 5.0625, "learning_rate": 3.916652098199857e-05, "loss": 1.8175674676895142, "step": 504 }, { "epoch": 0.1317076351307296, "grad_norm": 5.28125, "learning_rate": 3.916234640218917e-05, "loss": 2.081401824951172, "step": 505 }, { "epoch": 0.13196844232900828, "grad_norm": 5.1875, "learning_rate": 3.915816161763607e-05, "loss": 1.9667434692382812, "step": 506 }, { "epoch": 0.13222924952728696, "grad_norm": 4.65625, "learning_rate": 3.915396663056784e-05, "loss": 1.7676701545715332, "step": 507 }, { "epoch": 0.13249005672556563, "grad_norm": 5.34375, "learning_rate": 3.91497614432185e-05, "loss": 2.0130467414855957, "step": 508 }, { "epoch": 0.1327508639238443, "grad_norm": 5.78125, "learning_rate": 3.914554605782749e-05, "loss": 2.157017707824707, "step": 509 }, { "epoch": 0.13301167112212298, "grad_norm": 5.3125, "learning_rate": 3.914132047663965e-05, "loss": 1.9875788688659668, "step": 510 }, { "epoch": 0.13327247832040165, "grad_norm": 4.875, "learning_rate": 3.91370847019053e-05, "loss": 1.7901101112365723, "step": 511 }, { "epoch": 0.13353328551868032, "grad_norm": 5.0625, "learning_rate": 3.913283873588016e-05, "loss": 2.2028799057006836, "step": 512 }, { "epoch": 0.133794092716959, "grad_norm": 5.0625, "learning_rate": 3.912858258082538e-05, "loss": 2.07771635055542, "step": 513 }, { "epoch": 0.13405489991523767, "grad_norm": 4.8125, "learning_rate": 3.9124316239007535e-05, "loss": 1.8496947288513184, "step": 514 }, { "epoch": 0.13431570711351634, "grad_norm": 4.90625, "learning_rate": 3.912003971269864e-05, "loss": 2.029522180557251, "step": 515 }, { "epoch": 0.13457651431179501, "grad_norm": 4.9375, "learning_rate": 3.911575300417612e-05, "loss": 2.2308075428009033, "step": 516 }, { "epoch": 0.1348373215100737, "grad_norm": 5.0625, "learning_rate": 3.911145611572282e-05, "loss": 2.0410444736480713, "step": 517 }, { "epoch": 0.13509812870835236, "grad_norm": 5.125, "learning_rate": 3.9107149049627014e-05, "loss": 2.071100950241089, "step": 518 }, { "epoch": 0.13535893590663103, "grad_norm": 4.8125, "learning_rate": 3.9102831808182384e-05, "loss": 1.8825916051864624, "step": 519 }, { "epoch": 0.1356197431049097, "grad_norm": 5.28125, "learning_rate": 3.9098504393688055e-05, "loss": 2.1796164512634277, "step": 520 }, { "epoch": 0.13588055030318838, "grad_norm": 5.40625, "learning_rate": 3.9094166808448546e-05, "loss": 2.2033798694610596, "step": 521 }, { "epoch": 0.13614135750146705, "grad_norm": 5.375, "learning_rate": 3.908981905477381e-05, "loss": 1.9869440793991089, "step": 522 }, { "epoch": 0.13640216469974573, "grad_norm": 5.34375, "learning_rate": 3.908546113497919e-05, "loss": 1.9655147790908813, "step": 523 }, { "epoch": 0.1366629718980244, "grad_norm": 10.5, "learning_rate": 3.908109305138547e-05, "loss": 2.0668673515319824, "step": 524 }, { "epoch": 0.13692377909630304, "grad_norm": 5.15625, "learning_rate": 3.9076714806318835e-05, "loss": 2.0768802165985107, "step": 525 }, { "epoch": 0.13718458629458172, "grad_norm": 4.96875, "learning_rate": 3.907232640211089e-05, "loss": 2.2760894298553467, "step": 526 }, { "epoch": 0.1374453934928604, "grad_norm": 4.78125, "learning_rate": 3.9067927841098614e-05, "loss": 2.092144727706909, "step": 527 }, { "epoch": 0.13770620069113906, "grad_norm": 5.09375, "learning_rate": 3.906351912562445e-05, "loss": 2.464493989944458, "step": 528 }, { "epoch": 0.13796700788941774, "grad_norm": 5.09375, "learning_rate": 3.9059100258036214e-05, "loss": 2.144874095916748, "step": 529 }, { "epoch": 0.1382278150876964, "grad_norm": 4.9375, "learning_rate": 3.9054671240687134e-05, "loss": 1.8832736015319824, "step": 530 }, { "epoch": 0.13848862228597508, "grad_norm": 5.03125, "learning_rate": 3.905023207593585e-05, "loss": 2.051281690597534, "step": 531 }, { "epoch": 0.13874942948425376, "grad_norm": 4.65625, "learning_rate": 3.904578276614639e-05, "loss": 1.959316372871399, "step": 532 }, { "epoch": 0.13901023668253243, "grad_norm": 5.09375, "learning_rate": 3.9041323313688215e-05, "loss": 2.2046756744384766, "step": 533 }, { "epoch": 0.1392710438808111, "grad_norm": 4.5625, "learning_rate": 3.903685372093615e-05, "loss": 2.0775370597839355, "step": 534 }, { "epoch": 0.13953185107908977, "grad_norm": 5.03125, "learning_rate": 3.903237399027044e-05, "loss": 2.0336546897888184, "step": 535 }, { "epoch": 0.13979265827736845, "grad_norm": 5.0625, "learning_rate": 3.902788412407675e-05, "loss": 1.987802505493164, "step": 536 }, { "epoch": 0.14005346547564712, "grad_norm": 5.03125, "learning_rate": 3.9023384124746085e-05, "loss": 1.9189298152923584, "step": 537 }, { "epoch": 0.1403142726739258, "grad_norm": 4.9375, "learning_rate": 3.901887399467491e-05, "loss": 1.8632619380950928, "step": 538 }, { "epoch": 0.14057507987220447, "grad_norm": 5.25, "learning_rate": 3.9014353736265034e-05, "loss": 2.0026168823242188, "step": 539 }, { "epoch": 0.14083588707048314, "grad_norm": 5.03125, "learning_rate": 3.90098233519237e-05, "loss": 1.866854190826416, "step": 540 }, { "epoch": 0.1410966942687618, "grad_norm": 5.1875, "learning_rate": 3.900528284406352e-05, "loss": 2.2660746574401855, "step": 541 }, { "epoch": 0.1413575014670405, "grad_norm": 5.375, "learning_rate": 3.90007322151025e-05, "loss": 2.1613218784332275, "step": 542 }, { "epoch": 0.14161830866531916, "grad_norm": 5.21875, "learning_rate": 3.899617146746404e-05, "loss": 2.2124698162078857, "step": 543 }, { "epoch": 0.14187911586359783, "grad_norm": 4.875, "learning_rate": 3.899160060357693e-05, "loss": 2.103794574737549, "step": 544 }, { "epoch": 0.1421399230618765, "grad_norm": 4.75, "learning_rate": 3.898701962587533e-05, "loss": 1.8552050590515137, "step": 545 }, { "epoch": 0.14240073026015518, "grad_norm": 5.5625, "learning_rate": 3.898242853679882e-05, "loss": 2.0167746543884277, "step": 546 }, { "epoch": 0.14266153745843385, "grad_norm": 5.53125, "learning_rate": 3.8977827338792334e-05, "loss": 2.2357492446899414, "step": 547 }, { "epoch": 0.14292234465671252, "grad_norm": 4.78125, "learning_rate": 3.89732160343062e-05, "loss": 1.7399003505706787, "step": 548 }, { "epoch": 0.1431831518549912, "grad_norm": 4.96875, "learning_rate": 3.896859462579614e-05, "loss": 2.075052499771118, "step": 549 }, { "epoch": 0.14344395905326987, "grad_norm": 4.375, "learning_rate": 3.8963963115723234e-05, "loss": 1.7772223949432373, "step": 550 }, { "epoch": 0.14370476625154854, "grad_norm": 4.53125, "learning_rate": 3.8959321506553955e-05, "loss": 1.6828080415725708, "step": 551 }, { "epoch": 0.14396557344982722, "grad_norm": 5.46875, "learning_rate": 3.8954669800760164e-05, "loss": 2.1450443267822266, "step": 552 }, { "epoch": 0.1442263806481059, "grad_norm": 5.28125, "learning_rate": 3.895000800081907e-05, "loss": 2.1769192218780518, "step": 553 }, { "epoch": 0.14448718784638456, "grad_norm": 4.8125, "learning_rate": 3.894533610921328e-05, "loss": 2.011871814727783, "step": 554 }, { "epoch": 0.14474799504466324, "grad_norm": 4.8125, "learning_rate": 3.8940654128430766e-05, "loss": 1.8749043941497803, "step": 555 }, { "epoch": 0.1450088022429419, "grad_norm": 4.9375, "learning_rate": 3.893596206096489e-05, "loss": 2.031252145767212, "step": 556 }, { "epoch": 0.14526960944122058, "grad_norm": 5.0, "learning_rate": 3.893125990931437e-05, "loss": 2.2448625564575195, "step": 557 }, { "epoch": 0.14553041663949925, "grad_norm": 5.125, "learning_rate": 3.8926547675983286e-05, "loss": 2.0833122730255127, "step": 558 }, { "epoch": 0.14579122383777793, "grad_norm": 4.5, "learning_rate": 3.89218253634811e-05, "loss": 2.0020689964294434, "step": 559 }, { "epoch": 0.1460520310360566, "grad_norm": 4.8125, "learning_rate": 3.891709297432265e-05, "loss": 1.9792006015777588, "step": 560 }, { "epoch": 0.14631283823433527, "grad_norm": 4.78125, "learning_rate": 3.891235051102812e-05, "loss": 2.0448176860809326, "step": 561 }, { "epoch": 0.14657364543261395, "grad_norm": 4.90625, "learning_rate": 3.890759797612307e-05, "loss": 2.2394766807556152, "step": 562 }, { "epoch": 0.14683445263089262, "grad_norm": 4.84375, "learning_rate": 3.890283537213842e-05, "loss": 2.0798072814941406, "step": 563 }, { "epoch": 0.1470952598291713, "grad_norm": 5.53125, "learning_rate": 3.889806270161046e-05, "loss": 2.048489809036255, "step": 564 }, { "epoch": 0.14735606702744997, "grad_norm": 5.0625, "learning_rate": 3.889327996708083e-05, "loss": 2.113455295562744, "step": 565 }, { "epoch": 0.14761687422572864, "grad_norm": 4.75, "learning_rate": 3.888848717109653e-05, "loss": 1.6562857627868652, "step": 566 }, { "epoch": 0.1478776814240073, "grad_norm": 4.84375, "learning_rate": 3.888368431620993e-05, "loss": 2.054020404815674, "step": 567 }, { "epoch": 0.14813848862228599, "grad_norm": 4.75, "learning_rate": 3.887887140497875e-05, "loss": 1.9090944528579712, "step": 568 }, { "epoch": 0.14839929582056466, "grad_norm": 5.03125, "learning_rate": 3.887404843996606e-05, "loss": 1.945820927619934, "step": 569 }, { "epoch": 0.14866010301884333, "grad_norm": 5.1875, "learning_rate": 3.8869215423740285e-05, "loss": 2.0362110137939453, "step": 570 }, { "epoch": 0.148920910217122, "grad_norm": 4.71875, "learning_rate": 3.886437235887522e-05, "loss": 1.8869293928146362, "step": 571 }, { "epoch": 0.14918171741540068, "grad_norm": 4.9375, "learning_rate": 3.8859519247949984e-05, "loss": 2.261014223098755, "step": 572 }, { "epoch": 0.14944252461367935, "grad_norm": 4.90625, "learning_rate": 3.8854656093549075e-05, "loss": 1.9760932922363281, "step": 573 }, { "epoch": 0.14970333181195802, "grad_norm": 4.5625, "learning_rate": 3.8849782898262306e-05, "loss": 1.7362785339355469, "step": 574 }, { "epoch": 0.14996413901023667, "grad_norm": 12.8125, "learning_rate": 3.884489966468486e-05, "loss": 2.5838465690612793, "step": 575 }, { "epoch": 0.15022494620851534, "grad_norm": 5.40625, "learning_rate": 3.884000639541728e-05, "loss": 1.9989123344421387, "step": 576 }, { "epoch": 0.15048575340679402, "grad_norm": 5.6875, "learning_rate": 3.883510309306541e-05, "loss": 1.9393930435180664, "step": 577 }, { "epoch": 0.1507465606050727, "grad_norm": 5.03125, "learning_rate": 3.883018976024047e-05, "loss": 1.9157646894454956, "step": 578 }, { "epoch": 0.15100736780335136, "grad_norm": 5.40625, "learning_rate": 3.8825266399559024e-05, "loss": 1.9912060499191284, "step": 579 }, { "epoch": 0.15126817500163003, "grad_norm": 5.21875, "learning_rate": 3.8820333013642945e-05, "loss": 1.8786393404006958, "step": 580 }, { "epoch": 0.1515289821999087, "grad_norm": 5.3125, "learning_rate": 3.881538960511948e-05, "loss": 1.921290397644043, "step": 581 }, { "epoch": 0.15178978939818738, "grad_norm": 5.09375, "learning_rate": 3.881043617662121e-05, "loss": 1.8739674091339111, "step": 582 }, { "epoch": 0.15205059659646605, "grad_norm": 5.0, "learning_rate": 3.880547273078602e-05, "loss": 2.0182878971099854, "step": 583 }, { "epoch": 0.15231140379474473, "grad_norm": 4.875, "learning_rate": 3.880049927025715e-05, "loss": 1.7300055027008057, "step": 584 }, { "epoch": 0.1525722109930234, "grad_norm": 5.1875, "learning_rate": 3.8795515797683194e-05, "loss": 2.154013156890869, "step": 585 }, { "epoch": 0.15283301819130207, "grad_norm": 4.8125, "learning_rate": 3.8790522315718034e-05, "loss": 2.199392318725586, "step": 586 }, { "epoch": 0.15309382538958075, "grad_norm": 4.6875, "learning_rate": 3.878551882702092e-05, "loss": 1.8654108047485352, "step": 587 }, { "epoch": 0.15335463258785942, "grad_norm": 4.5, "learning_rate": 3.878050533425642e-05, "loss": 2.0651044845581055, "step": 588 }, { "epoch": 0.1536154397861381, "grad_norm": 4.65625, "learning_rate": 3.8775481840094416e-05, "loss": 1.9368878602981567, "step": 589 }, { "epoch": 0.15387624698441676, "grad_norm": 5.40625, "learning_rate": 3.8770448347210144e-05, "loss": 1.880233645439148, "step": 590 }, { "epoch": 0.15413705418269544, "grad_norm": 5.125, "learning_rate": 3.8765404858284124e-05, "loss": 2.0680623054504395, "step": 591 }, { "epoch": 0.1543978613809741, "grad_norm": 5.78125, "learning_rate": 3.876035137600224e-05, "loss": 2.1295831203460693, "step": 592 }, { "epoch": 0.15465866857925278, "grad_norm": 11.0625, "learning_rate": 3.875528790305567e-05, "loss": 2.007826328277588, "step": 593 }, { "epoch": 0.15491947577753146, "grad_norm": 5.125, "learning_rate": 3.875021444214093e-05, "loss": 2.1482367515563965, "step": 594 }, { "epoch": 0.15518028297581013, "grad_norm": 5.21875, "learning_rate": 3.874513099595986e-05, "loss": 2.0759899616241455, "step": 595 }, { "epoch": 0.1554410901740888, "grad_norm": 5.5, "learning_rate": 3.874003756721958e-05, "loss": 2.073017120361328, "step": 596 }, { "epoch": 0.15570189737236748, "grad_norm": 5.8125, "learning_rate": 3.873493415863256e-05, "loss": 2.0745091438293457, "step": 597 }, { "epoch": 0.15596270457064615, "grad_norm": 4.6875, "learning_rate": 3.872982077291659e-05, "loss": 1.9696614742279053, "step": 598 }, { "epoch": 0.15622351176892482, "grad_norm": 5.40625, "learning_rate": 3.872469741279475e-05, "loss": 2.2111566066741943, "step": 599 }, { "epoch": 0.1564843189672035, "grad_norm": 4.625, "learning_rate": 3.8719564080995434e-05, "loss": 1.8812564611434937, "step": 600 }, { "epoch": 0.15674512616548217, "grad_norm": 4.625, "learning_rate": 3.871442078025237e-05, "loss": 2.0892837047576904, "step": 601 }, { "epoch": 0.15700593336376084, "grad_norm": 4.5625, "learning_rate": 3.870926751330458e-05, "loss": 1.8276432752609253, "step": 602 }, { "epoch": 0.15726674056203951, "grad_norm": 5.03125, "learning_rate": 3.870410428289637e-05, "loss": 1.8986891508102417, "step": 603 }, { "epoch": 0.1575275477603182, "grad_norm": 4.90625, "learning_rate": 3.86989310917774e-05, "loss": 2.030865430831909, "step": 604 }, { "epoch": 0.15778835495859686, "grad_norm": 4.84375, "learning_rate": 3.869374794270258e-05, "loss": 1.9464844465255737, "step": 605 }, { "epoch": 0.15804916215687553, "grad_norm": 5.09375, "learning_rate": 3.868855483843218e-05, "loss": 2.18831205368042, "step": 606 }, { "epoch": 0.1583099693551542, "grad_norm": 4.46875, "learning_rate": 3.868335178173174e-05, "loss": 1.8682782649993896, "step": 607 }, { "epoch": 0.15857077655343288, "grad_norm": 4.71875, "learning_rate": 3.867813877537208e-05, "loss": 1.9430899620056152, "step": 608 }, { "epoch": 0.15883158375171155, "grad_norm": 4.875, "learning_rate": 3.867291582212936e-05, "loss": 2.0944416522979736, "step": 609 }, { "epoch": 0.15909239094999023, "grad_norm": 4.78125, "learning_rate": 3.866768292478502e-05, "loss": 1.973773717880249, "step": 610 }, { "epoch": 0.1593531981482689, "grad_norm": 5.0, "learning_rate": 3.866244008612579e-05, "loss": 2.190091609954834, "step": 611 }, { "epoch": 0.15961400534654757, "grad_norm": 4.71875, "learning_rate": 3.86571873089437e-05, "loss": 1.9963393211364746, "step": 612 }, { "epoch": 0.15987481254482624, "grad_norm": 5.0, "learning_rate": 3.8651924596036066e-05, "loss": 2.2495474815368652, "step": 613 }, { "epoch": 0.16013561974310492, "grad_norm": 4.78125, "learning_rate": 3.8646651950205514e-05, "loss": 1.9934877157211304, "step": 614 }, { "epoch": 0.1603964269413836, "grad_norm": 4.6875, "learning_rate": 3.864136937425993e-05, "loss": 2.1129329204559326, "step": 615 }, { "epoch": 0.16065723413966226, "grad_norm": 4.875, "learning_rate": 3.863607687101252e-05, "loss": 1.91444993019104, "step": 616 }, { "epoch": 0.16091804133794094, "grad_norm": 4.875, "learning_rate": 3.863077444328175e-05, "loss": 1.9214363098144531, "step": 617 }, { "epoch": 0.1611788485362196, "grad_norm": 4.84375, "learning_rate": 3.862546209389139e-05, "loss": 1.8061554431915283, "step": 618 }, { "epoch": 0.16143965573449828, "grad_norm": 4.71875, "learning_rate": 3.862013982567048e-05, "loss": 1.873937726020813, "step": 619 }, { "epoch": 0.16170046293277696, "grad_norm": 5.21875, "learning_rate": 3.861480764145335e-05, "loss": 2.1080081462860107, "step": 620 }, { "epoch": 0.16196127013105563, "grad_norm": 4.78125, "learning_rate": 3.860946554407961e-05, "loss": 2.0576019287109375, "step": 621 }, { "epoch": 0.1622220773293343, "grad_norm": 5.1875, "learning_rate": 3.860411353639415e-05, "loss": 2.2772209644317627, "step": 622 }, { "epoch": 0.16248288452761298, "grad_norm": 4.6875, "learning_rate": 3.859875162124714e-05, "loss": 2.103712558746338, "step": 623 }, { "epoch": 0.16274369172589165, "grad_norm": 4.90625, "learning_rate": 3.8593379801494015e-05, "loss": 2.074312925338745, "step": 624 }, { "epoch": 0.1630044989241703, "grad_norm": 5.78125, "learning_rate": 3.858799807999549e-05, "loss": 1.9695968627929688, "step": 625 }, { "epoch": 0.16326530612244897, "grad_norm": 4.78125, "learning_rate": 3.858260645961756e-05, "loss": 1.8722575902938843, "step": 626 }, { "epoch": 0.16352611332072764, "grad_norm": 4.65625, "learning_rate": 3.857720494323149e-05, "loss": 1.9032584428787231, "step": 627 }, { "epoch": 0.1637869205190063, "grad_norm": 4.75, "learning_rate": 3.8571793533713796e-05, "loss": 1.9035115242004395, "step": 628 }, { "epoch": 0.16404772771728499, "grad_norm": 5.09375, "learning_rate": 3.856637223394629e-05, "loss": 1.9448721408843994, "step": 629 }, { "epoch": 0.16430853491556366, "grad_norm": 4.59375, "learning_rate": 3.856094104681605e-05, "loss": 1.8737752437591553, "step": 630 }, { "epoch": 0.16456934211384233, "grad_norm": 5.375, "learning_rate": 3.855549997521538e-05, "loss": 1.9294793605804443, "step": 631 }, { "epoch": 0.164830149312121, "grad_norm": 4.9375, "learning_rate": 3.85500490220419e-05, "loss": 1.7865889072418213, "step": 632 }, { "epoch": 0.16509095651039968, "grad_norm": 4.71875, "learning_rate": 3.8544588190198454e-05, "loss": 1.6698765754699707, "step": 633 }, { "epoch": 0.16535176370867835, "grad_norm": 5.0625, "learning_rate": 3.8539117482593164e-05, "loss": 2.2318742275238037, "step": 634 }, { "epoch": 0.16561257090695702, "grad_norm": 4.75, "learning_rate": 3.853363690213942e-05, "loss": 1.8279545307159424, "step": 635 }, { "epoch": 0.1658733781052357, "grad_norm": 4.84375, "learning_rate": 3.852814645175584e-05, "loss": 1.9168078899383545, "step": 636 }, { "epoch": 0.16613418530351437, "grad_norm": 4.90625, "learning_rate": 3.8522646134366336e-05, "loss": 1.9145238399505615, "step": 637 }, { "epoch": 0.16639499250179304, "grad_norm": 4.8125, "learning_rate": 3.851713595290004e-05, "loss": 1.8849806785583496, "step": 638 }, { "epoch": 0.16665579970007172, "grad_norm": 4.53125, "learning_rate": 3.851161591029135e-05, "loss": 1.6500052213668823, "step": 639 }, { "epoch": 0.1669166068983504, "grad_norm": 5.0, "learning_rate": 3.8506086009479934e-05, "loss": 1.8743053674697876, "step": 640 }, { "epoch": 0.16717741409662906, "grad_norm": 4.84375, "learning_rate": 3.850054625341068e-05, "loss": 2.2005090713500977, "step": 641 }, { "epoch": 0.16743822129490774, "grad_norm": 4.5625, "learning_rate": 3.849499664503375e-05, "loss": 1.7907366752624512, "step": 642 }, { "epoch": 0.1676990284931864, "grad_norm": 4.4375, "learning_rate": 3.848943718730452e-05, "loss": 1.8465526103973389, "step": 643 }, { "epoch": 0.16795983569146508, "grad_norm": 4.75, "learning_rate": 3.848386788318365e-05, "loss": 2.003734827041626, "step": 644 }, { "epoch": 0.16822064288974375, "grad_norm": 4.78125, "learning_rate": 3.847828873563702e-05, "loss": 1.7648651599884033, "step": 645 }, { "epoch": 0.16848145008802243, "grad_norm": 4.4375, "learning_rate": 3.847269974763576e-05, "loss": 2.020761728286743, "step": 646 }, { "epoch": 0.1687422572863011, "grad_norm": 4.6875, "learning_rate": 3.846710092215623e-05, "loss": 1.9700047969818115, "step": 647 }, { "epoch": 0.16900306448457977, "grad_norm": 4.4375, "learning_rate": 3.846149226218003e-05, "loss": 1.9700831174850464, "step": 648 }, { "epoch": 0.16926387168285845, "grad_norm": 4.5, "learning_rate": 3.845587377069403e-05, "loss": 2.1578357219696045, "step": 649 }, { "epoch": 0.16952467888113712, "grad_norm": 4.5625, "learning_rate": 3.845024545069029e-05, "loss": 2.0335230827331543, "step": 650 }, { "epoch": 0.1697854860794158, "grad_norm": 4.34375, "learning_rate": 3.8444607305166124e-05, "loss": 1.9350706338882446, "step": 651 }, { "epoch": 0.17004629327769447, "grad_norm": 4.375, "learning_rate": 3.843895933712409e-05, "loss": 2.063561201095581, "step": 652 }, { "epoch": 0.17030710047597314, "grad_norm": 4.1875, "learning_rate": 3.843330154957195e-05, "loss": 1.7344597578048706, "step": 653 }, { "epoch": 0.1705679076742518, "grad_norm": 4.78125, "learning_rate": 3.8427633945522714e-05, "loss": 2.1208086013793945, "step": 654 }, { "epoch": 0.17082871487253048, "grad_norm": 4.5625, "learning_rate": 3.842195652799463e-05, "loss": 1.8748433589935303, "step": 655 }, { "epoch": 0.17108952207080916, "grad_norm": 4.40625, "learning_rate": 3.841626930001114e-05, "loss": 1.809694766998291, "step": 656 }, { "epoch": 0.17135032926908783, "grad_norm": 4.28125, "learning_rate": 3.841057226460094e-05, "loss": 1.9290765523910522, "step": 657 }, { "epoch": 0.1716111364673665, "grad_norm": 4.6875, "learning_rate": 3.840486542479793e-05, "loss": 2.177703857421875, "step": 658 }, { "epoch": 0.17187194366564518, "grad_norm": 4.375, "learning_rate": 3.839914878364125e-05, "loss": 1.9175394773483276, "step": 659 }, { "epoch": 0.17213275086392385, "grad_norm": 6.28125, "learning_rate": 3.8393422344175234e-05, "loss": 2.1164231300354004, "step": 660 }, { "epoch": 0.17239355806220252, "grad_norm": 4.6875, "learning_rate": 3.838768610944946e-05, "loss": 2.0701417922973633, "step": 661 }, { "epoch": 0.1726543652604812, "grad_norm": 4.625, "learning_rate": 3.8381940082518704e-05, "loss": 2.0714147090911865, "step": 662 }, { "epoch": 0.17291517245875987, "grad_norm": 4.5625, "learning_rate": 3.8376184266442965e-05, "loss": 1.839314341545105, "step": 663 }, { "epoch": 0.17317597965703854, "grad_norm": 4.8125, "learning_rate": 3.837041866428745e-05, "loss": 2.001265048980713, "step": 664 }, { "epoch": 0.17343678685531722, "grad_norm": 4.125, "learning_rate": 3.83646432791226e-05, "loss": 1.6682621240615845, "step": 665 }, { "epoch": 0.1736975940535959, "grad_norm": 4.875, "learning_rate": 3.835885811402402e-05, "loss": 1.9657459259033203, "step": 666 }, { "epoch": 0.17395840125187456, "grad_norm": 4.3125, "learning_rate": 3.8353063172072564e-05, "loss": 1.709079384803772, "step": 667 }, { "epoch": 0.17421920845015323, "grad_norm": 4.46875, "learning_rate": 3.834725845635428e-05, "loss": 1.8939547538757324, "step": 668 }, { "epoch": 0.1744800156484319, "grad_norm": 4.65625, "learning_rate": 3.834144396996041e-05, "loss": 2.047983169555664, "step": 669 }, { "epoch": 0.17474082284671058, "grad_norm": 4.875, "learning_rate": 3.833561971598743e-05, "loss": 2.058068037033081, "step": 670 }, { "epoch": 0.17500163004498925, "grad_norm": 4.53125, "learning_rate": 3.832978569753697e-05, "loss": 1.89093017578125, "step": 671 }, { "epoch": 0.17526243724326793, "grad_norm": 4.875, "learning_rate": 3.83239419177159e-05, "loss": 1.9490283727645874, "step": 672 }, { "epoch": 0.1755232444415466, "grad_norm": 4.6875, "learning_rate": 3.831808837963628e-05, "loss": 2.1818556785583496, "step": 673 }, { "epoch": 0.17578405163982527, "grad_norm": 4.5625, "learning_rate": 3.831222508641535e-05, "loss": 2.1393942832946777, "step": 674 }, { "epoch": 0.17604485883810392, "grad_norm": 4.3125, "learning_rate": 3.830635204117557e-05, "loss": 1.8520056009292603, "step": 675 }, { "epoch": 0.1763056660363826, "grad_norm": 4.8125, "learning_rate": 3.8300469247044564e-05, "loss": 1.6711615324020386, "step": 676 }, { "epoch": 0.17656647323466126, "grad_norm": 4.625, "learning_rate": 3.829457670715518e-05, "loss": 1.928394079208374, "step": 677 }, { "epoch": 0.17682728043293994, "grad_norm": 4.5, "learning_rate": 3.828867442464543e-05, "loss": 1.7552798986434937, "step": 678 }, { "epoch": 0.1770880876312186, "grad_norm": 4.875, "learning_rate": 3.828276240265852e-05, "loss": 2.078004837036133, "step": 679 }, { "epoch": 0.17734889482949728, "grad_norm": 4.8125, "learning_rate": 3.827684064434286e-05, "loss": 2.1814630031585693, "step": 680 }, { "epoch": 0.17760970202777596, "grad_norm": 4.78125, "learning_rate": 3.827090915285202e-05, "loss": 2.024282932281494, "step": 681 }, { "epoch": 0.17787050922605463, "grad_norm": 4.46875, "learning_rate": 3.8264967931344774e-05, "loss": 1.7166085243225098, "step": 682 }, { "epoch": 0.1781313164243333, "grad_norm": 4.625, "learning_rate": 3.825901698298506e-05, "loss": 1.7741438150405884, "step": 683 }, { "epoch": 0.17839212362261198, "grad_norm": 4.3125, "learning_rate": 3.8253056310942015e-05, "loss": 1.6977009773254395, "step": 684 }, { "epoch": 0.17865293082089065, "grad_norm": 4.40625, "learning_rate": 3.824708591838993e-05, "loss": 1.880244493484497, "step": 685 }, { "epoch": 0.17891373801916932, "grad_norm": 4.625, "learning_rate": 3.82411058085083e-05, "loss": 2.0184810161590576, "step": 686 }, { "epoch": 0.179174545217448, "grad_norm": 4.6875, "learning_rate": 3.823511598448177e-05, "loss": 1.8925732374191284, "step": 687 }, { "epoch": 0.17943535241572667, "grad_norm": 4.625, "learning_rate": 3.822911644950018e-05, "loss": 1.8870201110839844, "step": 688 }, { "epoch": 0.17969615961400534, "grad_norm": 4.78125, "learning_rate": 3.822310720675852e-05, "loss": 2.037837028503418, "step": 689 }, { "epoch": 0.179956966812284, "grad_norm": 4.46875, "learning_rate": 3.821708825945698e-05, "loss": 1.8832042217254639, "step": 690 }, { "epoch": 0.1802177740105627, "grad_norm": 4.8125, "learning_rate": 3.821105961080088e-05, "loss": 2.1521339416503906, "step": 691 }, { "epoch": 0.18047858120884136, "grad_norm": 4.125, "learning_rate": 3.820502126400073e-05, "loss": 1.81393301486969, "step": 692 }, { "epoch": 0.18073938840712003, "grad_norm": 4.28125, "learning_rate": 3.81989732222722e-05, "loss": 1.8241595029830933, "step": 693 }, { "epoch": 0.1810001956053987, "grad_norm": 4.40625, "learning_rate": 3.819291548883612e-05, "loss": 1.8708091974258423, "step": 694 }, { "epoch": 0.18126100280367738, "grad_norm": 4.75, "learning_rate": 3.81868480669185e-05, "loss": 1.8775160312652588, "step": 695 }, { "epoch": 0.18152181000195605, "grad_norm": 4.75, "learning_rate": 3.818077095975048e-05, "loss": 1.9862464666366577, "step": 696 }, { "epoch": 0.18178261720023473, "grad_norm": 4.59375, "learning_rate": 3.817468417056836e-05, "loss": 1.988217830657959, "step": 697 }, { "epoch": 0.1820434243985134, "grad_norm": 4.9375, "learning_rate": 3.816858770261363e-05, "loss": 2.1213014125823975, "step": 698 }, { "epoch": 0.18230423159679207, "grad_norm": 4.59375, "learning_rate": 3.816248155913291e-05, "loss": 1.8472071886062622, "step": 699 }, { "epoch": 0.18256503879507074, "grad_norm": 4.15625, "learning_rate": 3.815636574337796e-05, "loss": 1.5301586389541626, "step": 700 }, { "epoch": 0.18282584599334942, "grad_norm": 4.34375, "learning_rate": 3.8150240258605714e-05, "loss": 1.8507400751113892, "step": 701 }, { "epoch": 0.1830866531916281, "grad_norm": 4.75, "learning_rate": 3.8144105108078246e-05, "loss": 1.957880973815918, "step": 702 }, { "epoch": 0.18334746038990676, "grad_norm": 4.4375, "learning_rate": 3.813796029506277e-05, "loss": 2.0707597732543945, "step": 703 }, { "epoch": 0.18360826758818544, "grad_norm": 4.3125, "learning_rate": 3.813180582283167e-05, "loss": 2.112607955932617, "step": 704 }, { "epoch": 0.1838690747864641, "grad_norm": 4.375, "learning_rate": 3.8125641694662445e-05, "loss": 1.702808141708374, "step": 705 }, { "epoch": 0.18412988198474278, "grad_norm": 4.1875, "learning_rate": 3.8119467913837754e-05, "loss": 1.906697154045105, "step": 706 }, { "epoch": 0.18439068918302146, "grad_norm": 4.5, "learning_rate": 3.811328448364538e-05, "loss": 1.8337846994400024, "step": 707 }, { "epoch": 0.18465149638130013, "grad_norm": 5.09375, "learning_rate": 3.8107091407378275e-05, "loss": 1.86478590965271, "step": 708 }, { "epoch": 0.1849123035795788, "grad_norm": 4.0625, "learning_rate": 3.81008886883345e-05, "loss": 1.6813318729400635, "step": 709 }, { "epoch": 0.18517311077785747, "grad_norm": 4.8125, "learning_rate": 3.8094676329817256e-05, "loss": 1.7286593914031982, "step": 710 }, { "epoch": 0.18543391797613615, "grad_norm": 5.03125, "learning_rate": 3.808845433513488e-05, "loss": 2.1475114822387695, "step": 711 }, { "epoch": 0.18569472517441482, "grad_norm": 4.28125, "learning_rate": 3.8082222707600854e-05, "loss": 1.9896444082260132, "step": 712 }, { "epoch": 0.1859555323726935, "grad_norm": 4.46875, "learning_rate": 3.807598145053376e-05, "loss": 2.2578413486480713, "step": 713 }, { "epoch": 0.18621633957097217, "grad_norm": 4.1875, "learning_rate": 3.806973056725735e-05, "loss": 1.6955958604812622, "step": 714 }, { "epoch": 0.18647714676925084, "grad_norm": 4.46875, "learning_rate": 3.8063470061100454e-05, "loss": 1.8761972188949585, "step": 715 }, { "epoch": 0.1867379539675295, "grad_norm": 4.375, "learning_rate": 3.805719993539707e-05, "loss": 1.8478339910507202, "step": 716 }, { "epoch": 0.18699876116580819, "grad_norm": 4.34375, "learning_rate": 3.805092019348628e-05, "loss": 1.4803378582000732, "step": 717 }, { "epoch": 0.18725956836408686, "grad_norm": 4.65625, "learning_rate": 3.8044630838712326e-05, "loss": 1.7085039615631104, "step": 718 }, { "epoch": 0.18752037556236553, "grad_norm": 4.0625, "learning_rate": 3.8038331874424546e-05, "loss": 1.6832685470581055, "step": 719 }, { "epoch": 0.1877811827606442, "grad_norm": 4.34375, "learning_rate": 3.8032023303977384e-05, "loss": 2.000880479812622, "step": 720 }, { "epoch": 0.18804198995892288, "grad_norm": 4.625, "learning_rate": 3.802570513073044e-05, "loss": 1.9390875101089478, "step": 721 }, { "epoch": 0.18830279715720155, "grad_norm": 4.90625, "learning_rate": 3.801937735804838e-05, "loss": 2.086874008178711, "step": 722 }, { "epoch": 0.18856360435548022, "grad_norm": 4.96875, "learning_rate": 3.801303998930103e-05, "loss": 2.000013828277588, "step": 723 }, { "epoch": 0.1888244115537589, "grad_norm": 4.3125, "learning_rate": 3.800669302786328e-05, "loss": 1.8913724422454834, "step": 724 }, { "epoch": 0.18908521875203754, "grad_norm": 4.375, "learning_rate": 3.800033647711515e-05, "loss": 1.7368848323822021, "step": 725 }, { "epoch": 0.18934602595031622, "grad_norm": 4.375, "learning_rate": 3.7993970340441786e-05, "loss": 1.822178840637207, "step": 726 }, { "epoch": 0.1896068331485949, "grad_norm": 4.1875, "learning_rate": 3.79875946212334e-05, "loss": 1.549401044845581, "step": 727 }, { "epoch": 0.18986764034687356, "grad_norm": 4.75, "learning_rate": 3.798120932288534e-05, "loss": 1.803915023803711, "step": 728 }, { "epoch": 0.19012844754515223, "grad_norm": 4.96875, "learning_rate": 3.797481444879803e-05, "loss": 1.8997050523757935, "step": 729 }, { "epoch": 0.1903892547434309, "grad_norm": 4.78125, "learning_rate": 3.796841000237701e-05, "loss": 1.73325777053833, "step": 730 }, { "epoch": 0.19065006194170958, "grad_norm": 4.71875, "learning_rate": 3.7961995987032924e-05, "loss": 1.9174513816833496, "step": 731 }, { "epoch": 0.19091086913998825, "grad_norm": 5.1875, "learning_rate": 3.795557240618149e-05, "loss": 1.9190399646759033, "step": 732 }, { "epoch": 0.19117167633826693, "grad_norm": 4.78125, "learning_rate": 3.794913926324353e-05, "loss": 1.8463743925094604, "step": 733 }, { "epoch": 0.1914324835365456, "grad_norm": 4.46875, "learning_rate": 3.794269656164496e-05, "loss": 1.7895328998565674, "step": 734 }, { "epoch": 0.19169329073482427, "grad_norm": 4.21875, "learning_rate": 3.793624430481679e-05, "loss": 1.724631428718567, "step": 735 }, { "epoch": 0.19195409793310295, "grad_norm": 4.59375, "learning_rate": 3.792978249619512e-05, "loss": 1.934809684753418, "step": 736 }, { "epoch": 0.19221490513138162, "grad_norm": 4.25, "learning_rate": 3.7923311139221114e-05, "loss": 1.7765518426895142, "step": 737 }, { "epoch": 0.1924757123296603, "grad_norm": 4.3125, "learning_rate": 3.791683023734105e-05, "loss": 1.8107788562774658, "step": 738 }, { "epoch": 0.19273651952793897, "grad_norm": 4.46875, "learning_rate": 3.7910339794006274e-05, "loss": 1.9428932666778564, "step": 739 }, { "epoch": 0.19299732672621764, "grad_norm": 4.46875, "learning_rate": 3.790383981267322e-05, "loss": 1.7820011377334595, "step": 740 }, { "epoch": 0.1932581339244963, "grad_norm": 4.40625, "learning_rate": 3.789733029680338e-05, "loss": 1.9347243309020996, "step": 741 }, { "epoch": 0.19351894112277498, "grad_norm": 4.40625, "learning_rate": 3.789081124986337e-05, "loss": 1.6909987926483154, "step": 742 }, { "epoch": 0.19377974832105366, "grad_norm": 5.03125, "learning_rate": 3.788428267532483e-05, "loss": 1.7528774738311768, "step": 743 }, { "epoch": 0.19404055551933233, "grad_norm": 4.59375, "learning_rate": 3.787774457666451e-05, "loss": 1.726656198501587, "step": 744 }, { "epoch": 0.194301362717611, "grad_norm": 4.71875, "learning_rate": 3.7871196957364206e-05, "loss": 2.0491385459899902, "step": 745 }, { "epoch": 0.19456216991588968, "grad_norm": 5.03125, "learning_rate": 3.78646398209108e-05, "loss": 1.9195506572723389, "step": 746 }, { "epoch": 0.19482297711416835, "grad_norm": 4.375, "learning_rate": 3.785807317079624e-05, "loss": 1.758399486541748, "step": 747 }, { "epoch": 0.19508378431244702, "grad_norm": 4.4375, "learning_rate": 3.7851497010517554e-05, "loss": 1.838735580444336, "step": 748 }, { "epoch": 0.1953445915107257, "grad_norm": 4.34375, "learning_rate": 3.78449113435768e-05, "loss": 1.9476432800292969, "step": 749 }, { "epoch": 0.19560539870900437, "grad_norm": 4.65625, "learning_rate": 3.7838316173481127e-05, "loss": 1.732071876525879, "step": 750 }, { "epoch": 0.19586620590728304, "grad_norm": 4.21875, "learning_rate": 3.783171150374273e-05, "loss": 1.670494794845581, "step": 751 }, { "epoch": 0.19612701310556171, "grad_norm": 4.53125, "learning_rate": 3.782509733787888e-05, "loss": 1.9444698095321655, "step": 752 }, { "epoch": 0.1963878203038404, "grad_norm": 4.34375, "learning_rate": 3.7818473679411886e-05, "loss": 1.8395514488220215, "step": 753 }, { "epoch": 0.19664862750211906, "grad_norm": 4.4375, "learning_rate": 3.7811840531869124e-05, "loss": 1.9476215839385986, "step": 754 }, { "epoch": 0.19690943470039773, "grad_norm": 4.9375, "learning_rate": 3.7805197898783015e-05, "loss": 2.008249282836914, "step": 755 }, { "epoch": 0.1971702418986764, "grad_norm": 4.75, "learning_rate": 3.7798545783691055e-05, "loss": 1.943685531616211, "step": 756 }, { "epoch": 0.19743104909695508, "grad_norm": 4.5625, "learning_rate": 3.7791884190135745e-05, "loss": 1.8641271591186523, "step": 757 }, { "epoch": 0.19769185629523375, "grad_norm": 4.59375, "learning_rate": 3.778521312166467e-05, "loss": 1.9630130529403687, "step": 758 }, { "epoch": 0.19795266349351243, "grad_norm": 4.5625, "learning_rate": 3.777853258183046e-05, "loss": 2.0424537658691406, "step": 759 }, { "epoch": 0.1982134706917911, "grad_norm": 4.53125, "learning_rate": 3.7771842574190765e-05, "loss": 2.040316104888916, "step": 760 }, { "epoch": 0.19847427789006977, "grad_norm": 4.75, "learning_rate": 3.7765143102308305e-05, "loss": 2.004734754562378, "step": 761 }, { "epoch": 0.19873508508834845, "grad_norm": 4.53125, "learning_rate": 3.775843416975082e-05, "loss": 1.8797520399093628, "step": 762 }, { "epoch": 0.19899589228662712, "grad_norm": 4.40625, "learning_rate": 3.775171578009109e-05, "loss": 1.61118483543396, "step": 763 }, { "epoch": 0.1992566994849058, "grad_norm": 4.375, "learning_rate": 3.7744987936906934e-05, "loss": 1.8276541233062744, "step": 764 }, { "epoch": 0.19951750668318446, "grad_norm": 4.4375, "learning_rate": 3.773825064378122e-05, "loss": 1.9090209007263184, "step": 765 }, { "epoch": 0.19977831388146314, "grad_norm": 4.59375, "learning_rate": 3.773150390430183e-05, "loss": 1.7013720273971558, "step": 766 }, { "epoch": 0.2000391210797418, "grad_norm": 5.125, "learning_rate": 3.7724747722061676e-05, "loss": 1.9808199405670166, "step": 767 }, { "epoch": 0.20029992827802048, "grad_norm": 4.34375, "learning_rate": 3.771798210065871e-05, "loss": 2.031911611557007, "step": 768 }, { "epoch": 0.20056073547629916, "grad_norm": 4.53125, "learning_rate": 3.7711207043695914e-05, "loss": 2.065260410308838, "step": 769 }, { "epoch": 0.20082154267457783, "grad_norm": 4.5, "learning_rate": 3.770442255478128e-05, "loss": 1.8915151357650757, "step": 770 }, { "epoch": 0.2010823498728565, "grad_norm": 4.78125, "learning_rate": 3.769762863752782e-05, "loss": 1.715123176574707, "step": 771 }, { "epoch": 0.20134315707113518, "grad_norm": 4.59375, "learning_rate": 3.769082529555359e-05, "loss": 1.732142686843872, "step": 772 }, { "epoch": 0.20160396426941385, "grad_norm": 5.53125, "learning_rate": 3.768401253248165e-05, "loss": 1.9372129440307617, "step": 773 }, { "epoch": 0.20186477146769252, "grad_norm": 4.71875, "learning_rate": 3.767719035194007e-05, "loss": 1.8738274574279785, "step": 774 }, { "epoch": 0.20212557866597117, "grad_norm": 4.625, "learning_rate": 3.767035875756195e-05, "loss": 1.9400805234909058, "step": 775 }, { "epoch": 0.20238638586424984, "grad_norm": 4.96875, "learning_rate": 3.76635177529854e-05, "loss": 1.754683256149292, "step": 776 }, { "epoch": 0.2026471930625285, "grad_norm": 4.3125, "learning_rate": 3.765666734185353e-05, "loss": 2.086247682571411, "step": 777 }, { "epoch": 0.2029080002608072, "grad_norm": 4.84375, "learning_rate": 3.764980752781448e-05, "loss": 1.793013572692871, "step": 778 }, { "epoch": 0.20316880745908586, "grad_norm": 4.90625, "learning_rate": 3.7642938314521374e-05, "loss": 1.8967558145523071, "step": 779 }, { "epoch": 0.20342961465736453, "grad_norm": 4.46875, "learning_rate": 3.7636059705632355e-05, "loss": 2.0488367080688477, "step": 780 }, { "epoch": 0.2036904218556432, "grad_norm": 5.28125, "learning_rate": 3.762917170481057e-05, "loss": 1.8737905025482178, "step": 781 }, { "epoch": 0.20395122905392188, "grad_norm": 4.375, "learning_rate": 3.762227431572417e-05, "loss": 1.8983911275863647, "step": 782 }, { "epoch": 0.20421203625220055, "grad_norm": 4.75, "learning_rate": 3.761536754204628e-05, "loss": 1.9074043035507202, "step": 783 }, { "epoch": 0.20447284345047922, "grad_norm": 4.34375, "learning_rate": 3.7608451387455066e-05, "loss": 1.863377332687378, "step": 784 }, { "epoch": 0.2047336506487579, "grad_norm": 4.75, "learning_rate": 3.760152585563367e-05, "loss": 1.7177624702453613, "step": 785 }, { "epoch": 0.20499445784703657, "grad_norm": 4.28125, "learning_rate": 3.75945909502702e-05, "loss": 1.6665986776351929, "step": 786 }, { "epoch": 0.20525526504531524, "grad_norm": 4.59375, "learning_rate": 3.75876466750578e-05, "loss": 1.8127803802490234, "step": 787 }, { "epoch": 0.20551607224359392, "grad_norm": 4.84375, "learning_rate": 3.7580693033694576e-05, "loss": 2.0583481788635254, "step": 788 }, { "epoch": 0.2057768794418726, "grad_norm": 4.65625, "learning_rate": 3.757373002988363e-05, "loss": 2.0759923458099365, "step": 789 }, { "epoch": 0.20603768664015126, "grad_norm": 4.53125, "learning_rate": 3.756675766733306e-05, "loss": 1.8449110984802246, "step": 790 }, { "epoch": 0.20629849383842994, "grad_norm": 4.4375, "learning_rate": 3.755977594975593e-05, "loss": 1.890228271484375, "step": 791 }, { "epoch": 0.2065593010367086, "grad_norm": 4.40625, "learning_rate": 3.7552784880870294e-05, "loss": 1.9156498908996582, "step": 792 }, { "epoch": 0.20682010823498728, "grad_norm": 4.40625, "learning_rate": 3.754578446439919e-05, "loss": 1.7611085176467896, "step": 793 }, { "epoch": 0.20708091543326596, "grad_norm": 4.3125, "learning_rate": 3.753877470407062e-05, "loss": 1.793421983718872, "step": 794 }, { "epoch": 0.20734172263154463, "grad_norm": 4.40625, "learning_rate": 3.753175560361758e-05, "loss": 2.0085411071777344, "step": 795 }, { "epoch": 0.2076025298298233, "grad_norm": 4.5625, "learning_rate": 3.752472716677803e-05, "loss": 1.8485441207885742, "step": 796 }, { "epoch": 0.20786333702810197, "grad_norm": 4.53125, "learning_rate": 3.7517689397294914e-05, "loss": 1.9082846641540527, "step": 797 }, { "epoch": 0.20812414422638065, "grad_norm": 4.15625, "learning_rate": 3.751064229891612e-05, "loss": 1.8202950954437256, "step": 798 }, { "epoch": 0.20838495142465932, "grad_norm": 4.71875, "learning_rate": 3.750358587539452e-05, "loss": 1.9479665756225586, "step": 799 }, { "epoch": 0.208645758622938, "grad_norm": 4.6875, "learning_rate": 3.749652013048797e-05, "loss": 2.0446767807006836, "step": 800 }, { "epoch": 0.20890656582121667, "grad_norm": 4.5625, "learning_rate": 3.748944506795926e-05, "loss": 1.5322918891906738, "step": 801 }, { "epoch": 0.20916737301949534, "grad_norm": 4.8125, "learning_rate": 3.7482360691576146e-05, "loss": 1.6456043720245361, "step": 802 }, { "epoch": 0.209428180217774, "grad_norm": 4.09375, "learning_rate": 3.747526700511137e-05, "loss": 1.7690880298614502, "step": 803 }, { "epoch": 0.20968898741605269, "grad_norm": 4.15625, "learning_rate": 3.74681640123426e-05, "loss": 1.7727999687194824, "step": 804 }, { "epoch": 0.20994979461433136, "grad_norm": 5.75, "learning_rate": 3.7461051717052474e-05, "loss": 2.2625675201416016, "step": 805 }, { "epoch": 0.21021060181261003, "grad_norm": 4.65625, "learning_rate": 3.7453930123028594e-05, "loss": 1.7446084022521973, "step": 806 }, { "epoch": 0.2104714090108887, "grad_norm": 4.625, "learning_rate": 3.744679923406351e-05, "loss": 1.9689126014709473, "step": 807 }, { "epoch": 0.21073221620916738, "grad_norm": 4.1875, "learning_rate": 3.7439659053954685e-05, "loss": 1.6769580841064453, "step": 808 }, { "epoch": 0.21099302340744605, "grad_norm": 4.0625, "learning_rate": 3.743250958650459e-05, "loss": 1.8509106636047363, "step": 809 }, { "epoch": 0.21125383060572472, "grad_norm": 4.28125, "learning_rate": 3.74253508355206e-05, "loss": 1.8638005256652832, "step": 810 }, { "epoch": 0.2115146378040034, "grad_norm": 4.375, "learning_rate": 3.7418182804815054e-05, "loss": 1.8554314374923706, "step": 811 }, { "epoch": 0.21177544500228207, "grad_norm": 4.5, "learning_rate": 3.741100549820522e-05, "loss": 1.6938271522521973, "step": 812 }, { "epoch": 0.21203625220056074, "grad_norm": 4.5625, "learning_rate": 3.740381891951332e-05, "loss": 2.0607852935791016, "step": 813 }, { "epoch": 0.21229705939883942, "grad_norm": 4.5625, "learning_rate": 3.739662307256649e-05, "loss": 2.0525259971618652, "step": 814 }, { "epoch": 0.2125578665971181, "grad_norm": 4.40625, "learning_rate": 3.738941796119682e-05, "loss": 1.8823179006576538, "step": 815 }, { "epoch": 0.21281867379539676, "grad_norm": 4.65625, "learning_rate": 3.738220358924134e-05, "loss": 1.9729381799697876, "step": 816 }, { "epoch": 0.21307948099367544, "grad_norm": 4.40625, "learning_rate": 3.7374979960542e-05, "loss": 1.991550087928772, "step": 817 }, { "epoch": 0.2133402881919541, "grad_norm": 4.09375, "learning_rate": 3.736774707894568e-05, "loss": 1.6832308769226074, "step": 818 }, { "epoch": 0.21360109539023278, "grad_norm": 4.40625, "learning_rate": 3.736050494830417e-05, "loss": 1.9713823795318604, "step": 819 }, { "epoch": 0.21386190258851145, "grad_norm": 4.5625, "learning_rate": 3.735325357247424e-05, "loss": 1.791365623474121, "step": 820 }, { "epoch": 0.21412270978679013, "grad_norm": 4.46875, "learning_rate": 3.7345992955317534e-05, "loss": 1.7550973892211914, "step": 821 }, { "epoch": 0.2143835169850688, "grad_norm": 4.625, "learning_rate": 3.7338723100700615e-05, "loss": 1.830931305885315, "step": 822 }, { "epoch": 0.21464432418334747, "grad_norm": 4.625, "learning_rate": 3.733144401249501e-05, "loss": 2.0567502975463867, "step": 823 }, { "epoch": 0.21490513138162615, "grad_norm": 4.28125, "learning_rate": 3.732415569457711e-05, "loss": 1.8143775463104248, "step": 824 }, { "epoch": 0.2151659385799048, "grad_norm": 4.40625, "learning_rate": 3.731685815082826e-05, "loss": 1.614667534828186, "step": 825 }, { "epoch": 0.21542674577818346, "grad_norm": 4.96875, "learning_rate": 3.73095513851347e-05, "loss": 1.9580087661743164, "step": 826 }, { "epoch": 0.21568755297646214, "grad_norm": 4.25, "learning_rate": 3.730223540138759e-05, "loss": 1.9181026220321655, "step": 827 }, { "epoch": 0.2159483601747408, "grad_norm": 4.3125, "learning_rate": 3.7294910203482984e-05, "loss": 1.6878881454467773, "step": 828 }, { "epoch": 0.21620916737301948, "grad_norm": 4.5625, "learning_rate": 3.728757579532187e-05, "loss": 2.0224173069000244, "step": 829 }, { "epoch": 0.21646997457129816, "grad_norm": 4.15625, "learning_rate": 3.728023218081011e-05, "loss": 1.8751661777496338, "step": 830 }, { "epoch": 0.21673078176957683, "grad_norm": 4.125, "learning_rate": 3.727287936385849e-05, "loss": 1.7184031009674072, "step": 831 }, { "epoch": 0.2169915889678555, "grad_norm": 4.5625, "learning_rate": 3.7265517348382683e-05, "loss": 1.8709547519683838, "step": 832 }, { "epoch": 0.21725239616613418, "grad_norm": 4.59375, "learning_rate": 3.7258146138303276e-05, "loss": 2.057793617248535, "step": 833 }, { "epoch": 0.21751320336441285, "grad_norm": 4.375, "learning_rate": 3.725076573754574e-05, "loss": 1.785442590713501, "step": 834 }, { "epoch": 0.21777401056269152, "grad_norm": 4.03125, "learning_rate": 3.724337615004045e-05, "loss": 1.485368013381958, "step": 835 }, { "epoch": 0.2180348177609702, "grad_norm": 4.4375, "learning_rate": 3.7235977379722666e-05, "loss": 1.9761755466461182, "step": 836 }, { "epoch": 0.21829562495924887, "grad_norm": 4.4375, "learning_rate": 3.722856943053253e-05, "loss": 1.9895318746566772, "step": 837 }, { "epoch": 0.21855643215752754, "grad_norm": 4.53125, "learning_rate": 3.722115230641509e-05, "loss": 1.6570758819580078, "step": 838 }, { "epoch": 0.21881723935580621, "grad_norm": 4.6875, "learning_rate": 3.721372601132027e-05, "loss": 2.0249204635620117, "step": 839 }, { "epoch": 0.2190780465540849, "grad_norm": 4.4375, "learning_rate": 3.720629054920287e-05, "loss": 1.8241016864776611, "step": 840 }, { "epoch": 0.21933885375236356, "grad_norm": 4.125, "learning_rate": 3.71988459240226e-05, "loss": 1.8797310590744019, "step": 841 }, { "epoch": 0.21959966095064223, "grad_norm": 4.90625, "learning_rate": 3.719139213974403e-05, "loss": 1.8294291496276855, "step": 842 }, { "epoch": 0.2198604681489209, "grad_norm": 4.1875, "learning_rate": 3.718392920033659e-05, "loss": 1.7440972328186035, "step": 843 }, { "epoch": 0.22012127534719958, "grad_norm": 4.21875, "learning_rate": 3.7176457109774624e-05, "loss": 1.7216081619262695, "step": 844 }, { "epoch": 0.22038208254547825, "grad_norm": 4.5625, "learning_rate": 3.716897587203733e-05, "loss": 1.734525442123413, "step": 845 }, { "epoch": 0.22064288974375693, "grad_norm": 4.25, "learning_rate": 3.716148549110876e-05, "loss": 1.861636757850647, "step": 846 }, { "epoch": 0.2209036969420356, "grad_norm": 4.375, "learning_rate": 3.7153985970977865e-05, "loss": 1.9185471534729004, "step": 847 }, { "epoch": 0.22116450414031427, "grad_norm": 4.25, "learning_rate": 3.714647731563845e-05, "loss": 1.6556049585342407, "step": 848 }, { "epoch": 0.22142531133859295, "grad_norm": 5.0, "learning_rate": 3.7138959529089175e-05, "loss": 1.8159377574920654, "step": 849 }, { "epoch": 0.22168611853687162, "grad_norm": 4.375, "learning_rate": 3.713143261533359e-05, "loss": 2.021803140640259, "step": 850 }, { "epoch": 0.2219469257351503, "grad_norm": 4.46875, "learning_rate": 3.712389657838007e-05, "loss": 1.7354223728179932, "step": 851 }, { "epoch": 0.22220773293342896, "grad_norm": 5.0, "learning_rate": 3.7116351422241894e-05, "loss": 2.040472984313965, "step": 852 }, { "epoch": 0.22246854013170764, "grad_norm": 4.15625, "learning_rate": 3.7108797150937136e-05, "loss": 1.7748935222625732, "step": 853 }, { "epoch": 0.2227293473299863, "grad_norm": 4.3125, "learning_rate": 3.710123376848878e-05, "loss": 1.8850452899932861, "step": 854 }, { "epoch": 0.22299015452826498, "grad_norm": 4.1875, "learning_rate": 3.709366127892464e-05, "loss": 1.8661198616027832, "step": 855 }, { "epoch": 0.22325096172654366, "grad_norm": 4.09375, "learning_rate": 3.7086079686277376e-05, "loss": 1.8567036390304565, "step": 856 }, { "epoch": 0.22351176892482233, "grad_norm": 4.71875, "learning_rate": 3.7078488994584496e-05, "loss": 1.8610680103302002, "step": 857 }, { "epoch": 0.223772576123101, "grad_norm": 4.6875, "learning_rate": 3.7070889207888375e-05, "loss": 1.8955552577972412, "step": 858 }, { "epoch": 0.22403338332137968, "grad_norm": 4.65625, "learning_rate": 3.706328033023619e-05, "loss": 1.9822683334350586, "step": 859 }, { "epoch": 0.22429419051965835, "grad_norm": 4.34375, "learning_rate": 3.7055662365679994e-05, "loss": 1.6399036645889282, "step": 860 }, { "epoch": 0.22455499771793702, "grad_norm": 4.34375, "learning_rate": 3.704803531827668e-05, "loss": 1.6619651317596436, "step": 861 }, { "epoch": 0.2248158049162157, "grad_norm": 4.53125, "learning_rate": 3.704039919208795e-05, "loss": 1.5787312984466553, "step": 862 }, { "epoch": 0.22507661211449437, "grad_norm": 4.09375, "learning_rate": 3.703275399118037e-05, "loss": 1.7187163829803467, "step": 863 }, { "epoch": 0.22533741931277304, "grad_norm": 4.75, "learning_rate": 3.7025099719625324e-05, "loss": 2.1511130332946777, "step": 864 }, { "epoch": 0.2255982265110517, "grad_norm": 4.40625, "learning_rate": 3.7017436381499026e-05, "loss": 1.8955907821655273, "step": 865 }, { "epoch": 0.2258590337093304, "grad_norm": 4.9375, "learning_rate": 3.700976398088252e-05, "loss": 1.884652853012085, "step": 866 }, { "epoch": 0.22611984090760906, "grad_norm": 4.625, "learning_rate": 3.7002082521861675e-05, "loss": 1.8265893459320068, "step": 867 }, { "epoch": 0.22638064810588773, "grad_norm": 4.1875, "learning_rate": 3.699439200852719e-05, "loss": 1.5954231023788452, "step": 868 }, { "epoch": 0.2266414553041664, "grad_norm": 4.625, "learning_rate": 3.69866924449746e-05, "loss": 1.8270741701126099, "step": 869 }, { "epoch": 0.22690226250244508, "grad_norm": 4.28125, "learning_rate": 3.6978983835304204e-05, "loss": 1.938004732131958, "step": 870 }, { "epoch": 0.22716306970072375, "grad_norm": 4.1875, "learning_rate": 3.697126618362119e-05, "loss": 1.695405125617981, "step": 871 }, { "epoch": 0.22742387689900243, "grad_norm": 4.96875, "learning_rate": 3.69635394940355e-05, "loss": 1.9706010818481445, "step": 872 }, { "epoch": 0.2276846840972811, "grad_norm": 4.4375, "learning_rate": 3.695580377066194e-05, "loss": 1.896984577178955, "step": 873 }, { "epoch": 0.22794549129555977, "grad_norm": 4.78125, "learning_rate": 3.6948059017620095e-05, "loss": 1.9504402875900269, "step": 874 }, { "epoch": 0.22820629849383842, "grad_norm": 4.4375, "learning_rate": 3.694030523903436e-05, "loss": 1.9126397371292114, "step": 875 }, { "epoch": 0.2284671056921171, "grad_norm": 4.4375, "learning_rate": 3.6932542439033955e-05, "loss": 1.790719747543335, "step": 876 }, { "epoch": 0.22872791289039576, "grad_norm": 4.4375, "learning_rate": 3.692477062175289e-05, "loss": 1.9425745010375977, "step": 877 }, { "epoch": 0.22898872008867444, "grad_norm": 4.5, "learning_rate": 3.691698979132996e-05, "loss": 1.9001245498657227, "step": 878 }, { "epoch": 0.2292495272869531, "grad_norm": 4.34375, "learning_rate": 3.690919995190881e-05, "loss": 1.8583731651306152, "step": 879 }, { "epoch": 0.22951033448523178, "grad_norm": 4.15625, "learning_rate": 3.690140110763784e-05, "loss": 1.8205029964447021, "step": 880 }, { "epoch": 0.22977114168351045, "grad_norm": 4.5625, "learning_rate": 3.6893593262670246e-05, "loss": 1.9404447078704834, "step": 881 }, { "epoch": 0.23003194888178913, "grad_norm": 3.96875, "learning_rate": 3.688577642116405e-05, "loss": 1.7789771556854248, "step": 882 }, { "epoch": 0.2302927560800678, "grad_norm": 4.40625, "learning_rate": 3.6877950587282025e-05, "loss": 2.017627716064453, "step": 883 }, { "epoch": 0.23055356327834647, "grad_norm": 4.34375, "learning_rate": 3.687011576519177e-05, "loss": 1.93501615524292, "step": 884 }, { "epoch": 0.23081437047662515, "grad_norm": 4.6875, "learning_rate": 3.686227195906564e-05, "loss": 1.960984706878662, "step": 885 }, { "epoch": 0.23107517767490382, "grad_norm": 4.03125, "learning_rate": 3.6854419173080784e-05, "loss": 1.81727933883667, "step": 886 }, { "epoch": 0.2313359848731825, "grad_norm": 4.65625, "learning_rate": 3.6846557411419145e-05, "loss": 1.7302303314208984, "step": 887 }, { "epoch": 0.23159679207146117, "grad_norm": 4.28125, "learning_rate": 3.683868667826744e-05, "loss": 1.8151025772094727, "step": 888 }, { "epoch": 0.23185759926973984, "grad_norm": 4.625, "learning_rate": 3.683080697781715e-05, "loss": 1.9974563121795654, "step": 889 }, { "epoch": 0.2321184064680185, "grad_norm": 4.0625, "learning_rate": 3.682291831426454e-05, "loss": 1.8311131000518799, "step": 890 }, { "epoch": 0.23237921366629719, "grad_norm": 4.40625, "learning_rate": 3.6815020691810664e-05, "loss": 1.8545184135437012, "step": 891 }, { "epoch": 0.23264002086457586, "grad_norm": 5.3125, "learning_rate": 3.680711411466133e-05, "loss": 1.6972341537475586, "step": 892 }, { "epoch": 0.23290082806285453, "grad_norm": 4.90625, "learning_rate": 3.679919858702711e-05, "loss": 2.1778340339660645, "step": 893 }, { "epoch": 0.2331616352611332, "grad_norm": 4.625, "learning_rate": 3.679127411312336e-05, "loss": 2.0662150382995605, "step": 894 }, { "epoch": 0.23342244245941188, "grad_norm": 4.0625, "learning_rate": 3.678334069717018e-05, "loss": 1.7575409412384033, "step": 895 }, { "epoch": 0.23368324965769055, "grad_norm": 4.96875, "learning_rate": 3.6775398343392444e-05, "loss": 2.143287420272827, "step": 896 }, { "epoch": 0.23394405685596922, "grad_norm": 4.09375, "learning_rate": 3.67674470560198e-05, "loss": 1.69294011592865, "step": 897 }, { "epoch": 0.2342048640542479, "grad_norm": 4.25, "learning_rate": 3.675948683928662e-05, "loss": 1.6591373682022095, "step": 898 }, { "epoch": 0.23446567125252657, "grad_norm": 6.625, "learning_rate": 3.675151769743206e-05, "loss": 2.171626329421997, "step": 899 }, { "epoch": 0.23472647845080524, "grad_norm": 4.34375, "learning_rate": 3.674353963470001e-05, "loss": 1.7921541929244995, "step": 900 }, { "epoch": 0.23498728564908392, "grad_norm": 4.78125, "learning_rate": 3.673555265533913e-05, "loss": 1.960420846939087, "step": 901 }, { "epoch": 0.2352480928473626, "grad_norm": 4.75, "learning_rate": 3.672755676360281e-05, "loss": 2.0104479789733887, "step": 902 }, { "epoch": 0.23550890004564126, "grad_norm": 4.6875, "learning_rate": 3.671955196374919e-05, "loss": 2.0661163330078125, "step": 903 }, { "epoch": 0.23576970724391993, "grad_norm": 4.375, "learning_rate": 3.671153826004116e-05, "loss": 2.0342602729797363, "step": 904 }, { "epoch": 0.2360305144421986, "grad_norm": 4.3125, "learning_rate": 3.6703515656746365e-05, "loss": 1.9294260740280151, "step": 905 }, { "epoch": 0.23629132164047728, "grad_norm": 3.953125, "learning_rate": 3.669548415813715e-05, "loss": 1.389187216758728, "step": 906 }, { "epoch": 0.23655212883875595, "grad_norm": 4.125, "learning_rate": 3.668744376849064e-05, "loss": 1.5896613597869873, "step": 907 }, { "epoch": 0.23681293603703463, "grad_norm": 4.09375, "learning_rate": 3.6679394492088666e-05, "loss": 1.739761233329773, "step": 908 }, { "epoch": 0.2370737432353133, "grad_norm": 4.125, "learning_rate": 3.66713363332178e-05, "loss": 1.8526043891906738, "step": 909 }, { "epoch": 0.23733455043359197, "grad_norm": 4.5625, "learning_rate": 3.666326929616935e-05, "loss": 1.7840735912322998, "step": 910 }, { "epoch": 0.23759535763187065, "grad_norm": 4.53125, "learning_rate": 3.665519338523935e-05, "loss": 1.9899275302886963, "step": 911 }, { "epoch": 0.23785616483014932, "grad_norm": 4.5, "learning_rate": 3.6647108604728546e-05, "loss": 1.8728816509246826, "step": 912 }, { "epoch": 0.238116972028428, "grad_norm": 4.09375, "learning_rate": 3.6639014958942436e-05, "loss": 1.7578339576721191, "step": 913 }, { "epoch": 0.23837777922670667, "grad_norm": 4.15625, "learning_rate": 3.66309124521912e-05, "loss": 1.9648104906082153, "step": 914 }, { "epoch": 0.23863858642498534, "grad_norm": 4.25, "learning_rate": 3.662280108878978e-05, "loss": 1.7992619276046753, "step": 915 }, { "epoch": 0.238899393623264, "grad_norm": 10.1875, "learning_rate": 3.6614680873057796e-05, "loss": 2.0010461807250977, "step": 916 }, { "epoch": 0.23916020082154268, "grad_norm": 4.625, "learning_rate": 3.6606551809319614e-05, "loss": 1.8452863693237305, "step": 917 }, { "epoch": 0.23942100801982136, "grad_norm": 4.375, "learning_rate": 3.659841390190429e-05, "loss": 1.7582532167434692, "step": 918 }, { "epoch": 0.23968181521810003, "grad_norm": 4.53125, "learning_rate": 3.65902671551456e-05, "loss": 1.7853057384490967, "step": 919 }, { "epoch": 0.2399426224163787, "grad_norm": 4.5625, "learning_rate": 3.658211157338202e-05, "loss": 1.660017490386963, "step": 920 }, { "epoch": 0.24020342961465738, "grad_norm": 4.625, "learning_rate": 3.657394716095673e-05, "loss": 1.7863976955413818, "step": 921 }, { "epoch": 0.24046423681293605, "grad_norm": 4.21875, "learning_rate": 3.656577392221763e-05, "loss": 1.577195405960083, "step": 922 }, { "epoch": 0.24072504401121472, "grad_norm": 4.34375, "learning_rate": 3.655759186151731e-05, "loss": 1.7188184261322021, "step": 923 }, { "epoch": 0.2409858512094934, "grad_norm": 4.0625, "learning_rate": 3.654940098321305e-05, "loss": 1.5993618965148926, "step": 924 }, { "epoch": 0.24124665840777204, "grad_norm": 4.34375, "learning_rate": 3.654120129166682e-05, "loss": 1.9271817207336426, "step": 925 }, { "epoch": 0.24150746560605071, "grad_norm": 4.09375, "learning_rate": 3.653299279124532e-05, "loss": 1.5634753704071045, "step": 926 }, { "epoch": 0.2417682728043294, "grad_norm": 4.15625, "learning_rate": 3.65247754863199e-05, "loss": 1.7926197052001953, "step": 927 }, { "epoch": 0.24202908000260806, "grad_norm": 4.1875, "learning_rate": 3.651654938126662e-05, "loss": 2.1989662647247314, "step": 928 }, { "epoch": 0.24228988720088673, "grad_norm": 4.21875, "learning_rate": 3.650831448046623e-05, "loss": 1.8017569780349731, "step": 929 }, { "epoch": 0.2425506943991654, "grad_norm": 4.1875, "learning_rate": 3.650007078830414e-05, "loss": 1.9014790058135986, "step": 930 }, { "epoch": 0.24281150159744408, "grad_norm": 4.25, "learning_rate": 3.649181830917046e-05, "loss": 1.6705433130264282, "step": 931 }, { "epoch": 0.24307230879572275, "grad_norm": 4.65625, "learning_rate": 3.6483557047459994e-05, "loss": 1.9792215824127197, "step": 932 }, { "epoch": 0.24333311599400143, "grad_norm": 3.921875, "learning_rate": 3.6475287007572194e-05, "loss": 1.7080910205841064, "step": 933 }, { "epoch": 0.2435939231922801, "grad_norm": 4.46875, "learning_rate": 3.6467008193911195e-05, "loss": 2.0196614265441895, "step": 934 }, { "epoch": 0.24385473039055877, "grad_norm": 4.0625, "learning_rate": 3.645872061088581e-05, "loss": 1.619052529335022, "step": 935 }, { "epoch": 0.24411553758883744, "grad_norm": 4.28125, "learning_rate": 3.645042426290954e-05, "loss": 1.7661082744598389, "step": 936 }, { "epoch": 0.24437634478711612, "grad_norm": 4.0625, "learning_rate": 3.6442119154400506e-05, "loss": 1.5300219058990479, "step": 937 }, { "epoch": 0.2446371519853948, "grad_norm": 4.25, "learning_rate": 3.6433805289781535e-05, "loss": 1.6125133037567139, "step": 938 }, { "epoch": 0.24489795918367346, "grad_norm": 4.75, "learning_rate": 3.6425482673480114e-05, "loss": 1.6887743473052979, "step": 939 }, { "epoch": 0.24515876638195214, "grad_norm": 4.40625, "learning_rate": 3.641715130992836e-05, "loss": 1.908347249031067, "step": 940 }, { "epoch": 0.2454195735802308, "grad_norm": 4.25, "learning_rate": 3.6408811203563084e-05, "loss": 2.0740489959716797, "step": 941 }, { "epoch": 0.24568038077850948, "grad_norm": 4.28125, "learning_rate": 3.640046235882574e-05, "loss": 1.7063522338867188, "step": 942 }, { "epoch": 0.24594118797678816, "grad_norm": 4.3125, "learning_rate": 3.6392104780162425e-05, "loss": 1.7976257801055908, "step": 943 }, { "epoch": 0.24620199517506683, "grad_norm": 4.28125, "learning_rate": 3.63837384720239e-05, "loss": 1.89145827293396, "step": 944 }, { "epoch": 0.2464628023733455, "grad_norm": 4.125, "learning_rate": 3.6375363438865574e-05, "loss": 1.5576190948486328, "step": 945 }, { "epoch": 0.24672360957162418, "grad_norm": 4.46875, "learning_rate": 3.63669796851475e-05, "loss": 1.8441598415374756, "step": 946 }, { "epoch": 0.24698441676990285, "grad_norm": 4.125, "learning_rate": 3.6358587215334355e-05, "loss": 1.5619696378707886, "step": 947 }, { "epoch": 0.24724522396818152, "grad_norm": 4.40625, "learning_rate": 3.6350186033895505e-05, "loss": 1.5821995735168457, "step": 948 }, { "epoch": 0.2475060311664602, "grad_norm": 4.0, "learning_rate": 3.634177614530491e-05, "loss": 1.66362464427948, "step": 949 }, { "epoch": 0.24776683836473887, "grad_norm": 4.15625, "learning_rate": 3.633335755404119e-05, "loss": 1.8080108165740967, "step": 950 }, { "epoch": 0.24802764556301754, "grad_norm": 4.53125, "learning_rate": 3.63249302645876e-05, "loss": 2.0615901947021484, "step": 951 }, { "epoch": 0.2482884527612962, "grad_norm": 4.25, "learning_rate": 3.6316494281432e-05, "loss": 1.8065787553787231, "step": 952 }, { "epoch": 0.2485492599595749, "grad_norm": 4.4375, "learning_rate": 3.630804960906693e-05, "loss": 1.9311704635620117, "step": 953 }, { "epoch": 0.24881006715785356, "grad_norm": 4.125, "learning_rate": 3.62995962519895e-05, "loss": 1.6823601722717285, "step": 954 }, { "epoch": 0.24907087435613223, "grad_norm": 4.15625, "learning_rate": 3.629113421470149e-05, "loss": 1.729414463043213, "step": 955 }, { "epoch": 0.2493316815544109, "grad_norm": 3.921875, "learning_rate": 3.628266350170929e-05, "loss": 1.53072190284729, "step": 956 }, { "epoch": 0.24959248875268958, "grad_norm": 4.96875, "learning_rate": 3.6274184117523885e-05, "loss": 2.1300301551818848, "step": 957 }, { "epoch": 0.24985329595096825, "grad_norm": 4.3125, "learning_rate": 3.626569606666092e-05, "loss": 1.8616907596588135, "step": 958 }, { "epoch": 0.2501141031492469, "grad_norm": 4.4375, "learning_rate": 3.625719935364061e-05, "loss": 1.9490491151809692, "step": 959 }, { "epoch": 0.25037491034752557, "grad_norm": 4.3125, "learning_rate": 3.624869398298783e-05, "loss": 1.6784336566925049, "step": 960 }, { "epoch": 0.25063571754580427, "grad_norm": 3.96875, "learning_rate": 3.624017995923204e-05, "loss": 1.5734858512878418, "step": 961 }, { "epoch": 0.2508965247440829, "grad_norm": 4.0625, "learning_rate": 3.6231657286907294e-05, "loss": 1.7256945371627808, "step": 962 }, { "epoch": 0.2511573319423616, "grad_norm": 4.3125, "learning_rate": 3.622312597055229e-05, "loss": 1.8039542436599731, "step": 963 }, { "epoch": 0.25141813914064026, "grad_norm": 4.125, "learning_rate": 3.6214586014710285e-05, "loss": 1.8100578784942627, "step": 964 }, { "epoch": 0.25167894633891896, "grad_norm": 4.53125, "learning_rate": 3.6206037423929175e-05, "loss": 2.144407033920288, "step": 965 }, { "epoch": 0.2519397535371976, "grad_norm": 4.375, "learning_rate": 3.619748020276143e-05, "loss": 1.6897966861724854, "step": 966 }, { "epoch": 0.2522005607354763, "grad_norm": 4.25, "learning_rate": 3.618891435576414e-05, "loss": 1.4720361232757568, "step": 967 }, { "epoch": 0.25246136793375495, "grad_norm": 4.4375, "learning_rate": 3.6180339887498953e-05, "loss": 1.9113588333129883, "step": 968 }, { "epoch": 0.25272217513203366, "grad_norm": 4.1875, "learning_rate": 3.617175680253214e-05, "loss": 1.7395099401474, "step": 969 }, { "epoch": 0.2529829823303123, "grad_norm": 4.40625, "learning_rate": 3.6163165105434545e-05, "loss": 1.8829009532928467, "step": 970 }, { "epoch": 0.253243789528591, "grad_norm": 4.40625, "learning_rate": 3.615456480078162e-05, "loss": 1.7133891582489014, "step": 971 }, { "epoch": 0.25350459672686965, "grad_norm": 4.46875, "learning_rate": 3.6145955893153355e-05, "loss": 1.9484844207763672, "step": 972 }, { "epoch": 0.25376540392514835, "grad_norm": 4.28125, "learning_rate": 3.613733838713437e-05, "loss": 1.6234208345413208, "step": 973 }, { "epoch": 0.254026211123427, "grad_norm": 4.8125, "learning_rate": 3.612871228731384e-05, "loss": 1.8469617366790771, "step": 974 }, { "epoch": 0.2542870183217057, "grad_norm": 4.3125, "learning_rate": 3.612007759828552e-05, "loss": 1.8527976274490356, "step": 975 }, { "epoch": 0.25454782551998434, "grad_norm": 4.15625, "learning_rate": 3.611143432464773e-05, "loss": 1.6421080827713013, "step": 976 }, { "epoch": 0.25480863271826304, "grad_norm": 4.0, "learning_rate": 3.610278247100339e-05, "loss": 1.7878233194351196, "step": 977 }, { "epoch": 0.2550694399165417, "grad_norm": 4.78125, "learning_rate": 3.609412204195996e-05, "loss": 1.8771930932998657, "step": 978 }, { "epoch": 0.2553302471148204, "grad_norm": 4.625, "learning_rate": 3.608545304212948e-05, "loss": 2.0811333656311035, "step": 979 }, { "epoch": 0.25559105431309903, "grad_norm": 4.34375, "learning_rate": 3.607677547612855e-05, "loss": 1.7313621044158936, "step": 980 }, { "epoch": 0.25585186151137773, "grad_norm": 4.46875, "learning_rate": 3.6068089348578335e-05, "loss": 1.8115135431289673, "step": 981 }, { "epoch": 0.2561126687096564, "grad_norm": 4.40625, "learning_rate": 3.6059394664104554e-05, "loss": 2.017341136932373, "step": 982 }, { "epoch": 0.2563734759079351, "grad_norm": 4.28125, "learning_rate": 3.605069142733749e-05, "loss": 1.7018239498138428, "step": 983 }, { "epoch": 0.2566342831062137, "grad_norm": 4.03125, "learning_rate": 3.604197964291199e-05, "loss": 1.8607988357543945, "step": 984 }, { "epoch": 0.2568950903044924, "grad_norm": 4.0, "learning_rate": 3.6033259315467406e-05, "loss": 1.794417142868042, "step": 985 }, { "epoch": 0.25715589750277107, "grad_norm": 4.21875, "learning_rate": 3.602453044964771e-05, "loss": 1.9194737672805786, "step": 986 }, { "epoch": 0.25741670470104977, "grad_norm": 4.3125, "learning_rate": 3.6015793050101364e-05, "loss": 1.6993803977966309, "step": 987 }, { "epoch": 0.2576775118993284, "grad_norm": 4.46875, "learning_rate": 3.60070471214814e-05, "loss": 1.7307885885238647, "step": 988 }, { "epoch": 0.2579383190976071, "grad_norm": 4.40625, "learning_rate": 3.59982926684454e-05, "loss": 1.7992185354232788, "step": 989 }, { "epoch": 0.25819912629588576, "grad_norm": 4.0625, "learning_rate": 3.598952969565545e-05, "loss": 1.8427271842956543, "step": 990 }, { "epoch": 0.25845993349416446, "grad_norm": 4.40625, "learning_rate": 3.598075820777822e-05, "loss": 1.8364369869232178, "step": 991 }, { "epoch": 0.2587207406924431, "grad_norm": 3.96875, "learning_rate": 3.597197820948487e-05, "loss": 1.682704210281372, "step": 992 }, { "epoch": 0.2589815478907218, "grad_norm": 4.75, "learning_rate": 3.5963189705451124e-05, "loss": 1.9118919372558594, "step": 993 }, { "epoch": 0.25924235508900045, "grad_norm": 3.96875, "learning_rate": 3.595439270035722e-05, "loss": 1.6127727031707764, "step": 994 }, { "epoch": 0.25950316228727915, "grad_norm": 4.03125, "learning_rate": 3.594558719888793e-05, "loss": 1.6650826930999756, "step": 995 }, { "epoch": 0.2597639694855578, "grad_norm": 4.15625, "learning_rate": 3.593677320573256e-05, "loss": 1.7605465650558472, "step": 996 }, { "epoch": 0.2600247766838365, "grad_norm": 4.25, "learning_rate": 3.5927950725584905e-05, "loss": 1.442620038986206, "step": 997 }, { "epoch": 0.26028558388211515, "grad_norm": 4.125, "learning_rate": 3.591911976314332e-05, "loss": 2.185831069946289, "step": 998 }, { "epoch": 0.26054639108039385, "grad_norm": 4.03125, "learning_rate": 3.591028032311065e-05, "loss": 1.6497381925582886, "step": 999 }, { "epoch": 0.2608071982786725, "grad_norm": 4.59375, "learning_rate": 3.590143241019426e-05, "loss": 1.868830680847168, "step": 1000 }, { "epoch": 0.26106800547695114, "grad_norm": 4.125, "learning_rate": 3.5892576029106034e-05, "loss": 1.7232383489608765, "step": 1001 }, { "epoch": 0.26132881267522984, "grad_norm": 4.8125, "learning_rate": 3.588371118456237e-05, "loss": 1.8657559156417847, "step": 1002 }, { "epoch": 0.2615896198735085, "grad_norm": 4.15625, "learning_rate": 3.587483788128415e-05, "loss": 1.80161452293396, "step": 1003 }, { "epoch": 0.2618504270717872, "grad_norm": 4.25, "learning_rate": 3.5865956123996785e-05, "loss": 1.4268081188201904, "step": 1004 }, { "epoch": 0.26211123427006583, "grad_norm": 4.5, "learning_rate": 3.585706591743018e-05, "loss": 1.8081510066986084, "step": 1005 }, { "epoch": 0.26237204146834453, "grad_norm": 4.3125, "learning_rate": 3.584816726631873e-05, "loss": 1.6240954399108887, "step": 1006 }, { "epoch": 0.2626328486666232, "grad_norm": 4.625, "learning_rate": 3.5839260175401345e-05, "loss": 1.733473300933838, "step": 1007 }, { "epoch": 0.2628936558649019, "grad_norm": 4.15625, "learning_rate": 3.5830344649421416e-05, "loss": 1.8932445049285889, "step": 1008 }, { "epoch": 0.2631544630631805, "grad_norm": 4.59375, "learning_rate": 3.5821420693126834e-05, "loss": 1.9630615711212158, "step": 1009 }, { "epoch": 0.2634152702614592, "grad_norm": 4.375, "learning_rate": 3.581248831126996e-05, "loss": 1.6601322889328003, "step": 1010 }, { "epoch": 0.26367607745973787, "grad_norm": 4.1875, "learning_rate": 3.580354750860768e-05, "loss": 1.8083221912384033, "step": 1011 }, { "epoch": 0.26393688465801657, "grad_norm": 4.5625, "learning_rate": 3.579459828990133e-05, "loss": 1.8334442377090454, "step": 1012 }, { "epoch": 0.2641976918562952, "grad_norm": 4.96875, "learning_rate": 3.5785640659916736e-05, "loss": 2.0063252449035645, "step": 1013 }, { "epoch": 0.2644584990545739, "grad_norm": 4.40625, "learning_rate": 3.5776674623424226e-05, "loss": 2.0151400566101074, "step": 1014 }, { "epoch": 0.26471930625285256, "grad_norm": 3.84375, "learning_rate": 3.5767700185198556e-05, "loss": 1.4993696212768555, "step": 1015 }, { "epoch": 0.26498011345113126, "grad_norm": 4.25, "learning_rate": 3.575871735001901e-05, "loss": 1.8012977838516235, "step": 1016 }, { "epoch": 0.2652409206494099, "grad_norm": 4.28125, "learning_rate": 3.5749726122669316e-05, "loss": 1.8131016492843628, "step": 1017 }, { "epoch": 0.2655017278476886, "grad_norm": 4.40625, "learning_rate": 3.574072650793767e-05, "loss": 1.7579952478408813, "step": 1018 }, { "epoch": 0.26576253504596725, "grad_norm": 4.78125, "learning_rate": 3.573171851061674e-05, "loss": 1.9900325536727905, "step": 1019 }, { "epoch": 0.26602334224424595, "grad_norm": 3.984375, "learning_rate": 3.5722702135503664e-05, "loss": 1.738937258720398, "step": 1020 }, { "epoch": 0.2662841494425246, "grad_norm": 4.78125, "learning_rate": 3.571367738740003e-05, "loss": 1.9216761589050293, "step": 1021 }, { "epoch": 0.2665449566408033, "grad_norm": 3.984375, "learning_rate": 3.570464427111189e-05, "loss": 1.5162055492401123, "step": 1022 }, { "epoch": 0.26680576383908194, "grad_norm": 4.5625, "learning_rate": 3.569560279144976e-05, "loss": 1.8345317840576172, "step": 1023 }, { "epoch": 0.26706657103736064, "grad_norm": 4.4375, "learning_rate": 3.568655295322859e-05, "loss": 1.8400464057922363, "step": 1024 }, { "epoch": 0.2673273782356393, "grad_norm": 4.15625, "learning_rate": 3.567749476126781e-05, "loss": 1.690098762512207, "step": 1025 }, { "epoch": 0.267588185433918, "grad_norm": 4.34375, "learning_rate": 3.566842822039127e-05, "loss": 1.9592373371124268, "step": 1026 }, { "epoch": 0.26784899263219664, "grad_norm": 4.5, "learning_rate": 3.565935333542729e-05, "loss": 1.7009608745574951, "step": 1027 }, { "epoch": 0.26810979983047534, "grad_norm": 3.9375, "learning_rate": 3.56502701112086e-05, "loss": 1.7458827495574951, "step": 1028 }, { "epoch": 0.268370607028754, "grad_norm": 4.34375, "learning_rate": 3.564117855257242e-05, "loss": 1.8003566265106201, "step": 1029 }, { "epoch": 0.2686314142270327, "grad_norm": 4.71875, "learning_rate": 3.5632078664360365e-05, "loss": 1.5618364810943604, "step": 1030 }, { "epoch": 0.26889222142531133, "grad_norm": 4.1875, "learning_rate": 3.562297045141851e-05, "loss": 1.9563771486282349, "step": 1031 }, { "epoch": 0.26915302862359003, "grad_norm": 4.75, "learning_rate": 3.561385391859736e-05, "loss": 1.5928397178649902, "step": 1032 }, { "epoch": 0.2694138358218687, "grad_norm": 4.5625, "learning_rate": 3.560472907075183e-05, "loss": 1.7969861030578613, "step": 1033 }, { "epoch": 0.2696746430201474, "grad_norm": 3.890625, "learning_rate": 3.559559591274129e-05, "loss": 1.464586615562439, "step": 1034 }, { "epoch": 0.269935450218426, "grad_norm": 4.40625, "learning_rate": 3.558645444942953e-05, "loss": 1.7187366485595703, "step": 1035 }, { "epoch": 0.2701962574167047, "grad_norm": 4.4375, "learning_rate": 3.557730468568476e-05, "loss": 1.5232993364334106, "step": 1036 }, { "epoch": 0.27045706461498337, "grad_norm": 4.3125, "learning_rate": 3.556814662637959e-05, "loss": 1.7988381385803223, "step": 1037 }, { "epoch": 0.27071787181326207, "grad_norm": 4.25, "learning_rate": 3.555898027639109e-05, "loss": 1.8296515941619873, "step": 1038 }, { "epoch": 0.2709786790115407, "grad_norm": 4.125, "learning_rate": 3.55498056406007e-05, "loss": 1.3963137865066528, "step": 1039 }, { "epoch": 0.2712394862098194, "grad_norm": 4.75, "learning_rate": 3.554062272389431e-05, "loss": 1.8541409969329834, "step": 1040 }, { "epoch": 0.27150029340809806, "grad_norm": 4.15625, "learning_rate": 3.553143153116219e-05, "loss": 1.891303300857544, "step": 1041 }, { "epoch": 0.27176110060637676, "grad_norm": 4.875, "learning_rate": 3.552223206729904e-05, "loss": 2.0532147884368896, "step": 1042 }, { "epoch": 0.2720219078046554, "grad_norm": 3.859375, "learning_rate": 3.551302433720396e-05, "loss": 1.566463828086853, "step": 1043 }, { "epoch": 0.2722827150029341, "grad_norm": 4.375, "learning_rate": 3.550380834578044e-05, "loss": 1.5827128887176514, "step": 1044 }, { "epoch": 0.27254352220121275, "grad_norm": 4.4375, "learning_rate": 3.5494584097936375e-05, "loss": 1.8478995561599731, "step": 1045 }, { "epoch": 0.27280432939949145, "grad_norm": 6.125, "learning_rate": 3.5485351598584066e-05, "loss": 1.8030246496200562, "step": 1046 }, { "epoch": 0.2730651365977701, "grad_norm": 4.3125, "learning_rate": 3.54761108526402e-05, "loss": 1.7071260213851929, "step": 1047 }, { "epoch": 0.2733259437960488, "grad_norm": 4.09375, "learning_rate": 3.5466861865025856e-05, "loss": 1.7506829500198364, "step": 1048 }, { "epoch": 0.27358675099432744, "grad_norm": 4.0625, "learning_rate": 3.54576046406665e-05, "loss": 2.0367753505706787, "step": 1049 }, { "epoch": 0.2738475581926061, "grad_norm": 3.890625, "learning_rate": 3.544833918449199e-05, "loss": 1.6471517086029053, "step": 1050 }, { "epoch": 0.2741083653908848, "grad_norm": 4.3125, "learning_rate": 3.5439065501436575e-05, "loss": 2.0317742824554443, "step": 1051 }, { "epoch": 0.27436917258916343, "grad_norm": 3.9375, "learning_rate": 3.5429783596438864e-05, "loss": 1.732642650604248, "step": 1052 }, { "epoch": 0.27462997978744214, "grad_norm": 4.03125, "learning_rate": 3.5420493474441855e-05, "loss": 1.9185147285461426, "step": 1053 }, { "epoch": 0.2748907869857208, "grad_norm": 4.0625, "learning_rate": 3.5411195140392936e-05, "loss": 1.7298734188079834, "step": 1054 }, { "epoch": 0.2751515941839995, "grad_norm": 4.28125, "learning_rate": 3.540188859924384e-05, "loss": 1.9768015146255493, "step": 1055 }, { "epoch": 0.2754124013822781, "grad_norm": 5.40625, "learning_rate": 3.539257385595069e-05, "loss": 1.898254632949829, "step": 1056 }, { "epoch": 0.2756732085805568, "grad_norm": 4.15625, "learning_rate": 3.538325091547398e-05, "loss": 1.6079350709915161, "step": 1057 }, { "epoch": 0.2759340157788355, "grad_norm": 4.21875, "learning_rate": 3.537391978277856e-05, "loss": 1.719800591468811, "step": 1058 }, { "epoch": 0.2761948229771142, "grad_norm": 26.0, "learning_rate": 3.536458046283364e-05, "loss": 2.0866620540618896, "step": 1059 }, { "epoch": 0.2764556301753928, "grad_norm": 4.53125, "learning_rate": 3.535523296061279e-05, "loss": 1.798237919807434, "step": 1060 }, { "epoch": 0.2767164373736715, "grad_norm": 3.921875, "learning_rate": 3.534587728109396e-05, "loss": 1.8382494449615479, "step": 1061 }, { "epoch": 0.27697724457195017, "grad_norm": 4.6875, "learning_rate": 3.533651342925942e-05, "loss": 1.8185405731201172, "step": 1062 }, { "epoch": 0.27723805177022887, "grad_norm": 4.46875, "learning_rate": 3.532714141009583e-05, "loss": 1.7275090217590332, "step": 1063 }, { "epoch": 0.2774988589685075, "grad_norm": 4.46875, "learning_rate": 3.531776122859415e-05, "loss": 2.008340358734131, "step": 1064 }, { "epoch": 0.2777596661667862, "grad_norm": 4.4375, "learning_rate": 3.530837288974974e-05, "loss": 1.8108479976654053, "step": 1065 }, { "epoch": 0.27802047336506486, "grad_norm": 4.125, "learning_rate": 3.529897639856226e-05, "loss": 1.894986867904663, "step": 1066 }, { "epoch": 0.27828128056334356, "grad_norm": 3.984375, "learning_rate": 3.528957176003575e-05, "loss": 1.6555955410003662, "step": 1067 }, { "epoch": 0.2785420877616222, "grad_norm": 4.53125, "learning_rate": 3.528015897917856e-05, "loss": 1.951297402381897, "step": 1068 }, { "epoch": 0.2788028949599009, "grad_norm": 4.21875, "learning_rate": 3.527073806100338e-05, "loss": 1.9099540710449219, "step": 1069 }, { "epoch": 0.27906370215817955, "grad_norm": 3.71875, "learning_rate": 3.5261309010527256e-05, "loss": 1.778804063796997, "step": 1070 }, { "epoch": 0.27932450935645825, "grad_norm": 4.21875, "learning_rate": 3.525187183277153e-05, "loss": 1.7820208072662354, "step": 1071 }, { "epoch": 0.2795853165547369, "grad_norm": 3.953125, "learning_rate": 3.52424265327619e-05, "loss": 1.6652235984802246, "step": 1072 }, { "epoch": 0.2798461237530156, "grad_norm": 4.03125, "learning_rate": 3.5232973115528375e-05, "loss": 1.863331913948059, "step": 1073 }, { "epoch": 0.28010693095129424, "grad_norm": 3.859375, "learning_rate": 3.52235115861053e-05, "loss": 1.4964725971221924, "step": 1074 }, { "epoch": 0.28036773814957294, "grad_norm": 4.03125, "learning_rate": 3.5214041949531324e-05, "loss": 1.6539411544799805, "step": 1075 }, { "epoch": 0.2806285453478516, "grad_norm": 4.625, "learning_rate": 3.5204564210849425e-05, "loss": 1.8654205799102783, "step": 1076 }, { "epoch": 0.2808893525461303, "grad_norm": 4.21875, "learning_rate": 3.519507837510689e-05, "loss": 1.7179417610168457, "step": 1077 }, { "epoch": 0.28115015974440893, "grad_norm": 3.953125, "learning_rate": 3.518558444735532e-05, "loss": 1.6654682159423828, "step": 1078 }, { "epoch": 0.28141096694268763, "grad_norm": 4.125, "learning_rate": 3.517608243265063e-05, "loss": 1.6149661540985107, "step": 1079 }, { "epoch": 0.2816717741409663, "grad_norm": 4.25, "learning_rate": 3.516657233605302e-05, "loss": 1.7557415962219238, "step": 1080 }, { "epoch": 0.281932581339245, "grad_norm": 4.03125, "learning_rate": 3.5157054162627036e-05, "loss": 1.8852646350860596, "step": 1081 }, { "epoch": 0.2821933885375236, "grad_norm": 4.3125, "learning_rate": 3.514752791744147e-05, "loss": 2.0640451908111572, "step": 1082 }, { "epoch": 0.2824541957358023, "grad_norm": 4.59375, "learning_rate": 3.513799360556947e-05, "loss": 1.7899377346038818, "step": 1083 }, { "epoch": 0.282715002934081, "grad_norm": 4.0625, "learning_rate": 3.512845123208844e-05, "loss": 1.7269141674041748, "step": 1084 }, { "epoch": 0.2829758101323597, "grad_norm": 4.1875, "learning_rate": 3.511890080208008e-05, "loss": 1.6378408670425415, "step": 1085 }, { "epoch": 0.2832366173306383, "grad_norm": 4.28125, "learning_rate": 3.510934232063041e-05, "loss": 1.911826491355896, "step": 1086 }, { "epoch": 0.283497424528917, "grad_norm": 4.71875, "learning_rate": 3.509977579282971e-05, "loss": 1.7262004613876343, "step": 1087 }, { "epoch": 0.28375823172719566, "grad_norm": 4.40625, "learning_rate": 3.509020122377254e-05, "loss": 1.9643630981445312, "step": 1088 }, { "epoch": 0.28401903892547437, "grad_norm": 4.125, "learning_rate": 3.508061861855777e-05, "loss": 1.7308140993118286, "step": 1089 }, { "epoch": 0.284279846123753, "grad_norm": 4.21875, "learning_rate": 3.507102798228852e-05, "loss": 1.5944818258285522, "step": 1090 }, { "epoch": 0.2845406533220317, "grad_norm": 4.15625, "learning_rate": 3.5061429320072225e-05, "loss": 1.4689910411834717, "step": 1091 }, { "epoch": 0.28480146052031036, "grad_norm": 3.8125, "learning_rate": 3.505182263702055e-05, "loss": 1.7175511121749878, "step": 1092 }, { "epoch": 0.28506226771858906, "grad_norm": 4.625, "learning_rate": 3.504220793824945e-05, "loss": 1.98661470413208, "step": 1093 }, { "epoch": 0.2853230749168677, "grad_norm": 4.375, "learning_rate": 3.503258522887917e-05, "loss": 1.6025145053863525, "step": 1094 }, { "epoch": 0.2855838821151464, "grad_norm": 3.703125, "learning_rate": 3.502295451403419e-05, "loss": 1.6541790962219238, "step": 1095 }, { "epoch": 0.28584468931342505, "grad_norm": 3.828125, "learning_rate": 3.501331579884325e-05, "loss": 1.834327220916748, "step": 1096 }, { "epoch": 0.28610549651170375, "grad_norm": 3.921875, "learning_rate": 3.5003669088439395e-05, "loss": 1.7379425764083862, "step": 1097 }, { "epoch": 0.2863663037099824, "grad_norm": 4.1875, "learning_rate": 3.499401438795988e-05, "loss": 1.7236168384552002, "step": 1098 }, { "epoch": 0.2866271109082611, "grad_norm": 4.1875, "learning_rate": 3.498435170254624e-05, "loss": 1.6990818977355957, "step": 1099 }, { "epoch": 0.28688791810653974, "grad_norm": 4.03125, "learning_rate": 3.497468103734424e-05, "loss": 1.7150160074234009, "step": 1100 }, { "epoch": 0.2871487253048184, "grad_norm": 4.28125, "learning_rate": 3.4965002397503936e-05, "loss": 1.8012375831604004, "step": 1101 }, { "epoch": 0.2874095325030971, "grad_norm": 3.96875, "learning_rate": 3.495531578817958e-05, "loss": 1.4892457723617554, "step": 1102 }, { "epoch": 0.28767033970137573, "grad_norm": 4.3125, "learning_rate": 3.4945621214529705e-05, "loss": 1.8427650928497314, "step": 1103 }, { "epoch": 0.28793114689965443, "grad_norm": 3.75, "learning_rate": 3.493591868171709e-05, "loss": 1.5933095216751099, "step": 1104 }, { "epoch": 0.2881919540979331, "grad_norm": 4.375, "learning_rate": 3.49262081949087e-05, "loss": 1.6940237283706665, "step": 1105 }, { "epoch": 0.2884527612962118, "grad_norm": 3.953125, "learning_rate": 3.4916489759275794e-05, "loss": 1.495390772819519, "step": 1106 }, { "epoch": 0.2887135684944904, "grad_norm": 3.859375, "learning_rate": 3.490676337999383e-05, "loss": 1.590932846069336, "step": 1107 }, { "epoch": 0.2889743756927691, "grad_norm": 3.96875, "learning_rate": 3.489702906224253e-05, "loss": 1.7562646865844727, "step": 1108 }, { "epoch": 0.28923518289104777, "grad_norm": 3.9375, "learning_rate": 3.48872868112058e-05, "loss": 1.5710026025772095, "step": 1109 }, { "epoch": 0.28949599008932647, "grad_norm": 4.125, "learning_rate": 3.48775366320718e-05, "loss": 1.8648991584777832, "step": 1110 }, { "epoch": 0.2897567972876051, "grad_norm": 4.21875, "learning_rate": 3.48677785300329e-05, "loss": 1.6395639181137085, "step": 1111 }, { "epoch": 0.2900176044858838, "grad_norm": 4.5, "learning_rate": 3.48580125102857e-05, "loss": 1.6861207485198975, "step": 1112 }, { "epoch": 0.29027841168416246, "grad_norm": 4.21875, "learning_rate": 3.484823857803101e-05, "loss": 1.7843270301818848, "step": 1113 }, { "epoch": 0.29053921888244116, "grad_norm": 4.15625, "learning_rate": 3.483845673847386e-05, "loss": 1.8121414184570312, "step": 1114 }, { "epoch": 0.2908000260807198, "grad_norm": 4.09375, "learning_rate": 3.482866699682347e-05, "loss": 1.9957536458969116, "step": 1115 }, { "epoch": 0.2910608332789985, "grad_norm": 4.46875, "learning_rate": 3.4818869358293285e-05, "loss": 1.9593991041183472, "step": 1116 }, { "epoch": 0.29132164047727716, "grad_norm": 4.09375, "learning_rate": 3.4809063828100965e-05, "loss": 1.7332122325897217, "step": 1117 }, { "epoch": 0.29158244767555586, "grad_norm": 4.375, "learning_rate": 3.479925041146836e-05, "loss": 1.657759189605713, "step": 1118 }, { "epoch": 0.2918432548738345, "grad_norm": 4.625, "learning_rate": 3.4789429113621517e-05, "loss": 1.997182011604309, "step": 1119 }, { "epoch": 0.2921040620721132, "grad_norm": 4.0625, "learning_rate": 3.477959993979068e-05, "loss": 1.7544701099395752, "step": 1120 }, { "epoch": 0.29236486927039185, "grad_norm": 4.3125, "learning_rate": 3.476976289521029e-05, "loss": 1.771339774131775, "step": 1121 }, { "epoch": 0.29262567646867055, "grad_norm": 4.125, "learning_rate": 3.475991798511899e-05, "loss": 1.9331625699996948, "step": 1122 }, { "epoch": 0.2928864836669492, "grad_norm": 4.0625, "learning_rate": 3.47500652147596e-05, "loss": 2.144134521484375, "step": 1123 }, { "epoch": 0.2931472908652279, "grad_norm": 4.53125, "learning_rate": 3.4740204589379125e-05, "loss": 1.9860496520996094, "step": 1124 }, { "epoch": 0.29340809806350654, "grad_norm": 4.34375, "learning_rate": 3.4730336114228756e-05, "loss": 1.868838906288147, "step": 1125 }, { "epoch": 0.29366890526178524, "grad_norm": 4.09375, "learning_rate": 3.472045979456387e-05, "loss": 1.7470306158065796, "step": 1126 }, { "epoch": 0.2939297124600639, "grad_norm": 4.1875, "learning_rate": 3.471057563564403e-05, "loss": 1.7575474977493286, "step": 1127 }, { "epoch": 0.2941905196583426, "grad_norm": 4.21875, "learning_rate": 3.470068364273292e-05, "loss": 1.9841556549072266, "step": 1128 }, { "epoch": 0.29445132685662123, "grad_norm": 4.53125, "learning_rate": 3.4690783821098476e-05, "loss": 1.9896951913833618, "step": 1129 }, { "epoch": 0.29471213405489993, "grad_norm": 4.125, "learning_rate": 3.468087617601275e-05, "loss": 1.8877949714660645, "step": 1130 }, { "epoch": 0.2949729412531786, "grad_norm": 3.9375, "learning_rate": 3.4670960712751965e-05, "loss": 1.5972007513046265, "step": 1131 }, { "epoch": 0.2952337484514573, "grad_norm": 4.25, "learning_rate": 3.4661037436596526e-05, "loss": 1.761352777481079, "step": 1132 }, { "epoch": 0.2954945556497359, "grad_norm": 4.75, "learning_rate": 3.465110635283099e-05, "loss": 1.855859637260437, "step": 1133 }, { "epoch": 0.2957553628480146, "grad_norm": 4.09375, "learning_rate": 3.464116746674407e-05, "loss": 1.6813459396362305, "step": 1134 }, { "epoch": 0.29601617004629327, "grad_norm": 3.8125, "learning_rate": 3.4631220783628624e-05, "loss": 1.7292029857635498, "step": 1135 }, { "epoch": 0.29627697724457197, "grad_norm": 4.53125, "learning_rate": 3.462126630878168e-05, "loss": 1.9538642168045044, "step": 1136 }, { "epoch": 0.2965377844428506, "grad_norm": 4.09375, "learning_rate": 3.4611304047504416e-05, "loss": 1.7921640872955322, "step": 1137 }, { "epoch": 0.2967985916411293, "grad_norm": 4.4375, "learning_rate": 3.460133400510214e-05, "loss": 1.724027156829834, "step": 1138 }, { "epoch": 0.29705939883940796, "grad_norm": 4.375, "learning_rate": 3.459135618688431e-05, "loss": 1.9083806276321411, "step": 1139 }, { "epoch": 0.29732020603768666, "grad_norm": 4.0, "learning_rate": 3.4581370598164535e-05, "loss": 1.5803298950195312, "step": 1140 }, { "epoch": 0.2975810132359653, "grad_norm": 3.875, "learning_rate": 3.4571377244260554e-05, "loss": 1.4782859086990356, "step": 1141 }, { "epoch": 0.297841820434244, "grad_norm": 4.375, "learning_rate": 3.456137613049424e-05, "loss": 1.9525219202041626, "step": 1142 }, { "epoch": 0.29810262763252265, "grad_norm": 3.921875, "learning_rate": 3.45513672621916e-05, "loss": 1.9345943927764893, "step": 1143 }, { "epoch": 0.29836343483080136, "grad_norm": 4.125, "learning_rate": 3.454135064468278e-05, "loss": 1.9073323011398315, "step": 1144 }, { "epoch": 0.29862424202908, "grad_norm": 4.21875, "learning_rate": 3.453132628330203e-05, "loss": 1.7295219898223877, "step": 1145 }, { "epoch": 0.2988850492273587, "grad_norm": 4.28125, "learning_rate": 3.452129418338775e-05, "loss": 1.6505075693130493, "step": 1146 }, { "epoch": 0.29914585642563735, "grad_norm": 4.25, "learning_rate": 3.4511254350282436e-05, "loss": 1.7804162502288818, "step": 1147 }, { "epoch": 0.29940666362391605, "grad_norm": 4.125, "learning_rate": 3.450120678933273e-05, "loss": 1.8169751167297363, "step": 1148 }, { "epoch": 0.2996674708221947, "grad_norm": 4.28125, "learning_rate": 3.449115150588936e-05, "loss": 1.6251006126403809, "step": 1149 }, { "epoch": 0.29992827802047334, "grad_norm": 4.25, "learning_rate": 3.448108850530719e-05, "loss": 1.8406720161437988, "step": 1150 }, { "epoch": 0.30018908521875204, "grad_norm": 4.25, "learning_rate": 3.4471017792945186e-05, "loss": 1.9019505977630615, "step": 1151 }, { "epoch": 0.3004498924170307, "grad_norm": 4.125, "learning_rate": 3.4460939374166424e-05, "loss": 1.896089792251587, "step": 1152 }, { "epoch": 0.3007106996153094, "grad_norm": 4.03125, "learning_rate": 3.4450853254338065e-05, "loss": 1.679295301437378, "step": 1153 }, { "epoch": 0.30097150681358803, "grad_norm": 4.25, "learning_rate": 3.44407594388314e-05, "loss": 1.6873747110366821, "step": 1154 }, { "epoch": 0.30123231401186673, "grad_norm": 4.0, "learning_rate": 3.443065793302179e-05, "loss": 1.688307523727417, "step": 1155 }, { "epoch": 0.3014931212101454, "grad_norm": 4.5625, "learning_rate": 3.4420548742288716e-05, "loss": 1.8257405757904053, "step": 1156 }, { "epoch": 0.3017539284084241, "grad_norm": 4.53125, "learning_rate": 3.441043187201574e-05, "loss": 1.7855994701385498, "step": 1157 }, { "epoch": 0.3020147356067027, "grad_norm": 3.859375, "learning_rate": 3.4400307327590517e-05, "loss": 1.6374878883361816, "step": 1158 }, { "epoch": 0.3022755428049814, "grad_norm": 4.25, "learning_rate": 3.439017511440478e-05, "loss": 1.905398964881897, "step": 1159 }, { "epoch": 0.30253635000326007, "grad_norm": 4.21875, "learning_rate": 3.4380035237854355e-05, "loss": 1.79018235206604, "step": 1160 }, { "epoch": 0.30279715720153877, "grad_norm": 4.4375, "learning_rate": 3.436988770333915e-05, "loss": 1.934191107749939, "step": 1161 }, { "epoch": 0.3030579643998174, "grad_norm": 4.34375, "learning_rate": 3.435973251626314e-05, "loss": 1.959614634513855, "step": 1162 }, { "epoch": 0.3033187715980961, "grad_norm": 4.96875, "learning_rate": 3.4349569682034394e-05, "loss": 1.8245642185211182, "step": 1163 }, { "epoch": 0.30357957879637476, "grad_norm": 4.03125, "learning_rate": 3.433939920606503e-05, "loss": 1.9029959440231323, "step": 1164 }, { "epoch": 0.30384038599465346, "grad_norm": 4.3125, "learning_rate": 3.432922109377125e-05, "loss": 1.563682198524475, "step": 1165 }, { "epoch": 0.3041011931929321, "grad_norm": 4.1875, "learning_rate": 3.4319035350573334e-05, "loss": 2.009978771209717, "step": 1166 }, { "epoch": 0.3043620003912108, "grad_norm": 3.703125, "learning_rate": 3.43088419818956e-05, "loss": 1.6859945058822632, "step": 1167 }, { "epoch": 0.30462280758948945, "grad_norm": 3.71875, "learning_rate": 3.4298640993166446e-05, "loss": 1.7476444244384766, "step": 1168 }, { "epoch": 0.30488361478776815, "grad_norm": 3.875, "learning_rate": 3.4288432389818314e-05, "loss": 1.4783931970596313, "step": 1169 }, { "epoch": 0.3051444219860468, "grad_norm": 3.9375, "learning_rate": 3.427821617728771e-05, "loss": 1.5559719800949097, "step": 1170 }, { "epoch": 0.3054052291843255, "grad_norm": 4.03125, "learning_rate": 3.4267992361015196e-05, "loss": 1.6822757720947266, "step": 1171 }, { "epoch": 0.30566603638260414, "grad_norm": 3.859375, "learning_rate": 3.4257760946445375e-05, "loss": 1.708196759223938, "step": 1172 }, { "epoch": 0.30592684358088285, "grad_norm": 4.40625, "learning_rate": 3.42475219390269e-05, "loss": 1.585463523864746, "step": 1173 }, { "epoch": 0.3061876507791615, "grad_norm": 4.15625, "learning_rate": 3.423727534421247e-05, "loss": 1.7142834663391113, "step": 1174 }, { "epoch": 0.3064484579774402, "grad_norm": 4.0625, "learning_rate": 3.422702116745881e-05, "loss": 1.822965383529663, "step": 1175 }, { "epoch": 0.30670926517571884, "grad_norm": 3.734375, "learning_rate": 3.42167594142267e-05, "loss": 1.6955723762512207, "step": 1176 }, { "epoch": 0.30697007237399754, "grad_norm": 3.796875, "learning_rate": 3.420649008998095e-05, "loss": 1.9219430685043335, "step": 1177 }, { "epoch": 0.3072308795722762, "grad_norm": 4.03125, "learning_rate": 3.419621320019041e-05, "loss": 1.8803141117095947, "step": 1178 }, { "epoch": 0.3074916867705549, "grad_norm": 3.859375, "learning_rate": 3.418592875032793e-05, "loss": 1.6397581100463867, "step": 1179 }, { "epoch": 0.30775249396883353, "grad_norm": 3.9375, "learning_rate": 3.417563674587043e-05, "loss": 1.4602605104446411, "step": 1180 }, { "epoch": 0.30801330116711223, "grad_norm": 3.890625, "learning_rate": 3.4165337192298804e-05, "loss": 1.7662440538406372, "step": 1181 }, { "epoch": 0.3082741083653909, "grad_norm": 4.125, "learning_rate": 3.415503009509801e-05, "loss": 1.7334061861038208, "step": 1182 }, { "epoch": 0.3085349155636696, "grad_norm": 4.90625, "learning_rate": 3.4144715459756995e-05, "loss": 1.7847474813461304, "step": 1183 }, { "epoch": 0.3087957227619482, "grad_norm": 3.953125, "learning_rate": 3.413439329176874e-05, "loss": 1.8660763502120972, "step": 1184 }, { "epoch": 0.3090565299602269, "grad_norm": 3.875, "learning_rate": 3.41240635966302e-05, "loss": 1.4467748403549194, "step": 1185 }, { "epoch": 0.30931733715850557, "grad_norm": 3.765625, "learning_rate": 3.411372637984241e-05, "loss": 1.6627625226974487, "step": 1186 }, { "epoch": 0.30957814435678427, "grad_norm": 4.21875, "learning_rate": 3.4103381646910325e-05, "loss": 1.893566370010376, "step": 1187 }, { "epoch": 0.3098389515550629, "grad_norm": 4.71875, "learning_rate": 3.409302940334296e-05, "loss": 1.5209918022155762, "step": 1188 }, { "epoch": 0.3100997587533416, "grad_norm": 4.125, "learning_rate": 3.408266965465332e-05, "loss": 1.8906986713409424, "step": 1189 }, { "epoch": 0.31036056595162026, "grad_norm": 4.25, "learning_rate": 3.40723024063584e-05, "loss": 1.5039809942245483, "step": 1190 }, { "epoch": 0.31062137314989896, "grad_norm": 4.15625, "learning_rate": 3.406192766397918e-05, "loss": 1.6729252338409424, "step": 1191 }, { "epoch": 0.3108821803481776, "grad_norm": 4.125, "learning_rate": 3.405154543304065e-05, "loss": 1.5411571264266968, "step": 1192 }, { "epoch": 0.3111429875464563, "grad_norm": 4.03125, "learning_rate": 3.404115571907176e-05, "loss": 1.9417058229446411, "step": 1193 }, { "epoch": 0.31140379474473495, "grad_norm": 3.734375, "learning_rate": 3.4030758527605484e-05, "loss": 1.615490198135376, "step": 1194 }, { "epoch": 0.31166460194301365, "grad_norm": 4.375, "learning_rate": 3.402035386417875e-05, "loss": 2.0656657218933105, "step": 1195 }, { "epoch": 0.3119254091412923, "grad_norm": 4.375, "learning_rate": 3.4009941734332476e-05, "loss": 1.805072546005249, "step": 1196 }, { "epoch": 0.312186216339571, "grad_norm": 4.1875, "learning_rate": 3.399952214361154e-05, "loss": 1.7341935634613037, "step": 1197 }, { "epoch": 0.31244702353784964, "grad_norm": 3.984375, "learning_rate": 3.398909509756482e-05, "loss": 1.8699414730072021, "step": 1198 }, { "epoch": 0.3127078307361283, "grad_norm": 3.75, "learning_rate": 3.397866060174515e-05, "loss": 1.4938279390335083, "step": 1199 }, { "epoch": 0.312968637934407, "grad_norm": 4.15625, "learning_rate": 3.3968218661709315e-05, "loss": 1.7651143074035645, "step": 1200 }, { "epoch": 0.31322944513268564, "grad_norm": 3.953125, "learning_rate": 3.3957769283018106e-05, "loss": 1.6848008632659912, "step": 1201 }, { "epoch": 0.31349025233096434, "grad_norm": 4.0625, "learning_rate": 3.394731247123623e-05, "loss": 1.8530536890029907, "step": 1202 }, { "epoch": 0.313751059529243, "grad_norm": 4.3125, "learning_rate": 3.393684823193238e-05, "loss": 2.0332212448120117, "step": 1203 }, { "epoch": 0.3140118667275217, "grad_norm": 4.0, "learning_rate": 3.39263765706792e-05, "loss": 1.8089815378189087, "step": 1204 }, { "epoch": 0.3142726739258003, "grad_norm": 3.9375, "learning_rate": 3.3915897493053274e-05, "loss": 1.7100820541381836, "step": 1205 }, { "epoch": 0.31453348112407903, "grad_norm": 3.890625, "learning_rate": 3.390541100463515e-05, "loss": 1.4569581747055054, "step": 1206 }, { "epoch": 0.3147942883223577, "grad_norm": 3.953125, "learning_rate": 3.3894917111009325e-05, "loss": 1.7145471572875977, "step": 1207 }, { "epoch": 0.3150550955206364, "grad_norm": 4.03125, "learning_rate": 3.388441581776423e-05, "loss": 1.8081591129302979, "step": 1208 }, { "epoch": 0.315315902718915, "grad_norm": 4.6875, "learning_rate": 3.3873907130492236e-05, "loss": 1.6673164367675781, "step": 1209 }, { "epoch": 0.3155767099171937, "grad_norm": 4.125, "learning_rate": 3.386339105478964e-05, "loss": 1.787574052810669, "step": 1210 }, { "epoch": 0.31583751711547237, "grad_norm": 3.6875, "learning_rate": 3.385286759625672e-05, "loss": 1.460659146308899, "step": 1211 }, { "epoch": 0.31609832431375107, "grad_norm": 4.0, "learning_rate": 3.384233676049762e-05, "loss": 1.524317979812622, "step": 1212 }, { "epoch": 0.3163591315120297, "grad_norm": 4.0625, "learning_rate": 3.3831798553120475e-05, "loss": 1.6283299922943115, "step": 1213 }, { "epoch": 0.3166199387103084, "grad_norm": 3.609375, "learning_rate": 3.38212529797373e-05, "loss": 1.3031306266784668, "step": 1214 }, { "epoch": 0.31688074590858706, "grad_norm": 4.625, "learning_rate": 3.381070004596405e-05, "loss": 1.764394998550415, "step": 1215 }, { "epoch": 0.31714155310686576, "grad_norm": 4.0, "learning_rate": 3.3800139757420604e-05, "loss": 1.557260513305664, "step": 1216 }, { "epoch": 0.3174023603051444, "grad_norm": 3.875, "learning_rate": 3.3789572119730766e-05, "loss": 1.5873910188674927, "step": 1217 }, { "epoch": 0.3176631675034231, "grad_norm": 4.09375, "learning_rate": 3.377899713852222e-05, "loss": 1.6127475500106812, "step": 1218 }, { "epoch": 0.31792397470170175, "grad_norm": 4.03125, "learning_rate": 3.376841481942659e-05, "loss": 1.7642548084259033, "step": 1219 }, { "epoch": 0.31818478189998045, "grad_norm": 4.15625, "learning_rate": 3.3757825168079396e-05, "loss": 1.804701328277588, "step": 1220 }, { "epoch": 0.3184455890982591, "grad_norm": 3.9375, "learning_rate": 3.374722819012008e-05, "loss": 1.7389459609985352, "step": 1221 }, { "epoch": 0.3187063962965378, "grad_norm": 4.0625, "learning_rate": 3.373662389119196e-05, "loss": 1.4407830238342285, "step": 1222 }, { "epoch": 0.31896720349481644, "grad_norm": 4.09375, "learning_rate": 3.372601227694226e-05, "loss": 1.6595262289047241, "step": 1223 }, { "epoch": 0.31922801069309514, "grad_norm": 4.15625, "learning_rate": 3.3715393353022116e-05, "loss": 1.8524608612060547, "step": 1224 }, { "epoch": 0.3194888178913738, "grad_norm": 4.125, "learning_rate": 3.370476712508654e-05, "loss": 1.6108800172805786, "step": 1225 }, { "epoch": 0.3197496250896525, "grad_norm": 4.0625, "learning_rate": 3.369413359879445e-05, "loss": 1.7554543018341064, "step": 1226 }, { "epoch": 0.32001043228793113, "grad_norm": 3.6875, "learning_rate": 3.368349277980861e-05, "loss": 1.501410961151123, "step": 1227 }, { "epoch": 0.32027123948620984, "grad_norm": 4.21875, "learning_rate": 3.367284467379572e-05, "loss": 2.095388412475586, "step": 1228 }, { "epoch": 0.3205320466844885, "grad_norm": 4.1875, "learning_rate": 3.366218928642634e-05, "loss": 1.7752673625946045, "step": 1229 }, { "epoch": 0.3207928538827672, "grad_norm": 3.9375, "learning_rate": 3.36515266233749e-05, "loss": 1.790653944015503, "step": 1230 }, { "epoch": 0.3210536610810458, "grad_norm": 3.875, "learning_rate": 3.364085669031971e-05, "loss": 1.5785245895385742, "step": 1231 }, { "epoch": 0.3213144682793245, "grad_norm": 4.09375, "learning_rate": 3.3630179492942944e-05, "loss": 1.9329664707183838, "step": 1232 }, { "epoch": 0.3215752754776032, "grad_norm": 3.90625, "learning_rate": 3.361949503693066e-05, "loss": 1.6032710075378418, "step": 1233 }, { "epoch": 0.3218360826758819, "grad_norm": 4.0, "learning_rate": 3.360880332797278e-05, "loss": 1.7220112085342407, "step": 1234 }, { "epoch": 0.3220968898741605, "grad_norm": 3.953125, "learning_rate": 3.359810437176307e-05, "loss": 1.717103362083435, "step": 1235 }, { "epoch": 0.3223576970724392, "grad_norm": 3.890625, "learning_rate": 3.3587398173999166e-05, "loss": 1.6662729978561401, "step": 1236 }, { "epoch": 0.32261850427071787, "grad_norm": 3.796875, "learning_rate": 3.3576684740382584e-05, "loss": 1.6473667621612549, "step": 1237 }, { "epoch": 0.32287931146899657, "grad_norm": 3.828125, "learning_rate": 3.356596407661864e-05, "loss": 1.7563276290893555, "step": 1238 }, { "epoch": 0.3231401186672752, "grad_norm": 3.75, "learning_rate": 3.3555236188416555e-05, "loss": 1.6142089366912842, "step": 1239 }, { "epoch": 0.3234009258655539, "grad_norm": 4.125, "learning_rate": 3.354450108148937e-05, "loss": 1.732367753982544, "step": 1240 }, { "epoch": 0.32366173306383256, "grad_norm": 4.0, "learning_rate": 3.3533758761553966e-05, "loss": 1.6796579360961914, "step": 1241 }, { "epoch": 0.32392254026211126, "grad_norm": 3.890625, "learning_rate": 3.352300923433108e-05, "loss": 1.8754359483718872, "step": 1242 }, { "epoch": 0.3241833474603899, "grad_norm": 4.1875, "learning_rate": 3.351225250554528e-05, "loss": 1.778498649597168, "step": 1243 }, { "epoch": 0.3244441546586686, "grad_norm": 4.21875, "learning_rate": 3.350148858092497e-05, "loss": 2.1445441246032715, "step": 1244 }, { "epoch": 0.32470496185694725, "grad_norm": 4.46875, "learning_rate": 3.349071746620238e-05, "loss": 1.8901653289794922, "step": 1245 }, { "epoch": 0.32496576905522595, "grad_norm": 4.34375, "learning_rate": 3.347993916711358e-05, "loss": 1.9751983880996704, "step": 1246 }, { "epoch": 0.3252265762535046, "grad_norm": 3.5625, "learning_rate": 3.346915368939847e-05, "loss": 1.3669779300689697, "step": 1247 }, { "epoch": 0.3254873834517833, "grad_norm": 4.15625, "learning_rate": 3.3458361038800754e-05, "loss": 1.8745354413986206, "step": 1248 }, { "epoch": 0.32574819065006194, "grad_norm": 4.46875, "learning_rate": 3.3447561221067964e-05, "loss": 1.5999069213867188, "step": 1249 }, { "epoch": 0.3260089978483406, "grad_norm": 4.34375, "learning_rate": 3.343675424195146e-05, "loss": 2.0362579822540283, "step": 1250 }, { "epoch": 0.3262698050466193, "grad_norm": 4.0625, "learning_rate": 3.342594010720639e-05, "loss": 1.6629884243011475, "step": 1251 }, { "epoch": 0.32653061224489793, "grad_norm": 4.3125, "learning_rate": 3.3415118822591744e-05, "loss": 2.0487184524536133, "step": 1252 }, { "epoch": 0.32679141944317663, "grad_norm": 4.0, "learning_rate": 3.34042903938703e-05, "loss": 1.6013741493225098, "step": 1253 }, { "epoch": 0.3270522266414553, "grad_norm": 4.0, "learning_rate": 3.3393454826808645e-05, "loss": 1.8632688522338867, "step": 1254 }, { "epoch": 0.327313033839734, "grad_norm": 4.65625, "learning_rate": 3.3382612127177166e-05, "loss": 2.017110586166382, "step": 1255 }, { "epoch": 0.3275738410380126, "grad_norm": 4.34375, "learning_rate": 3.337176230075005e-05, "loss": 1.9384896755218506, "step": 1256 }, { "epoch": 0.3278346482362913, "grad_norm": 4.46875, "learning_rate": 3.3360905353305284e-05, "loss": 1.7275506258010864, "step": 1257 }, { "epoch": 0.32809545543456997, "grad_norm": 4.15625, "learning_rate": 3.335004129062464e-05, "loss": 1.731511116027832, "step": 1258 }, { "epoch": 0.32835626263284867, "grad_norm": 4.15625, "learning_rate": 3.3339170118493674e-05, "loss": 1.887991189956665, "step": 1259 }, { "epoch": 0.3286170698311273, "grad_norm": 3.96875, "learning_rate": 3.332829184270175e-05, "loss": 1.6650071144104004, "step": 1260 }, { "epoch": 0.328877877029406, "grad_norm": 4.09375, "learning_rate": 3.331740646904199e-05, "loss": 1.8969206809997559, "step": 1261 }, { "epoch": 0.32913868422768466, "grad_norm": 4.0625, "learning_rate": 3.3306514003311305e-05, "loss": 1.8191999197006226, "step": 1262 }, { "epoch": 0.32939949142596336, "grad_norm": 3.6875, "learning_rate": 3.32956144513104e-05, "loss": 1.7069573402404785, "step": 1263 }, { "epoch": 0.329660298624242, "grad_norm": 3.796875, "learning_rate": 3.328470781884372e-05, "loss": 1.6841368675231934, "step": 1264 }, { "epoch": 0.3299211058225207, "grad_norm": 4.5, "learning_rate": 3.327379411171951e-05, "loss": 1.7754472494125366, "step": 1265 }, { "epoch": 0.33018191302079936, "grad_norm": 4.125, "learning_rate": 3.326287333574977e-05, "loss": 1.608618140220642, "step": 1266 }, { "epoch": 0.33044272021907806, "grad_norm": 4.0625, "learning_rate": 3.3251945496750253e-05, "loss": 1.667032241821289, "step": 1267 }, { "epoch": 0.3307035274173567, "grad_norm": 4.0625, "learning_rate": 3.324101060054051e-05, "loss": 1.7562623023986816, "step": 1268 }, { "epoch": 0.3309643346156354, "grad_norm": 4.03125, "learning_rate": 3.32300686529438e-05, "loss": 1.6734570264816284, "step": 1269 }, { "epoch": 0.33122514181391405, "grad_norm": 4.21875, "learning_rate": 3.321911965978718e-05, "loss": 1.7349474430084229, "step": 1270 }, { "epoch": 0.33148594901219275, "grad_norm": 4.25, "learning_rate": 3.320816362690145e-05, "loss": 1.8015398979187012, "step": 1271 }, { "epoch": 0.3317467562104714, "grad_norm": 4.03125, "learning_rate": 3.319720056012113e-05, "loss": 1.8009923696517944, "step": 1272 }, { "epoch": 0.3320075634087501, "grad_norm": 4.21875, "learning_rate": 3.318623046528453e-05, "loss": 1.8440279960632324, "step": 1273 }, { "epoch": 0.33226837060702874, "grad_norm": 3.953125, "learning_rate": 3.3175253348233654e-05, "loss": 1.6640136241912842, "step": 1274 }, { "epoch": 0.33252917780530744, "grad_norm": 4.40625, "learning_rate": 3.316426921481429e-05, "loss": 1.7970125675201416, "step": 1275 }, { "epoch": 0.3327899850035861, "grad_norm": 4.125, "learning_rate": 3.315327807087595e-05, "loss": 1.9269849061965942, "step": 1276 }, { "epoch": 0.3330507922018648, "grad_norm": 3.984375, "learning_rate": 3.314227992227186e-05, "loss": 1.5076572895050049, "step": 1277 }, { "epoch": 0.33331159940014343, "grad_norm": 4.6875, "learning_rate": 3.3131274774859e-05, "loss": 1.6746129989624023, "step": 1278 }, { "epoch": 0.33357240659842213, "grad_norm": 3.890625, "learning_rate": 3.312026263449805e-05, "loss": 1.6816201210021973, "step": 1279 }, { "epoch": 0.3338332137967008, "grad_norm": 4.125, "learning_rate": 3.310924350705345e-05, "loss": 1.7868664264678955, "step": 1280 }, { "epoch": 0.3340940209949795, "grad_norm": 3.875, "learning_rate": 3.309821739839333e-05, "loss": 1.4542313814163208, "step": 1281 }, { "epoch": 0.3343548281932581, "grad_norm": 3.875, "learning_rate": 3.308718431438956e-05, "loss": 1.7355276346206665, "step": 1282 }, { "epoch": 0.3346156353915368, "grad_norm": 3.859375, "learning_rate": 3.3076144260917705e-05, "loss": 1.4578869342803955, "step": 1283 }, { "epoch": 0.33487644258981547, "grad_norm": 3.875, "learning_rate": 3.306509724385706e-05, "loss": 1.5557750463485718, "step": 1284 }, { "epoch": 0.33513724978809417, "grad_norm": 3.84375, "learning_rate": 3.3054043269090614e-05, "loss": 1.6612582206726074, "step": 1285 }, { "epoch": 0.3353980569863728, "grad_norm": 4.0625, "learning_rate": 3.304298234250506e-05, "loss": 1.7065796852111816, "step": 1286 }, { "epoch": 0.3356588641846515, "grad_norm": 4.25, "learning_rate": 3.303191446999082e-05, "loss": 1.929694652557373, "step": 1287 }, { "epoch": 0.33591967138293016, "grad_norm": 3.875, "learning_rate": 3.302083965744198e-05, "loss": 1.6119188070297241, "step": 1288 }, { "epoch": 0.33618047858120886, "grad_norm": 4.21875, "learning_rate": 3.300975791075633e-05, "loss": 1.7780215740203857, "step": 1289 }, { "epoch": 0.3364412857794875, "grad_norm": 3.90625, "learning_rate": 3.2998669235835374e-05, "loss": 1.674952745437622, "step": 1290 }, { "epoch": 0.3367020929777662, "grad_norm": 4.03125, "learning_rate": 3.298757363858429e-05, "loss": 1.6719112396240234, "step": 1291 }, { "epoch": 0.33696290017604486, "grad_norm": 3.890625, "learning_rate": 3.297647112491193e-05, "loss": 1.6144449710845947, "step": 1292 }, { "epoch": 0.33722370737432356, "grad_norm": 4.28125, "learning_rate": 3.2965361700730856e-05, "loss": 2.0725719928741455, "step": 1293 }, { "epoch": 0.3374845145726022, "grad_norm": 4.1875, "learning_rate": 3.2954245371957294e-05, "loss": 1.604860782623291, "step": 1294 }, { "epoch": 0.3377453217708809, "grad_norm": 4.3125, "learning_rate": 3.294312214451115e-05, "loss": 1.7692651748657227, "step": 1295 }, { "epoch": 0.33800612896915955, "grad_norm": 3.875, "learning_rate": 3.293199202431599e-05, "loss": 1.7182050943374634, "step": 1296 }, { "epoch": 0.33826693616743825, "grad_norm": 3.8125, "learning_rate": 3.292085501729909e-05, "loss": 1.5841974020004272, "step": 1297 }, { "epoch": 0.3385277433657169, "grad_norm": 3.859375, "learning_rate": 3.290971112939135e-05, "loss": 1.6726911067962646, "step": 1298 }, { "epoch": 0.33878855056399554, "grad_norm": 3.546875, "learning_rate": 3.289856036652736e-05, "loss": 1.6453895568847656, "step": 1299 }, { "epoch": 0.33904935776227424, "grad_norm": 4.03125, "learning_rate": 3.288740273464535e-05, "loss": 1.8880107402801514, "step": 1300 }, { "epoch": 0.3393101649605529, "grad_norm": 3.859375, "learning_rate": 3.287623823968724e-05, "loss": 1.6029824018478394, "step": 1301 }, { "epoch": 0.3395709721588316, "grad_norm": 4.0, "learning_rate": 3.2865066887598566e-05, "loss": 1.5460083484649658, "step": 1302 }, { "epoch": 0.33983177935711023, "grad_norm": 3.75, "learning_rate": 3.285388868432856e-05, "loss": 1.419440746307373, "step": 1303 }, { "epoch": 0.34009258655538893, "grad_norm": 4.15625, "learning_rate": 3.284270363583005e-05, "loss": 1.952426791191101, "step": 1304 }, { "epoch": 0.3403533937536676, "grad_norm": 3.9375, "learning_rate": 3.283151174805957e-05, "loss": 1.7862493991851807, "step": 1305 }, { "epoch": 0.3406142009519463, "grad_norm": 4.1875, "learning_rate": 3.282031302697724e-05, "loss": 1.757668137550354, "step": 1306 }, { "epoch": 0.3408750081502249, "grad_norm": 4.0, "learning_rate": 3.280910747854685e-05, "loss": 1.7689999341964722, "step": 1307 }, { "epoch": 0.3411358153485036, "grad_norm": 4.0625, "learning_rate": 3.279789510873583e-05, "loss": 1.647916555404663, "step": 1308 }, { "epoch": 0.34139662254678227, "grad_norm": 4.125, "learning_rate": 3.278667592351521e-05, "loss": 1.7402260303497314, "step": 1309 }, { "epoch": 0.34165742974506097, "grad_norm": 4.40625, "learning_rate": 3.277544992885969e-05, "loss": 1.8295246362686157, "step": 1310 }, { "epoch": 0.3419182369433396, "grad_norm": 4.21875, "learning_rate": 3.2764217130747566e-05, "loss": 1.785295009613037, "step": 1311 }, { "epoch": 0.3421790441416183, "grad_norm": 3.71875, "learning_rate": 3.275297753516078e-05, "loss": 1.67706298828125, "step": 1312 }, { "epoch": 0.34243985133989696, "grad_norm": 3.75, "learning_rate": 3.274173114808487e-05, "loss": 1.673936128616333, "step": 1313 }, { "epoch": 0.34270065853817566, "grad_norm": 4.03125, "learning_rate": 3.273047797550901e-05, "loss": 1.7098627090454102, "step": 1314 }, { "epoch": 0.3429614657364543, "grad_norm": 4.125, "learning_rate": 3.2719218023425976e-05, "loss": 1.8933629989624023, "step": 1315 }, { "epoch": 0.343222272934733, "grad_norm": 4.25, "learning_rate": 3.270795129783217e-05, "loss": 1.7215807437896729, "step": 1316 }, { "epoch": 0.34348308013301165, "grad_norm": 4.3125, "learning_rate": 3.2696677804727574e-05, "loss": 1.659914493560791, "step": 1317 }, { "epoch": 0.34374388733129035, "grad_norm": 3.875, "learning_rate": 3.26853975501158e-05, "loss": 1.5705589056015015, "step": 1318 }, { "epoch": 0.344004694529569, "grad_norm": 4.125, "learning_rate": 3.267411054000406e-05, "loss": 1.8723785877227783, "step": 1319 }, { "epoch": 0.3442655017278477, "grad_norm": 4.0, "learning_rate": 3.266281678040314e-05, "loss": 1.6139472723007202, "step": 1320 }, { "epoch": 0.34452630892612635, "grad_norm": 3.875, "learning_rate": 3.265151627732744e-05, "loss": 1.7100131511688232, "step": 1321 }, { "epoch": 0.34478711612440505, "grad_norm": 3.890625, "learning_rate": 3.2640209036794946e-05, "loss": 1.6928737163543701, "step": 1322 }, { "epoch": 0.3450479233226837, "grad_norm": 3.84375, "learning_rate": 3.262889506482723e-05, "loss": 1.6608216762542725, "step": 1323 }, { "epoch": 0.3453087305209624, "grad_norm": 3.84375, "learning_rate": 3.261757436744946e-05, "loss": 1.524505376815796, "step": 1324 }, { "epoch": 0.34556953771924104, "grad_norm": 3.75, "learning_rate": 3.2606246950690365e-05, "loss": 1.627318024635315, "step": 1325 }, { "epoch": 0.34583034491751974, "grad_norm": 4.03125, "learning_rate": 3.259491282058227e-05, "loss": 1.7727982997894287, "step": 1326 }, { "epoch": 0.3460911521157984, "grad_norm": 4.3125, "learning_rate": 3.2583571983161074e-05, "loss": 1.734523892402649, "step": 1327 }, { "epoch": 0.3463519593140771, "grad_norm": 4.03125, "learning_rate": 3.2572224444466226e-05, "loss": 1.7290492057800293, "step": 1328 }, { "epoch": 0.34661276651235573, "grad_norm": 4.375, "learning_rate": 3.2560870210540786e-05, "loss": 1.9091473817825317, "step": 1329 }, { "epoch": 0.34687357371063443, "grad_norm": 4.125, "learning_rate": 3.254950928743133e-05, "loss": 1.7047741413116455, "step": 1330 }, { "epoch": 0.3471343809089131, "grad_norm": 3.6875, "learning_rate": 3.2538141681188035e-05, "loss": 1.5911457538604736, "step": 1331 }, { "epoch": 0.3473951881071918, "grad_norm": 3.90625, "learning_rate": 3.2526767397864614e-05, "loss": 1.6464033126831055, "step": 1332 }, { "epoch": 0.3476559953054704, "grad_norm": 3.703125, "learning_rate": 3.2515386443518356e-05, "loss": 1.6222318410873413, "step": 1333 }, { "epoch": 0.3479168025037491, "grad_norm": 3.8125, "learning_rate": 3.250399882421007e-05, "loss": 1.6447246074676514, "step": 1334 }, { "epoch": 0.34817760970202777, "grad_norm": 3.6875, "learning_rate": 3.249260454600416e-05, "loss": 1.5842132568359375, "step": 1335 }, { "epoch": 0.34843841690030647, "grad_norm": 4.0, "learning_rate": 3.2481203614968544e-05, "loss": 1.8774179220199585, "step": 1336 }, { "epoch": 0.3486992240985851, "grad_norm": 3.875, "learning_rate": 3.246979603717467e-05, "loss": 1.8143694400787354, "step": 1337 }, { "epoch": 0.3489600312968638, "grad_norm": 3.84375, "learning_rate": 3.2458381818697576e-05, "loss": 1.4577972888946533, "step": 1338 }, { "epoch": 0.34922083849514246, "grad_norm": 4.125, "learning_rate": 3.244696096561579e-05, "loss": 1.8097434043884277, "step": 1339 }, { "epoch": 0.34948164569342116, "grad_norm": 3.796875, "learning_rate": 3.2435533484011385e-05, "loss": 1.5269027948379517, "step": 1340 }, { "epoch": 0.3497424528916998, "grad_norm": 4.0, "learning_rate": 3.242409937996999e-05, "loss": 1.495283603668213, "step": 1341 }, { "epoch": 0.3500032600899785, "grad_norm": 4.09375, "learning_rate": 3.2412658659580715e-05, "loss": 1.5215660333633423, "step": 1342 }, { "epoch": 0.35026406728825715, "grad_norm": 3.828125, "learning_rate": 3.240121132893623e-05, "loss": 1.4833729267120361, "step": 1343 }, { "epoch": 0.35052487448653585, "grad_norm": 3.734375, "learning_rate": 3.2389757394132706e-05, "loss": 1.6995420455932617, "step": 1344 }, { "epoch": 0.3507856816848145, "grad_norm": 3.765625, "learning_rate": 3.2378296861269854e-05, "loss": 1.4850215911865234, "step": 1345 }, { "epoch": 0.3510464888830932, "grad_norm": 4.28125, "learning_rate": 3.236682973645087e-05, "loss": 1.8568706512451172, "step": 1346 }, { "epoch": 0.35130729608137184, "grad_norm": 4.09375, "learning_rate": 3.235535602578246e-05, "loss": 1.7425060272216797, "step": 1347 }, { "epoch": 0.35156810327965055, "grad_norm": 4.0625, "learning_rate": 3.234387573537488e-05, "loss": 1.425740361213684, "step": 1348 }, { "epoch": 0.3518289104779292, "grad_norm": 3.609375, "learning_rate": 3.233238887134184e-05, "loss": 1.6624091863632202, "step": 1349 }, { "epoch": 0.35208971767620784, "grad_norm": 4.25, "learning_rate": 3.2320895439800584e-05, "loss": 1.684777021408081, "step": 1350 }, { "epoch": 0.35235052487448654, "grad_norm": 4.15625, "learning_rate": 3.230939544687183e-05, "loss": 1.9346016645431519, "step": 1351 }, { "epoch": 0.3526113320727652, "grad_norm": 4.0, "learning_rate": 3.229788889867981e-05, "loss": 1.682251214981079, "step": 1352 }, { "epoch": 0.3528721392710439, "grad_norm": 4.0, "learning_rate": 3.2286375801352225e-05, "loss": 1.6682612895965576, "step": 1353 }, { "epoch": 0.35313294646932253, "grad_norm": 4.1875, "learning_rate": 3.2274856161020284e-05, "loss": 1.6654565334320068, "step": 1354 }, { "epoch": 0.35339375366760123, "grad_norm": 3.953125, "learning_rate": 3.226332998381867e-05, "loss": 1.6384330987930298, "step": 1355 }, { "epoch": 0.3536545608658799, "grad_norm": 4.15625, "learning_rate": 3.225179727588556e-05, "loss": 1.5750579833984375, "step": 1356 }, { "epoch": 0.3539153680641586, "grad_norm": 3.828125, "learning_rate": 3.2240258043362593e-05, "loss": 1.6457281112670898, "step": 1357 }, { "epoch": 0.3541761752624372, "grad_norm": 3.765625, "learning_rate": 3.222871229239489e-05, "loss": 1.5718562602996826, "step": 1358 }, { "epoch": 0.3544369824607159, "grad_norm": 4.1875, "learning_rate": 3.221716002913103e-05, "loss": 1.8065204620361328, "step": 1359 }, { "epoch": 0.35469778965899457, "grad_norm": 4.03125, "learning_rate": 3.220560125972309e-05, "loss": 1.7895827293395996, "step": 1360 }, { "epoch": 0.35495859685727327, "grad_norm": 3.859375, "learning_rate": 3.219403599032659e-05, "loss": 1.6034718751907349, "step": 1361 }, { "epoch": 0.3552194040555519, "grad_norm": 4.21875, "learning_rate": 3.21824642271005e-05, "loss": 1.5476500988006592, "step": 1362 }, { "epoch": 0.3554802112538306, "grad_norm": 4.34375, "learning_rate": 3.217088597620728e-05, "loss": 1.6288042068481445, "step": 1363 }, { "epoch": 0.35574101845210926, "grad_norm": 3.796875, "learning_rate": 3.215930124381282e-05, "loss": 1.642390489578247, "step": 1364 }, { "epoch": 0.35600182565038796, "grad_norm": 3.859375, "learning_rate": 3.2147710036086475e-05, "loss": 1.6608468294143677, "step": 1365 }, { "epoch": 0.3562626328486666, "grad_norm": 3.703125, "learning_rate": 3.2136112359201043e-05, "loss": 1.7599210739135742, "step": 1366 }, { "epoch": 0.3565234400469453, "grad_norm": 4.09375, "learning_rate": 3.212450821933277e-05, "loss": 1.5129823684692383, "step": 1367 }, { "epoch": 0.35678424724522395, "grad_norm": 4.0625, "learning_rate": 3.211289762266132e-05, "loss": 1.5158467292785645, "step": 1368 }, { "epoch": 0.35704505444350265, "grad_norm": 3.609375, "learning_rate": 3.210128057536985e-05, "loss": 1.4805352687835693, "step": 1369 }, { "epoch": 0.3573058616417813, "grad_norm": 3.9375, "learning_rate": 3.20896570836449e-05, "loss": 1.7511478662490845, "step": 1370 }, { "epoch": 0.35756666884006, "grad_norm": 4.15625, "learning_rate": 3.207802715367647e-05, "loss": 1.7829283475875854, "step": 1371 }, { "epoch": 0.35782747603833864, "grad_norm": 3.96875, "learning_rate": 3.2066390791657966e-05, "loss": 1.6291167736053467, "step": 1372 }, { "epoch": 0.35808828323661734, "grad_norm": 3.984375, "learning_rate": 3.2054748003786245e-05, "loss": 1.9838212728500366, "step": 1373 }, { "epoch": 0.358349090434896, "grad_norm": 4.03125, "learning_rate": 3.2043098796261575e-05, "loss": 1.783674955368042, "step": 1374 }, { "epoch": 0.3586098976331747, "grad_norm": 3.734375, "learning_rate": 3.203144317528764e-05, "loss": 1.5914146900177002, "step": 1375 }, { "epoch": 0.35887070483145334, "grad_norm": 4.09375, "learning_rate": 3.2019781147071526e-05, "loss": 1.6053392887115479, "step": 1376 }, { "epoch": 0.35913151202973204, "grad_norm": 3.671875, "learning_rate": 3.2008112717823765e-05, "loss": 1.4937987327575684, "step": 1377 }, { "epoch": 0.3593923192280107, "grad_norm": 3.890625, "learning_rate": 3.199643789375828e-05, "loss": 1.5063151121139526, "step": 1378 }, { "epoch": 0.3596531264262894, "grad_norm": 4.03125, "learning_rate": 3.198475668109239e-05, "loss": 1.5862890481948853, "step": 1379 }, { "epoch": 0.359913933624568, "grad_norm": 3.3125, "learning_rate": 3.197306908604682e-05, "loss": 1.3541672229766846, "step": 1380 }, { "epoch": 0.36017474082284673, "grad_norm": 3.90625, "learning_rate": 3.196137511484571e-05, "loss": 1.6211347579956055, "step": 1381 }, { "epoch": 0.3604355480211254, "grad_norm": 3.9375, "learning_rate": 3.194967477371658e-05, "loss": 1.8768589496612549, "step": 1382 }, { "epoch": 0.3606963552194041, "grad_norm": 3.640625, "learning_rate": 3.1937968068890346e-05, "loss": 1.3836852312088013, "step": 1383 }, { "epoch": 0.3609571624176827, "grad_norm": 3.734375, "learning_rate": 3.192625500660132e-05, "loss": 1.5915032625198364, "step": 1384 }, { "epoch": 0.3612179696159614, "grad_norm": 3.84375, "learning_rate": 3.191453559308718e-05, "loss": 1.5451784133911133, "step": 1385 }, { "epoch": 0.36147877681424007, "grad_norm": 3.921875, "learning_rate": 3.190280983458901e-05, "loss": 1.7580339908599854, "step": 1386 }, { "epoch": 0.36173958401251877, "grad_norm": 4.15625, "learning_rate": 3.189107773735126e-05, "loss": 1.6647831201553345, "step": 1387 }, { "epoch": 0.3620003912107974, "grad_norm": 3.921875, "learning_rate": 3.1879339307621765e-05, "loss": 1.7435990571975708, "step": 1388 }, { "epoch": 0.3622611984090761, "grad_norm": 3.875, "learning_rate": 3.1867594551651704e-05, "loss": 1.602888822555542, "step": 1389 }, { "epoch": 0.36252200560735476, "grad_norm": 3.53125, "learning_rate": 3.185584347569567e-05, "loss": 1.54853093624115, "step": 1390 }, { "epoch": 0.36278281280563346, "grad_norm": 3.890625, "learning_rate": 3.184408608601158e-05, "loss": 1.764925241470337, "step": 1391 }, { "epoch": 0.3630436200039121, "grad_norm": 4.03125, "learning_rate": 3.183232238886075e-05, "loss": 1.7794710397720337, "step": 1392 }, { "epoch": 0.3633044272021908, "grad_norm": 6.5625, "learning_rate": 3.182055239050782e-05, "loss": 1.8926446437835693, "step": 1393 }, { "epoch": 0.36356523440046945, "grad_norm": 3.953125, "learning_rate": 3.18087760972208e-05, "loss": 1.7303396463394165, "step": 1394 }, { "epoch": 0.36382604159874815, "grad_norm": 4.09375, "learning_rate": 3.1796993515271075e-05, "loss": 1.884756088256836, "step": 1395 }, { "epoch": 0.3640868487970268, "grad_norm": 4.0, "learning_rate": 3.1785204650933334e-05, "loss": 1.5551866292953491, "step": 1396 }, { "epoch": 0.3643476559953055, "grad_norm": 4.03125, "learning_rate": 3.177340951048566e-05, "loss": 1.7906616926193237, "step": 1397 }, { "epoch": 0.36460846319358414, "grad_norm": 3.96875, "learning_rate": 3.176160810020943e-05, "loss": 1.7069594860076904, "step": 1398 }, { "epoch": 0.3648692703918628, "grad_norm": 3.78125, "learning_rate": 3.1749800426389405e-05, "loss": 1.9896750450134277, "step": 1399 }, { "epoch": 0.3651300775901415, "grad_norm": 4.09375, "learning_rate": 3.1737986495313644e-05, "loss": 1.3037837743759155, "step": 1400 }, { "epoch": 0.36539088478842013, "grad_norm": 3.96875, "learning_rate": 3.1726166313273565e-05, "loss": 1.5864887237548828, "step": 1401 }, { "epoch": 0.36565169198669883, "grad_norm": 4.03125, "learning_rate": 3.1714339886563896e-05, "loss": 1.5846668481826782, "step": 1402 }, { "epoch": 0.3659124991849775, "grad_norm": 3.609375, "learning_rate": 3.170250722148271e-05, "loss": 1.5023304224014282, "step": 1403 }, { "epoch": 0.3661733063832562, "grad_norm": 4.0, "learning_rate": 3.169066832433139e-05, "loss": 1.797839879989624, "step": 1404 }, { "epoch": 0.3664341135815348, "grad_norm": 4.0625, "learning_rate": 3.167882320141463e-05, "loss": 1.8674392700195312, "step": 1405 }, { "epoch": 0.3666949207798135, "grad_norm": 3.6875, "learning_rate": 3.166697185904046e-05, "loss": 1.5936272144317627, "step": 1406 }, { "epoch": 0.36695572797809217, "grad_norm": 4.21875, "learning_rate": 3.1655114303520216e-05, "loss": 1.7863364219665527, "step": 1407 }, { "epoch": 0.3672165351763709, "grad_norm": 3.8125, "learning_rate": 3.1643250541168515e-05, "loss": 1.576570987701416, "step": 1408 }, { "epoch": 0.3674773423746495, "grad_norm": 4.125, "learning_rate": 3.163138057830332e-05, "loss": 1.69913649559021, "step": 1409 }, { "epoch": 0.3677381495729282, "grad_norm": 3.703125, "learning_rate": 3.161950442124587e-05, "loss": 1.4786731004714966, "step": 1410 }, { "epoch": 0.36799895677120686, "grad_norm": 3.828125, "learning_rate": 3.160762207632071e-05, "loss": 1.6378079652786255, "step": 1411 }, { "epoch": 0.36825976396948557, "grad_norm": 3.5, "learning_rate": 3.1595733549855697e-05, "loss": 1.553241491317749, "step": 1412 }, { "epoch": 0.3685205711677642, "grad_norm": 3.75, "learning_rate": 3.158383884818195e-05, "loss": 1.5734972953796387, "step": 1413 }, { "epoch": 0.3687813783660429, "grad_norm": 4.0625, "learning_rate": 3.1571937977633884e-05, "loss": 1.666394591331482, "step": 1414 }, { "epoch": 0.36904218556432156, "grad_norm": 3.84375, "learning_rate": 3.1560030944549226e-05, "loss": 1.7138550281524658, "step": 1415 }, { "epoch": 0.36930299276260026, "grad_norm": 3.8125, "learning_rate": 3.1548117755268945e-05, "loss": 1.4449464082717896, "step": 1416 }, { "epoch": 0.3695637999608789, "grad_norm": 4.0625, "learning_rate": 3.1536198416137325e-05, "loss": 1.5007790327072144, "step": 1417 }, { "epoch": 0.3698246071591576, "grad_norm": 4.03125, "learning_rate": 3.1524272933501895e-05, "loss": 1.4527007341384888, "step": 1418 }, { "epoch": 0.37008541435743625, "grad_norm": 3.5, "learning_rate": 3.151234131371348e-05, "loss": 1.612570881843567, "step": 1419 }, { "epoch": 0.37034622155571495, "grad_norm": 4.09375, "learning_rate": 3.150040356312614e-05, "loss": 1.774835467338562, "step": 1420 }, { "epoch": 0.3706070287539936, "grad_norm": 3.9375, "learning_rate": 3.148845968809725e-05, "loss": 1.7513749599456787, "step": 1421 }, { "epoch": 0.3708678359522723, "grad_norm": 3.9375, "learning_rate": 3.147650969498741e-05, "loss": 1.6648533344268799, "step": 1422 }, { "epoch": 0.37112864315055094, "grad_norm": 3.890625, "learning_rate": 3.146455359016048e-05, "loss": 1.6554616689682007, "step": 1423 }, { "epoch": 0.37138945034882964, "grad_norm": 4.15625, "learning_rate": 3.1452591379983574e-05, "loss": 1.7418549060821533, "step": 1424 }, { "epoch": 0.3716502575471083, "grad_norm": 4.28125, "learning_rate": 3.144062307082709e-05, "loss": 1.7103866338729858, "step": 1425 }, { "epoch": 0.371911064745387, "grad_norm": 3.515625, "learning_rate": 3.142864866906462e-05, "loss": 1.495460867881775, "step": 1426 }, { "epoch": 0.37217187194366563, "grad_norm": 4.53125, "learning_rate": 3.141666818107306e-05, "loss": 1.5710725784301758, "step": 1427 }, { "epoch": 0.37243267914194433, "grad_norm": 4.03125, "learning_rate": 3.1404681613232476e-05, "loss": 1.6216447353363037, "step": 1428 }, { "epoch": 0.372693486340223, "grad_norm": 4.0, "learning_rate": 3.139268897192625e-05, "loss": 1.8143309354782104, "step": 1429 }, { "epoch": 0.3729542935385017, "grad_norm": 3.84375, "learning_rate": 3.138069026354095e-05, "loss": 1.5283796787261963, "step": 1430 }, { "epoch": 0.3732151007367803, "grad_norm": 4.28125, "learning_rate": 3.1368685494466375e-05, "loss": 1.5353344678878784, "step": 1431 }, { "epoch": 0.373475907935059, "grad_norm": 4.1875, "learning_rate": 3.1356674671095564e-05, "loss": 1.8828372955322266, "step": 1432 }, { "epoch": 0.37373671513333767, "grad_norm": 4.0, "learning_rate": 3.134465779982479e-05, "loss": 1.6504466533660889, "step": 1433 }, { "epoch": 0.37399752233161637, "grad_norm": 4.0625, "learning_rate": 3.133263488705353e-05, "loss": 1.8509012460708618, "step": 1434 }, { "epoch": 0.374258329529895, "grad_norm": 3.75, "learning_rate": 3.132060593918448e-05, "loss": 1.6439563035964966, "step": 1435 }, { "epoch": 0.3745191367281737, "grad_norm": 4.125, "learning_rate": 3.1308570962623554e-05, "loss": 1.6253024339675903, "step": 1436 }, { "epoch": 0.37477994392645236, "grad_norm": 4.28125, "learning_rate": 3.129652996377987e-05, "loss": 1.900834321975708, "step": 1437 }, { "epoch": 0.37504075112473106, "grad_norm": 3.75, "learning_rate": 3.1284482949065776e-05, "loss": 1.5885049104690552, "step": 1438 }, { "epoch": 0.3753015583230097, "grad_norm": 4.375, "learning_rate": 3.127242992489679e-05, "loss": 1.9497604370117188, "step": 1439 }, { "epoch": 0.3755623655212884, "grad_norm": 3.734375, "learning_rate": 3.126037089769165e-05, "loss": 1.7633403539657593, "step": 1440 }, { "epoch": 0.37582317271956706, "grad_norm": 3.625, "learning_rate": 3.12483058738723e-05, "loss": 1.619724988937378, "step": 1441 }, { "epoch": 0.37608397991784576, "grad_norm": 3.609375, "learning_rate": 3.123623485986385e-05, "loss": 1.611403465270996, "step": 1442 }, { "epoch": 0.3763447871161244, "grad_norm": 3.921875, "learning_rate": 3.1224157862094624e-05, "loss": 1.6440799236297607, "step": 1443 }, { "epoch": 0.3766055943144031, "grad_norm": 3.734375, "learning_rate": 3.121207488699612e-05, "loss": 1.7324094772338867, "step": 1444 }, { "epoch": 0.37686640151268175, "grad_norm": 3.8125, "learning_rate": 3.1199985941003025e-05, "loss": 1.7377610206604004, "step": 1445 }, { "epoch": 0.37712720871096045, "grad_norm": 3.734375, "learning_rate": 3.11878910305532e-05, "loss": 1.4954262971878052, "step": 1446 }, { "epoch": 0.3773880159092391, "grad_norm": 3.71875, "learning_rate": 3.117579016208769e-05, "loss": 1.6021431684494019, "step": 1447 }, { "epoch": 0.3776488231075178, "grad_norm": 3.828125, "learning_rate": 3.1163683342050716e-05, "loss": 1.5622243881225586, "step": 1448 }, { "epoch": 0.37790963030579644, "grad_norm": 3.84375, "learning_rate": 3.115157057688964e-05, "loss": 1.568746566772461, "step": 1449 }, { "epoch": 0.3781704375040751, "grad_norm": 3.78125, "learning_rate": 3.113945187305504e-05, "loss": 1.6814937591552734, "step": 1450 }, { "epoch": 0.3784312447023538, "grad_norm": 3.984375, "learning_rate": 3.11273272370006e-05, "loss": 1.6775844097137451, "step": 1451 }, { "epoch": 0.37869205190063243, "grad_norm": 3.953125, "learning_rate": 3.1115196675183216e-05, "loss": 1.4709616899490356, "step": 1452 }, { "epoch": 0.37895285909891113, "grad_norm": 3.84375, "learning_rate": 3.11030601940629e-05, "loss": 1.746448040008545, "step": 1453 }, { "epoch": 0.3792136662971898, "grad_norm": 3.75, "learning_rate": 3.109091780010283e-05, "loss": 1.7106285095214844, "step": 1454 }, { "epoch": 0.3794744734954685, "grad_norm": 3.859375, "learning_rate": 3.107876949976934e-05, "loss": 1.6017910242080688, "step": 1455 }, { "epoch": 0.3797352806937471, "grad_norm": 3.953125, "learning_rate": 3.106661529953191e-05, "loss": 1.951802134513855, "step": 1456 }, { "epoch": 0.3799960878920258, "grad_norm": 3.65625, "learning_rate": 3.105445520586314e-05, "loss": 1.2404531240463257, "step": 1457 }, { "epoch": 0.38025689509030447, "grad_norm": 3.5625, "learning_rate": 3.1042289225238796e-05, "loss": 1.6789586544036865, "step": 1458 }, { "epoch": 0.38051770228858317, "grad_norm": 3.765625, "learning_rate": 3.103011736413776e-05, "loss": 1.8510938882827759, "step": 1459 }, { "epoch": 0.3807785094868618, "grad_norm": 4.03125, "learning_rate": 3.101793962904205e-05, "loss": 1.9462577104568481, "step": 1460 }, { "epoch": 0.3810393166851405, "grad_norm": 3.8125, "learning_rate": 3.100575602643683e-05, "loss": 1.7952697277069092, "step": 1461 }, { "epoch": 0.38130012388341916, "grad_norm": 3.609375, "learning_rate": 3.099356656281035e-05, "loss": 1.5525479316711426, "step": 1462 }, { "epoch": 0.38156093108169786, "grad_norm": 3.734375, "learning_rate": 3.098137124465403e-05, "loss": 1.5512548685073853, "step": 1463 }, { "epoch": 0.3818217382799765, "grad_norm": 3.84375, "learning_rate": 3.096917007846237e-05, "loss": 1.47243070602417, "step": 1464 }, { "epoch": 0.3820825454782552, "grad_norm": 7.34375, "learning_rate": 3.095696307073299e-05, "loss": 2.14477801322937, "step": 1465 }, { "epoch": 0.38234335267653385, "grad_norm": 4.0625, "learning_rate": 3.094475022796664e-05, "loss": 1.8921345472335815, "step": 1466 }, { "epoch": 0.38260415987481255, "grad_norm": 3.953125, "learning_rate": 3.093253155666715e-05, "loss": 1.5091718435287476, "step": 1467 }, { "epoch": 0.3828649670730912, "grad_norm": 3.71875, "learning_rate": 3.0920307063341485e-05, "loss": 1.737138032913208, "step": 1468 }, { "epoch": 0.3831257742713699, "grad_norm": 3.390625, "learning_rate": 3.090807675449969e-05, "loss": 1.533186435699463, "step": 1469 }, { "epoch": 0.38338658146964855, "grad_norm": 4.53125, "learning_rate": 3.0895840636654906e-05, "loss": 1.9540660381317139, "step": 1470 }, { "epoch": 0.38364738866792725, "grad_norm": 3.921875, "learning_rate": 3.088359871632337e-05, "loss": 1.6391453742980957, "step": 1471 }, { "epoch": 0.3839081958662059, "grad_norm": 3.65625, "learning_rate": 3.0871351000024425e-05, "loss": 1.4946112632751465, "step": 1472 }, { "epoch": 0.3841690030644846, "grad_norm": 3.53125, "learning_rate": 3.085909749428048e-05, "loss": 1.6111618280410767, "step": 1473 }, { "epoch": 0.38442981026276324, "grad_norm": 4.15625, "learning_rate": 3.084683820561703e-05, "loss": 1.9099327325820923, "step": 1474 }, { "epoch": 0.38469061746104194, "grad_norm": 4.125, "learning_rate": 3.083457314056267e-05, "loss": 1.5812824964523315, "step": 1475 }, { "epoch": 0.3849514246593206, "grad_norm": 4.0625, "learning_rate": 3.082230230564904e-05, "loss": 1.604433536529541, "step": 1476 }, { "epoch": 0.3852122318575993, "grad_norm": 4.03125, "learning_rate": 3.081002570741086e-05, "loss": 1.717907428741455, "step": 1477 }, { "epoch": 0.38547303905587793, "grad_norm": 4.125, "learning_rate": 3.0797743352385956e-05, "loss": 1.7131648063659668, "step": 1478 }, { "epoch": 0.38573384625415663, "grad_norm": 3.8125, "learning_rate": 3.078545524711517e-05, "loss": 1.4925047159194946, "step": 1479 }, { "epoch": 0.3859946534524353, "grad_norm": 3.9375, "learning_rate": 3.0773161398142435e-05, "loss": 1.5858057737350464, "step": 1480 }, { "epoch": 0.386255460650714, "grad_norm": 3.875, "learning_rate": 3.076086181201474e-05, "loss": 1.7264738082885742, "step": 1481 }, { "epoch": 0.3865162678489926, "grad_norm": 3.84375, "learning_rate": 3.0748556495282104e-05, "loss": 1.7423782348632812, "step": 1482 }, { "epoch": 0.3867770750472713, "grad_norm": 3.78125, "learning_rate": 3.0736245454497634e-05, "loss": 1.4618861675262451, "step": 1483 }, { "epoch": 0.38703788224554997, "grad_norm": 3.734375, "learning_rate": 3.072392869621747e-05, "loss": 1.5003267526626587, "step": 1484 }, { "epoch": 0.38729868944382867, "grad_norm": 3.953125, "learning_rate": 3.0711606227000794e-05, "loss": 1.821825623512268, "step": 1485 }, { "epoch": 0.3875594966421073, "grad_norm": 4.0, "learning_rate": 3.069927805340983e-05, "loss": 1.6688201427459717, "step": 1486 }, { "epoch": 0.387820303840386, "grad_norm": 4.09375, "learning_rate": 3.068694418200985e-05, "loss": 1.8684214353561401, "step": 1487 }, { "epoch": 0.38808111103866466, "grad_norm": 3.921875, "learning_rate": 3.0674604619369136e-05, "loss": 1.8903346061706543, "step": 1488 }, { "epoch": 0.38834191823694336, "grad_norm": 3.875, "learning_rate": 3.0662259372059026e-05, "loss": 1.7969683408737183, "step": 1489 }, { "epoch": 0.388602725435222, "grad_norm": 3.90625, "learning_rate": 3.064990844665388e-05, "loss": 1.8461135625839233, "step": 1490 }, { "epoch": 0.3888635326335007, "grad_norm": 3.78125, "learning_rate": 3.063755184973107e-05, "loss": 1.934802532196045, "step": 1491 }, { "epoch": 0.38912433983177935, "grad_norm": 3.859375, "learning_rate": 3.062518958787099e-05, "loss": 1.5508052110671997, "step": 1492 }, { "epoch": 0.38938514703005805, "grad_norm": 3.8125, "learning_rate": 3.061282166765707e-05, "loss": 1.5577564239501953, "step": 1493 }, { "epoch": 0.3896459542283367, "grad_norm": 3.890625, "learning_rate": 3.0600448095675736e-05, "loss": 1.7710663080215454, "step": 1494 }, { "epoch": 0.3899067614266154, "grad_norm": 3.71875, "learning_rate": 3.0588068878516435e-05, "loss": 1.5946844816207886, "step": 1495 }, { "epoch": 0.39016756862489405, "grad_norm": 3.890625, "learning_rate": 3.0575684022771595e-05, "loss": 1.5420547723770142, "step": 1496 }, { "epoch": 0.39042837582317275, "grad_norm": 3.890625, "learning_rate": 3.0563293535036676e-05, "loss": 1.6369975805282593, "step": 1497 }, { "epoch": 0.3906891830214514, "grad_norm": 3.71875, "learning_rate": 3.055089742191013e-05, "loss": 1.6601423025131226, "step": 1498 }, { "epoch": 0.39094999021973004, "grad_norm": 4.21875, "learning_rate": 3.05384956899934e-05, "loss": 1.8204602003097534, "step": 1499 }, { "epoch": 0.39121079741800874, "grad_norm": 4.21875, "learning_rate": 3.052608834589091e-05, "loss": 1.9024076461791992, "step": 1500 }, { "epoch": 0.3914716046162874, "grad_norm": 4.03125, "learning_rate": 3.0513675396210094e-05, "loss": 1.7785924673080444, "step": 1501 }, { "epoch": 0.3917324118145661, "grad_norm": 4.4375, "learning_rate": 3.050125684756137e-05, "loss": 1.9502075910568237, "step": 1502 }, { "epoch": 0.39199321901284473, "grad_norm": 3.90625, "learning_rate": 3.048883270655812e-05, "loss": 1.6539990901947021, "step": 1503 }, { "epoch": 0.39225402621112343, "grad_norm": 3.921875, "learning_rate": 3.047640297981671e-05, "loss": 1.8033804893493652, "step": 1504 }, { "epoch": 0.3925148334094021, "grad_norm": 4.09375, "learning_rate": 3.04639676739565e-05, "loss": 1.6304492950439453, "step": 1505 }, { "epoch": 0.3927756406076808, "grad_norm": 3.734375, "learning_rate": 3.045152679559979e-05, "loss": 1.5990183353424072, "step": 1506 }, { "epoch": 0.3930364478059594, "grad_norm": 3.640625, "learning_rate": 3.0439080351371875e-05, "loss": 1.6147968769073486, "step": 1507 }, { "epoch": 0.3932972550042381, "grad_norm": 3.421875, "learning_rate": 3.0426628347900996e-05, "loss": 1.553178071975708, "step": 1508 }, { "epoch": 0.39355806220251677, "grad_norm": 3.734375, "learning_rate": 3.0414170791818355e-05, "loss": 1.5697667598724365, "step": 1509 }, { "epoch": 0.39381886940079547, "grad_norm": 3.265625, "learning_rate": 3.0401707689758133e-05, "loss": 1.266568899154663, "step": 1510 }, { "epoch": 0.3940796765990741, "grad_norm": 3.578125, "learning_rate": 3.0389239048357437e-05, "loss": 1.5824153423309326, "step": 1511 }, { "epoch": 0.3943404837973528, "grad_norm": 4.1875, "learning_rate": 3.0376764874256337e-05, "loss": 1.812801718711853, "step": 1512 }, { "epoch": 0.39460129099563146, "grad_norm": 3.609375, "learning_rate": 3.036428517409785e-05, "loss": 1.650933027267456, "step": 1513 }, { "epoch": 0.39486209819391016, "grad_norm": 3.515625, "learning_rate": 3.0351799954527927e-05, "loss": 1.4789142608642578, "step": 1514 }, { "epoch": 0.3951229053921888, "grad_norm": 3.765625, "learning_rate": 3.033930922219548e-05, "loss": 1.656179428100586, "step": 1515 }, { "epoch": 0.3953837125904675, "grad_norm": 4.40625, "learning_rate": 3.032681298375233e-05, "loss": 1.6824867725372314, "step": 1516 }, { "epoch": 0.39564451978874615, "grad_norm": 4.0625, "learning_rate": 3.031431124585324e-05, "loss": 1.7417092323303223, "step": 1517 }, { "epoch": 0.39590532698702485, "grad_norm": 3.796875, "learning_rate": 3.0301804015155906e-05, "loss": 1.629252314567566, "step": 1518 }, { "epoch": 0.3961661341853035, "grad_norm": 3.5625, "learning_rate": 3.0289291298320952e-05, "loss": 1.5575857162475586, "step": 1519 }, { "epoch": 0.3964269413835822, "grad_norm": 3.953125, "learning_rate": 3.027677310201192e-05, "loss": 1.5468329191207886, "step": 1520 }, { "epoch": 0.39668774858186084, "grad_norm": 3.6875, "learning_rate": 3.0264249432895254e-05, "loss": 1.6860032081604004, "step": 1521 }, { "epoch": 0.39694855578013954, "grad_norm": 3.78125, "learning_rate": 3.0251720297640336e-05, "loss": 1.6084399223327637, "step": 1522 }, { "epoch": 0.3972093629784182, "grad_norm": 4.46875, "learning_rate": 3.0239185702919452e-05, "loss": 1.4937280416488647, "step": 1523 }, { "epoch": 0.3974701701766969, "grad_norm": 3.625, "learning_rate": 3.0226645655407795e-05, "loss": 1.6108170747756958, "step": 1524 }, { "epoch": 0.39773097737497554, "grad_norm": 4.34375, "learning_rate": 3.0214100161783445e-05, "loss": 1.752903699874878, "step": 1525 }, { "epoch": 0.39799178457325424, "grad_norm": 3.78125, "learning_rate": 3.0201549228727417e-05, "loss": 1.7260315418243408, "step": 1526 }, { "epoch": 0.3982525917715329, "grad_norm": 3.6875, "learning_rate": 3.018899286292359e-05, "loss": 1.5754700899124146, "step": 1527 }, { "epoch": 0.3985133989698116, "grad_norm": 4.3125, "learning_rate": 3.017643107105876e-05, "loss": 1.630751609802246, "step": 1528 }, { "epoch": 0.39877420616809023, "grad_norm": 3.890625, "learning_rate": 3.0163863859822596e-05, "loss": 1.6684372425079346, "step": 1529 }, { "epoch": 0.39903501336636893, "grad_norm": 3.984375, "learning_rate": 3.0151291235907643e-05, "loss": 1.8939087390899658, "step": 1530 }, { "epoch": 0.3992958205646476, "grad_norm": 3.828125, "learning_rate": 3.0138713206009376e-05, "loss": 1.4833927154541016, "step": 1531 }, { "epoch": 0.3995566277629263, "grad_norm": 3.96875, "learning_rate": 3.0126129776826095e-05, "loss": 1.6794382333755493, "step": 1532 }, { "epoch": 0.3998174349612049, "grad_norm": 3.6875, "learning_rate": 3.0113540955059e-05, "loss": 1.7359819412231445, "step": 1533 }, { "epoch": 0.4000782421594836, "grad_norm": 3.78125, "learning_rate": 3.0100946747412173e-05, "loss": 1.6286976337432861, "step": 1534 }, { "epoch": 0.40033904935776227, "grad_norm": 3.796875, "learning_rate": 3.0088347160592534e-05, "loss": 1.8103272914886475, "step": 1535 }, { "epoch": 0.40059985655604097, "grad_norm": 4.09375, "learning_rate": 3.0075742201309898e-05, "loss": 1.7070512771606445, "step": 1536 }, { "epoch": 0.4008606637543196, "grad_norm": 3.3125, "learning_rate": 3.0063131876276917e-05, "loss": 1.4519729614257812, "step": 1537 }, { "epoch": 0.4011214709525983, "grad_norm": 3.828125, "learning_rate": 3.0050516192209126e-05, "loss": 1.6558887958526611, "step": 1538 }, { "epoch": 0.40138227815087696, "grad_norm": 3.8125, "learning_rate": 3.003789515582489e-05, "loss": 1.4730424880981445, "step": 1539 }, { "epoch": 0.40164308534915566, "grad_norm": 3.875, "learning_rate": 3.0025268773845436e-05, "loss": 1.8217370510101318, "step": 1540 }, { "epoch": 0.4019038925474343, "grad_norm": 3.9375, "learning_rate": 3.001263705299484e-05, "loss": 1.6720647811889648, "step": 1541 }, { "epoch": 0.402164699745713, "grad_norm": 3.796875, "learning_rate": 3.0000000000000004e-05, "loss": 1.663779377937317, "step": 1542 }, { "epoch": 0.40242550694399165, "grad_norm": 3.6875, "learning_rate": 2.9987357621590693e-05, "loss": 1.4999834299087524, "step": 1543 }, { "epoch": 0.40268631414227035, "grad_norm": 3.71875, "learning_rate": 2.9974709924499498e-05, "loss": 1.628490924835205, "step": 1544 }, { "epoch": 0.402947121340549, "grad_norm": 4.125, "learning_rate": 2.9962056915461844e-05, "loss": 1.6849451065063477, "step": 1545 }, { "epoch": 0.4032079285388277, "grad_norm": 4.0625, "learning_rate": 2.994939860121598e-05, "loss": 1.4971082210540771, "step": 1546 }, { "epoch": 0.40346873573710634, "grad_norm": 4.0625, "learning_rate": 2.993673498850297e-05, "loss": 1.6485989093780518, "step": 1547 }, { "epoch": 0.40372954293538504, "grad_norm": 3.875, "learning_rate": 2.9924066084066737e-05, "loss": 1.4791631698608398, "step": 1548 }, { "epoch": 0.4039903501336637, "grad_norm": 3.921875, "learning_rate": 2.9911391894653975e-05, "loss": 1.6571983098983765, "step": 1549 }, { "epoch": 0.40425115733194233, "grad_norm": 3.703125, "learning_rate": 2.9898712427014227e-05, "loss": 1.6108894348144531, "step": 1550 }, { "epoch": 0.40451196453022104, "grad_norm": 3.546875, "learning_rate": 2.9886027687899843e-05, "loss": 1.719677209854126, "step": 1551 }, { "epoch": 0.4047727717284997, "grad_norm": 3.890625, "learning_rate": 2.9873337684065945e-05, "loss": 1.770775318145752, "step": 1552 }, { "epoch": 0.4050335789267784, "grad_norm": 3.796875, "learning_rate": 2.9860642422270517e-05, "loss": 1.6228448152542114, "step": 1553 }, { "epoch": 0.405294386125057, "grad_norm": 3.65625, "learning_rate": 2.9847941909274295e-05, "loss": 1.7396163940429688, "step": 1554 }, { "epoch": 0.4055551933233357, "grad_norm": 3.375, "learning_rate": 2.983523615184083e-05, "loss": 1.175154685974121, "step": 1555 }, { "epoch": 0.4058160005216144, "grad_norm": 4.09375, "learning_rate": 2.9822525156736467e-05, "loss": 1.6514532566070557, "step": 1556 }, { "epoch": 0.4060768077198931, "grad_norm": 3.71875, "learning_rate": 2.980980893073034e-05, "loss": 1.2293200492858887, "step": 1557 }, { "epoch": 0.4063376149181717, "grad_norm": 3.6875, "learning_rate": 2.9797087480594366e-05, "loss": 1.437143087387085, "step": 1558 }, { "epoch": 0.4065984221164504, "grad_norm": 3.515625, "learning_rate": 2.9784360813103236e-05, "loss": 1.4445056915283203, "step": 1559 }, { "epoch": 0.40685922931472907, "grad_norm": 3.734375, "learning_rate": 2.9771628935034434e-05, "loss": 1.692763328552246, "step": 1560 }, { "epoch": 0.40712003651300777, "grad_norm": 3.828125, "learning_rate": 2.9758891853168213e-05, "loss": 1.7990310192108154, "step": 1561 }, { "epoch": 0.4073808437112864, "grad_norm": 3.71875, "learning_rate": 2.97461495742876e-05, "loss": 1.3989152908325195, "step": 1562 }, { "epoch": 0.4076416509095651, "grad_norm": 3.796875, "learning_rate": 2.973340210517839e-05, "loss": 1.4927902221679688, "step": 1563 }, { "epoch": 0.40790245810784376, "grad_norm": 3.5625, "learning_rate": 2.9720649452629123e-05, "loss": 1.634340524673462, "step": 1564 }, { "epoch": 0.40816326530612246, "grad_norm": 3.90625, "learning_rate": 2.9707891623431126e-05, "loss": 1.6583830118179321, "step": 1565 }, { "epoch": 0.4084240725044011, "grad_norm": 3.71875, "learning_rate": 2.9695128624378468e-05, "loss": 1.4272788763046265, "step": 1566 }, { "epoch": 0.4086848797026798, "grad_norm": 3.765625, "learning_rate": 2.968236046226798e-05, "loss": 1.62437105178833, "step": 1567 }, { "epoch": 0.40894568690095845, "grad_norm": 3.484375, "learning_rate": 2.9669587143899235e-05, "loss": 1.4198951721191406, "step": 1568 }, { "epoch": 0.40920649409923715, "grad_norm": 3.6875, "learning_rate": 2.965680867607455e-05, "loss": 1.5796867609024048, "step": 1569 }, { "epoch": 0.4094673012975158, "grad_norm": 3.3125, "learning_rate": 2.9644025065599e-05, "loss": 1.423644781112671, "step": 1570 }, { "epoch": 0.4097281084957945, "grad_norm": 3.90625, "learning_rate": 2.9631236319280376e-05, "loss": 1.70688796043396, "step": 1571 }, { "epoch": 0.40998891569407314, "grad_norm": 3.875, "learning_rate": 2.9618442443929218e-05, "loss": 1.974204182624817, "step": 1572 }, { "epoch": 0.41024972289235184, "grad_norm": 3.609375, "learning_rate": 2.9605643446358798e-05, "loss": 1.5921353101730347, "step": 1573 }, { "epoch": 0.4105105300906305, "grad_norm": 4.03125, "learning_rate": 2.959283933338511e-05, "loss": 1.6732794046401978, "step": 1574 }, { "epoch": 0.4107713372889092, "grad_norm": 3.640625, "learning_rate": 2.9580030111826882e-05, "loss": 1.6262917518615723, "step": 1575 }, { "epoch": 0.41103214448718783, "grad_norm": 3.828125, "learning_rate": 2.9567215788505544e-05, "loss": 1.9774980545043945, "step": 1576 }, { "epoch": 0.41129295168546653, "grad_norm": 3.671875, "learning_rate": 2.955439637024526e-05, "loss": 1.4540205001831055, "step": 1577 }, { "epoch": 0.4115537588837452, "grad_norm": 3.65625, "learning_rate": 2.9541571863872903e-05, "loss": 1.752079725265503, "step": 1578 }, { "epoch": 0.4118145660820239, "grad_norm": 3.75, "learning_rate": 2.9528742276218053e-05, "loss": 1.6492635011672974, "step": 1579 }, { "epoch": 0.4120753732803025, "grad_norm": 3.625, "learning_rate": 2.9515907614113e-05, "loss": 1.4227832555770874, "step": 1580 }, { "epoch": 0.4123361804785812, "grad_norm": 3.59375, "learning_rate": 2.9503067884392726e-05, "loss": 1.2912888526916504, "step": 1581 }, { "epoch": 0.41259698767685987, "grad_norm": 4.21875, "learning_rate": 2.9490223093894924e-05, "loss": 1.5593420267105103, "step": 1582 }, { "epoch": 0.4128577948751386, "grad_norm": 3.9375, "learning_rate": 2.9477373249459974e-05, "loss": 1.7273547649383545, "step": 1583 }, { "epoch": 0.4131186020734172, "grad_norm": 3.484375, "learning_rate": 2.946451835793096e-05, "loss": 1.5384719371795654, "step": 1584 }, { "epoch": 0.4133794092716959, "grad_norm": 3.875, "learning_rate": 2.945165842615364e-05, "loss": 1.8384695053100586, "step": 1585 }, { "epoch": 0.41364021646997456, "grad_norm": 3.84375, "learning_rate": 2.943879346097645e-05, "loss": 1.7200520038604736, "step": 1586 }, { "epoch": 0.41390102366825327, "grad_norm": 3.984375, "learning_rate": 2.942592346925053e-05, "loss": 1.5968172550201416, "step": 1587 }, { "epoch": 0.4141618308665319, "grad_norm": 3.65625, "learning_rate": 2.941304845782968e-05, "loss": 1.5619614124298096, "step": 1588 }, { "epoch": 0.4144226380648106, "grad_norm": 3.828125, "learning_rate": 2.9400168433570378e-05, "loss": 1.551422119140625, "step": 1589 }, { "epoch": 0.41468344526308926, "grad_norm": 3.8125, "learning_rate": 2.938728340333177e-05, "loss": 1.6820770502090454, "step": 1590 }, { "epoch": 0.41494425246136796, "grad_norm": 3.78125, "learning_rate": 2.9374393373975663e-05, "loss": 1.576513648033142, "step": 1591 }, { "epoch": 0.4152050596596466, "grad_norm": 4.0625, "learning_rate": 2.936149835236655e-05, "loss": 1.6498585939407349, "step": 1592 }, { "epoch": 0.4154658668579253, "grad_norm": 3.5, "learning_rate": 2.9348598345371545e-05, "loss": 1.6826300621032715, "step": 1593 }, { "epoch": 0.41572667405620395, "grad_norm": 3.78125, "learning_rate": 2.933569335986044e-05, "loss": 1.864450216293335, "step": 1594 }, { "epoch": 0.41598748125448265, "grad_norm": 3.9375, "learning_rate": 2.9322783402705687e-05, "loss": 1.5203943252563477, "step": 1595 }, { "epoch": 0.4162482884527613, "grad_norm": 3.875, "learning_rate": 2.9309868480782356e-05, "loss": 1.7502679824829102, "step": 1596 }, { "epoch": 0.41650909565104, "grad_norm": 3.8125, "learning_rate": 2.9296948600968194e-05, "loss": 1.60343599319458, "step": 1597 }, { "epoch": 0.41676990284931864, "grad_norm": 3.59375, "learning_rate": 2.928402377014356e-05, "loss": 1.7471981048583984, "step": 1598 }, { "epoch": 0.4170307100475973, "grad_norm": 3.84375, "learning_rate": 2.9271093995191467e-05, "loss": 1.9182765483856201, "step": 1599 }, { "epoch": 0.417291517245876, "grad_norm": 3.859375, "learning_rate": 2.9258159282997555e-05, "loss": 1.9174833297729492, "step": 1600 }, { "epoch": 0.41755232444415463, "grad_norm": 3.609375, "learning_rate": 2.9245219640450103e-05, "loss": 1.5726749897003174, "step": 1601 }, { "epoch": 0.41781313164243333, "grad_norm": 3.9375, "learning_rate": 2.9232275074439996e-05, "loss": 1.4135874509811401, "step": 1602 }, { "epoch": 0.418073938840712, "grad_norm": 3.578125, "learning_rate": 2.9219325591860753e-05, "loss": 1.641174554824829, "step": 1603 }, { "epoch": 0.4183347460389907, "grad_norm": 3.890625, "learning_rate": 2.9206371199608518e-05, "loss": 1.7001125812530518, "step": 1604 }, { "epoch": 0.4185955532372693, "grad_norm": 4.28125, "learning_rate": 2.9193411904582033e-05, "loss": 1.8250946998596191, "step": 1605 }, { "epoch": 0.418856360435548, "grad_norm": 3.8125, "learning_rate": 2.9180447713682664e-05, "loss": 1.8040634393692017, "step": 1606 }, { "epoch": 0.41911716763382667, "grad_norm": 3.96875, "learning_rate": 2.9167478633814376e-05, "loss": 1.6127195358276367, "step": 1607 }, { "epoch": 0.41937797483210537, "grad_norm": 3.65625, "learning_rate": 2.9154504671883747e-05, "loss": 1.5782577991485596, "step": 1608 }, { "epoch": 0.419638782030384, "grad_norm": 3.4375, "learning_rate": 2.9141525834799952e-05, "loss": 1.6237680912017822, "step": 1609 }, { "epoch": 0.4198995892286627, "grad_norm": 3.8125, "learning_rate": 2.912854212947475e-05, "loss": 1.5012826919555664, "step": 1610 }, { "epoch": 0.42016039642694136, "grad_norm": 4.15625, "learning_rate": 2.9115553562822508e-05, "loss": 1.7397178411483765, "step": 1611 }, { "epoch": 0.42042120362522006, "grad_norm": 3.78125, "learning_rate": 2.9102560141760178e-05, "loss": 1.4469175338745117, "step": 1612 }, { "epoch": 0.4206820108234987, "grad_norm": 3.75, "learning_rate": 2.908956187320729e-05, "loss": 1.3980048894882202, "step": 1613 }, { "epoch": 0.4209428180217774, "grad_norm": 3.546875, "learning_rate": 2.9076558764085966e-05, "loss": 1.52538001537323, "step": 1614 }, { "epoch": 0.42120362522005605, "grad_norm": 4.0, "learning_rate": 2.9063550821320897e-05, "loss": 1.826903223991394, "step": 1615 }, { "epoch": 0.42146443241833476, "grad_norm": 3.4375, "learning_rate": 2.9050538051839355e-05, "loss": 1.2689564228057861, "step": 1616 }, { "epoch": 0.4217252396166134, "grad_norm": 3.640625, "learning_rate": 2.903752046257117e-05, "loss": 1.546547532081604, "step": 1617 }, { "epoch": 0.4219860468148921, "grad_norm": 3.625, "learning_rate": 2.9024498060448758e-05, "loss": 1.3963298797607422, "step": 1618 }, { "epoch": 0.42224685401317075, "grad_norm": 3.859375, "learning_rate": 2.901147085240709e-05, "loss": 1.47343909740448, "step": 1619 }, { "epoch": 0.42250766121144945, "grad_norm": 3.890625, "learning_rate": 2.899843884538368e-05, "loss": 1.4737505912780762, "step": 1620 }, { "epoch": 0.4227684684097281, "grad_norm": 3.578125, "learning_rate": 2.8985402046318625e-05, "loss": 1.5501151084899902, "step": 1621 }, { "epoch": 0.4230292756080068, "grad_norm": 4.0, "learning_rate": 2.8972360462154557e-05, "loss": 1.7948087453842163, "step": 1622 }, { "epoch": 0.42329008280628544, "grad_norm": 3.90625, "learning_rate": 2.8959314099836654e-05, "loss": 1.4594502449035645, "step": 1623 }, { "epoch": 0.42355089000456414, "grad_norm": 3.828125, "learning_rate": 2.8946262966312652e-05, "loss": 1.6952762603759766, "step": 1624 }, { "epoch": 0.4238116972028428, "grad_norm": 3.6875, "learning_rate": 2.893320706853282e-05, "loss": 1.5199551582336426, "step": 1625 }, { "epoch": 0.4240725044011215, "grad_norm": 3.8125, "learning_rate": 2.892014641344997e-05, "loss": 1.7249001264572144, "step": 1626 }, { "epoch": 0.42433331159940013, "grad_norm": 3.828125, "learning_rate": 2.890708100801943e-05, "loss": 1.660285472869873, "step": 1627 }, { "epoch": 0.42459411879767883, "grad_norm": 3.765625, "learning_rate": 2.8894010859199073e-05, "loss": 1.7036306858062744, "step": 1628 }, { "epoch": 0.4248549259959575, "grad_norm": 3.765625, "learning_rate": 2.8880935973949304e-05, "loss": 1.5827271938323975, "step": 1629 }, { "epoch": 0.4251157331942362, "grad_norm": 4.125, "learning_rate": 2.8867856359233027e-05, "loss": 1.3944931030273438, "step": 1630 }, { "epoch": 0.4253765403925148, "grad_norm": 3.9375, "learning_rate": 2.8854772022015694e-05, "loss": 1.742563247680664, "step": 1631 }, { "epoch": 0.4256373475907935, "grad_norm": 3.578125, "learning_rate": 2.8841682969265242e-05, "loss": 1.4844765663146973, "step": 1632 }, { "epoch": 0.42589815478907217, "grad_norm": 3.78125, "learning_rate": 2.8828589207952152e-05, "loss": 1.5548579692840576, "step": 1633 }, { "epoch": 0.42615896198735087, "grad_norm": 3.53125, "learning_rate": 2.8815490745049372e-05, "loss": 1.2696446180343628, "step": 1634 }, { "epoch": 0.4264197691856295, "grad_norm": 3.890625, "learning_rate": 2.8802387587532395e-05, "loss": 1.6950145959854126, "step": 1635 }, { "epoch": 0.4266805763839082, "grad_norm": 3.6875, "learning_rate": 2.8789279742379196e-05, "loss": 1.7176172733306885, "step": 1636 }, { "epoch": 0.42694138358218686, "grad_norm": 4.5, "learning_rate": 2.8776167216570225e-05, "loss": 1.8122673034667969, "step": 1637 }, { "epoch": 0.42720219078046556, "grad_norm": 3.953125, "learning_rate": 2.876305001708847e-05, "loss": 1.6122087240219116, "step": 1638 }, { "epoch": 0.4274629979787442, "grad_norm": 3.90625, "learning_rate": 2.874992815091937e-05, "loss": 1.713416337966919, "step": 1639 }, { "epoch": 0.4277238051770229, "grad_norm": 3.703125, "learning_rate": 2.873680162505087e-05, "loss": 1.697394609451294, "step": 1640 }, { "epoch": 0.42798461237530155, "grad_norm": 4.0625, "learning_rate": 2.8723670446473373e-05, "loss": 1.6875641345977783, "step": 1641 }, { "epoch": 0.42824541957358025, "grad_norm": 3.78125, "learning_rate": 2.8710534622179797e-05, "loss": 1.820279598236084, "step": 1642 }, { "epoch": 0.4285062267718589, "grad_norm": 3.734375, "learning_rate": 2.8697394159165505e-05, "loss": 1.5553900003433228, "step": 1643 }, { "epoch": 0.4287670339701376, "grad_norm": 5.375, "learning_rate": 2.868424906442833e-05, "loss": 1.6352019309997559, "step": 1644 }, { "epoch": 0.42902784116841625, "grad_norm": 3.625, "learning_rate": 2.867109934496859e-05, "loss": 1.4842324256896973, "step": 1645 }, { "epoch": 0.42928864836669495, "grad_norm": 3.859375, "learning_rate": 2.865794500778905e-05, "loss": 1.4266735315322876, "step": 1646 }, { "epoch": 0.4295494555649736, "grad_norm": 3.890625, "learning_rate": 2.864478605989494e-05, "loss": 1.5456125736236572, "step": 1647 }, { "epoch": 0.4298102627632523, "grad_norm": 3.515625, "learning_rate": 2.8631622508293957e-05, "loss": 1.5186576843261719, "step": 1648 }, { "epoch": 0.43007106996153094, "grad_norm": 3.859375, "learning_rate": 2.8618454359996217e-05, "loss": 1.5909136533737183, "step": 1649 }, { "epoch": 0.4303318771598096, "grad_norm": 4.03125, "learning_rate": 2.8605281622014315e-05, "loss": 1.7734607458114624, "step": 1650 }, { "epoch": 0.4305926843580883, "grad_norm": 4.0, "learning_rate": 2.8592104301363285e-05, "loss": 1.7636637687683105, "step": 1651 }, { "epoch": 0.43085349155636693, "grad_norm": 3.890625, "learning_rate": 2.8578922405060593e-05, "loss": 1.7131767272949219, "step": 1652 }, { "epoch": 0.43111429875464563, "grad_norm": 3.796875, "learning_rate": 2.8565735940126146e-05, "loss": 1.5171430110931396, "step": 1653 }, { "epoch": 0.4313751059529243, "grad_norm": 4.28125, "learning_rate": 2.855254491358227e-05, "loss": 1.241947889328003, "step": 1654 }, { "epoch": 0.431635913151203, "grad_norm": 4.0, "learning_rate": 2.8539349332453758e-05, "loss": 1.6244800090789795, "step": 1655 }, { "epoch": 0.4318967203494816, "grad_norm": 3.828125, "learning_rate": 2.852614920376778e-05, "loss": 1.6008328199386597, "step": 1656 }, { "epoch": 0.4321575275477603, "grad_norm": 3.828125, "learning_rate": 2.8512944534553968e-05, "loss": 1.7080821990966797, "step": 1657 }, { "epoch": 0.43241833474603897, "grad_norm": 4.1875, "learning_rate": 2.849973533184435e-05, "loss": 1.818986177444458, "step": 1658 }, { "epoch": 0.43267914194431767, "grad_norm": 3.796875, "learning_rate": 2.8486521602673368e-05, "loss": 1.7770615816116333, "step": 1659 }, { "epoch": 0.4329399491425963, "grad_norm": 3.6875, "learning_rate": 2.8473303354077894e-05, "loss": 1.6648075580596924, "step": 1660 }, { "epoch": 0.433200756340875, "grad_norm": 3.84375, "learning_rate": 2.8460080593097177e-05, "loss": 1.87071692943573, "step": 1661 }, { "epoch": 0.43346156353915366, "grad_norm": 3.59375, "learning_rate": 2.8446853326772902e-05, "loss": 1.5345838069915771, "step": 1662 }, { "epoch": 0.43372237073743236, "grad_norm": 4.0, "learning_rate": 2.8433621562149122e-05, "loss": 1.5579233169555664, "step": 1663 }, { "epoch": 0.433983177935711, "grad_norm": 3.765625, "learning_rate": 2.8420385306272303e-05, "loss": 1.5652178525924683, "step": 1664 }, { "epoch": 0.4342439851339897, "grad_norm": 3.890625, "learning_rate": 2.8407144566191315e-05, "loss": 1.7516753673553467, "step": 1665 }, { "epoch": 0.43450479233226835, "grad_norm": 3.765625, "learning_rate": 2.839389934895738e-05, "loss": 1.4254251718521118, "step": 1666 }, { "epoch": 0.43476559953054705, "grad_norm": 3.609375, "learning_rate": 2.8380649661624135e-05, "loss": 1.5584107637405396, "step": 1667 }, { "epoch": 0.4350264067288257, "grad_norm": 3.75, "learning_rate": 2.836739551124759e-05, "loss": 1.6238042116165161, "step": 1668 }, { "epoch": 0.4352872139271044, "grad_norm": 3.796875, "learning_rate": 2.8354136904886123e-05, "loss": 1.6247994899749756, "step": 1669 }, { "epoch": 0.43554802112538304, "grad_norm": 3.6875, "learning_rate": 2.8340873849600502e-05, "loss": 1.4396026134490967, "step": 1670 }, { "epoch": 0.43580882832366175, "grad_norm": 3.828125, "learning_rate": 2.832760635245383e-05, "loss": 1.3682128190994263, "step": 1671 }, { "epoch": 0.4360696355219404, "grad_norm": 4.0, "learning_rate": 2.831433442051163e-05, "loss": 1.592055082321167, "step": 1672 }, { "epoch": 0.4363304427202191, "grad_norm": 3.65625, "learning_rate": 2.830105806084174e-05, "loss": 1.4300789833068848, "step": 1673 }, { "epoch": 0.43659124991849774, "grad_norm": 3.625, "learning_rate": 2.828777728051437e-05, "loss": 1.423119068145752, "step": 1674 }, { "epoch": 0.43685205711677644, "grad_norm": 3.90625, "learning_rate": 2.8274492086602085e-05, "loss": 1.757512092590332, "step": 1675 }, { "epoch": 0.4371128643150551, "grad_norm": 3.84375, "learning_rate": 2.826120248617981e-05, "loss": 1.5190269947052002, "step": 1676 }, { "epoch": 0.4373736715133338, "grad_norm": 4.0625, "learning_rate": 2.8247908486324807e-05, "loss": 1.508905291557312, "step": 1677 }, { "epoch": 0.43763447871161243, "grad_norm": 3.65625, "learning_rate": 2.8234610094116676e-05, "loss": 1.6778496503829956, "step": 1678 }, { "epoch": 0.43789528590989113, "grad_norm": 3.671875, "learning_rate": 2.8221307316637365e-05, "loss": 1.9285204410552979, "step": 1679 }, { "epoch": 0.4381560931081698, "grad_norm": 3.953125, "learning_rate": 2.8208000160971153e-05, "loss": 1.6861218214035034, "step": 1680 }, { "epoch": 0.4384169003064485, "grad_norm": 3.78125, "learning_rate": 2.8194688634204647e-05, "loss": 1.6404638290405273, "step": 1681 }, { "epoch": 0.4386777075047271, "grad_norm": 3.84375, "learning_rate": 2.8181372743426805e-05, "loss": 1.6675424575805664, "step": 1682 }, { "epoch": 0.4389385147030058, "grad_norm": 3.8125, "learning_rate": 2.8168052495728866e-05, "loss": 1.6485905647277832, "step": 1683 }, { "epoch": 0.43919932190128447, "grad_norm": 4.1875, "learning_rate": 2.8154727898204434e-05, "loss": 1.7377148866653442, "step": 1684 }, { "epoch": 0.43946012909956317, "grad_norm": 3.703125, "learning_rate": 2.8141398957949397e-05, "loss": 1.6704241037368774, "step": 1685 }, { "epoch": 0.4397209362978418, "grad_norm": 3.703125, "learning_rate": 2.8128065682061975e-05, "loss": 1.7559125423431396, "step": 1686 }, { "epoch": 0.4399817434961205, "grad_norm": 3.8125, "learning_rate": 2.8114728077642693e-05, "loss": 1.6977081298828125, "step": 1687 }, { "epoch": 0.44024255069439916, "grad_norm": 3.53125, "learning_rate": 2.8101386151794362e-05, "loss": 1.3879613876342773, "step": 1688 }, { "epoch": 0.44050335789267786, "grad_norm": 3.765625, "learning_rate": 2.8088039911622133e-05, "loss": 1.4685391187667847, "step": 1689 }, { "epoch": 0.4407641650909565, "grad_norm": 3.75, "learning_rate": 2.8074689364233414e-05, "loss": 1.8438893556594849, "step": 1690 }, { "epoch": 0.4410249722892352, "grad_norm": 3.65625, "learning_rate": 2.8061334516737936e-05, "loss": 1.6127426624298096, "step": 1691 }, { "epoch": 0.44128577948751385, "grad_norm": 3.453125, "learning_rate": 2.80479753762477e-05, "loss": 1.3222421407699585, "step": 1692 }, { "epoch": 0.44154658668579255, "grad_norm": 3.71875, "learning_rate": 2.8034611949877005e-05, "loss": 1.6235663890838623, "step": 1693 }, { "epoch": 0.4418073938840712, "grad_norm": 4.4375, "learning_rate": 2.8021244244742437e-05, "loss": 1.9033315181732178, "step": 1694 }, { "epoch": 0.4420682010823499, "grad_norm": 3.75, "learning_rate": 2.8007872267962844e-05, "loss": 1.569352388381958, "step": 1695 }, { "epoch": 0.44232900828062854, "grad_norm": 3.484375, "learning_rate": 2.7994496026659363e-05, "loss": 1.3566945791244507, "step": 1696 }, { "epoch": 0.44258981547890724, "grad_norm": 4.0, "learning_rate": 2.798111552795539e-05, "loss": 1.5274192094802856, "step": 1697 }, { "epoch": 0.4428506226771859, "grad_norm": 3.625, "learning_rate": 2.7967730778976596e-05, "loss": 1.5132932662963867, "step": 1698 }, { "epoch": 0.44311142987546454, "grad_norm": 3.90625, "learning_rate": 2.795434178685093e-05, "loss": 1.6494386196136475, "step": 1699 }, { "epoch": 0.44337223707374324, "grad_norm": 3.671875, "learning_rate": 2.7940948558708567e-05, "loss": 1.6655917167663574, "step": 1700 }, { "epoch": 0.4436330442720219, "grad_norm": 3.609375, "learning_rate": 2.792755110168196e-05, "loss": 1.5109964609146118, "step": 1701 }, { "epoch": 0.4438938514703006, "grad_norm": 3.625, "learning_rate": 2.791414942290582e-05, "loss": 1.4205164909362793, "step": 1702 }, { "epoch": 0.4441546586685792, "grad_norm": 3.890625, "learning_rate": 2.7900743529517087e-05, "loss": 1.5652105808258057, "step": 1703 }, { "epoch": 0.44441546586685793, "grad_norm": 3.546875, "learning_rate": 2.7887333428654955e-05, "loss": 1.481246829032898, "step": 1704 }, { "epoch": 0.4446762730651366, "grad_norm": 3.5625, "learning_rate": 2.7873919127460857e-05, "loss": 1.777045726776123, "step": 1705 }, { "epoch": 0.4449370802634153, "grad_norm": 3.40625, "learning_rate": 2.7860500633078475e-05, "loss": 1.5149214267730713, "step": 1706 }, { "epoch": 0.4451978874616939, "grad_norm": 3.65625, "learning_rate": 2.7847077952653704e-05, "loss": 1.6715518236160278, "step": 1707 }, { "epoch": 0.4454586946599726, "grad_norm": 3.546875, "learning_rate": 2.7833651093334686e-05, "loss": 1.6214019060134888, "step": 1708 }, { "epoch": 0.44571950185825127, "grad_norm": 3.734375, "learning_rate": 2.7820220062271768e-05, "loss": 1.4817593097686768, "step": 1709 }, { "epoch": 0.44598030905652997, "grad_norm": 3.703125, "learning_rate": 2.780678486661753e-05, "loss": 1.4205501079559326, "step": 1710 }, { "epoch": 0.4462411162548086, "grad_norm": 3.5, "learning_rate": 2.779334551352679e-05, "loss": 1.573449730873108, "step": 1711 }, { "epoch": 0.4465019234530873, "grad_norm": 3.546875, "learning_rate": 2.7779902010156542e-05, "loss": 1.5362462997436523, "step": 1712 }, { "epoch": 0.44676273065136596, "grad_norm": 3.75, "learning_rate": 2.776645436366602e-05, "loss": 1.4577736854553223, "step": 1713 }, { "epoch": 0.44702353784964466, "grad_norm": 3.390625, "learning_rate": 2.7753002581216636e-05, "loss": 1.5574638843536377, "step": 1714 }, { "epoch": 0.4472843450479233, "grad_norm": 3.46875, "learning_rate": 2.7739546669972046e-05, "loss": 1.3225568532943726, "step": 1715 }, { "epoch": 0.447545152246202, "grad_norm": 3.546875, "learning_rate": 2.7726086637098064e-05, "loss": 1.5711350440979004, "step": 1716 }, { "epoch": 0.44780595944448065, "grad_norm": 4.125, "learning_rate": 2.771262248976272e-05, "loss": 1.6645259857177734, "step": 1717 }, { "epoch": 0.44806676664275935, "grad_norm": 3.5, "learning_rate": 2.769915423513623e-05, "loss": 1.3624897003173828, "step": 1718 }, { "epoch": 0.448327573841038, "grad_norm": 3.796875, "learning_rate": 2.7685681880390995e-05, "loss": 1.7277605533599854, "step": 1719 }, { "epoch": 0.4485883810393167, "grad_norm": 3.890625, "learning_rate": 2.7672205432701607e-05, "loss": 1.7653427124023438, "step": 1720 }, { "epoch": 0.44884918823759534, "grad_norm": 3.734375, "learning_rate": 2.7658724899244833e-05, "loss": 1.591906189918518, "step": 1721 }, { "epoch": 0.44910999543587404, "grad_norm": 3.609375, "learning_rate": 2.7645240287199606e-05, "loss": 1.6018164157867432, "step": 1722 }, { "epoch": 0.4493708026341527, "grad_norm": 3.53125, "learning_rate": 2.7631751603747058e-05, "loss": 1.5946130752563477, "step": 1723 }, { "epoch": 0.4496316098324314, "grad_norm": 3.53125, "learning_rate": 2.7618258856070458e-05, "loss": 1.4172343015670776, "step": 1724 }, { "epoch": 0.44989241703071003, "grad_norm": 3.875, "learning_rate": 2.7604762051355262e-05, "loss": 1.9235308170318604, "step": 1725 }, { "epoch": 0.45015322422898874, "grad_norm": 3.421875, "learning_rate": 2.7591261196789072e-05, "loss": 1.4664264917373657, "step": 1726 }, { "epoch": 0.4504140314272674, "grad_norm": 3.640625, "learning_rate": 2.7577756299561654e-05, "loss": 1.406175136566162, "step": 1727 }, { "epoch": 0.4506748386255461, "grad_norm": 3.546875, "learning_rate": 2.7564247366864926e-05, "loss": 1.6239509582519531, "step": 1728 }, { "epoch": 0.4509356458238247, "grad_norm": 3.5, "learning_rate": 2.7550734405892954e-05, "loss": 1.8283617496490479, "step": 1729 }, { "epoch": 0.4511964530221034, "grad_norm": 3.546875, "learning_rate": 2.753721742384196e-05, "loss": 1.433314561843872, "step": 1730 }, { "epoch": 0.4514572602203821, "grad_norm": 3.640625, "learning_rate": 2.7523696427910272e-05, "loss": 1.6373162269592285, "step": 1731 }, { "epoch": 0.4517180674186608, "grad_norm": 3.6875, "learning_rate": 2.7510171425298408e-05, "loss": 1.5311187505722046, "step": 1732 }, { "epoch": 0.4519788746169394, "grad_norm": 3.6875, "learning_rate": 2.7496642423208975e-05, "loss": 1.498328685760498, "step": 1733 }, { "epoch": 0.4522396818152181, "grad_norm": 3.703125, "learning_rate": 2.7483109428846736e-05, "loss": 1.7286779880523682, "step": 1734 }, { "epoch": 0.45250048901349677, "grad_norm": 3.9375, "learning_rate": 2.7469572449418564e-05, "loss": 1.7031946182250977, "step": 1735 }, { "epoch": 0.45276129621177547, "grad_norm": 3.9375, "learning_rate": 2.7456031492133472e-05, "loss": 1.6917610168457031, "step": 1736 }, { "epoch": 0.4530221034100541, "grad_norm": 3.765625, "learning_rate": 2.7442486564202577e-05, "loss": 1.4163310527801514, "step": 1737 }, { "epoch": 0.4532829106083328, "grad_norm": 3.875, "learning_rate": 2.742893767283911e-05, "loss": 1.7884117364883423, "step": 1738 }, { "epoch": 0.45354371780661146, "grad_norm": 3.515625, "learning_rate": 2.741538482525842e-05, "loss": 1.5097811222076416, "step": 1739 }, { "epoch": 0.45380452500489016, "grad_norm": 3.75, "learning_rate": 2.740182802867796e-05, "loss": 1.71995210647583, "step": 1740 }, { "epoch": 0.4540653322031688, "grad_norm": 3.703125, "learning_rate": 2.738826729031728e-05, "loss": 1.719955563545227, "step": 1741 }, { "epoch": 0.4543261394014475, "grad_norm": 3.78125, "learning_rate": 2.7374702617398052e-05, "loss": 1.5219910144805908, "step": 1742 }, { "epoch": 0.45458694659972615, "grad_norm": 4.25, "learning_rate": 2.7361134017144012e-05, "loss": 1.7454659938812256, "step": 1743 }, { "epoch": 0.45484775379800485, "grad_norm": 3.546875, "learning_rate": 2.7347561496781007e-05, "loss": 1.7231475114822388, "step": 1744 }, { "epoch": 0.4551085609962835, "grad_norm": 3.46875, "learning_rate": 2.7333985063536963e-05, "loss": 1.6025141477584839, "step": 1745 }, { "epoch": 0.4553693681945622, "grad_norm": 4.0, "learning_rate": 2.73204047246419e-05, "loss": 1.6790502071380615, "step": 1746 }, { "epoch": 0.45563017539284084, "grad_norm": 3.734375, "learning_rate": 2.7306820487327906e-05, "loss": 1.6374878883361816, "step": 1747 }, { "epoch": 0.45589098259111954, "grad_norm": 3.578125, "learning_rate": 2.7293232358829146e-05, "loss": 1.6954729557037354, "step": 1748 }, { "epoch": 0.4561517897893982, "grad_norm": 3.734375, "learning_rate": 2.7279640346381877e-05, "loss": 1.764048457145691, "step": 1749 }, { "epoch": 0.45641259698767683, "grad_norm": 3.578125, "learning_rate": 2.7266044457224394e-05, "loss": 1.6400312185287476, "step": 1750 }, { "epoch": 0.45667340418595553, "grad_norm": 3.59375, "learning_rate": 2.7252444698597078e-05, "loss": 1.5373930931091309, "step": 1751 }, { "epoch": 0.4569342113842342, "grad_norm": 3.546875, "learning_rate": 2.723884107774236e-05, "loss": 1.3972481489181519, "step": 1752 }, { "epoch": 0.4571950185825129, "grad_norm": 3.5625, "learning_rate": 2.7225233601904738e-05, "loss": 1.7710322141647339, "step": 1753 }, { "epoch": 0.4574558257807915, "grad_norm": 3.4375, "learning_rate": 2.721162227833076e-05, "loss": 1.5051907300949097, "step": 1754 }, { "epoch": 0.4577166329790702, "grad_norm": 4.0625, "learning_rate": 2.7198007114269005e-05, "loss": 1.6756287813186646, "step": 1755 }, { "epoch": 0.45797744017734887, "grad_norm": 3.59375, "learning_rate": 2.7184388116970124e-05, "loss": 1.4749387502670288, "step": 1756 }, { "epoch": 0.45823824737562757, "grad_norm": 3.84375, "learning_rate": 2.7170765293686792e-05, "loss": 1.9337652921676636, "step": 1757 }, { "epoch": 0.4584990545739062, "grad_norm": 3.53125, "learning_rate": 2.715713865167373e-05, "loss": 1.5626142024993896, "step": 1758 }, { "epoch": 0.4587598617721849, "grad_norm": 3.453125, "learning_rate": 2.7143508198187695e-05, "loss": 1.5024030208587646, "step": 1759 }, { "epoch": 0.45902066897046356, "grad_norm": 3.609375, "learning_rate": 2.712987394048746e-05, "loss": 1.4093332290649414, "step": 1760 }, { "epoch": 0.45928147616874226, "grad_norm": 3.71875, "learning_rate": 2.7116235885833835e-05, "loss": 1.5896199941635132, "step": 1761 }, { "epoch": 0.4595422833670209, "grad_norm": 3.609375, "learning_rate": 2.710259404148965e-05, "loss": 1.6533608436584473, "step": 1762 }, { "epoch": 0.4598030905652996, "grad_norm": 3.53125, "learning_rate": 2.708894841471975e-05, "loss": 1.6842036247253418, "step": 1763 }, { "epoch": 0.46006389776357826, "grad_norm": 3.65625, "learning_rate": 2.7075299012791003e-05, "loss": 1.488107442855835, "step": 1764 }, { "epoch": 0.46032470496185696, "grad_norm": 3.703125, "learning_rate": 2.7061645842972266e-05, "loss": 1.551996111869812, "step": 1765 }, { "epoch": 0.4605855121601356, "grad_norm": 3.671875, "learning_rate": 2.704798891253444e-05, "loss": 1.6990528106689453, "step": 1766 }, { "epoch": 0.4608463193584143, "grad_norm": 3.609375, "learning_rate": 2.703432822875039e-05, "loss": 1.503858208656311, "step": 1767 }, { "epoch": 0.46110712655669295, "grad_norm": 3.609375, "learning_rate": 2.7020663798895003e-05, "loss": 1.5476514101028442, "step": 1768 }, { "epoch": 0.46136793375497165, "grad_norm": 3.71875, "learning_rate": 2.7006995630245155e-05, "loss": 1.4874258041381836, "step": 1769 }, { "epoch": 0.4616287409532503, "grad_norm": 3.6875, "learning_rate": 2.6993323730079707e-05, "loss": 1.786684513092041, "step": 1770 }, { "epoch": 0.461889548151529, "grad_norm": 3.84375, "learning_rate": 2.6979648105679523e-05, "loss": 1.6916794776916504, "step": 1771 }, { "epoch": 0.46215035534980764, "grad_norm": 3.859375, "learning_rate": 2.6965968764327425e-05, "loss": 1.5331114530563354, "step": 1772 }, { "epoch": 0.46241116254808634, "grad_norm": 3.765625, "learning_rate": 2.695228571330824e-05, "loss": 1.9602546691894531, "step": 1773 }, { "epoch": 0.462671969746365, "grad_norm": 3.421875, "learning_rate": 2.6938598959908752e-05, "loss": 1.4702377319335938, "step": 1774 }, { "epoch": 0.4629327769446437, "grad_norm": 3.703125, "learning_rate": 2.6924908511417737e-05, "loss": 1.6842867136001587, "step": 1775 }, { "epoch": 0.46319358414292233, "grad_norm": 3.4375, "learning_rate": 2.6911214375125917e-05, "loss": 1.5531668663024902, "step": 1776 }, { "epoch": 0.46345439134120103, "grad_norm": 3.21875, "learning_rate": 2.6897516558325984e-05, "loss": 1.2932814359664917, "step": 1777 }, { "epoch": 0.4637151985394797, "grad_norm": 4.34375, "learning_rate": 2.6883815068312597e-05, "loss": 1.5881843566894531, "step": 1778 }, { "epoch": 0.4639760057377584, "grad_norm": 3.375, "learning_rate": 2.687010991238237e-05, "loss": 1.4304906129837036, "step": 1779 }, { "epoch": 0.464236812936037, "grad_norm": 3.625, "learning_rate": 2.6856401097833863e-05, "loss": 1.5073388814926147, "step": 1780 }, { "epoch": 0.4644976201343157, "grad_norm": 3.796875, "learning_rate": 2.68426886319676e-05, "loss": 1.7035486698150635, "step": 1781 }, { "epoch": 0.46475842733259437, "grad_norm": 3.890625, "learning_rate": 2.6828972522086013e-05, "loss": 1.650022029876709, "step": 1782 }, { "epoch": 0.46501923453087307, "grad_norm": 3.921875, "learning_rate": 2.6815252775493523e-05, "loss": 1.7883479595184326, "step": 1783 }, { "epoch": 0.4652800417291517, "grad_norm": 3.53125, "learning_rate": 2.6801529399496446e-05, "loss": 1.5400499105453491, "step": 1784 }, { "epoch": 0.4655408489274304, "grad_norm": 3.9375, "learning_rate": 2.6787802401403062e-05, "loss": 1.4867982864379883, "step": 1785 }, { "epoch": 0.46580165612570906, "grad_norm": 3.328125, "learning_rate": 2.677407178852356e-05, "loss": 1.2686187028884888, "step": 1786 }, { "epoch": 0.46606246332398776, "grad_norm": 3.671875, "learning_rate": 2.6760337568170056e-05, "loss": 1.719443917274475, "step": 1787 }, { "epoch": 0.4663232705222664, "grad_norm": 3.46875, "learning_rate": 2.6746599747656605e-05, "loss": 1.4701546430587769, "step": 1788 }, { "epoch": 0.4665840777205451, "grad_norm": 3.46875, "learning_rate": 2.6732858334299155e-05, "loss": 1.4711968898773193, "step": 1789 }, { "epoch": 0.46684488491882375, "grad_norm": 3.6875, "learning_rate": 2.6719113335415572e-05, "loss": 1.6861298084259033, "step": 1790 }, { "epoch": 0.46710569211710246, "grad_norm": 3.8125, "learning_rate": 2.670536475832566e-05, "loss": 1.5621323585510254, "step": 1791 }, { "epoch": 0.4673664993153811, "grad_norm": 3.703125, "learning_rate": 2.6691612610351084e-05, "loss": 1.4175682067871094, "step": 1792 }, { "epoch": 0.4676273065136598, "grad_norm": 3.6875, "learning_rate": 2.6677856898815443e-05, "loss": 1.5656529664993286, "step": 1793 }, { "epoch": 0.46788811371193845, "grad_norm": 3.421875, "learning_rate": 2.6664097631044224e-05, "loss": 1.355933427810669, "step": 1794 }, { "epoch": 0.46814892091021715, "grad_norm": 3.390625, "learning_rate": 2.66503348143648e-05, "loss": 1.2955758571624756, "step": 1795 }, { "epoch": 0.4684097281084958, "grad_norm": 4.09375, "learning_rate": 2.663656845610645e-05, "loss": 1.8883273601531982, "step": 1796 }, { "epoch": 0.4686705353067745, "grad_norm": 3.65625, "learning_rate": 2.6622798563600324e-05, "loss": 1.5600733757019043, "step": 1797 }, { "epoch": 0.46893134250505314, "grad_norm": 3.5, "learning_rate": 2.660902514417947e-05, "loss": 1.6002708673477173, "step": 1798 }, { "epoch": 0.4691921497033318, "grad_norm": 3.71875, "learning_rate": 2.659524820517879e-05, "loss": 1.5094125270843506, "step": 1799 }, { "epoch": 0.4694529569016105, "grad_norm": 3.71875, "learning_rate": 2.658146775393509e-05, "loss": 1.6929123401641846, "step": 1800 }, { "epoch": 0.46971376409988913, "grad_norm": 3.609375, "learning_rate": 2.656768379778702e-05, "loss": 1.4931977987289429, "step": 1801 }, { "epoch": 0.46997457129816783, "grad_norm": 3.40625, "learning_rate": 2.6553896344075113e-05, "loss": 1.4910286664962769, "step": 1802 }, { "epoch": 0.4702353784964465, "grad_norm": 3.546875, "learning_rate": 2.6540105400141756e-05, "loss": 1.7399252653121948, "step": 1803 }, { "epoch": 0.4704961856947252, "grad_norm": 3.609375, "learning_rate": 2.652631097333121e-05, "loss": 1.6574151515960693, "step": 1804 }, { "epoch": 0.4707569928930038, "grad_norm": 3.640625, "learning_rate": 2.6512513070989567e-05, "loss": 1.7319456338882446, "step": 1805 }, { "epoch": 0.4710178000912825, "grad_norm": 3.671875, "learning_rate": 2.6498711700464786e-05, "loss": 1.7306711673736572, "step": 1806 }, { "epoch": 0.47127860728956117, "grad_norm": 3.484375, "learning_rate": 2.648490686910666e-05, "loss": 1.4612176418304443, "step": 1807 }, { "epoch": 0.47153941448783987, "grad_norm": 3.5625, "learning_rate": 2.647109858426685e-05, "loss": 1.3843921422958374, "step": 1808 }, { "epoch": 0.4718002216861185, "grad_norm": 3.375, "learning_rate": 2.645728685329883e-05, "loss": 1.324055552482605, "step": 1809 }, { "epoch": 0.4720610288843972, "grad_norm": 3.484375, "learning_rate": 2.6443471683557925e-05, "loss": 1.6804945468902588, "step": 1810 }, { "epoch": 0.47232183608267586, "grad_norm": 3.703125, "learning_rate": 2.6429653082401276e-05, "loss": 1.5219495296478271, "step": 1811 }, { "epoch": 0.47258264328095456, "grad_norm": 3.65625, "learning_rate": 2.6415831057187867e-05, "loss": 1.696804404258728, "step": 1812 }, { "epoch": 0.4728434504792332, "grad_norm": 3.765625, "learning_rate": 2.6402005615278505e-05, "loss": 1.6895540952682495, "step": 1813 }, { "epoch": 0.4731042576775119, "grad_norm": 5.28125, "learning_rate": 2.6388176764035794e-05, "loss": 1.4110807180404663, "step": 1814 }, { "epoch": 0.47336506487579055, "grad_norm": 3.640625, "learning_rate": 2.6374344510824194e-05, "loss": 1.7220664024353027, "step": 1815 }, { "epoch": 0.47362587207406925, "grad_norm": 3.609375, "learning_rate": 2.6360508863009928e-05, "loss": 1.6977365016937256, "step": 1816 }, { "epoch": 0.4738866792723479, "grad_norm": 3.578125, "learning_rate": 2.634666982796107e-05, "loss": 1.7042901515960693, "step": 1817 }, { "epoch": 0.4741474864706266, "grad_norm": 3.84375, "learning_rate": 2.6332827413047475e-05, "loss": 1.6732020378112793, "step": 1818 }, { "epoch": 0.47440829366890525, "grad_norm": 4.0625, "learning_rate": 2.63189816256408e-05, "loss": 1.7227288484573364, "step": 1819 }, { "epoch": 0.47466910086718395, "grad_norm": 3.375, "learning_rate": 2.6305132473114502e-05, "loss": 1.3994221687316895, "step": 1820 }, { "epoch": 0.4749299080654626, "grad_norm": 3.828125, "learning_rate": 2.6291279962843828e-05, "loss": 1.4534467458724976, "step": 1821 }, { "epoch": 0.4751907152637413, "grad_norm": 3.875, "learning_rate": 2.6277424102205817e-05, "loss": 1.467596173286438, "step": 1822 }, { "epoch": 0.47545152246201994, "grad_norm": 3.578125, "learning_rate": 2.626356489857929e-05, "loss": 1.4756677150726318, "step": 1823 }, { "epoch": 0.47571232966029864, "grad_norm": 3.625, "learning_rate": 2.624970235934484e-05, "loss": 1.6284886598587036, "step": 1824 }, { "epoch": 0.4759731368585773, "grad_norm": 3.71875, "learning_rate": 2.6235836491884845e-05, "loss": 1.5227932929992676, "step": 1825 }, { "epoch": 0.476233944056856, "grad_norm": 3.75, "learning_rate": 2.6221967303583463e-05, "loss": 1.5853811502456665, "step": 1826 }, { "epoch": 0.47649475125513463, "grad_norm": 3.59375, "learning_rate": 2.6208094801826603e-05, "loss": 1.5051393508911133, "step": 1827 }, { "epoch": 0.47675555845341333, "grad_norm": 3.484375, "learning_rate": 2.6194218994001956e-05, "loss": 1.3453019857406616, "step": 1828 }, { "epoch": 0.477016365651692, "grad_norm": 3.890625, "learning_rate": 2.618033988749895e-05, "loss": 1.648142695426941, "step": 1829 }, { "epoch": 0.4772771728499707, "grad_norm": 3.8125, "learning_rate": 2.61664574897088e-05, "loss": 1.4317494630813599, "step": 1830 }, { "epoch": 0.4775379800482493, "grad_norm": 3.796875, "learning_rate": 2.6152571808024446e-05, "loss": 1.5820590257644653, "step": 1831 }, { "epoch": 0.477798787246528, "grad_norm": 3.75, "learning_rate": 2.6138682849840602e-05, "loss": 1.859946608543396, "step": 1832 }, { "epoch": 0.47805959444480667, "grad_norm": 3.671875, "learning_rate": 2.6124790622553696e-05, "loss": 1.586352825164795, "step": 1833 }, { "epoch": 0.47832040164308537, "grad_norm": 3.4375, "learning_rate": 2.611089513356193e-05, "loss": 1.6238490343093872, "step": 1834 }, { "epoch": 0.478581208841364, "grad_norm": 3.734375, "learning_rate": 2.6096996390265226e-05, "loss": 1.5462846755981445, "step": 1835 }, { "epoch": 0.4788420160396427, "grad_norm": 3.6875, "learning_rate": 2.6083094400065236e-05, "loss": 1.3191076517105103, "step": 1836 }, { "epoch": 0.47910282323792136, "grad_norm": 3.734375, "learning_rate": 2.6069189170365354e-05, "loss": 1.6824617385864258, "step": 1837 }, { "epoch": 0.47936363043620006, "grad_norm": 3.59375, "learning_rate": 2.6055280708570667e-05, "loss": 1.3834890127182007, "step": 1838 }, { "epoch": 0.4796244376344787, "grad_norm": 3.515625, "learning_rate": 2.6041369022088044e-05, "loss": 1.6102575063705444, "step": 1839 }, { "epoch": 0.4798852448327574, "grad_norm": 3.53125, "learning_rate": 2.602745411832601e-05, "loss": 1.3793972730636597, "step": 1840 }, { "epoch": 0.48014605203103605, "grad_norm": 3.890625, "learning_rate": 2.601353600469483e-05, "loss": 1.7498921155929565, "step": 1841 }, { "epoch": 0.48040685922931475, "grad_norm": 3.71875, "learning_rate": 2.5999614688606482e-05, "loss": 1.7192661762237549, "step": 1842 }, { "epoch": 0.4806676664275934, "grad_norm": 3.4375, "learning_rate": 2.5985690177474646e-05, "loss": 1.7017711400985718, "step": 1843 }, { "epoch": 0.4809284736258721, "grad_norm": 3.515625, "learning_rate": 2.59717624787147e-05, "loss": 1.6084730625152588, "step": 1844 }, { "epoch": 0.48118928082415074, "grad_norm": 3.59375, "learning_rate": 2.5957831599743713e-05, "loss": 1.6439201831817627, "step": 1845 }, { "epoch": 0.48145008802242945, "grad_norm": 3.703125, "learning_rate": 2.594389754798046e-05, "loss": 1.3723888397216797, "step": 1846 }, { "epoch": 0.4817108952207081, "grad_norm": 3.640625, "learning_rate": 2.5929960330845402e-05, "loss": 1.4593439102172852, "step": 1847 }, { "epoch": 0.4819717024189868, "grad_norm": 3.734375, "learning_rate": 2.5916019955760687e-05, "loss": 1.5289146900177002, "step": 1848 }, { "epoch": 0.48223250961726544, "grad_norm": 3.78125, "learning_rate": 2.5902076430150143e-05, "loss": 1.5675907135009766, "step": 1849 }, { "epoch": 0.4824933168155441, "grad_norm": 3.671875, "learning_rate": 2.5888129761439268e-05, "loss": 1.5333256721496582, "step": 1850 }, { "epoch": 0.4827541240138228, "grad_norm": 4.09375, "learning_rate": 2.587417995705525e-05, "loss": 1.9039056301116943, "step": 1851 }, { "epoch": 0.48301493121210143, "grad_norm": 3.4375, "learning_rate": 2.586022702442693e-05, "loss": 1.4152421951293945, "step": 1852 }, { "epoch": 0.48327573841038013, "grad_norm": 3.4375, "learning_rate": 2.5846270970984826e-05, "loss": 1.4902153015136719, "step": 1853 }, { "epoch": 0.4835365456086588, "grad_norm": 3.6875, "learning_rate": 2.5832311804161127e-05, "loss": 1.768594741821289, "step": 1854 }, { "epoch": 0.4837973528069375, "grad_norm": 3.671875, "learning_rate": 2.581834953138964e-05, "loss": 1.5538244247436523, "step": 1855 }, { "epoch": 0.4840581600052161, "grad_norm": 3.84375, "learning_rate": 2.580438416010588e-05, "loss": 1.8284088373184204, "step": 1856 }, { "epoch": 0.4843189672034948, "grad_norm": 3.5, "learning_rate": 2.5790415697746976e-05, "loss": 1.479146122932434, "step": 1857 }, { "epoch": 0.48457977440177347, "grad_norm": 3.78125, "learning_rate": 2.5776444151751712e-05, "loss": 1.6835163831710815, "step": 1858 }, { "epoch": 0.48484058160005217, "grad_norm": 3.671875, "learning_rate": 2.5762469529560514e-05, "loss": 1.6849534511566162, "step": 1859 }, { "epoch": 0.4851013887983308, "grad_norm": 3.640625, "learning_rate": 2.5748491838615457e-05, "loss": 1.5472744703292847, "step": 1860 }, { "epoch": 0.4853621959966095, "grad_norm": 3.609375, "learning_rate": 2.5734511086360236e-05, "loss": 1.6519335508346558, "step": 1861 }, { "epoch": 0.48562300319488816, "grad_norm": 3.53125, "learning_rate": 2.5720527280240172e-05, "loss": 1.5347322225570679, "step": 1862 }, { "epoch": 0.48588381039316686, "grad_norm": 3.515625, "learning_rate": 2.5706540427702234e-05, "loss": 1.4124343395233154, "step": 1863 }, { "epoch": 0.4861446175914455, "grad_norm": 3.765625, "learning_rate": 2.5692550536194988e-05, "loss": 1.6010658740997314, "step": 1864 }, { "epoch": 0.4864054247897242, "grad_norm": 3.703125, "learning_rate": 2.5678557613168645e-05, "loss": 1.6980068683624268, "step": 1865 }, { "epoch": 0.48666623198800285, "grad_norm": 3.796875, "learning_rate": 2.566456166607501e-05, "loss": 1.8062810897827148, "step": 1866 }, { "epoch": 0.48692703918628155, "grad_norm": 4.03125, "learning_rate": 2.56505627023675e-05, "loss": 1.6686768531799316, "step": 1867 }, { "epoch": 0.4871878463845602, "grad_norm": 3.96875, "learning_rate": 2.5636560729501154e-05, "loss": 1.660166621208191, "step": 1868 }, { "epoch": 0.4874486535828389, "grad_norm": 3.625, "learning_rate": 2.56225557549326e-05, "loss": 1.8616141080856323, "step": 1869 }, { "epoch": 0.48770946078111754, "grad_norm": 4.0, "learning_rate": 2.5608547786120056e-05, "loss": 1.7120239734649658, "step": 1870 }, { "epoch": 0.48797026797939624, "grad_norm": 3.578125, "learning_rate": 2.5594536830523362e-05, "loss": 1.5653290748596191, "step": 1871 }, { "epoch": 0.4882310751776749, "grad_norm": 4.15625, "learning_rate": 2.5580522895603917e-05, "loss": 1.5002496242523193, "step": 1872 }, { "epoch": 0.4884918823759536, "grad_norm": 3.75, "learning_rate": 2.5566505988824738e-05, "loss": 1.466421127319336, "step": 1873 }, { "epoch": 0.48875268957423224, "grad_norm": 4.0, "learning_rate": 2.5552486117650398e-05, "loss": 1.4143470525741577, "step": 1874 }, { "epoch": 0.48901349677251094, "grad_norm": 3.84375, "learning_rate": 2.5538463289547068e-05, "loss": 1.8513165712356567, "step": 1875 }, { "epoch": 0.4892743039707896, "grad_norm": 3.5625, "learning_rate": 2.5524437511982472e-05, "loss": 1.4836419820785522, "step": 1876 }, { "epoch": 0.4895351111690683, "grad_norm": 3.5625, "learning_rate": 2.551040879242593e-05, "loss": 1.5473915338516235, "step": 1877 }, { "epoch": 0.4897959183673469, "grad_norm": 3.5625, "learning_rate": 2.5496377138348313e-05, "loss": 1.438124418258667, "step": 1878 }, { "epoch": 0.49005672556562563, "grad_norm": 3.625, "learning_rate": 2.5482342557222057e-05, "loss": 1.5178227424621582, "step": 1879 }, { "epoch": 0.4903175327639043, "grad_norm": 3.6875, "learning_rate": 2.546830505652116e-05, "loss": 1.6444778442382812, "step": 1880 }, { "epoch": 0.490578339962183, "grad_norm": 3.5625, "learning_rate": 2.545426464372117e-05, "loss": 1.6002541780471802, "step": 1881 }, { "epoch": 0.4908391471604616, "grad_norm": 3.546875, "learning_rate": 2.5440221326299187e-05, "loss": 1.7041120529174805, "step": 1882 }, { "epoch": 0.4910999543587403, "grad_norm": 3.578125, "learning_rate": 2.5426175111733873e-05, "loss": 1.511730432510376, "step": 1883 }, { "epoch": 0.49136076155701897, "grad_norm": 3.75, "learning_rate": 2.5412126007505404e-05, "loss": 1.7230110168457031, "step": 1884 }, { "epoch": 0.49162156875529767, "grad_norm": 3.703125, "learning_rate": 2.5398074021095517e-05, "loss": 1.4343509674072266, "step": 1885 }, { "epoch": 0.4918823759535763, "grad_norm": 3.5625, "learning_rate": 2.5384019159987473e-05, "loss": 1.3298263549804688, "step": 1886 }, { "epoch": 0.492143183151855, "grad_norm": 3.78125, "learning_rate": 2.536996143166608e-05, "loss": 1.591562271118164, "step": 1887 }, { "epoch": 0.49240399035013366, "grad_norm": 3.9375, "learning_rate": 2.535590084361764e-05, "loss": 1.6372700929641724, "step": 1888 }, { "epoch": 0.49266479754841236, "grad_norm": 4.28125, "learning_rate": 2.5341837403330015e-05, "loss": 1.6347156763076782, "step": 1889 }, { "epoch": 0.492925604746691, "grad_norm": 3.5625, "learning_rate": 2.5327771118292575e-05, "loss": 1.5804529190063477, "step": 1890 }, { "epoch": 0.4931864119449697, "grad_norm": 3.890625, "learning_rate": 2.5313701995996177e-05, "loss": 1.7619483470916748, "step": 1891 }, { "epoch": 0.49344721914324835, "grad_norm": 3.453125, "learning_rate": 2.529963004393324e-05, "loss": 1.387556791305542, "step": 1892 }, { "epoch": 0.49370802634152705, "grad_norm": 3.78125, "learning_rate": 2.5285555269597635e-05, "loss": 1.744908094406128, "step": 1893 }, { "epoch": 0.4939688335398057, "grad_norm": 3.8125, "learning_rate": 2.5271477680484776e-05, "loss": 1.6964850425720215, "step": 1894 }, { "epoch": 0.4942296407380844, "grad_norm": 3.703125, "learning_rate": 2.5257397284091572e-05, "loss": 1.4730907678604126, "step": 1895 }, { "epoch": 0.49449044793636304, "grad_norm": 4.375, "learning_rate": 2.52433140879164e-05, "loss": 1.6733832359313965, "step": 1896 }, { "epoch": 0.49475125513464174, "grad_norm": 3.453125, "learning_rate": 2.5229228099459153e-05, "loss": 1.5286256074905396, "step": 1897 }, { "epoch": 0.4950120623329204, "grad_norm": 3.765625, "learning_rate": 2.5215139326221206e-05, "loss": 1.682433843612671, "step": 1898 }, { "epoch": 0.49527286953119903, "grad_norm": 3.765625, "learning_rate": 2.5201047775705414e-05, "loss": 1.7269163131713867, "step": 1899 }, { "epoch": 0.49553367672947773, "grad_norm": 4.375, "learning_rate": 2.5186953455416106e-05, "loss": 1.81180739402771, "step": 1900 }, { "epoch": 0.4957944839277564, "grad_norm": 3.875, "learning_rate": 2.51728563728591e-05, "loss": 1.527209758758545, "step": 1901 }, { "epoch": 0.4960552911260351, "grad_norm": 3.828125, "learning_rate": 2.515875653554167e-05, "loss": 1.8661195039749146, "step": 1902 }, { "epoch": 0.4963160983243137, "grad_norm": 3.421875, "learning_rate": 2.5144653950972565e-05, "loss": 1.3593074083328247, "step": 1903 }, { "epoch": 0.4965769055225924, "grad_norm": 3.765625, "learning_rate": 2.5130548626662002e-05, "loss": 1.5782065391540527, "step": 1904 }, { "epoch": 0.49683771272087107, "grad_norm": 3.484375, "learning_rate": 2.511644057012164e-05, "loss": 1.5576541423797607, "step": 1905 }, { "epoch": 0.4970985199191498, "grad_norm": 3.4375, "learning_rate": 2.510232978886461e-05, "loss": 1.5748167037963867, "step": 1906 }, { "epoch": 0.4973593271174284, "grad_norm": 3.453125, "learning_rate": 2.5088216290405495e-05, "loss": 1.518371820449829, "step": 1907 }, { "epoch": 0.4976201343157071, "grad_norm": 3.5625, "learning_rate": 2.5074100082260304e-05, "loss": 1.7400927543640137, "step": 1908 }, { "epoch": 0.49788094151398576, "grad_norm": 3.59375, "learning_rate": 2.5059981171946515e-05, "loss": 1.4117454290390015, "step": 1909 }, { "epoch": 0.49814174871226446, "grad_norm": 3.578125, "learning_rate": 2.5045859566983016e-05, "loss": 1.3402498960494995, "step": 1910 }, { "epoch": 0.4984025559105431, "grad_norm": 4.0, "learning_rate": 2.5031735274890176e-05, "loss": 1.872905969619751, "step": 1911 }, { "epoch": 0.4986633631088218, "grad_norm": 3.9375, "learning_rate": 2.501760830318974e-05, "loss": 1.6585395336151123, "step": 1912 }, { "epoch": 0.49892417030710046, "grad_norm": 3.78125, "learning_rate": 2.5003478659404906e-05, "loss": 1.6110572814941406, "step": 1913 }, { "epoch": 0.49918497750537916, "grad_norm": 3.609375, "learning_rate": 2.4989346351060314e-05, "loss": 1.711033582687378, "step": 1914 }, { "epoch": 0.4994457847036578, "grad_norm": 3.640625, "learning_rate": 2.4975211385681986e-05, "loss": 1.8739043474197388, "step": 1915 }, { "epoch": 0.4997065919019365, "grad_norm": 3.65625, "learning_rate": 2.4961073770797394e-05, "loss": 1.5758452415466309, "step": 1916 }, { "epoch": 0.49996739910021515, "grad_norm": 3.390625, "learning_rate": 2.494693351393539e-05, "loss": 1.5570107698440552, "step": 1917 }, { "epoch": 0.5002282062984938, "grad_norm": 3.5625, "learning_rate": 2.4932790622626247e-05, "loss": 1.5484333038330078, "step": 1918 }, { "epoch": 0.5004890134967726, "grad_norm": 3.59375, "learning_rate": 2.4918645104401648e-05, "loss": 1.6143624782562256, "step": 1919 }, { "epoch": 0.5007498206950511, "grad_norm": 3.640625, "learning_rate": 2.4904496966794662e-05, "loss": 1.6884369850158691, "step": 1920 }, { "epoch": 0.5010106278933298, "grad_norm": 3.71875, "learning_rate": 2.4890346217339768e-05, "loss": 1.5124157667160034, "step": 1921 }, { "epoch": 0.5012714350916085, "grad_norm": 3.828125, "learning_rate": 2.4876192863572816e-05, "loss": 1.6060041189193726, "step": 1922 }, { "epoch": 0.5015322422898872, "grad_norm": 3.828125, "learning_rate": 2.4862036913031053e-05, "loss": 1.4871320724487305, "step": 1923 }, { "epoch": 0.5017930494881658, "grad_norm": 3.5625, "learning_rate": 2.4847878373253118e-05, "loss": 1.3923563957214355, "step": 1924 }, { "epoch": 0.5020538566864445, "grad_norm": 3.578125, "learning_rate": 2.4833717251779014e-05, "loss": 1.5572978258132935, "step": 1925 }, { "epoch": 0.5023146638847232, "grad_norm": 3.53125, "learning_rate": 2.4819553556150134e-05, "loss": 1.6809754371643066, "step": 1926 }, { "epoch": 0.5025754710830019, "grad_norm": 4.03125, "learning_rate": 2.4805387293909214e-05, "loss": 1.8645470142364502, "step": 1927 }, { "epoch": 0.5028362782812805, "grad_norm": 3.859375, "learning_rate": 2.4791218472600396e-05, "loss": 1.772527813911438, "step": 1928 }, { "epoch": 0.5030970854795592, "grad_norm": 3.828125, "learning_rate": 2.4777047099769157e-05, "loss": 1.609299898147583, "step": 1929 }, { "epoch": 0.5033578926778379, "grad_norm": 3.625, "learning_rate": 2.4762873182962338e-05, "loss": 1.748628854751587, "step": 1930 }, { "epoch": 0.5036186998761166, "grad_norm": 3.5625, "learning_rate": 2.4748696729728135e-05, "loss": 1.6891801357269287, "step": 1931 }, { "epoch": 0.5038795070743952, "grad_norm": 3.984375, "learning_rate": 2.4734517747616106e-05, "loss": 1.5500144958496094, "step": 1932 }, { "epoch": 0.5041403142726739, "grad_norm": 3.328125, "learning_rate": 2.472033624417715e-05, "loss": 1.177302598953247, "step": 1933 }, { "epoch": 0.5044011214709526, "grad_norm": 3.765625, "learning_rate": 2.4706152226963484e-05, "loss": 1.510635495185852, "step": 1934 }, { "epoch": 0.5046619286692313, "grad_norm": 3.875, "learning_rate": 2.46919657035287e-05, "loss": 1.5966476202011108, "step": 1935 }, { "epoch": 0.5049227358675099, "grad_norm": 3.625, "learning_rate": 2.467777668142771e-05, "loss": 1.3704627752304077, "step": 1936 }, { "epoch": 0.5051835430657886, "grad_norm": 3.640625, "learning_rate": 2.466358516821675e-05, "loss": 1.6136150360107422, "step": 1937 }, { "epoch": 0.5054443502640673, "grad_norm": 3.671875, "learning_rate": 2.46493911714534e-05, "loss": 1.286433219909668, "step": 1938 }, { "epoch": 0.505705157462346, "grad_norm": 3.640625, "learning_rate": 2.4635194698696544e-05, "loss": 1.5212979316711426, "step": 1939 }, { "epoch": 0.5059659646606246, "grad_norm": 3.484375, "learning_rate": 2.4620995757506393e-05, "loss": 1.5038641691207886, "step": 1940 }, { "epoch": 0.5062267718589033, "grad_norm": 3.546875, "learning_rate": 2.4606794355444467e-05, "loss": 1.5387482643127441, "step": 1941 }, { "epoch": 0.506487579057182, "grad_norm": 3.671875, "learning_rate": 2.4592590500073607e-05, "loss": 1.5222899913787842, "step": 1942 }, { "epoch": 0.5067483862554607, "grad_norm": 3.75, "learning_rate": 2.4578384198957957e-05, "loss": 1.7417032718658447, "step": 1943 }, { "epoch": 0.5070091934537393, "grad_norm": 3.265625, "learning_rate": 2.456417545966295e-05, "loss": 1.7328388690948486, "step": 1944 }, { "epoch": 0.507270000652018, "grad_norm": 3.359375, "learning_rate": 2.4549964289755347e-05, "loss": 1.432841181755066, "step": 1945 }, { "epoch": 0.5075308078502967, "grad_norm": 4.125, "learning_rate": 2.453575069680317e-05, "loss": 1.5138942003250122, "step": 1946 }, { "epoch": 0.5077916150485754, "grad_norm": 3.703125, "learning_rate": 2.4521534688375747e-05, "loss": 1.4251571893692017, "step": 1947 }, { "epoch": 0.508052422246854, "grad_norm": 3.828125, "learning_rate": 2.4507316272043705e-05, "loss": 1.2827256917953491, "step": 1948 }, { "epoch": 0.5083132294451327, "grad_norm": 3.703125, "learning_rate": 2.449309545537892e-05, "loss": 1.6564579010009766, "step": 1949 }, { "epoch": 0.5085740366434114, "grad_norm": 3.78125, "learning_rate": 2.447887224595458e-05, "loss": 1.5971944332122803, "step": 1950 }, { "epoch": 0.50883484384169, "grad_norm": 3.640625, "learning_rate": 2.4464646651345133e-05, "loss": 1.6550348997116089, "step": 1951 }, { "epoch": 0.5090956510399687, "grad_norm": 3.4375, "learning_rate": 2.445041867912629e-05, "loss": 1.5908136367797852, "step": 1952 }, { "epoch": 0.5093564582382474, "grad_norm": 3.5625, "learning_rate": 2.4436188336875044e-05, "loss": 1.4241156578063965, "step": 1953 }, { "epoch": 0.5096172654365261, "grad_norm": 3.25, "learning_rate": 2.4421955632169638e-05, "loss": 1.4336529970169067, "step": 1954 }, { "epoch": 0.5098780726348047, "grad_norm": 3.78125, "learning_rate": 2.440772057258958e-05, "loss": 1.6574651002883911, "step": 1955 }, { "epoch": 0.5101388798330834, "grad_norm": 3.84375, "learning_rate": 2.439348316571563e-05, "loss": 1.8338189125061035, "step": 1956 }, { "epoch": 0.5103996870313621, "grad_norm": 3.671875, "learning_rate": 2.4379243419129788e-05, "loss": 1.7848278284072876, "step": 1957 }, { "epoch": 0.5106604942296408, "grad_norm": 3.5, "learning_rate": 2.436500134041532e-05, "loss": 1.265929102897644, "step": 1958 }, { "epoch": 0.5109213014279194, "grad_norm": 3.703125, "learning_rate": 2.4350756937156718e-05, "loss": 1.5478613376617432, "step": 1959 }, { "epoch": 0.5111821086261981, "grad_norm": 3.984375, "learning_rate": 2.433651021693972e-05, "loss": 1.741408348083496, "step": 1960 }, { "epoch": 0.5114429158244768, "grad_norm": 3.46875, "learning_rate": 2.4322261187351287e-05, "loss": 1.689663290977478, "step": 1961 }, { "epoch": 0.5117037230227555, "grad_norm": 3.546875, "learning_rate": 2.430800985597963e-05, "loss": 1.541811466217041, "step": 1962 }, { "epoch": 0.511964530221034, "grad_norm": 3.359375, "learning_rate": 2.429375623041417e-05, "loss": 1.5056332349777222, "step": 1963 }, { "epoch": 0.5122253374193128, "grad_norm": 3.625, "learning_rate": 2.4279500318245542e-05, "loss": 1.6486773490905762, "step": 1964 }, { "epoch": 0.5124861446175915, "grad_norm": 3.734375, "learning_rate": 2.426524212706563e-05, "loss": 1.5406248569488525, "step": 1965 }, { "epoch": 0.5127469518158702, "grad_norm": 3.40625, "learning_rate": 2.4250981664467488e-05, "loss": 1.1409528255462646, "step": 1966 }, { "epoch": 0.5130077590141487, "grad_norm": 3.890625, "learning_rate": 2.423671893804543e-05, "loss": 1.627926230430603, "step": 1967 }, { "epoch": 0.5132685662124274, "grad_norm": 3.515625, "learning_rate": 2.4222453955394928e-05, "loss": 1.7604104280471802, "step": 1968 }, { "epoch": 0.5135293734107061, "grad_norm": 3.5625, "learning_rate": 2.4208186724112683e-05, "loss": 1.5717003345489502, "step": 1969 }, { "epoch": 0.5137901806089848, "grad_norm": 3.375, "learning_rate": 2.419391725179659e-05, "loss": 1.3920912742614746, "step": 1970 }, { "epoch": 0.5140509878072634, "grad_norm": 3.5, "learning_rate": 2.417964554604573e-05, "loss": 1.8886692523956299, "step": 1971 }, { "epoch": 0.5143117950055421, "grad_norm": 3.265625, "learning_rate": 2.4165371614460388e-05, "loss": 1.5830036401748657, "step": 1972 }, { "epoch": 0.5145726022038208, "grad_norm": 3.84375, "learning_rate": 2.415109546464201e-05, "loss": 1.675565481185913, "step": 1973 }, { "epoch": 0.5148334094020995, "grad_norm": 3.515625, "learning_rate": 2.4136817104193244e-05, "loss": 1.5147334337234497, "step": 1974 }, { "epoch": 0.5150942166003781, "grad_norm": 3.3125, "learning_rate": 2.412253654071791e-05, "loss": 1.4753445386886597, "step": 1975 }, { "epoch": 0.5153550237986568, "grad_norm": 3.640625, "learning_rate": 2.4108253781820998e-05, "loss": 1.3679883480072021, "step": 1976 }, { "epoch": 0.5156158309969355, "grad_norm": 3.546875, "learning_rate": 2.4093968835108674e-05, "loss": 1.4993984699249268, "step": 1977 }, { "epoch": 0.5158766381952142, "grad_norm": 3.421875, "learning_rate": 2.407968170818825e-05, "loss": 1.2360178232192993, "step": 1978 }, { "epoch": 0.5161374453934928, "grad_norm": 3.6875, "learning_rate": 2.406539240866823e-05, "loss": 1.4463670253753662, "step": 1979 }, { "epoch": 0.5163982525917715, "grad_norm": 3.546875, "learning_rate": 2.4051100944158252e-05, "loss": 1.7772748470306396, "step": 1980 }, { "epoch": 0.5166590597900502, "grad_norm": 3.46875, "learning_rate": 2.403680732226911e-05, "loss": 1.677770733833313, "step": 1981 }, { "epoch": 0.5169198669883289, "grad_norm": 3.671875, "learning_rate": 2.4022511550612757e-05, "loss": 1.669695258140564, "step": 1982 }, { "epoch": 0.5171806741866075, "grad_norm": 3.46875, "learning_rate": 2.400821363680227e-05, "loss": 1.4655897617340088, "step": 1983 }, { "epoch": 0.5174414813848862, "grad_norm": 3.546875, "learning_rate": 2.3993913588451898e-05, "loss": 1.4056119918823242, "step": 1984 }, { "epoch": 0.5177022885831649, "grad_norm": 3.40625, "learning_rate": 2.3979611413177003e-05, "loss": 1.5410664081573486, "step": 1985 }, { "epoch": 0.5179630957814436, "grad_norm": 3.5625, "learning_rate": 2.3965307118594077e-05, "loss": 1.8617842197418213, "step": 1986 }, { "epoch": 0.5182239029797222, "grad_norm": 3.484375, "learning_rate": 2.3951000712320768e-05, "loss": 1.5264947414398193, "step": 1987 }, { "epoch": 0.5184847101780009, "grad_norm": 3.625, "learning_rate": 2.393669220197581e-05, "loss": 1.6375635862350464, "step": 1988 }, { "epoch": 0.5187455173762796, "grad_norm": 3.578125, "learning_rate": 2.3922381595179096e-05, "loss": 1.56282377243042, "step": 1989 }, { "epoch": 0.5190063245745583, "grad_norm": 3.625, "learning_rate": 2.3908068899551604e-05, "loss": 1.568946123123169, "step": 1990 }, { "epoch": 0.5192671317728369, "grad_norm": 3.203125, "learning_rate": 2.3893754122715446e-05, "loss": 1.2424476146697998, "step": 1991 }, { "epoch": 0.5195279389711156, "grad_norm": 3.640625, "learning_rate": 2.3879437272293827e-05, "loss": 1.5834656953811646, "step": 1992 }, { "epoch": 0.5197887461693943, "grad_norm": 3.28125, "learning_rate": 2.3865118355911066e-05, "loss": 1.5506184101104736, "step": 1993 }, { "epoch": 0.520049553367673, "grad_norm": 3.703125, "learning_rate": 2.3850797381192586e-05, "loss": 1.4371000528335571, "step": 1994 }, { "epoch": 0.5203103605659516, "grad_norm": 3.828125, "learning_rate": 2.3836474355764887e-05, "loss": 1.6684209108352661, "step": 1995 }, { "epoch": 0.5205711677642303, "grad_norm": 3.671875, "learning_rate": 2.382214928725559e-05, "loss": 1.3157470226287842, "step": 1996 }, { "epoch": 0.520831974962509, "grad_norm": 3.609375, "learning_rate": 2.380782218329337e-05, "loss": 1.7018301486968994, "step": 1997 }, { "epoch": 0.5210927821607877, "grad_norm": 3.265625, "learning_rate": 2.3793493051508012e-05, "loss": 1.5252659320831299, "step": 1998 }, { "epoch": 0.5213535893590663, "grad_norm": 3.59375, "learning_rate": 2.3779161899530383e-05, "loss": 1.513904094696045, "step": 1999 }, { "epoch": 0.521614396557345, "grad_norm": 3.703125, "learning_rate": 2.3764828734992392e-05, "loss": 1.476542592048645, "step": 2000 }, { "epoch": 0.5218752037556237, "grad_norm": 3.328125, "learning_rate": 2.3750493565527063e-05, "loss": 1.3485337495803833, "step": 2001 }, { "epoch": 0.5221360109539023, "grad_norm": 3.53125, "learning_rate": 2.373615639876846e-05, "loss": 1.6200525760650635, "step": 2002 }, { "epoch": 0.522396818152181, "grad_norm": 3.578125, "learning_rate": 2.372181724235172e-05, "loss": 1.9040131568908691, "step": 2003 }, { "epoch": 0.5226576253504597, "grad_norm": 3.90625, "learning_rate": 2.3707476103913037e-05, "loss": 1.5234922170639038, "step": 2004 }, { "epoch": 0.5229184325487384, "grad_norm": 3.4375, "learning_rate": 2.3693132991089663e-05, "loss": 1.3111425638198853, "step": 2005 }, { "epoch": 0.523179239747017, "grad_norm": 3.40625, "learning_rate": 2.36787879115199e-05, "loss": 1.6711585521697998, "step": 2006 }, { "epoch": 0.5234400469452957, "grad_norm": 3.484375, "learning_rate": 2.3664440872843098e-05, "loss": 1.6181418895721436, "step": 2007 }, { "epoch": 0.5237008541435744, "grad_norm": 3.84375, "learning_rate": 2.365009188269965e-05, "loss": 1.4819923639297485, "step": 2008 }, { "epoch": 0.5239616613418531, "grad_norm": 3.453125, "learning_rate": 2.363574094873098e-05, "loss": 1.5990458726882935, "step": 2009 }, { "epoch": 0.5242224685401317, "grad_norm": 4.53125, "learning_rate": 2.3621388078579566e-05, "loss": 1.9280414581298828, "step": 2010 }, { "epoch": 0.5244832757384104, "grad_norm": 3.453125, "learning_rate": 2.3607033279888905e-05, "loss": 1.5945062637329102, "step": 2011 }, { "epoch": 0.5247440829366891, "grad_norm": 3.21875, "learning_rate": 2.3592676560303512e-05, "loss": 1.4565190076828003, "step": 2012 }, { "epoch": 0.5250048901349678, "grad_norm": 3.359375, "learning_rate": 2.357831792746895e-05, "loss": 1.3984203338623047, "step": 2013 }, { "epoch": 0.5252656973332464, "grad_norm": 3.421875, "learning_rate": 2.356395738903177e-05, "loss": 1.573155403137207, "step": 2014 }, { "epoch": 0.525526504531525, "grad_norm": 3.53125, "learning_rate": 2.354959495263957e-05, "loss": 1.536868929862976, "step": 2015 }, { "epoch": 0.5257873117298038, "grad_norm": 3.453125, "learning_rate": 2.3535230625940936e-05, "loss": 1.5260744094848633, "step": 2016 }, { "epoch": 0.5260481189280825, "grad_norm": 3.46875, "learning_rate": 2.352086441658546e-05, "loss": 1.3998295068740845, "step": 2017 }, { "epoch": 0.526308926126361, "grad_norm": 3.65625, "learning_rate": 2.350649633222376e-05, "loss": 1.337104320526123, "step": 2018 }, { "epoch": 0.5265697333246397, "grad_norm": 3.546875, "learning_rate": 2.349212638050742e-05, "loss": 1.6615405082702637, "step": 2019 }, { "epoch": 0.5268305405229184, "grad_norm": 3.71875, "learning_rate": 2.347775456908904e-05, "loss": 1.4596977233886719, "step": 2020 }, { "epoch": 0.5270913477211971, "grad_norm": 3.421875, "learning_rate": 2.3463380905622214e-05, "loss": 1.543012022972107, "step": 2021 }, { "epoch": 0.5273521549194757, "grad_norm": 3.890625, "learning_rate": 2.34490053977615e-05, "loss": 1.8023302555084229, "step": 2022 }, { "epoch": 0.5276129621177544, "grad_norm": 3.53125, "learning_rate": 2.3434628053162465e-05, "loss": 1.5883944034576416, "step": 2023 }, { "epoch": 0.5278737693160331, "grad_norm": 3.6875, "learning_rate": 2.3420248879481632e-05, "loss": 1.5604071617126465, "step": 2024 }, { "epoch": 0.5281345765143118, "grad_norm": 3.625, "learning_rate": 2.3405867884376504e-05, "loss": 1.4639250040054321, "step": 2025 }, { "epoch": 0.5283953837125904, "grad_norm": 3.546875, "learning_rate": 2.3391485075505567e-05, "loss": 1.5720698833465576, "step": 2026 }, { "epoch": 0.5286561909108691, "grad_norm": 3.34375, "learning_rate": 2.3377100460528256e-05, "loss": 1.2630044221878052, "step": 2027 }, { "epoch": 0.5289169981091478, "grad_norm": 3.546875, "learning_rate": 2.3362714047104987e-05, "loss": 1.364597201347351, "step": 2028 }, { "epoch": 0.5291778053074265, "grad_norm": 3.6875, "learning_rate": 2.3348325842897102e-05, "loss": 1.5645570755004883, "step": 2029 }, { "epoch": 0.5294386125057051, "grad_norm": 3.5, "learning_rate": 2.3333935855566922e-05, "loss": 1.4444186687469482, "step": 2030 }, { "epoch": 0.5296994197039838, "grad_norm": 3.484375, "learning_rate": 2.331954409277772e-05, "loss": 1.4813719987869263, "step": 2031 }, { "epoch": 0.5299602269022625, "grad_norm": 3.375, "learning_rate": 2.33051505621937e-05, "loss": 1.4920804500579834, "step": 2032 }, { "epoch": 0.5302210341005412, "grad_norm": 3.59375, "learning_rate": 2.329075527148002e-05, "loss": 1.4645588397979736, "step": 2033 }, { "epoch": 0.5304818412988198, "grad_norm": 3.484375, "learning_rate": 2.3276358228302757e-05, "loss": 1.6457539796829224, "step": 2034 }, { "epoch": 0.5307426484970985, "grad_norm": 3.375, "learning_rate": 2.326195944032894e-05, "loss": 1.6536197662353516, "step": 2035 }, { "epoch": 0.5310034556953772, "grad_norm": 3.59375, "learning_rate": 2.3247558915226526e-05, "loss": 1.5841575860977173, "step": 2036 }, { "epoch": 0.5312642628936559, "grad_norm": 3.609375, "learning_rate": 2.3233156660664384e-05, "loss": 1.562570333480835, "step": 2037 }, { "epoch": 0.5315250700919345, "grad_norm": 3.484375, "learning_rate": 2.3218752684312308e-05, "loss": 1.4996845722198486, "step": 2038 }, { "epoch": 0.5317858772902132, "grad_norm": 3.75, "learning_rate": 2.320434699384102e-05, "loss": 1.6510419845581055, "step": 2039 }, { "epoch": 0.5320466844884919, "grad_norm": 3.578125, "learning_rate": 2.3189939596922156e-05, "loss": 1.5520703792572021, "step": 2040 }, { "epoch": 0.5323074916867706, "grad_norm": 3.46875, "learning_rate": 2.3175530501228227e-05, "loss": 1.5717568397521973, "step": 2041 }, { "epoch": 0.5325682988850492, "grad_norm": 3.984375, "learning_rate": 2.3161119714432693e-05, "loss": 1.6424689292907715, "step": 2042 }, { "epoch": 0.5328291060833279, "grad_norm": 3.53125, "learning_rate": 2.314670724420989e-05, "loss": 1.5468082427978516, "step": 2043 }, { "epoch": 0.5330899132816066, "grad_norm": 3.515625, "learning_rate": 2.3132293098235056e-05, "loss": 1.6908055543899536, "step": 2044 }, { "epoch": 0.5333507204798853, "grad_norm": 3.515625, "learning_rate": 2.3117877284184322e-05, "loss": 1.4291343688964844, "step": 2045 }, { "epoch": 0.5336115276781639, "grad_norm": 3.46875, "learning_rate": 2.3103459809734706e-05, "loss": 1.5319087505340576, "step": 2046 }, { "epoch": 0.5338723348764426, "grad_norm": 3.921875, "learning_rate": 2.3089040682564104e-05, "loss": 1.5633468627929688, "step": 2047 }, { "epoch": 0.5341331420747213, "grad_norm": 3.890625, "learning_rate": 2.307461991035131e-05, "loss": 1.7004632949829102, "step": 2048 }, { "epoch": 0.5343939492729999, "grad_norm": 3.34375, "learning_rate": 2.3060197500775977e-05, "loss": 1.5623658895492554, "step": 2049 }, { "epoch": 0.5346547564712786, "grad_norm": 3.65625, "learning_rate": 2.304577346151864e-05, "loss": 1.5550426244735718, "step": 2050 }, { "epoch": 0.5349155636695573, "grad_norm": 3.5, "learning_rate": 2.303134780026069e-05, "loss": 1.608680248260498, "step": 2051 }, { "epoch": 0.535176370867836, "grad_norm": 3.984375, "learning_rate": 2.3016920524684396e-05, "loss": 1.9391027688980103, "step": 2052 }, { "epoch": 0.5354371780661146, "grad_norm": 3.8125, "learning_rate": 2.300249164247288e-05, "loss": 1.6023705005645752, "step": 2053 }, { "epoch": 0.5356979852643933, "grad_norm": 3.53125, "learning_rate": 2.298806116131012e-05, "loss": 1.5014597177505493, "step": 2054 }, { "epoch": 0.535958792462672, "grad_norm": 3.828125, "learning_rate": 2.297362908888093e-05, "loss": 1.5135198831558228, "step": 2055 }, { "epoch": 0.5362195996609507, "grad_norm": 3.515625, "learning_rate": 2.2959195432871012e-05, "loss": 1.4736385345458984, "step": 2056 }, { "epoch": 0.5364804068592293, "grad_norm": 3.734375, "learning_rate": 2.2944760200966876e-05, "loss": 1.570051670074463, "step": 2057 }, { "epoch": 0.536741214057508, "grad_norm": 3.5625, "learning_rate": 2.2930323400855875e-05, "loss": 1.6457270383834839, "step": 2058 }, { "epoch": 0.5370020212557867, "grad_norm": 3.4375, "learning_rate": 2.2915885040226205e-05, "loss": 1.40768301486969, "step": 2059 }, { "epoch": 0.5372628284540654, "grad_norm": 3.921875, "learning_rate": 2.29014451267669e-05, "loss": 1.632968544960022, "step": 2060 }, { "epoch": 0.537523635652344, "grad_norm": 3.203125, "learning_rate": 2.2887003668167803e-05, "loss": 1.3858742713928223, "step": 2061 }, { "epoch": 0.5377844428506227, "grad_norm": 3.40625, "learning_rate": 2.28725606721196e-05, "loss": 1.4711406230926514, "step": 2062 }, { "epoch": 0.5380452500489014, "grad_norm": 3.53125, "learning_rate": 2.2858116146313772e-05, "loss": 1.7600085735321045, "step": 2063 }, { "epoch": 0.5383060572471801, "grad_norm": 3.53125, "learning_rate": 2.2843670098442634e-05, "loss": 1.6487839221954346, "step": 2064 }, { "epoch": 0.5385668644454586, "grad_norm": 3.296875, "learning_rate": 2.2829222536199308e-05, "loss": 1.547398567199707, "step": 2065 }, { "epoch": 0.5388276716437373, "grad_norm": 3.484375, "learning_rate": 2.281477346727772e-05, "loss": 1.616747260093689, "step": 2066 }, { "epoch": 0.539088478842016, "grad_norm": 3.828125, "learning_rate": 2.2800322899372586e-05, "loss": 1.5411043167114258, "step": 2067 }, { "epoch": 0.5393492860402948, "grad_norm": 3.984375, "learning_rate": 2.2785870840179437e-05, "loss": 1.7756426334381104, "step": 2068 }, { "epoch": 0.5396100932385733, "grad_norm": 3.28125, "learning_rate": 2.2771417297394613e-05, "loss": 1.355830192565918, "step": 2069 }, { "epoch": 0.539870900436852, "grad_norm": 3.765625, "learning_rate": 2.27569622787152e-05, "loss": 1.398935317993164, "step": 2070 }, { "epoch": 0.5401317076351307, "grad_norm": 3.8125, "learning_rate": 2.274250579183911e-05, "loss": 1.6798679828643799, "step": 2071 }, { "epoch": 0.5403925148334094, "grad_norm": 3.78125, "learning_rate": 2.2728047844465006e-05, "loss": 1.7959800958633423, "step": 2072 }, { "epoch": 0.540653322031688, "grad_norm": 3.390625, "learning_rate": 2.2713588444292358e-05, "loss": 1.4074865579605103, "step": 2073 }, { "epoch": 0.5409141292299667, "grad_norm": 3.46875, "learning_rate": 2.2699127599021397e-05, "loss": 1.409622311592102, "step": 2074 }, { "epoch": 0.5411749364282454, "grad_norm": 3.328125, "learning_rate": 2.2684665316353112e-05, "loss": 1.2508794069290161, "step": 2075 }, { "epoch": 0.5414357436265241, "grad_norm": 3.578125, "learning_rate": 2.2670201603989275e-05, "loss": 1.6247743368148804, "step": 2076 }, { "epoch": 0.5416965508248027, "grad_norm": 3.203125, "learning_rate": 2.265573646963241e-05, "loss": 1.2508021593093872, "step": 2077 }, { "epoch": 0.5419573580230814, "grad_norm": 3.453125, "learning_rate": 2.26412699209858e-05, "loss": 1.3514494895935059, "step": 2078 }, { "epoch": 0.5422181652213601, "grad_norm": 3.5, "learning_rate": 2.2626801965753483e-05, "loss": 1.5631085634231567, "step": 2079 }, { "epoch": 0.5424789724196388, "grad_norm": 3.578125, "learning_rate": 2.2612332611640243e-05, "loss": 1.5388072729110718, "step": 2080 }, { "epoch": 0.5427397796179174, "grad_norm": 3.328125, "learning_rate": 2.259786186635161e-05, "loss": 1.2644977569580078, "step": 2081 }, { "epoch": 0.5430005868161961, "grad_norm": 3.40625, "learning_rate": 2.258338973759386e-05, "loss": 1.6229095458984375, "step": 2082 }, { "epoch": 0.5432613940144748, "grad_norm": 3.46875, "learning_rate": 2.2568916233074004e-05, "loss": 1.4737906455993652, "step": 2083 }, { "epoch": 0.5435222012127535, "grad_norm": 3.421875, "learning_rate": 2.2554441360499775e-05, "loss": 1.6421562433242798, "step": 2084 }, { "epoch": 0.5437830084110321, "grad_norm": 3.59375, "learning_rate": 2.253996512757964e-05, "loss": 1.3732681274414062, "step": 2085 }, { "epoch": 0.5440438156093108, "grad_norm": 3.5, "learning_rate": 2.2525487542022808e-05, "loss": 1.5210660696029663, "step": 2086 }, { "epoch": 0.5443046228075895, "grad_norm": 3.25, "learning_rate": 2.2511008611539177e-05, "loss": 1.2596254348754883, "step": 2087 }, { "epoch": 0.5445654300058682, "grad_norm": 3.265625, "learning_rate": 2.249652834383939e-05, "loss": 1.3855814933776855, "step": 2088 }, { "epoch": 0.5448262372041468, "grad_norm": 3.53125, "learning_rate": 2.2482046746634784e-05, "loss": 1.484673023223877, "step": 2089 }, { "epoch": 0.5450870444024255, "grad_norm": 3.78125, "learning_rate": 2.2467563827637414e-05, "loss": 1.5831198692321777, "step": 2090 }, { "epoch": 0.5453478516007042, "grad_norm": 3.484375, "learning_rate": 2.2453079594560025e-05, "loss": 1.5446763038635254, "step": 2091 }, { "epoch": 0.5456086587989829, "grad_norm": 3.5625, "learning_rate": 2.2438594055116077e-05, "loss": 1.385252594947815, "step": 2092 }, { "epoch": 0.5458694659972615, "grad_norm": 3.609375, "learning_rate": 2.2424107217019724e-05, "loss": 1.5257917642593384, "step": 2093 }, { "epoch": 0.5461302731955402, "grad_norm": 3.625, "learning_rate": 2.24096190879858e-05, "loss": 1.4672267436981201, "step": 2094 }, { "epoch": 0.5463910803938189, "grad_norm": 3.515625, "learning_rate": 2.2395129675729845e-05, "loss": 1.6204917430877686, "step": 2095 }, { "epoch": 0.5466518875920976, "grad_norm": 3.734375, "learning_rate": 2.238063898796806e-05, "loss": 1.6788408756256104, "step": 2096 }, { "epoch": 0.5469126947903762, "grad_norm": 3.53125, "learning_rate": 2.236614703241734e-05, "loss": 1.4796342849731445, "step": 2097 }, { "epoch": 0.5471735019886549, "grad_norm": 3.625, "learning_rate": 2.2351653816795263e-05, "loss": 1.5955634117126465, "step": 2098 }, { "epoch": 0.5474343091869336, "grad_norm": 3.40625, "learning_rate": 2.233715934882005e-05, "loss": 1.5928817987442017, "step": 2099 }, { "epoch": 0.5476951163852122, "grad_norm": 3.40625, "learning_rate": 2.232266363621062e-05, "loss": 1.5585155487060547, "step": 2100 }, { "epoch": 0.5479559235834909, "grad_norm": 3.859375, "learning_rate": 2.230816668668653e-05, "loss": 1.6276161670684814, "step": 2101 }, { "epoch": 0.5482167307817696, "grad_norm": 3.421875, "learning_rate": 2.2293668507968015e-05, "loss": 1.4985865354537964, "step": 2102 }, { "epoch": 0.5484775379800483, "grad_norm": 3.75, "learning_rate": 2.2279169107775944e-05, "loss": 1.6169122457504272, "step": 2103 }, { "epoch": 0.5487383451783269, "grad_norm": 3.390625, "learning_rate": 2.2264668493831863e-05, "loss": 1.5767686367034912, "step": 2104 }, { "epoch": 0.5489991523766056, "grad_norm": 3.84375, "learning_rate": 2.225016667385795e-05, "loss": 1.4845507144927979, "step": 2105 }, { "epoch": 0.5492599595748843, "grad_norm": 3.59375, "learning_rate": 2.2235663655577006e-05, "loss": 1.5769743919372559, "step": 2106 }, { "epoch": 0.549520766773163, "grad_norm": 3.59375, "learning_rate": 2.2221159446712513e-05, "loss": 1.3913387060165405, "step": 2107 }, { "epoch": 0.5497815739714416, "grad_norm": 3.59375, "learning_rate": 2.2206654054988545e-05, "loss": 1.4873542785644531, "step": 2108 }, { "epoch": 0.5500423811697203, "grad_norm": 3.515625, "learning_rate": 2.2192147488129837e-05, "loss": 1.367735505104065, "step": 2109 }, { "epoch": 0.550303188367999, "grad_norm": 3.421875, "learning_rate": 2.2177639753861735e-05, "loss": 1.3265107870101929, "step": 2110 }, { "epoch": 0.5505639955662777, "grad_norm": 3.625, "learning_rate": 2.21631308599102e-05, "loss": 1.437415361404419, "step": 2111 }, { "epoch": 0.5508248027645563, "grad_norm": 3.53125, "learning_rate": 2.2148620814001828e-05, "loss": 1.553528070449829, "step": 2112 }, { "epoch": 0.551085609962835, "grad_norm": 3.6875, "learning_rate": 2.2134109623863815e-05, "loss": 1.4988465309143066, "step": 2113 }, { "epoch": 0.5513464171611137, "grad_norm": 3.53125, "learning_rate": 2.2119597297223976e-05, "loss": 1.4551095962524414, "step": 2114 }, { "epoch": 0.5516072243593924, "grad_norm": 3.46875, "learning_rate": 2.2105083841810718e-05, "loss": 1.2718291282653809, "step": 2115 }, { "epoch": 0.551868031557671, "grad_norm": 3.484375, "learning_rate": 2.209056926535307e-05, "loss": 1.6388218402862549, "step": 2116 }, { "epoch": 0.5521288387559496, "grad_norm": 3.71875, "learning_rate": 2.207605357558064e-05, "loss": 1.6339783668518066, "step": 2117 }, { "epoch": 0.5523896459542283, "grad_norm": 3.28125, "learning_rate": 2.2061536780223634e-05, "loss": 1.6556386947631836, "step": 2118 }, { "epoch": 0.552650453152507, "grad_norm": 3.625, "learning_rate": 2.2047018887012838e-05, "loss": 1.7499902248382568, "step": 2119 }, { "epoch": 0.5529112603507856, "grad_norm": 3.390625, "learning_rate": 2.2032499903679648e-05, "loss": 1.5917760133743286, "step": 2120 }, { "epoch": 0.5531720675490643, "grad_norm": 3.359375, "learning_rate": 2.201797983795601e-05, "loss": 1.4344977140426636, "step": 2121 }, { "epoch": 0.553432874747343, "grad_norm": 3.234375, "learning_rate": 2.200345869757448e-05, "loss": 1.4399155378341675, "step": 2122 }, { "epoch": 0.5536936819456217, "grad_norm": 3.359375, "learning_rate": 2.1988936490268142e-05, "loss": 1.5594520568847656, "step": 2123 }, { "epoch": 0.5539544891439003, "grad_norm": 3.515625, "learning_rate": 2.1974413223770695e-05, "loss": 1.6037397384643555, "step": 2124 }, { "epoch": 0.554215296342179, "grad_norm": 3.875, "learning_rate": 2.195988890581637e-05, "loss": 1.6540207862854004, "step": 2125 }, { "epoch": 0.5544761035404577, "grad_norm": 3.640625, "learning_rate": 2.1945363544139963e-05, "loss": 1.4645299911499023, "step": 2126 }, { "epoch": 0.5547369107387364, "grad_norm": 3.34375, "learning_rate": 2.193083714647685e-05, "loss": 1.3683663606643677, "step": 2127 }, { "epoch": 0.554997717937015, "grad_norm": 3.453125, "learning_rate": 2.1916309720562915e-05, "loss": 1.2491161823272705, "step": 2128 }, { "epoch": 0.5552585251352937, "grad_norm": 3.609375, "learning_rate": 2.1901781274134633e-05, "loss": 1.5425299406051636, "step": 2129 }, { "epoch": 0.5555193323335724, "grad_norm": 3.59375, "learning_rate": 2.1887251814928998e-05, "loss": 1.514522671699524, "step": 2130 }, { "epoch": 0.5557801395318511, "grad_norm": 3.40625, "learning_rate": 2.1872721350683552e-05, "loss": 1.4008015394210815, "step": 2131 }, { "epoch": 0.5560409467301297, "grad_norm": 3.5, "learning_rate": 2.1858189889136363e-05, "loss": 1.4071357250213623, "step": 2132 }, { "epoch": 0.5563017539284084, "grad_norm": 3.46875, "learning_rate": 2.1843657438026038e-05, "loss": 1.4505395889282227, "step": 2133 }, { "epoch": 0.5565625611266871, "grad_norm": 5.34375, "learning_rate": 2.182912400509172e-05, "loss": 2.069634437561035, "step": 2134 }, { "epoch": 0.5568233683249658, "grad_norm": 3.53125, "learning_rate": 2.181458959807305e-05, "loss": 1.4999897480010986, "step": 2135 }, { "epoch": 0.5570841755232444, "grad_norm": 3.203125, "learning_rate": 2.1800054224710213e-05, "loss": 1.4152805805206299, "step": 2136 }, { "epoch": 0.5573449827215231, "grad_norm": 3.6875, "learning_rate": 2.1785517892743887e-05, "loss": 1.6478434801101685, "step": 2137 }, { "epoch": 0.5576057899198018, "grad_norm": 3.765625, "learning_rate": 2.1770980609915283e-05, "loss": 1.5277208089828491, "step": 2138 }, { "epoch": 0.5578665971180805, "grad_norm": 3.609375, "learning_rate": 2.1756442383966102e-05, "loss": 1.4693225622177124, "step": 2139 }, { "epoch": 0.5581274043163591, "grad_norm": 3.171875, "learning_rate": 2.174190322263855e-05, "loss": 1.1739648580551147, "step": 2140 }, { "epoch": 0.5583882115146378, "grad_norm": 3.4375, "learning_rate": 2.172736313367533e-05, "loss": 1.4084835052490234, "step": 2141 }, { "epoch": 0.5586490187129165, "grad_norm": 3.578125, "learning_rate": 2.171282212481965e-05, "loss": 1.7365339994430542, "step": 2142 }, { "epoch": 0.5589098259111952, "grad_norm": 3.421875, "learning_rate": 2.1698280203815193e-05, "loss": 1.377223014831543, "step": 2143 }, { "epoch": 0.5591706331094738, "grad_norm": 3.671875, "learning_rate": 2.1683737378406143e-05, "loss": 1.5929501056671143, "step": 2144 }, { "epoch": 0.5594314403077525, "grad_norm": 3.640625, "learning_rate": 2.1669193656337147e-05, "loss": 1.577168345451355, "step": 2145 }, { "epoch": 0.5596922475060312, "grad_norm": 3.609375, "learning_rate": 2.1654649045353348e-05, "loss": 1.5571691989898682, "step": 2146 }, { "epoch": 0.5599530547043099, "grad_norm": 3.75, "learning_rate": 2.164010355320035e-05, "loss": 1.7082301378250122, "step": 2147 }, { "epoch": 0.5602138619025885, "grad_norm": 3.375, "learning_rate": 2.162555718762423e-05, "loss": 1.5191845893859863, "step": 2148 }, { "epoch": 0.5604746691008672, "grad_norm": 3.421875, "learning_rate": 2.1611009956371533e-05, "loss": 1.469129204750061, "step": 2149 }, { "epoch": 0.5607354762991459, "grad_norm": 3.578125, "learning_rate": 2.1596461867189257e-05, "loss": 1.5439426898956299, "step": 2150 }, { "epoch": 0.5609962834974245, "grad_norm": 3.625, "learning_rate": 2.1581912927824878e-05, "loss": 1.470725655555725, "step": 2151 }, { "epoch": 0.5612570906957032, "grad_norm": 3.609375, "learning_rate": 2.156736314602629e-05, "loss": 1.6040174961090088, "step": 2152 }, { "epoch": 0.5615178978939819, "grad_norm": 3.703125, "learning_rate": 2.1552812529541865e-05, "loss": 1.6059777736663818, "step": 2153 }, { "epoch": 0.5617787050922606, "grad_norm": 3.59375, "learning_rate": 2.1538261086120408e-05, "loss": 1.501955509185791, "step": 2154 }, { "epoch": 0.5620395122905392, "grad_norm": 3.515625, "learning_rate": 2.1523708823511168e-05, "loss": 1.4939640760421753, "step": 2155 }, { "epoch": 0.5623003194888179, "grad_norm": 3.390625, "learning_rate": 2.1509155749463823e-05, "loss": 1.7001293897628784, "step": 2156 }, { "epoch": 0.5625611266870966, "grad_norm": 3.625, "learning_rate": 2.149460187172849e-05, "loss": 1.603058099746704, "step": 2157 }, { "epoch": 0.5628219338853753, "grad_norm": 3.875, "learning_rate": 2.148004719805571e-05, "loss": 1.5233949422836304, "step": 2158 }, { "epoch": 0.5630827410836539, "grad_norm": 3.296875, "learning_rate": 2.146549173619646e-05, "loss": 1.4987554550170898, "step": 2159 }, { "epoch": 0.5633435482819326, "grad_norm": 3.46875, "learning_rate": 2.145093549390211e-05, "loss": 1.684701681137085, "step": 2160 }, { "epoch": 0.5636043554802113, "grad_norm": 3.46875, "learning_rate": 2.143637847892448e-05, "loss": 1.4607975482940674, "step": 2161 }, { "epoch": 0.56386516267849, "grad_norm": 3.390625, "learning_rate": 2.1421820699015763e-05, "loss": 1.4865243434906006, "step": 2162 }, { "epoch": 0.5641259698767686, "grad_norm": 3.375, "learning_rate": 2.1407262161928607e-05, "loss": 1.3964680433273315, "step": 2163 }, { "epoch": 0.5643867770750473, "grad_norm": 3.453125, "learning_rate": 2.1392702875416017e-05, "loss": 1.5070065259933472, "step": 2164 }, { "epoch": 0.564647584273326, "grad_norm": 3.453125, "learning_rate": 2.1378142847231417e-05, "loss": 1.4372220039367676, "step": 2165 }, { "epoch": 0.5649083914716047, "grad_norm": 3.59375, "learning_rate": 2.1363582085128635e-05, "loss": 1.735957384109497, "step": 2166 }, { "epoch": 0.5651691986698832, "grad_norm": 3.546875, "learning_rate": 2.134902059686187e-05, "loss": 1.5055205821990967, "step": 2167 }, { "epoch": 0.565430005868162, "grad_norm": 3.421875, "learning_rate": 2.1334458390185736e-05, "loss": 1.274411916732788, "step": 2168 }, { "epoch": 0.5656908130664406, "grad_norm": 3.4375, "learning_rate": 2.131989547285519e-05, "loss": 1.6582330465316772, "step": 2169 }, { "epoch": 0.5659516202647193, "grad_norm": 3.78125, "learning_rate": 2.1305331852625596e-05, "loss": 1.6835277080535889, "step": 2170 }, { "epoch": 0.5662124274629979, "grad_norm": 3.265625, "learning_rate": 2.129076753725269e-05, "loss": 1.3967995643615723, "step": 2171 }, { "epoch": 0.5664732346612766, "grad_norm": 3.578125, "learning_rate": 2.1276202534492566e-05, "loss": 1.6190301179885864, "step": 2172 }, { "epoch": 0.5667340418595553, "grad_norm": 3.453125, "learning_rate": 2.126163685210171e-05, "loss": 1.456328272819519, "step": 2173 }, { "epoch": 0.566994849057834, "grad_norm": 3.53125, "learning_rate": 2.1247070497836926e-05, "loss": 1.5893317461013794, "step": 2174 }, { "epoch": 0.5672556562561126, "grad_norm": 3.625, "learning_rate": 2.123250347945542e-05, "loss": 1.6280925273895264, "step": 2175 }, { "epoch": 0.5675164634543913, "grad_norm": 3.359375, "learning_rate": 2.1217935804714722e-05, "loss": 1.316066861152649, "step": 2176 }, { "epoch": 0.56777727065267, "grad_norm": 3.515625, "learning_rate": 2.120336748137273e-05, "loss": 1.6684691905975342, "step": 2177 }, { "epoch": 0.5680380778509487, "grad_norm": 3.46875, "learning_rate": 2.1188798517187683e-05, "loss": 1.643919587135315, "step": 2178 }, { "epoch": 0.5682988850492273, "grad_norm": 3.828125, "learning_rate": 2.117422891991814e-05, "loss": 1.4487578868865967, "step": 2179 }, { "epoch": 0.568559692247506, "grad_norm": 3.546875, "learning_rate": 2.1159658697323044e-05, "loss": 1.3514080047607422, "step": 2180 }, { "epoch": 0.5688204994457847, "grad_norm": 3.703125, "learning_rate": 2.1145087857161614e-05, "loss": 1.6331803798675537, "step": 2181 }, { "epoch": 0.5690813066440634, "grad_norm": 3.5625, "learning_rate": 2.1130516407193445e-05, "loss": 1.6580138206481934, "step": 2182 }, { "epoch": 0.569342113842342, "grad_norm": 3.765625, "learning_rate": 2.1115944355178427e-05, "loss": 1.5402663946151733, "step": 2183 }, { "epoch": 0.5696029210406207, "grad_norm": 3.453125, "learning_rate": 2.1101371708876786e-05, "loss": 1.4032838344573975, "step": 2184 }, { "epoch": 0.5698637282388994, "grad_norm": 3.375, "learning_rate": 2.1086798476049068e-05, "loss": 1.3917063474655151, "step": 2185 }, { "epoch": 0.5701245354371781, "grad_norm": 3.5, "learning_rate": 2.1072224664456114e-05, "loss": 1.6108860969543457, "step": 2186 }, { "epoch": 0.5703853426354567, "grad_norm": 3.421875, "learning_rate": 2.1057650281859083e-05, "loss": 1.412126064300537, "step": 2187 }, { "epoch": 0.5706461498337354, "grad_norm": 3.734375, "learning_rate": 2.104307533601944e-05, "loss": 1.647977352142334, "step": 2188 }, { "epoch": 0.5709069570320141, "grad_norm": 3.59375, "learning_rate": 2.1028499834698946e-05, "loss": 1.6325180530548096, "step": 2189 }, { "epoch": 0.5711677642302928, "grad_norm": 3.296875, "learning_rate": 2.101392378565967e-05, "loss": 1.5504090785980225, "step": 2190 }, { "epoch": 0.5714285714285714, "grad_norm": 3.421875, "learning_rate": 2.0999347196663943e-05, "loss": 1.5248416662216187, "step": 2191 }, { "epoch": 0.5716893786268501, "grad_norm": 3.421875, "learning_rate": 2.0984770075474414e-05, "loss": 1.59442138671875, "step": 2192 }, { "epoch": 0.5719501858251288, "grad_norm": 3.453125, "learning_rate": 2.0970192429854004e-05, "loss": 1.3801133632659912, "step": 2193 }, { "epoch": 0.5722109930234075, "grad_norm": 3.609375, "learning_rate": 2.0955614267565915e-05, "loss": 1.6320899724960327, "step": 2194 }, { "epoch": 0.5724718002216861, "grad_norm": 3.609375, "learning_rate": 2.0941035596373625e-05, "loss": 1.6819474697113037, "step": 2195 }, { "epoch": 0.5727326074199648, "grad_norm": 3.390625, "learning_rate": 2.0926456424040865e-05, "loss": 1.2906243801116943, "step": 2196 }, { "epoch": 0.5729934146182435, "grad_norm": 3.28125, "learning_rate": 2.091187675833167e-05, "loss": 1.4808273315429688, "step": 2197 }, { "epoch": 0.5732542218165222, "grad_norm": 3.84375, "learning_rate": 2.08972966070103e-05, "loss": 1.463750958442688, "step": 2198 }, { "epoch": 0.5735150290148008, "grad_norm": 3.8125, "learning_rate": 2.0882715977841296e-05, "loss": 1.793060302734375, "step": 2199 }, { "epoch": 0.5737758362130795, "grad_norm": 3.515625, "learning_rate": 2.0868134878589452e-05, "loss": 1.4461506605148315, "step": 2200 }, { "epoch": 0.5740366434113582, "grad_norm": 3.328125, "learning_rate": 2.0853553317019798e-05, "loss": 1.4556095600128174, "step": 2201 }, { "epoch": 0.5742974506096368, "grad_norm": 3.546875, "learning_rate": 2.083897130089763e-05, "loss": 1.4989452362060547, "step": 2202 }, { "epoch": 0.5745582578079155, "grad_norm": 3.578125, "learning_rate": 2.082438883798847e-05, "loss": 1.666111946105957, "step": 2203 }, { "epoch": 0.5748190650061942, "grad_norm": 3.359375, "learning_rate": 2.080980593605808e-05, "loss": 1.2804036140441895, "step": 2204 }, { "epoch": 0.5750798722044729, "grad_norm": 3.765625, "learning_rate": 2.079522260287247e-05, "loss": 1.3963851928710938, "step": 2205 }, { "epoch": 0.5753406794027515, "grad_norm": 3.53125, "learning_rate": 2.0780638846197857e-05, "loss": 1.283686637878418, "step": 2206 }, { "epoch": 0.5756014866010302, "grad_norm": 3.296875, "learning_rate": 2.076605467380071e-05, "loss": 1.511763572692871, "step": 2207 }, { "epoch": 0.5758622937993089, "grad_norm": 3.703125, "learning_rate": 2.0751470093447694e-05, "loss": 1.594511866569519, "step": 2208 }, { "epoch": 0.5761231009975876, "grad_norm": 3.484375, "learning_rate": 2.0736885112905708e-05, "loss": 1.5738933086395264, "step": 2209 }, { "epoch": 0.5763839081958662, "grad_norm": 3.640625, "learning_rate": 2.0722299739941857e-05, "loss": 1.3943856954574585, "step": 2210 }, { "epoch": 0.5766447153941449, "grad_norm": 3.578125, "learning_rate": 2.0707713982323456e-05, "loss": 1.4696235656738281, "step": 2211 }, { "epoch": 0.5769055225924236, "grad_norm": 3.15625, "learning_rate": 2.069312784781803e-05, "loss": 1.5212304592132568, "step": 2212 }, { "epoch": 0.5771663297907023, "grad_norm": 3.453125, "learning_rate": 2.067854134419329e-05, "loss": 1.3783750534057617, "step": 2213 }, { "epoch": 0.5774271369889808, "grad_norm": 3.734375, "learning_rate": 2.066395447921717e-05, "loss": 1.7481436729431152, "step": 2214 }, { "epoch": 0.5776879441872595, "grad_norm": 3.796875, "learning_rate": 2.064936726065776e-05, "loss": 1.6693224906921387, "step": 2215 }, { "epoch": 0.5779487513855383, "grad_norm": 3.515625, "learning_rate": 2.063477969628337e-05, "loss": 1.3997015953063965, "step": 2216 }, { "epoch": 0.578209558583817, "grad_norm": 3.484375, "learning_rate": 2.0620191793862485e-05, "loss": 1.6478688716888428, "step": 2217 }, { "epoch": 0.5784703657820955, "grad_norm": 3.625, "learning_rate": 2.0605603561163762e-05, "loss": 1.5532352924346924, "step": 2218 }, { "epoch": 0.5787311729803742, "grad_norm": 3.5, "learning_rate": 2.059101500595605e-05, "loss": 1.639623761177063, "step": 2219 }, { "epoch": 0.5789919801786529, "grad_norm": 3.75, "learning_rate": 2.0576426136008344e-05, "loss": 1.3887747526168823, "step": 2220 }, { "epoch": 0.5792527873769316, "grad_norm": 3.4375, "learning_rate": 2.0561836959089828e-05, "loss": 1.713215947151184, "step": 2221 }, { "epoch": 0.5795135945752102, "grad_norm": 3.296875, "learning_rate": 2.054724748296985e-05, "loss": 1.2385673522949219, "step": 2222 }, { "epoch": 0.5797744017734889, "grad_norm": 3.671875, "learning_rate": 2.0532657715417895e-05, "loss": 1.5485032796859741, "step": 2223 }, { "epoch": 0.5800352089717676, "grad_norm": 3.328125, "learning_rate": 2.0518067664203643e-05, "loss": 1.702805519104004, "step": 2224 }, { "epoch": 0.5802960161700463, "grad_norm": 3.34375, "learning_rate": 2.0503477337096878e-05, "loss": 1.4689847230911255, "step": 2225 }, { "epoch": 0.5805568233683249, "grad_norm": 3.5625, "learning_rate": 2.048888674186756e-05, "loss": 1.5059980154037476, "step": 2226 }, { "epoch": 0.5808176305666036, "grad_norm": 3.5, "learning_rate": 2.0474295886285797e-05, "loss": 1.4363094568252563, "step": 2227 }, { "epoch": 0.5810784377648823, "grad_norm": 3.921875, "learning_rate": 2.045970477812181e-05, "loss": 1.6230552196502686, "step": 2228 }, { "epoch": 0.581339244963161, "grad_norm": 3.75, "learning_rate": 2.0445113425145983e-05, "loss": 1.429176688194275, "step": 2229 }, { "epoch": 0.5816000521614396, "grad_norm": 3.765625, "learning_rate": 2.0430521835128795e-05, "loss": 1.6176044940948486, "step": 2230 }, { "epoch": 0.5818608593597183, "grad_norm": 3.625, "learning_rate": 2.0415930015840896e-05, "loss": 1.7044963836669922, "step": 2231 }, { "epoch": 0.582121666557997, "grad_norm": 3.390625, "learning_rate": 2.0401337975053024e-05, "loss": 1.4895380735397339, "step": 2232 }, { "epoch": 0.5823824737562757, "grad_norm": 3.265625, "learning_rate": 2.038674572053604e-05, "loss": 1.3310184478759766, "step": 2233 }, { "epoch": 0.5826432809545543, "grad_norm": 3.28125, "learning_rate": 2.0372153260060937e-05, "loss": 1.7036465406417847, "step": 2234 }, { "epoch": 0.582904088152833, "grad_norm": 3.234375, "learning_rate": 2.035756060139879e-05, "loss": 1.4056061506271362, "step": 2235 }, { "epoch": 0.5831648953511117, "grad_norm": 3.453125, "learning_rate": 2.034296775232081e-05, "loss": 1.565107822418213, "step": 2236 }, { "epoch": 0.5834257025493904, "grad_norm": 3.625, "learning_rate": 2.0328374720598286e-05, "loss": 1.4062299728393555, "step": 2237 }, { "epoch": 0.583686509747669, "grad_norm": 3.375, "learning_rate": 2.0313781514002615e-05, "loss": 1.3680182695388794, "step": 2238 }, { "epoch": 0.5839473169459477, "grad_norm": 3.6875, "learning_rate": 2.0299188140305276e-05, "loss": 1.4250578880310059, "step": 2239 }, { "epoch": 0.5842081241442264, "grad_norm": 3.515625, "learning_rate": 2.028459460727785e-05, "loss": 1.310401439666748, "step": 2240 }, { "epoch": 0.5844689313425051, "grad_norm": 3.4375, "learning_rate": 2.027000092269201e-05, "loss": 1.3361576795578003, "step": 2241 }, { "epoch": 0.5847297385407837, "grad_norm": 3.375, "learning_rate": 2.025540709431948e-05, "loss": 1.2321393489837646, "step": 2242 }, { "epoch": 0.5849905457390624, "grad_norm": 3.59375, "learning_rate": 2.0240813129932086e-05, "loss": 1.6547679901123047, "step": 2243 }, { "epoch": 0.5852513529373411, "grad_norm": 3.453125, "learning_rate": 2.0226219037301723e-05, "loss": 1.5261492729187012, "step": 2244 }, { "epoch": 0.5855121601356198, "grad_norm": 3.1875, "learning_rate": 2.021162482420034e-05, "loss": 1.3989250659942627, "step": 2245 }, { "epoch": 0.5857729673338984, "grad_norm": 3.3125, "learning_rate": 2.0197030498399975e-05, "loss": 1.4565719366073608, "step": 2246 }, { "epoch": 0.5860337745321771, "grad_norm": 3.578125, "learning_rate": 2.0182436067672695e-05, "loss": 1.4274851083755493, "step": 2247 }, { "epoch": 0.5862945817304558, "grad_norm": 3.4375, "learning_rate": 2.0167841539790657e-05, "loss": 1.513249158859253, "step": 2248 }, { "epoch": 0.5865553889287344, "grad_norm": 3.234375, "learning_rate": 2.0153246922526034e-05, "loss": 1.4265291690826416, "step": 2249 }, { "epoch": 0.5868161961270131, "grad_norm": 3.515625, "learning_rate": 2.0138652223651084e-05, "loss": 1.5014431476593018, "step": 2250 }, { "epoch": 0.5870770033252918, "grad_norm": 3.5625, "learning_rate": 2.0124057450938062e-05, "loss": 1.5550901889801025, "step": 2251 }, { "epoch": 0.5873378105235705, "grad_norm": 3.40625, "learning_rate": 2.0109462612159314e-05, "loss": 1.456763505935669, "step": 2252 }, { "epoch": 0.5875986177218491, "grad_norm": 3.671875, "learning_rate": 2.0094867715087192e-05, "loss": 1.6487913131713867, "step": 2253 }, { "epoch": 0.5878594249201278, "grad_norm": 3.75, "learning_rate": 2.0080272767494075e-05, "loss": 1.5697367191314697, "step": 2254 }, { "epoch": 0.5881202321184065, "grad_norm": 3.296875, "learning_rate": 2.0065677777152387e-05, "loss": 1.5290368795394897, "step": 2255 }, { "epoch": 0.5883810393166852, "grad_norm": 3.625, "learning_rate": 2.0051082751834548e-05, "loss": 1.5546298027038574, "step": 2256 }, { "epoch": 0.5886418465149638, "grad_norm": 3.4375, "learning_rate": 2.0036487699313035e-05, "loss": 1.705029845237732, "step": 2257 }, { "epoch": 0.5889026537132425, "grad_norm": 3.59375, "learning_rate": 2.0021892627360313e-05, "loss": 1.6264777183532715, "step": 2258 }, { "epoch": 0.5891634609115212, "grad_norm": 3.359375, "learning_rate": 2.0007297543748856e-05, "loss": 1.5446674823760986, "step": 2259 }, { "epoch": 0.5894242681097999, "grad_norm": 3.8125, "learning_rate": 1.999270245625115e-05, "loss": 1.6087820529937744, "step": 2260 }, { "epoch": 0.5896850753080785, "grad_norm": 3.328125, "learning_rate": 1.9978107372639697e-05, "loss": 1.6164394617080688, "step": 2261 }, { "epoch": 0.5899458825063572, "grad_norm": 3.609375, "learning_rate": 1.996351230068697e-05, "loss": 1.3870233297348022, "step": 2262 }, { "epoch": 0.5902066897046359, "grad_norm": 3.4375, "learning_rate": 1.9948917248165452e-05, "loss": 1.3601336479187012, "step": 2263 }, { "epoch": 0.5904674969029146, "grad_norm": 3.515625, "learning_rate": 1.9934322222847626e-05, "loss": 1.4149237871170044, "step": 2264 }, { "epoch": 0.5907283041011931, "grad_norm": 3.28125, "learning_rate": 1.991972723250593e-05, "loss": 1.4162180423736572, "step": 2265 }, { "epoch": 0.5909891112994718, "grad_norm": 3.375, "learning_rate": 1.990513228491281e-05, "loss": 1.2737969160079956, "step": 2266 }, { "epoch": 0.5912499184977505, "grad_norm": 3.578125, "learning_rate": 1.9890537387840693e-05, "loss": 1.515159249305725, "step": 2267 }, { "epoch": 0.5915107256960292, "grad_norm": 3.546875, "learning_rate": 1.987594254906194e-05, "loss": 1.4385640621185303, "step": 2268 }, { "epoch": 0.5917715328943078, "grad_norm": 3.6875, "learning_rate": 1.986134777634893e-05, "loss": 1.728789210319519, "step": 2269 }, { "epoch": 0.5920323400925865, "grad_norm": 3.59375, "learning_rate": 1.984675307747397e-05, "loss": 1.5056780576705933, "step": 2270 }, { "epoch": 0.5922931472908652, "grad_norm": 3.34375, "learning_rate": 1.9832158460209346e-05, "loss": 1.262589693069458, "step": 2271 }, { "epoch": 0.5925539544891439, "grad_norm": 3.734375, "learning_rate": 1.9817563932327312e-05, "loss": 1.6616630554199219, "step": 2272 }, { "epoch": 0.5928147616874225, "grad_norm": 3.65625, "learning_rate": 1.9802969501600028e-05, "loss": 1.808161735534668, "step": 2273 } ], "logging_steps": 1, "max_steps": 4411, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1324, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.0573094378695557e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }